Commit 688448db authored by silencealiang's avatar silencealiang
Browse files

更新代码

parent a02a5490
Pipeline #2503 passed with stage
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
import pytest
import torch
import torch.nn.init as init
from megatron.core.models.common.embeddings.relative_pos_embedding import RelativePositionEmbedding
from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
from tests.unit_tests.test_utilities import Utils
class TestRelativePositionEmbedding:
def setup_method(self):
Utils.initialize_model_parallel(1, 1)
model_parallel_cuda_manual_seed(123)
self.num_heads = 12
self.relative_pos_emb = RelativePositionEmbedding(
bidirectional=True,
init_method=init.normal_,
num_attention_heads=self.num_heads,
relative_attention_num_buckets=32,
relative_attention_max_distance=128,
)
def teardown_method(self, method):
del self.relative_pos_emb
Utils.destroy_model_parallel()
def test_constructor(self):
assert isinstance(self.relative_pos_emb, RelativePositionEmbedding)
def test_forward(self):
self.query_seq_length = 512
output = self.relative_pos_emb(self.query_seq_length, self.query_seq_length)
assert output.shape[0] == 1
assert output.shape[1] == self.num_heads
assert output.shape[2] == self.query_seq_length
assert output.shape[3] == self.query_seq_length
......@@ -196,6 +196,7 @@ class TestRetroAttention:
config.hidden_size,
)
@pytest.mark.flaky_in_dev
def test_gpu_forward(self):
for recompute_granularity in (None, 'selective'):
for use_transformer_engine in (True, False):
......
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
import os
from contextlib import nullcontext
import pytest
import torch
from megatron.core import dist_checkpointing
from megatron.core import parallel_state
from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
from megatron.core.packed_seq_params import PackedSeqParams
from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
from megatron.core.transformer.transformer_block import TransformerBlock
from megatron.core.transformer.transformer_block import TransformerBlock, get_num_layers_to_build
from megatron.core.transformer.transformer_config import TransformerConfig
from megatron.core.transformer.transformer_layer import TransformerLayer
from tests.unit_tests.test_utilities import Utils
......@@ -134,3 +133,99 @@ class TestParallelTransformerBlock:
assert hidden_states.shape[0] == sequence_length
assert hidden_states.shape[1] == micro_batch_size
assert hidden_states.shape[2] == config.hidden_size
class TestPipelineParallelTransformerBlock:
@pytest.mark.parametrize(
"num_layers, pipeline_model_parallel_size, virtual_pipeline_model_parallel_size, "
"include_embedding_in_pipeline_split, include_loss_in_pipeline_split, "
"first_pipeline_num_layers, last_pipeline_num_layers, should_assert_error",
[
# Last pipeline stage has specified layers
(60, 5, None, False, False, None, 4, False),
# Uneven PP 6*[8]+[6]+[6]=60
(60, 8, None, False, False, 6, 6, False),
# Even PP
(64, 4, None, False, False, None, None, False),
# Even VPP
(64, 4, 8, False, False, None, None, False),
# First pipeline stage has specified layers
# Should distribute remaining layers evenly among other stages
(60, 6, None, False, False, 5, None, False),
# Uneven distribution leading to assertion error
(101, 8, None, False, False, 13, 13, True),
# Include embedding in pipeline split without virtual PP
(63, 4, None, True, False, None, None, False),
# Include loss in pipeline split without virtual PP
(63, 4, None, False, True, None, None, False),
# Include embedding and loss in pipeline split without virtual PP
(62, 4, None, True, True, None, None, False),
# Include embedding and loss with virtual PP
(62, 4, 2, True, True, None, None, False),
# num_layers not divisible by pipeline size without embedding/loss
(65, 4, None, False, False, None, None, True),
# num_layers not divisible by pipeline size with embedding/loss
(65, 4, None, True, True, None, None, True),
# Uneven distribution with specified first pipeline layers causing error
(61, 4, None, False, False, 12, None, True),
# Too few layers for the number of pipeline stages
(2, 4, None, False, False, None, None, True),
# Uneven PP with embedding included (should assert per code)
(60, 6, None, True, False, 5, 5, True),
# Virtual PP where num_layers not divisible by total virtual stages
(50, 2, 7, False, False, None, None, True),
# Edge case where num_layers per virtual rank is zero
(4, 4, 4, False, False, None, None, True),
],
)
@pytest.mark.flaky
@pytest.mark.flaky_in_dev
def test_layer_builder(
self,
num_layers,
pipeline_model_parallel_size,
virtual_pipeline_model_parallel_size,
include_embedding_in_pipeline_split,
include_loss_in_pipeline_split,
first_pipeline_num_layers,
last_pipeline_num_layers,
should_assert_error,
):
Utils.fake_initialize_model_parallel(
tensor_model_parallel_size=1,
pipeline_model_parallel_size=pipeline_model_parallel_size,
virtual_pipeline_model_parallel_size=virtual_pipeline_model_parallel_size,
)
context = (
pytest.raises((AssertionError, ValueError)) if should_assert_error else nullcontext()
)
with context:
transformer_config = TransformerConfig(
num_layers=num_layers,
pipeline_model_parallel_size=pipeline_model_parallel_size,
virtual_pipeline_model_parallel_size=virtual_pipeline_model_parallel_size,
include_embedding_in_pipeline_split=include_embedding_in_pipeline_split,
include_loss_in_pipeline_split=include_loss_in_pipeline_split,
first_pipeline_num_layers=first_pipeline_num_layers,
last_pipeline_num_layers=last_pipeline_num_layers,
pipeline_dtype=torch.bfloat16,
hidden_size=128,
num_attention_heads=16,
)
total_build_layers = 0
for i in range(pipeline_model_parallel_size):
parallel_state.set_pipeline_model_parallel_rank(i)
if virtual_pipeline_model_parallel_size is not None:
for j in range(virtual_pipeline_model_parallel_size):
parallel_state.set_virtual_pipeline_model_parallel_rank(j)
num_layers_to_build = get_num_layers_to_build(transformer_config)
total_build_layers += num_layers_to_build
else:
num_layers_to_build = get_num_layers_to_build(transformer_config)
total_build_layers += num_layers_to_build
if not should_assert_error:
assert (
total_build_layers == num_layers
), f"total build layers {total_build_layers} should be equal to num_layers {num_layers}"
parallel_state.set_pipeline_model_parallel_world_size(None)
parallel_state.set_virtual_pipeline_model_parallel_world_size(None)
......@@ -9,7 +9,10 @@ from megatron.core.dist_checkpointing.mapping import ShardedObject, ShardedTenso
from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
from megatron.core.transformer.transformer_config import TransformerConfig
from megatron.core.transformer.transformer_layer import TransformerLayer
from megatron.core.transformer.transformer_layer import (
TransformerLayer,
get_transformer_layer_offset,
)
from tests.unit_tests.test_utilities import Utils
......@@ -56,6 +59,10 @@ class TestParallelTransformerLayer:
assert hidden_states.shape[1] == micro_batch_size
assert hidden_states.shape[2] == config.hidden_size
def test_get_layer_offset(self):
config = self.parallel_transformer_layer.config
assert get_transformer_layer_offset(config) == 0
@pytest.mark.parametrize('order', ['tp-pp-dp', 'tp-dp-pp'])
@pytest.mark.parametrize('tp_pp', [(4, 2), (1, 1), (8, 1), (2, 2)])
def test_sharded_state_dict(self, tp_pp, order):
......
......@@ -129,26 +129,39 @@ def main():
dest='checking')
known_args, _ = parser.parse_known_args()
# Handle old arg values.
def update_loader_saver(key):
old_value = getattr(known_args, key)
if old_value == "megatron":
setattr(known_args, key, "legacy")
if old_value == "mcore":
setattr(known_args, key, "core")
update_loader_saver("loader")
update_loader_saver("saver")
# Load loader/saver plugins.
loader = load_plugin('loader', known_args.loader)
saver = load_plugin('saver', known_args.saver)
# Parser loader/saver args.
loader.add_arguments(parser)
saver.add_arguments(parser)
args = parser.parse_args()
ctx = mp.get_context("spawn")
queue = ctx.Queue(maxsize=args.max_queue_size)
# queue = mp.Queue(maxsize=args.max_queue_size)
# Initialize queue
queue = mp.Queue(maxsize=args.max_queue_size)
# Start saver process.
print("Starting saver...")
saver_proc = ctx.Process(target=saver.save_checkpoint, args=(queue, args))
# saver_proc = mp.Process(target=saver.save_checkpoint, args=(queue, args))
saver_proc = mp.Process(target=saver.save_checkpoint, args=(queue, args))
saver_proc.start()
# Run loader.
print("Starting loader...")
loader.load_checkpoint(queue, args)
# Finish saver process.
print("Waiting for saver to complete...")
saver_proc.join()
......
......@@ -6,7 +6,7 @@ import sys
import torch
import types
from schema_mcore import get_model_schema
from schema_core import get_model_schema
from utils import print_memory_usage
......
......@@ -19,10 +19,10 @@ def add_arguments(parser):
# TODO(jbarker): Need assertion to make sure *exactly* one of these is used
parser.add_argument('--model-size', type=str, required=True,
choices=['llama2-7B', 'llama2-13B', 'llama2-70B', 'llama2-7Bf', 'llama2-13Bf', 'llama2-70Bf', 'llama3-8B', 'llama3-70B', 'llama3-8Bf', 'llama3-70Bf', 'mistral-7B', 'mistral-7Bf', 'yi-34B', 'qwen2.5-7B', 'qwen2.5-72B', 'qwen2.5-7Bf', 'qwen2.5-72Bf'],
help='Model size can be `llama2-7B`, `llama2-13B`, `llama2-70B`, `llama3-8B`, `llama3-70B`, `mistral-7B`, `qwen2.5-7B`, `qwen2.5-72B` (for pretrained models), '
'and `llama2-7Bf`, `llama2-13Bf`, `llama2-70Bf`, `llama3-8Bf`, `llama3-70bf`, `mistral-7Bf`, `qwen2.5-7Bf`, and `qwen2.5-72Bf` (for chat-finetuned models).')
choices=['llama2-7B', 'llama2-13B', 'llama2-70B', 'llama2-7Bf', 'llama2-13Bf', 'llama2-70Bf', 'llama3', 'mistral', 'yi-34B', 'qwen2.5'],
help='Select model size/type')
parser.add_argument('--checkpoint-type', type=str, required=True,
choices=['meta', 'hf'],
help='Type of checkpoint to convert, options are "meta" or "hf"')
parser.add_argument('--bf16', action='store_true', help='Whether to load weights in bf16.')
parser.add_argument('--fp16', action='store_true', help='Whether to load weights in fp16.')
......@@ -53,17 +53,6 @@ NUM_SHARDS = {
"llama2-13Bf": 2,
"llama2-70B": 8,
"llama2-70Bf": 8,
"llama3-8B": 1,
"llama3-8Bf": 1,
"llama3-70B": 8,
"llama3-70Bf": 8,
"mistral-7B": 1,
"mistral-7Bf": 1,
"yi-34B": 8,
"qwen2.5-7B": 1,
"qwen2.5-7Bf": 1,
"qwen2.5-72B": 8,
"qwen2.5-72Bf": 8,
}
......@@ -84,14 +73,11 @@ def write_json(text, path):
# This conversion is adapted from
# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py
def convert_to_hf(model_path, input_base_path, model_size, tokenizer_path):
if "llama2" in model_size:
from transformers import LlamaConfig as ModelConfig
from transformers import LlamaTokenizer, LlamaTokenizerFast
elif "llama3" in model_size:
from transformers import LlamaConfig as ModelConfig
elif "mistral" in model_size:
from transformers import MistralConfig as ModelConfig
else:
raise NotImplementedError(f"converting {model_size} is only supported using HuggingFace weights")
# for backward compatibility, before you needed the repo to be called `my_repo/model_size`
if not os.path.isfile(os.path.join(input_base_path, "params.json")):
......@@ -112,25 +98,18 @@ def convert_to_hf(model_path, input_base_path, model_size, tokenizer_path):
if base > 10000.0:
max_position_embeddings = 32768 if "mistral" in model_size else 16384
else:
max_position_embeddings = 4096 if "mistral" in model_size else 2048
max_position_embeddings = 4096
if "llama2" in model_size:
tokenizer_class = LlamaTokenizer if LlamaTokenizerFast is None else LlamaTokenizerFast
elif model_size in ["llama3", "mistral"]:
tokenizer_class = transformers.AutoTokenizer.from_pretrained
else:
raise AttributeError(f"model_size={model_size} not supported")
if tokenizer_path is not None:
if "llama" in model_size:
tokenizer = tokenizer_class(tokenizer_path)
if "llama2" in model_size:
tokenizer = tokenizer_class(tokenizer_path)
tokenizer.save_pretrained(model_path)
vocab_size = tokenizer.vocab_size if tokenizer_path is not None else 32000
elif "llama3" in model_size:
vocab_size = 128256
elif "mistral" in model_size:
tokenizer = tokenizer_class.from_file(tokenizer_path)
vocab_size = 32768
else:
raise AttributeError(f"model_size={model_size} is not supported")
......@@ -296,15 +275,21 @@ def convert_to_hf(model_path, input_base_path, model_size, tokenizer_path):
return model_path
def load_args_from_checkpoint(args):
def load_args_from_checkpoint(args, model_size):
# Read Llama args.
model_args_path = os.path.join(args.load, "config.json")
with open(model_args_path) as f:
model_args = json.load(f)
# Update Megatron args.
args.seq_length = 4096
if "llama2" in model_size:
# Correct bug in earlier conversion script.
args.max_position_embeddings = 4096
else:
args.max_position_embeddings = model_args["max_position_embeddings"]
args.hidden_size = model_args["hidden_size"]
args.num_attention_heads = model_args["num_attention_heads"]
args.num_layers = model_args["num_hidden_layers"]
......@@ -315,7 +300,7 @@ def load_args_from_checkpoint(args):
args.swiglu = True
args.normalization = "RMSNorm"
args.add_bias_linear = False
args.untie_embeddings_and_output_weights = True
args.untie_embeddings_and_output_weights = not model_args.get("tie_word_embeddings", False)
args.vocab_size = model_args["vocab_size"]
args.padded_vocab_size = model_args["vocab_size"]
args.ffn_hidden_size = model_args["intermediate_size"]
......@@ -334,6 +319,7 @@ def set_preprocess_state(args, model, hf_model):
def set_postprocess_state(args, model, hf_model):
'''Set output layer & norm params.'''
model.language_model.encoder.final_norm.weight.data.copy_(hf_model.model.norm.weight)
if args.untie_embeddings_and_output_weights:
model.language_model.output_layer.weight.data.copy_(hf_model.lm_head.weight)
......@@ -430,6 +416,7 @@ def _load_checkpoint(queue, args):
if args.checkpoint_type == "meta":
model_tmp_path = convert_to_hf(model_path=os.path.join(args.save_dir, 'tmp'), input_base_path=args.load_dir, model_size=args.model_size, tokenizer_path=args.tokenizer_model)
args.load_dir = model_tmp_path
args.tokenizer_model = model_tmp_path # point to HF tokenizer model
try:
from megatron.training.arguments import parse_args, validate_args
......@@ -466,7 +453,7 @@ def _load_checkpoint(queue, args):
margs = parse_args()
margs.tokenizer_model = args.tokenizer_model
load_args_from_checkpoint(margs)
load_args_from_checkpoint(margs, args.model_size)
if "llama2" in args.model_size:
margs.tokenizer_type = "Llama2Tokenizer"
......@@ -527,7 +514,7 @@ def _load_checkpoint(queue, args):
mpu.set_tensor_model_parallel_world_size(margs.tensor_model_parallel_size)
mpu.set_pipeline_model_parallel_world_size(margs.pipeline_model_parallel_size)
mpu.set_virtual_pipeline_model_parallel_world_size(margs.virtual_pipeline_model_parallel_size)
# fused_kernels.load(margs)
fused_kernels.load(margs)
# Short aliases.
tp_size = margs.tensor_model_parallel_size
......@@ -661,7 +648,7 @@ def _load_checkpoint(queue, args):
queue.put("done")
if args.checkpoint_type == "meta":
shutil.rmtree(os.path.join(args.save_dir, 'tmp'))
shutil.rmtree(os.path.join(args.load_dir))
def load_checkpoint(queue, args):
......
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
from importlib.metadata import version
import os
from packaging.version import Version as PkgVersion
import sys
import torch
from importlib.metadata import version
from packaging.version import Version as PkgVersion
from schema_mcore import get_model_schema
from schema_core import get_model_schema
def add_arguments(parser):
......
......@@ -206,7 +206,7 @@ def save_checkpoint(queue, args):
mpu.set_pipeline_model_parallel_world_size(args.target_pipeline_parallel_size)
mpu.set_tensor_model_parallel_rank(0)
mpu.set_pipeline_model_parallel_rank(0)
# fused_kernels.load(margs)
fused_kernels.load(margs)
# Embeddings
# -----------
......
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
"""Mcore model schemas."""
"""Core model schemas."""
import typing as T
from schema_base import ModelSchema
def get_mcore_transformer_block_key(model_key):
def get_core_transformer_block_key(model_key):
return {
"GPT" : "decoder",
"BERT" : "encoder",
}[model_key]
class MCoreSchema(ModelSchema):
class CoreSchema(ModelSchema):
def __init__(self, model_type, layer_schema):
block_key = get_mcore_transformer_block_key(model_type)
block_key = get_core_transformer_block_key(model_type)
super().__init__({
"embeddings" : {
"pos" : "embedding.position_embeddings.weight",
......@@ -49,7 +49,7 @@ class MCoreSchema(ModelSchema):
})
class MCoreLocalSchema(MCoreSchema):
class CoreLocalSchema(CoreSchema):
def __init__(self, model_type):
super().__init__(model_type, layer_schema={
......@@ -73,7 +73,7 @@ class MCoreLocalSchema(MCoreSchema):
})
class MCoreTESchema(MCoreSchema):
class CoreTESchema(CoreSchema):
def __init__(self, model_type):
super().__init__(model_type, layer_schema={
......@@ -98,7 +98,7 @@ class MCoreTESchema(MCoreSchema):
})
class MCoreMoETESchema(MCoreSchema):
class CoreMoETESchema(CoreSchema):
def __init__(self, model_type, num_experts, expert_model_parallel_size):
num_local_experts = num_experts // expert_model_parallel_size
......@@ -131,13 +131,13 @@ def get_model_schema(
transformer_impl: T.Literal["transformer_engine", "local"],
num_experts: T.Optional[int] = None,
expert_model_parallel_size: T.Optional[int] = None,
) -> MCoreSchema:
) -> CoreSchema:
if num_experts is not None and num_experts > 0:
# Only support TE setter for MOE
assert transformer_impl == "transformer_engine"
assert isinstance(expert_model_parallel_size, int)
return MCoreMoETESchema(model_type, num_experts, expert_model_parallel_size)
return CoreMoETESchema(model_type, num_experts, expert_model_parallel_size)
return {
"local" : MCoreLocalSchema,
"transformer_engine" : MCoreTESchema,
"local" : CoreLocalSchema,
"transformer_engine" : CoreTESchema,
}[transformer_impl](model_type)
File mode changed from 100644 to 100755
......@@ -110,7 +110,6 @@ def get_gpt_chunk_datasets(config):
get_blend_from_list(args.valid_data_path),
get_blend_from_list(args.test_data_path)
],
renormalize_blend_weights=args.renormalize_blend_weights,
split=config.retro_gpt_split,
split_preprocessing=config.retro_gpt_split,
path_to_cache=config.retro_gpt_data_cache_path,
......
File mode changed from 100644 to 100755
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment