Unverified Commit deef9a3d authored by Yuting Jiang's avatar Yuting Jiang Committed by GitHub
Browse files

Benchmarks - Add deepseek megatron-lm benchmark (#713)



**Description**
Add deepseek megatron-lm benchmark.

---------
Co-authored-by: default avataryukirora <yuting.jiang@microsoft.com>
Co-authored-by: default avatarHongtao Zhang <garyworkzht@gmail.com>
Co-authored-by: default avatarHongtao Zhang <hongtaozhang@microsoft.com>
parent a56356d8
...@@ -41,25 +41,7 @@ def __init__(self, name, parameters=''): ...@@ -41,25 +41,7 @@ def __init__(self, name, parameters=''):
def add_parser_arguments(self): def add_parser_arguments(self):
"""Add the specified arguments.""" """Add the specified arguments."""
super().add_parser_arguments() super().add_parser_arguments()
self._parser.add_argument('--code_base', type=str, required=False, default='', help='Code base.') # Model configs
self._parser.add_argument('--dataset_url', type=str, required=False, default=None, help='Dataset URL.')
self._parser.add_argument(
'--vocab_url',
type=str,
required=False,
default='https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json',
help='Vocab URL.'
)
self._parser.add_argument(
'--merges_url',
type=str,
required=False,
default='https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt',
help='Merges URL.'
)
self._parser.add_argument(
'--tokenizer_type', type=str, required=False, default='GPT2BPETokenizer', help='Tokenizer type.'
)
self._parser.add_argument('--model_size', type=int, required=False, default=6.7, help='Model size.') self._parser.add_argument('--model_size', type=int, required=False, default=6.7, help='Model size.')
self._parser.add_argument('--num_layers', type=int, required=False, default=32, help='Number of layers.') self._parser.add_argument('--num_layers', type=int, required=False, default=32, help='Number of layers.')
self._parser.add_argument('--hidden_size', type=int, required=False, default=4096, help='Hidden size.') self._parser.add_argument('--hidden_size', type=int, required=False, default=4096, help='Hidden size.')
...@@ -102,6 +84,8 @@ def add_parser_arguments(self): ...@@ -102,6 +84,8 @@ def add_parser_arguments(self):
self._parser.add_argument( self._parser.add_argument(
'--train_tokens', type=int, required=False, default=300000000000, help='Train tokens.' '--train_tokens', type=int, required=False, default=300000000000, help='Train tokens.'
) )
self._parser.add_argument('--lr_decay_samples', type=int, default=43945312, help='Use lr decay samples.')
self._parser.add_argument('--prescale_grad', action='store_true', help='Prescale grad.')
# lr configs # lr configs
# Parallelism configs # Parallelism configs
self._parser.add_argument('--zero_stage', type=int, default=1, help='Zero stage.') self._parser.add_argument('--zero_stage', type=int, default=1, help='Zero stage.')
...@@ -119,14 +103,133 @@ def add_parser_arguments(self): ...@@ -119,14 +103,133 @@ def add_parser_arguments(self):
self._parser.add_argument( self._parser.add_argument(
'--split', type=str, default='949,50,1', help='Split dataset ratio for train/val/test.' '--split', type=str, default='949,50,1', help='Split dataset ratio for train/val/test.'
) )
self._parser.add_argument('--prescale_grad', action='store_true', help='Prescale grad.') self._parser.add_argument('--dataset_url', type=str, required=False, default=None, help='Dataset URL.')
self._parser.add_argument( self._parser.add_argument(
'--hostfile', type=str, default=None, help='Hostfile to run the mutli-node benchmark.' '--vocab_url',
type=str,
required=False,
default='https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json',
help='Vocab URL.'
)
self._parser.add_argument(
'--merges_url',
type=str,
required=False,
default='https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt',
help='Merges URL.'
) )
self._parser.add_argument('--data_impl', type=str, default='mmap', help='Data impl.') self._parser.add_argument('--data_impl', type=str, default='mmap', help='Data impl.')
self._parser.add_argument('--data_prefix', type=str, default='dataset_text_document', help='Data prefix.') self._parser.add_argument('--data_prefix', type=str, default='dataset_text_document', help='Data prefix.')
self._parser.add_argument('--mock_data', action='store_true', help='Use mock data.')
self._parser.add_argument(
'--dataloader_type',
type=str,
default=None,
help='Data loader type to load data.',
)
self._parser.add_argument(
'--max_padding_length',
type=int,
default=None,
help='Max padding legth to embedding.',
)
self._parser.add_argument(
'--data_cache_path',
type=str,
default=None,
help='Data cache path.',
)
self._parser.add_argument(
'--dataset',
type=str,
default=None,
help='Dataset to use.',
)
# Model architecture
self._parser.add_argument('--ffn_hidden_size', type=int, help='FFN hidden layer size.')
self._parser.add_argument('--swiglu', action='store_true', help='Enable SwiGLU activation.')
self._parser.add_argument('--no_bias_swiglu_fusion', action='store_true', help='Disable bias SwiGLU fusion.')
self._parser.add_argument('--disable_bias_linear', action='store_true', help='Disable bias in linear layers.')
self._parser.add_argument('--normalization', type=str, help='Normalization method.')
self._parser.add_argument('--norm_epsilon', type=float, help='Normalization epsilon.')
self._parser.add_argument(
'--untie_embeddings_and_output_weights', action='store_true', help='Untie embeddings and output weights.'
)
self._parser.add_argument('--extra_vocab_size', type=int, help='Extra vocabulary size.')
self._parser.add_argument('--transformer_impl', type=str, default=None, help='Transformer implementation.')
# Loss settings
self._parser.add_argument('--eod_mask_loss', action='store_true', help='Enable EOD mask loss.')
self._parser.add_argument('--hysteresis', type=int, default=2, help='Hysteresis for loss scale.')
# Optimizer
self._parser.add_argument(
'--optimizer',
type=str,
default='adam',
help='Optimizer to use. Current supported: "adam" and "fused_adam".',
)
self._parser.add_argument(
'--override_opt_param_scheduler', action='store_true', help='Enable the opt_param scheduler.'
)
# LoRA settings
self._parser.add_argument('--kv_lora_rank', type=int, help='KV LoRA rank.')
# MoE configuration
self._parser.add_argument(
'--expert_model_parallel_size',
type=int,
default=None,
help='Expert model parallel size.',
)
self._parser.add_argument(
'--num_experts',
type=int,
default=None,
help='Number of experts.',
)
self._parser.add_argument('--moe_ffn_hidden_size', type=int, help='MoE FFN hidden size.')
self._parser.add_argument('--enable_shared_expert', action='store_true', help='Enable shared expert in MoE.')
self._parser.add_argument('--moe_layer_freq', type=int, help='MoE layer frequency.')
self._parser.add_argument('--num_shared_experts', type=int, help='Number of shared experts.')
self._parser.add_argument('--moe_router_topk', type=int, help='Top-k routing for MoE.')
self._parser.add_argument('--moe_aux_loss_coeff', type=float, help='Auxiliary loss coefficient.')
self._parser.add_argument(
'--moe_router_load_balancing_type', type=str, help='Load balancing type for MoE router.'
)
# Tokenizer & Position Encoding
self._parser.add_argument(
'--tokenizer_type', type=str, required=False, default='GPT2BPETokenizer', help='Tokenizer type.'
)
self._parser.add_argument('--patch_tokenizer_type', type=str, help='Tokenizer type.')
self._parser.add_argument('--position_embedding_type', type=str, help='Position embedding type.')
self._parser.add_argument('--no_rope_fusion', action='store_true', help='Disable RoPE fusion.')
self._parser.add_argument('--rotary_base', type=int, help='Rotary base value.')
self._parser.add_argument('--rotary_scaling_factor', type=int, help='Rotary scaling factor.')
self._parser.add_argument('--qk_nope_head_dim', type=int, help='QK NoPE head dimension.')
self._parser.add_argument('--qk_rope_head_dim', type=int, help='QK RoPE head dimension.')
self._parser.add_argument('--v_head_dim', type=int, help='V head dimension.')
# Checkpoint and loading
self._parser.add_argument('--load', type=str, help='Model to load.')
self._parser.add_argument('--no_load_optim', action='store_true', help='Disable optimizer loading.')
self._parser.add_argument('--no_load_rng', action='store_true', help='Disable RNG loading.')
self._parser.add_argument('--ckpt_format', type=str, help='Checkpoint format.')
# Other settings
self._parser.add_argument('--code_base', type=str, required=False, default='', help='Code base.')
self._parser.add_argument(
'--hostfile', type=str, default=None, help='Hostfile to run the mutli-node benchmark.'
)
self._parser.add_argument('--deepspeed', action='store_true', help='Use deepspeed.') self._parser.add_argument('--deepspeed', action='store_true', help='Use deepspeed.')
self._parser.add_argument('--extra', type=str, default=None, help='Extra options for Megatron.') self._parser.add_argument('--extra', type=str, default=None, help='Extra options for Megatron.')
self._parser.add_argument(
'--model',
type=str,
default='gpt',
help='Model to run. Current supported: "gpt" and "deepseek".',
)
self._parser.add_argument(
'--train_mode',
type=str,
default=None,
help='Train mode to run. Current supported: "pretrain" and "finetune".',
)
def _preprocess(self): def _preprocess(self):
if not super()._preprocess(): if not super()._preprocess():
...@@ -139,8 +242,9 @@ def _preprocess(self): ...@@ -139,8 +242,9 @@ def _preprocess(self):
else: else:
self._args.code_base = os.path.join(os.getenv('SB_MICRO_PATH'), 'third_party/Megatron/Megatron-LM') self._args.code_base = os.path.join(os.getenv('SB_MICRO_PATH'), 'third_party/Megatron/Megatron-LM')
if not os.path.exists(self._args.code_base) or \ if not os.path.exists(self._args.code_base) or not os.path.exists(
not os.path.exists(os.path.join(self._args.code_base, 'pretrain_gpt.py')): os.path.join(self._args.code_base, f'pretrain_{self._args.model}.py')
):
logger.error('Code base is not valid.') logger.error('Code base is not valid.')
self._result.set_return_code(ReturnCode.INVALID_ARGUMENT) self._result.set_return_code(ReturnCode.INVALID_ARGUMENT)
return False return False
...@@ -207,10 +311,11 @@ def __prepare_deespeed_config(self, precision_megatron): ...@@ -207,10 +311,11 @@ def __prepare_deespeed_config(self, precision_megatron):
'enabled': True, 'enabled': True,
'loss_scale': 0, 'loss_scale': 0,
'loss_scale_window': 500, 'loss_scale_window': 500,
'hysteresis': 2,
'min_loss_scale': 1, 'min_loss_scale': 1,
'initial_scale_power': 11 'initial_scale_power': 11
} }
if self._args.hysteresis is not None:
precision_template['hysteresis'] = self._args.hysteresis
ds_config_template = { ds_config_template = {
'train_batch_size': self._args.batch_size, 'train_batch_size': self._args.batch_size,
...@@ -242,6 +347,129 @@ def __prepare_deespeed_config(self, precision_megatron): ...@@ -242,6 +347,129 @@ def __prepare_deespeed_config(self, precision_megatron):
deepspeed_options = f'{deepspeed_options} --no-pipeline-parallel' deepspeed_options = f'{deepspeed_options} --no-pipeline-parallel'
return deepspeed_options return deepspeed_options
def _append_parallel_flags(self, opts):
if self._args.sequence_parallel:
opts += ' --sequence-parallel'
if self._args.no_async_tensor_model_parallel_allreduce:
opts += ' --no-async-tensor-model-parallel-allreduce'
if self._args.pipeline_model_parallel_size > 1:
opts += f' --pipeline-model-parallel-size {self._args.pipeline_model_parallel_size}'
return opts
def _append_architecture_flags(self, opts):
if self._args.swiglu:
opts += ' --swiglu'
if self._args.no_bias_swiglu_fusion:
opts += ' --no-bias-swiglu-fusion'
if self._args.disable_bias_linear:
opts += ' --disable-bias-linear'
if self._args.normalization:
opts += f' --normalization {self._args.normalization}'
if self._args.norm_epsilon:
opts += f' --norm-epsilon {self._args.norm_epsilon}'
if self._args.untie_embeddings_and_output_weights:
opts += ' --untie-embeddings-and-output-weights'
if self._args.transformer_impl:
opts += f' --transformer-impl {self._args.transformer_impl}'
if self._args.extra_vocab_size:
opts += f' --extra-vocab-size {self._args.extra_vocab_size}'
if self._args.ffn_hidden_size:
opts += f' --ffn-hidden-size {self._args.ffn_hidden_size}'
return opts
def _append_moe_flags(self, opts):
if self._args.moe_ffn_hidden_size:
opts += f' --moe-ffn-hidden-size {self._args.moe_ffn_hidden_size}'
if self._args.enable_shared_expert:
opts += ' --enable-shared-expert'
if self._args.moe_layer_freq:
opts += f' --moe-layer-freq {self._args.moe_layer_freq}'
if self._args.num_shared_experts:
opts += f' --num-shared-experts {self._args.num_shared_experts}'
if self._args.moe_router_topk:
opts += f' --moe-router-topk {self._args.moe_router_topk}'
if self._args.moe_aux_loss_coeff:
opts += f' --moe-aux-loss-coeff {self._args.moe_aux_loss_coeff}'
if self._args.moe_router_load_balancing_type:
opts += f' --moe-router-load-balancing-type {self._args.moe_router_load_balancing_type}'
if self._args.expert_model_parallel_size:
opts += f' --expert-model-parallel-size {self._args.expert_model_parallel_size}'
if self._args.num_experts:
opts += f' --num-experts {self._args.num_experts}'
return opts
def _append_optimizer_flags(self, opts):
if self._args.optimizer:
opts += f' --optimizer {self._args.optimizer}'
if getattr(self._args, 'override_opt_param_scheduler', True):
opts += ' --override-opt_param-scheduler'
if self._args.hysteresis is not None:
opts += f' --hysteresis {self._args.hysteresis}'
return opts
def _append_checkpoint_flags(self, opts):
if self._args.load:
opts += f' --load {self._args.load}'
if self._args.no_load_optim:
opts += ' --no-load-optim'
if self._args.no_load_rng:
opts += ' --no-load-rng'
if self._args.ckpt_format:
opts += f' --ckpt-format {self._args.ckpt_format}'
return opts
def _append_tokenizer_flags(self, opts):
args = self._args
# map of arg-attribute → flag string
flag_map = {
'tokenizer_type': '--tokenizer-type',
'patch_tokenizer_type': '--patch-tokenizer-type',
'position_embedding_type': '--position-embedding-type',
'rotary_base': '--rotary-base',
'rotary_scaling_factor': '--rotary-scaling-factor',
'qk_nope_head_dim': '--qk-nope-head-dim',
'qk_rope_head_dim': '--qk-rope-head-dim',
'v_head_dim': '--v-head-dim',
'kv_lora_rank': '--kv-lora-rank',
'no_rope_fusion': '--no-rope-fusion',
}
for attr, flag in flag_map.items():
val = getattr(args, attr, None)
if not val:
continue
# boolean flags get no value
if isinstance(val, bool):
opts += f' {flag}'
else:
opts += f' {flag} {val}'
return opts
def _append_misc_flags(self, opts):
if self._args.eod_mask_loss:
opts += ' --eod-mask-loss'
if self._args.use_rotary_position_embeddings:
opts += ' --use-rotary-position-embeddings'
if self._args.no_gradient_accumulation_fusion:
opts += ' --no-gradient-accumulation-fusion'
if self._args.use_flash_attn:
opts += ' --use-flash-attn'
if self._args.no_masked_softmax_fusion:
opts += ' --no-masked-softmax-fusion'
if self._args.no_bias_gelu_fusion:
opts += ' --no-bias-gelu-fusion'
if self._args.no_bias_dropout_fusion:
opts += ' --no-bias-dropout-fusion'
if self._args.train_mode:
opts += f' --train-mode {self._args.train_mode}'
if self._args.max_padding_length:
opts += f' --max-padding-length {self._args.max_padding_length}'
return opts
def _megatron_command(self, precision): # noqa: C901 def _megatron_command(self, precision): # noqa: C901
"""Generate megatron command.""" """Generate megatron command."""
if precision == Precision.FLOAT32: if precision == Precision.FLOAT32:
...@@ -252,12 +480,11 @@ def _megatron_command(self, precision): # noqa: C901 ...@@ -252,12 +480,11 @@ def _megatron_command(self, precision): # noqa: C901
precision_megatron = '--bf16' precision_megatron = '--bf16'
megatron_options = f'\ megatron_options = f'\
--override-opt_param-scheduler \
--adam-beta1 0.9 \ --adam-beta1 0.9 \
--adam-beta2 0.95 \ --adam-beta2 0.95 \
--tensor-model-parallel-size {self._args.tensor_model_parallel_size} \ --tensor-model-parallel-size {self._args.tensor_model_parallel_size} \
--init-method-std {self._args.init_std} \ --init-method-std {self._args.init_std} \
--lr-decay-samples 43945312 \ --lr-decay-samples {self._args.lr_decay_samples} \
--lr-warmup-samples {self._args.num_warmup * self._args.batch_size} \ --lr-warmup-samples {self._args.num_warmup * self._args.batch_size} \
--lr-decay-style cosine \ --lr-decay-style cosine \
--micro-batch-size {self._args.micro_batch_size} \ --micro-batch-size {self._args.micro_batch_size} \
...@@ -270,54 +497,38 @@ def _megatron_command(self, precision): # noqa: C901 ...@@ -270,54 +497,38 @@ def _megatron_command(self, precision): # noqa: C901
--train-samples {self._args.num_steps * self._args.batch_size} \ --train-samples {self._args.num_steps * self._args.batch_size} \
--lr {self._args.lr} \ --lr {self._args.lr} \
--min-lr {self._args.min_lr} \ --min-lr {self._args.min_lr} \
--split {self._args.split} \
--log-interval {self._args.log_interval} \ --log-interval {self._args.log_interval} \
--eval-interval {self._args.eval_interval} \ --eval-interval {self._args.eval_interval} \
--eval-iters {self._args.eval_iters} \ --eval-iters {self._args.eval_iters} \
--save-interval {self._args.save_interval} \ --save-interval {self._args.save_interval} \
--weight-decay 0.1 \ --weight-decay 0.1 \
--clip-grad 1.0 \ --clip-grad 1.0 \
--hysteresis 2 \
--num-workers {self._args.num_workers} \ --num-workers {self._args.num_workers} \
--attention-dropout 0.0 \ --attention-dropout 0.0 \
--hidden-dropout 0.0 \ --hidden-dropout 0.0 \
--optimizer adam \
--use-distributed-optimizer \ --use-distributed-optimizer \
{precision_megatron} \ {precision_megatron} \
--seed {self._args.seed} \ --seed {self._args.seed} \
--log-throughput' --log-throughput'
if self._args.sequence_parallel: megatron_options = self._append_parallel_flags(megatron_options)
megatron_options = f'{megatron_options} --sequence-parallel' megatron_options = self._append_architecture_flags(megatron_options)
if self._args.no_async_tensor_model_parallel_allreduce: megatron_options = self._append_moe_flags(megatron_options)
megatron_options = f'{megatron_options} --no-async-tensor-model-parallel-allreduce' megatron_options = self._append_optimizer_flags(megatron_options)
if self._args.use_rotary_position_embeddings: megatron_options = self._append_checkpoint_flags(megatron_options)
megatron_options = f'{megatron_options} --use-rotary-position-embeddings' megatron_options = self._append_tokenizer_flags(megatron_options)
if self._args.no_gradient_accumulation_fusion: megatron_options = self._append_misc_flags(megatron_options)
megatron_options = f'{megatron_options} --no-gradient-accumulation-fusion'
if self._args.use_flash_attn:
megatron_options = f'{megatron_options} --use-flash-attn'
if self._args.no_masked_softmax_fusion:
megatron_options = f'{megatron_options} --no-masked-softmax-fusion'
if self._args.no_bias_gelu_fusion:
megatron_options = f'{megatron_options} --no-bias-gelu-fusion'
if self._args.no_bias_dropout_fusion:
megatron_options = f'{megatron_options} --no-bias-dropout-fusion'
if self._args.extra:
megatron_options = f'{megatron_options} {self._args.extra}'
command = '' script_path = os.path.join(self._args.code_base, f'pretrain_{self._args.model}.py')
script_path = os.path.join(self._args.code_base, 'pretrain_gpt.py')
if self._args.deepspeed: if self._args.deepspeed:
deepspeed_option = self.__prepare_deespeed_config(precision_megatron.lstrip('--')) deepspeed_option = self.__prepare_deespeed_config(precision_megatron.lstrip('--'))
# No --log-throughput in Megatron-DeepSpeed by 20231219
megatron_options = megatron_options.replace('--log-throughput', '').strip() megatron_options = megatron_options.replace('--log-throughput', '').strip()
if self._num_nodes > 1: if self._num_nodes > 1:
command = f'torchrun {self._distributed_args} ' + \ command = f'torchrun {self._distributed_args} {script_path} \
f'{script_path} {megatron_options} {self._data_options} {deepspeed_option}' {megatron_options} {self._data_options} {deepspeed_option}'
else: else:
command = f'deepspeed {script_path} {megatron_options} {self._data_options} {deepspeed_option}' command = f'deepspeed {script_path} {megatron_options} {self._data_options} {deepspeed_option}'
else: else:
command = f'torchrun {self._distributed_args} {script_path} {megatron_options} {self._data_options}' command = f'torchrun {self._distributed_args} {script_path} {megatron_options} {self._data_options}'
...@@ -425,6 +636,11 @@ def _generate_dataset(self): ...@@ -425,6 +636,11 @@ def _generate_dataset(self):
Return: Return:
True if dataset is created successfully. True if dataset is created successfully.
""" """
self._data_options = ''
if self._args.mock_data:
logger.info('Using mock data.')
self._data_options = '--mock-data'
else:
self._vocab_path = str(Path(self._args.data_home) / 'gpt2-vocab.json') self._vocab_path = str(Path(self._args.data_home) / 'gpt2-vocab.json')
download_file(self._args.vocab_url, self._vocab_path) download_file(self._args.vocab_url, self._vocab_path)
self._merges_path = str(Path(self._args.data_home) / 'gpt2-merges.txt') self._merges_path = str(Path(self._args.data_home) / 'gpt2-merges.txt')
...@@ -466,7 +682,15 @@ def _generate_dataset(self): ...@@ -466,7 +682,15 @@ def _generate_dataset(self):
--merge-file {self._merges_path} \ --merge-file {self._merges_path} \
--data-path {self._data_path}' --data-path {self._data_path}'
logger.info('Dataset preparation successfully.') if self._args.dataloader_type:
self._data_options += f' --dataloader-type {self._args.dataloader_type}'
if self._args.split:
self._data_options += f' --split {self._args.split}'
if self._args.data_cache_path:
self._data_options += f' --data-cache-path {self._args.data_cache_path}'
if self._args.dataset:
self._data_options += f' --dataset {self._args.dataset}'
return True return True
def _set_force_fp32(self): def _set_force_fp32(self):
...@@ -521,3 +745,54 @@ def _cal_params_count(self): ...@@ -521,3 +745,54 @@ def _cal_params_count(self):
# Register GPT3 benchmark. # Register GPT3 benchmark.
BenchmarkRegistry.register_benchmark('megatron-gpt', MegatronGPT, parameters='', platform=Platform.CUDA) BenchmarkRegistry.register_benchmark('megatron-gpt', MegatronGPT, parameters='', platform=Platform.CUDA)
BenchmarkRegistry.register_benchmark('megatron-gpt', MegatronGPT, parameters='', platform=Platform.ROCM) BenchmarkRegistry.register_benchmark('megatron-gpt', MegatronGPT, parameters='', platform=Platform.ROCM)
BenchmarkRegistry.register_benchmark(
'megatron-deepseek-v2', MegatronGPT, parameters='--model=deepseek', platform=Platform.ROCM
)
BenchmarkRegistry.register_benchmark(
'megatron-deepseek-v2',
MegatronGPT,
parameters=(
'--model=deepseek '
'--tokenizer_type=DeepSeekV2Tokenizer '
'--transformer_impl=transformer_engine '
'--num_layers=27 '
'--hidden_size=1024 '
'--seq_len=4096 '
'--num_attn_heads=16 '
'--moe_ffn_hidden_size=1408 '
'--enable_shared_expert '
'--moe_layer_freq=1 '
'--num_shared_experts=2 '
'--moe_router_topk=6 '
'--moe_aux_loss_coeff=1e-2 '
'--moe_router_load_balancing_type=aux_loss '
'--num_experts=64 '
'--patch_tokenizer_type=DeepSeekV2Tokenizer '
'--position_embedding_type=rope '
'--no_rope_fusion '
'--rotary_base=10000 '
'--rotary_scaling_factor=40 '
'--qk_nope_head_dim=128 '
'--qk_rope_head_dim=64 '
'--v_head_dim=128 '
'--ffn_hidden_size=10944 '
'--swiglu '
'--normalization=RMSNorm '
'--norm_epsilon=1e-06 '
'--no_bias_swiglu_fusion '
'--disable_bias_linear '
'--untie_embeddings_and_output_weights '
'--extra_vocab_size=2400 '
'--load=deepseek-ai/DeepSeek-V2-Lite '
'--no_load_optim '
'--no_load_rng '
'--ckpt_format=torch '
'--eod_mask_loss '
'--train_mode=pretrain '
'--data_cache_path=/root/cache '
'--max_padding_length=4096 '
'--kv_lora_rank=512 '
'--dataloader_type=cyclic'
),
platform=Platform.ROCM
)
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
import os import os
from pathlib import Path from pathlib import Path
import shlex
import statistics import statistics
from unittest import mock from unittest import mock
import unittest import unittest
...@@ -15,6 +16,26 @@ ...@@ -15,6 +16,26 @@
from tests.helper.testcase import BenchmarkTestCase from tests.helper.testcase import BenchmarkTestCase
def normalize_command(cmd):
"""Convert a CLI string into a list of meaningful argument units (key-value or flag)."""
tokens = shlex.split(cmd)
units = []
i = 0
while i < len(tokens):
if tokens[i].startswith('--'):
if i + 1 >= len(tokens) or tokens[i + 1].startswith('--'):
units.append(tokens[i]) # flag-only
i += 1
else:
units.append(f'{tokens[i]} {tokens[i + 1]}') # key-value pair
i += 2
else:
# Include positional args like torchrun, script path, etc.
units.append(tokens[i])
i += 1
return sorted(units)
class MegatronGPTTest(BenchmarkTestCase, unittest.TestCase): class MegatronGPTTest(BenchmarkTestCase, unittest.TestCase):
"""Tests for IBBenchmark benchmark.""" """Tests for IBBenchmark benchmark."""
@classmethod @classmethod
...@@ -170,17 +191,20 @@ def test_megatron_gpt_command(self, mock_generate_dataset): ...@@ -170,17 +191,20 @@ def test_megatron_gpt_command(self, mock_generate_dataset):
benchmark = benchmark_cls( benchmark = benchmark_cls(
self.benchmark_name, self.benchmark_name,
parameters=f'--code_base {self._tmp_dir} --hostfile {self.hostfile_path} \ parameters=f'--code_base {self._tmp_dir} --hostfile {self.hostfile_path} \
--num_warmup 0 --num_steps 10 --batch_size 2048 --data_prefix dataset_text_document', --num_warmup 0 --num_steps 10 --batch_size 2048 --data_prefix dataset_text_document \
--override_opt_param_scheduler',
) )
mock_generate_dataset.return_value = True mock_generate_dataset.return_value = True
benchmark._preprocess() benchmark._preprocess()
benchmark._data_options = f'\ benchmark._data_options = f'\
--vocab-file {self._tmp_dir}/gpt2-vocab.json \ --vocab-file {self._tmp_dir}/gpt2-vocab.json \
--merge-file {self._tmp_dir}/gpt2-merges.txt \ --merge-file {self._tmp_dir}/gpt2-merges.txt \
--data-path {self._tmp_dir}/dataset_text_document' --data-path {self._tmp_dir}/dataset_text_document \
--split 949,50,1'
script_path = str(Path(self._tmp_dir) / 'pretrain_gpt.py') script_path = str(Path(self._tmp_dir) / 'pretrain_gpt.py')
expected_command = 'torchrun {distributed_args} {script_path} \ expected_command_template = 'torchrun {distributed_args} {script_path} \
--tokenizer-type GPT2BPETokenizer \
--override-opt_param-scheduler \ --override-opt_param-scheduler \
--adam-beta1 0.9 \ --adam-beta1 0.9 \
--adam-beta2 0.95 \ --adam-beta2 0.95 \
...@@ -199,7 +223,6 @@ def test_megatron_gpt_command(self, mock_generate_dataset): ...@@ -199,7 +223,6 @@ def test_megatron_gpt_command(self, mock_generate_dataset):
--train-samples 20480 \ --train-samples 20480 \
--lr 0.00012 \ --lr 0.00012 \
--min-lr 1e-06 \ --min-lr 1e-06 \
--split 949,50,1 \
--log-interval 1 \ --log-interval 1 \
--eval-interval 10 \ --eval-interval 10 \
--eval-iters 0 \ --eval-iters 0 \
...@@ -217,54 +240,58 @@ def test_megatron_gpt_command(self, mock_generate_dataset): ...@@ -217,54 +240,58 @@ def test_megatron_gpt_command(self, mock_generate_dataset):
--log-throughput {data_options}' --log-throughput {data_options}'
precision = Precision.FLOAT32 precision = Precision.FLOAT32
command = benchmark._megatron_command(precision) expected_command = expected_command_template.format(
self.assertEqual(
command,
expected_command.format(
precision='', precision='',
data_options=benchmark._data_options, data_options=benchmark._data_options,
distributed_args=benchmark._distributed_args, distributed_args=benchmark._distributed_args,
script_path=script_path script_path=script_path
) )
)
precision = Precision.FLOAT16
command = benchmark._megatron_command(precision) command = benchmark._megatron_command(precision)
self.assertEqual( actual_units = normalize_command(command)
command, expected_units = normalize_command(expected_command)
expected_command.format( self.assertEqual(actual_units, expected_units)
precision = Precision.FLOAT16
expected_command = expected_command_template.format(
precision='--fp16', precision='--fp16',
data_options=benchmark._data_options, data_options=benchmark._data_options,
distributed_args=benchmark._distributed_args, distributed_args=benchmark._distributed_args,
script_path=script_path script_path=script_path
) )
)
precision = Precision.BFLOAT16
command = benchmark._megatron_command(precision) command = benchmark._megatron_command(precision)
self.assertEqual( actual_units = normalize_command(command)
command, expected_units = normalize_command(expected_command)
expected_command.format( self.assertEqual(actual_units, expected_units)
precision = Precision.BFLOAT16
expected_command = expected_command_template.format(
precision='--bf16', precision='--bf16',
data_options=benchmark._data_options, data_options=benchmark._data_options,
distributed_args=benchmark._distributed_args, distributed_args=benchmark._distributed_args,
script_path=script_path script_path=script_path
) )
) command = benchmark._megatron_command(precision)
actual_units = normalize_command(command)
expected_units = normalize_command(expected_command)
self.assertEqual(actual_units, expected_units)
os.environ['OMPI_COMM_WORLD_SIZE'] = '1' os.environ['OMPI_COMM_WORLD_SIZE'] = '1'
benchmark = benchmark_cls( benchmark = benchmark_cls(
self.benchmark_name, self.benchmark_name,
parameters=f'--code_base {self._tmp_dir} --hostfile {self.hostfile_path} \ parameters=f'--code_base {self._tmp_dir} --hostfile {self.hostfile_path} \
--num_warmup 0 --num_steps 10 --batch_size 2048 --data_prefix dataset_text_document --deepspeed', --num_warmup 0 --num_steps 10 --batch_size 2048 --data_prefix dataset_text_document \
--deepspeed --override_opt_param_scheduler',
) )
mock_generate_dataset.return_value = True
benchmark._preprocess() benchmark._preprocess()
benchmark._data_options = f'\ benchmark._data_options = f'\
--vocab-file {self._tmp_dir}/gpt2-vocab.json \ --vocab-file {self._tmp_dir}/gpt2-vocab.json \
--merge-file {self._tmp_dir}/gpt2-merges.txt \ --merge-file {self._tmp_dir}/gpt2-merges.txt \
--data-path {self._tmp_dir}/dataset_text_document' --data-path {self._tmp_dir}/dataset_text_document \
--split 949,50,1'
command = benchmark._megatron_command(Precision.BFLOAT16) command = benchmark._megatron_command(Precision.BFLOAT16)
expected_command = 'deepspeed {script_path} --override-opt_param-scheduler \ expected_command = 'deepspeed {script_path} --override-opt_param-scheduler \
--tokenizer-type GPT2BPETokenizer \
--adam-beta1 0.9 \ --adam-beta1 0.9 \
--adam-beta2 0.95 \ --adam-beta2 0.95 \
--tensor-model-parallel-size 1 \ --tensor-model-parallel-size 1 \
...@@ -282,7 +309,6 @@ def test_megatron_gpt_command(self, mock_generate_dataset): ...@@ -282,7 +309,6 @@ def test_megatron_gpt_command(self, mock_generate_dataset):
--train-samples 20480 \ --train-samples 20480 \
--lr 0.00012 \ --lr 0.00012 \
--min-lr 1e-06 \ --min-lr 1e-06 \
--split 949,50,1 \
--log-interval 1 \ --log-interval 1 \
--eval-interval 10 \ --eval-interval 10 \
--eval-iters 0 \ --eval-iters 0 \
...@@ -306,16 +332,174 @@ def test_megatron_gpt_command(self, mock_generate_dataset): ...@@ -306,16 +332,174 @@ def test_megatron_gpt_command(self, mock_generate_dataset):
--train-tokens 300000000000 \ --train-tokens 300000000000 \
--data-impl mmap --no-pipeline-parallel' --data-impl mmap --no-pipeline-parallel'
self.assertEqual( expected_command = expected_command.format(
command,
expected_command.format(
precision='--bf16', precision='--bf16',
data_options=benchmark._data_options, data_options=benchmark._data_options,
script_path=script_path, deepseed_options=expect_ds_options,
deepseed_options=expect_ds_options script_path=script_path
) )
command = benchmark._megatron_command(Precision.BFLOAT16)
actual_units = normalize_command(command)
expected_units = normalize_command(expected_command)
self.assertEqual(actual_units, expected_units)
def test_deepseek_v2_command(self):
"""Test v2 command."""
# test deepspeed with megatron
os.environ['OMPI_COMM_WORLD_SIZE'] = '1'
os.environ['OMPI_COMM_WORLD_LOCAL_SIZE'] = '1'
os.environ['OMPI_COMM_WORLD_RANK'] = '0'
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '12345'
with open(self.hostfile_path, 'w') as f:
f.write('host1\n')
benchmark_name = 'megatron-deepseek-v2'
(benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(self.benchmark_name, Platform.ROCM)
assert (benchmark_cls)
benchmark = benchmark_cls(
benchmark_name,
parameters=f'--code_base {self._tmp_dir} --hostfile {self.hostfile_path} '
'--num_warmup 0 '
'--num_steps 10 '
'--batch_size 256 '
'--expert_model_parallel_size 8 '
'--micro_batch_size 2 '
'--mock_data '
'--model=deepseek '
'--tokenizer_type=DeepSeekV2Tokenizer '
'--transformer_impl=transformer_engine '
'--num_layers=27 '
'--hidden_size=1024 '
'--seq_len=4096 '
'--ffn_hidden_size=10944 '
'--num_attn_heads=16 '
'--moe_ffn_hidden_size=1408 '
'--enable_shared_expert '
'--moe_layer_freq=1 '
'--num_shared_experts=2 '
'--moe_router_topk=6 '
'--moe_aux_loss_coeff=0.01 '
'--moe_router_load_balancing_type=aux_loss '
'--num_experts=64 '
'--patch_tokenizer_type=DeepSeekV2Tokenizer '
'--position_embedding_type=rope '
'--no_rope_fusion '
'--rotary_base=10000 '
'--rotary_scaling_factor=40 '
'--qk_nope_head_dim=128 '
'--qk_rope_head_dim=64 '
'--v_head_dim=128 '
'--ffn_hidden_size=10944 '
'--swiglu '
'--normalization=RMSNorm '
'--norm_epsilon=1e-06 '
'--no_bias_swiglu_fusion '
'--disable_bias_linear '
'--untie_embeddings_and_output_weights '
'--extra_vocab_size=2400 '
'--load=deepseek-ai/DeepSeek-V2-Lite '
'--no_load_optim '
'--no_load_rng '
'--ckpt_format=torch '
'--eod_mask_loss '
'--train_mode=pretrain '
'--data_cache_path=/root/cache '
'--max_padding_length=4096 '
'--kv_lora_rank=512 '
'--dataloader_type=cyclic '
) )
benchmark._preprocess()
benchmark._data_options = '\
--mock-data \
--dataloader-type cyclic \
--data-cache-path /root/cache \
--dataset LLama-Pretrain-Idxmap'
precision = Precision.BFLOAT16
command = benchmark._megatron_command(precision)
expected_command = (
'torchrun {script_path} --bf16 \
--init-method-std 0.009 \
--adam-beta1 0.9 \
--hidden-dropout 0.0 \
--min-lr 1e-06 \
--lr 0.00012 \
--optimizer adam \
--log-interval 1 \
--eval-interval 10 \
--seed 1234 \
--eval-iters 0 \
--max-position-embeddings 4096 \
--hysteresis 2 \
--lr-decay-style cosine \
--lr-decay-samples 43945312 \
--clip-grad 1.0 \
--save-interval 10000 \
--adam-beta2 0.95 \
--moe-aux-loss-coeff 0.01 \
--log-throughput \
--num-workers 8 \
--use-distributed-optimizer \
--attention-dropout 0.0 \
--tensor-model-parallel-size 1 \
--lr-warmup-samples 0 \
--weight-decay 0.1 \
--train-samples 2560 \
--no-load-optim \
--load deepseek-ai/DeepSeek-V2-Lite \
--no-load-rng \
--ffn-hidden-size 10944 \
--patch-tokenizer-type DeepSeekV2Tokenizer \
--swiglu \
--normalization RMSNorm \
--norm-epsilon 1e-06 \
--no-bias-swiglu-fusion \
--no-rope-fusion \
--position-embedding-type rope \
--untie-embeddings-and-output-weights \
--disable-bias-linear \
--ckpt-format torch \
--rotary-base 10000 \
--rotary-scaling-factor 40 \
--eod-mask-loss \
--moe-ffn-hidden-size 1408 \
--enable-shared-expert \
--moe-layer-freq 1 \
--num-shared-experts 2 \
--moe-router-topk 6 \
--kv-lora-rank 512 \
--qk-nope-head-dim 128 \
--qk-rope-head-dim 64 \
--v-head-dim 128 \
--moe-router-load-balancing-type aux_loss \
--train-mode pretrain \
--extra-vocab-size 2400 \
--global-batch-size 256 \
--micro-batch-size 2 \
--num-layers 27 \
--hidden-size 1024 \
--seq-length 4096 \
--num-attention-heads 16 \
--tokenizer-type DeepSeekV2Tokenizer \
--transformer-impl transformer_engine \
--num-experts 64 \
--expert-model-parallel-size 8 \
--max-padding-length 4096 \
{data_options} \
{disitributed_args}'
).format(
script_path=str(Path(self._tmp_dir) / 'pretrain_deepseek.py'),
data_options=benchmark._data_options,
disitributed_args=benchmark._distributed_args
)
actual_units = normalize_command(command)
expected_units = normalize_command(expected_command)
self.assertEqual(actual_units, expected_units)
@decorator.load_data('tests/data/megatron_deepspeed.log') @decorator.load_data('tests/data/megatron_deepspeed.log')
@mock.patch('superbench.benchmarks.model_benchmarks.MegatronGPT._generate_dataset') @mock.patch('superbench.benchmarks.model_benchmarks.MegatronGPT._generate_dataset')
def test_megatron_parse_log(self, raw_output, mock_generate_dataset): def test_megatron_parse_log(self, raw_output, mock_generate_dataset):
......
...@@ -16,13 +16,13 @@ ROCM_VER ?= $(shell hipconfig -R | grep -oP '\d+\.\d+\.\d+' || echo "0.0.0") ...@@ -16,13 +16,13 @@ ROCM_VER ?= $(shell hipconfig -R | grep -oP '\d+\.\d+\.\d+' || echo "0.0.0")
NUM_MAKE_JOBS ?= $(shell nproc --ignore=2) NUM_MAKE_JOBS ?= $(shell nproc --ignore=2)
.PHONY: all cuda_with_msccl cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest cuda_msccl rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd rocm_hipblaslt megatron_lm megatron_deepspeed apex_rocm nvbandwidth .PHONY: all cuda_with_msccl cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest cuda_msccl rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd rocm_hipblaslt megatron_lm megatron_deepspeed apex_rocm nvbandwidth rocm_megatron_lm
# Build targets. # Build targets.
all: cuda rocm all: cuda rocm
cuda_with_msccl: cuda cuda_msccl cuda_with_msccl: cuda cuda_msccl
cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn megatron_lm megatron_deepspeed nvbandwidth cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn megatron_lm megatron_deepspeed nvbandwidth
rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm_hipblaslt megatron_deepspeed apex_rocm rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm_hipblaslt megatron_deepspeed apex_rocm rocm_megatron_lm
cpu: common cpu_perftest cpu: common cpu_perftest
common: fio cpu_stream common: fio cpu_stream
...@@ -230,6 +230,18 @@ megatron_deepspeed: ...@@ -230,6 +230,18 @@ megatron_deepspeed:
python -m pip install --no-cache-dir -r requirements.txt && \ python -m pip install --no-cache-dir -r requirements.txt && \
python -m pip install DeepSpeed python -m pip install DeepSpeed
rocm_megatron_lm:
cd Megatron && mkdir -p rocm && cd rocm && \
if [ ! -d "Megatron-LM" ]; then \
git clone -b rocm_dev https://github.com/ROCm/Megatron-LM.git ; \
fi
cp Megatron/rocm/Megatron-LM/examples/deepseek_v2/pretrain_deepseek.py Megatron/rocm/Megatron-LM/
git clone https://github.com/caaatch22/grouped_gemm.git &&\
cd grouped_gemm &&\
git checkout 8a9b438 &&\
git submodule update --init --recursive &&\
pip install .
# Instal apex of ROCm due to dependency of Megatron # Instal apex of ROCm due to dependency of Megatron
apex_rocm: apex_rocm:
$(eval TORCH_VERSION ?= $(shell python -c "import torch; print(torch.__version__)")) $(eval TORCH_VERSION ?= $(shell python -c "import torch; print(torch.__version__)"))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment