Unverified Commit deef9a3d authored by Yuting Jiang's avatar Yuting Jiang Committed by GitHub
Browse files

Benchmarks - Add deepseek megatron-lm benchmark (#713)



**Description**
Add deepseek megatron-lm benchmark.

---------
Co-authored-by: default avataryukirora <yuting.jiang@microsoft.com>
Co-authored-by: default avatarHongtao Zhang <garyworkzht@gmail.com>
Co-authored-by: default avatarHongtao Zhang <hongtaozhang@microsoft.com>
parent a56356d8
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
import os import os
from pathlib import Path from pathlib import Path
import shlex
import statistics import statistics
from unittest import mock from unittest import mock
import unittest import unittest
...@@ -15,6 +16,26 @@ ...@@ -15,6 +16,26 @@
from tests.helper.testcase import BenchmarkTestCase from tests.helper.testcase import BenchmarkTestCase
def normalize_command(cmd):
"""Convert a CLI string into a list of meaningful argument units (key-value or flag)."""
tokens = shlex.split(cmd)
units = []
i = 0
while i < len(tokens):
if tokens[i].startswith('--'):
if i + 1 >= len(tokens) or tokens[i + 1].startswith('--'):
units.append(tokens[i]) # flag-only
i += 1
else:
units.append(f'{tokens[i]} {tokens[i + 1]}') # key-value pair
i += 2
else:
# Include positional args like torchrun, script path, etc.
units.append(tokens[i])
i += 1
return sorted(units)
class MegatronGPTTest(BenchmarkTestCase, unittest.TestCase): class MegatronGPTTest(BenchmarkTestCase, unittest.TestCase):
"""Tests for IBBenchmark benchmark.""" """Tests for IBBenchmark benchmark."""
@classmethod @classmethod
...@@ -170,17 +191,20 @@ def test_megatron_gpt_command(self, mock_generate_dataset): ...@@ -170,17 +191,20 @@ def test_megatron_gpt_command(self, mock_generate_dataset):
benchmark = benchmark_cls( benchmark = benchmark_cls(
self.benchmark_name, self.benchmark_name,
parameters=f'--code_base {self._tmp_dir} --hostfile {self.hostfile_path} \ parameters=f'--code_base {self._tmp_dir} --hostfile {self.hostfile_path} \
--num_warmup 0 --num_steps 10 --batch_size 2048 --data_prefix dataset_text_document', --num_warmup 0 --num_steps 10 --batch_size 2048 --data_prefix dataset_text_document \
--override_opt_param_scheduler',
) )
mock_generate_dataset.return_value = True mock_generate_dataset.return_value = True
benchmark._preprocess() benchmark._preprocess()
benchmark._data_options = f'\ benchmark._data_options = f'\
--vocab-file {self._tmp_dir}/gpt2-vocab.json \ --vocab-file {self._tmp_dir}/gpt2-vocab.json \
--merge-file {self._tmp_dir}/gpt2-merges.txt \ --merge-file {self._tmp_dir}/gpt2-merges.txt \
--data-path {self._tmp_dir}/dataset_text_document' --data-path {self._tmp_dir}/dataset_text_document \
--split 949,50,1'
script_path = str(Path(self._tmp_dir) / 'pretrain_gpt.py') script_path = str(Path(self._tmp_dir) / 'pretrain_gpt.py')
expected_command = 'torchrun {distributed_args} {script_path} \ expected_command_template = 'torchrun {distributed_args} {script_path} \
--tokenizer-type GPT2BPETokenizer \
--override-opt_param-scheduler \ --override-opt_param-scheduler \
--adam-beta1 0.9 \ --adam-beta1 0.9 \
--adam-beta2 0.95 \ --adam-beta2 0.95 \
...@@ -199,7 +223,6 @@ def test_megatron_gpt_command(self, mock_generate_dataset): ...@@ -199,7 +223,6 @@ def test_megatron_gpt_command(self, mock_generate_dataset):
--train-samples 20480 \ --train-samples 20480 \
--lr 0.00012 \ --lr 0.00012 \
--min-lr 1e-06 \ --min-lr 1e-06 \
--split 949,50,1 \
--log-interval 1 \ --log-interval 1 \
--eval-interval 10 \ --eval-interval 10 \
--eval-iters 0 \ --eval-iters 0 \
...@@ -217,54 +240,58 @@ def test_megatron_gpt_command(self, mock_generate_dataset): ...@@ -217,54 +240,58 @@ def test_megatron_gpt_command(self, mock_generate_dataset):
--log-throughput {data_options}' --log-throughput {data_options}'
precision = Precision.FLOAT32 precision = Precision.FLOAT32
command = benchmark._megatron_command(precision) expected_command = expected_command_template.format(
self.assertEqual( precision='',
command, data_options=benchmark._data_options,
expected_command.format( distributed_args=benchmark._distributed_args,
precision='', script_path=script_path
data_options=benchmark._data_options,
distributed_args=benchmark._distributed_args,
script_path=script_path
)
) )
precision = Precision.FLOAT16
command = benchmark._megatron_command(precision) command = benchmark._megatron_command(precision)
self.assertEqual( actual_units = normalize_command(command)
command, expected_units = normalize_command(expected_command)
expected_command.format( self.assertEqual(actual_units, expected_units)
precision='--fp16',
data_options=benchmark._data_options, precision = Precision.FLOAT16
distributed_args=benchmark._distributed_args, expected_command = expected_command_template.format(
script_path=script_path precision='--fp16',
) data_options=benchmark._data_options,
distributed_args=benchmark._distributed_args,
script_path=script_path
) )
precision = Precision.BFLOAT16
command = benchmark._megatron_command(precision) command = benchmark._megatron_command(precision)
self.assertEqual( actual_units = normalize_command(command)
command, expected_units = normalize_command(expected_command)
expected_command.format( self.assertEqual(actual_units, expected_units)
precision='--bf16',
data_options=benchmark._data_options, precision = Precision.BFLOAT16
distributed_args=benchmark._distributed_args, expected_command = expected_command_template.format(
script_path=script_path precision='--bf16',
) data_options=benchmark._data_options,
distributed_args=benchmark._distributed_args,
script_path=script_path
) )
command = benchmark._megatron_command(precision)
actual_units = normalize_command(command)
expected_units = normalize_command(expected_command)
self.assertEqual(actual_units, expected_units)
os.environ['OMPI_COMM_WORLD_SIZE'] = '1' os.environ['OMPI_COMM_WORLD_SIZE'] = '1'
benchmark = benchmark_cls( benchmark = benchmark_cls(
self.benchmark_name, self.benchmark_name,
parameters=f'--code_base {self._tmp_dir} --hostfile {self.hostfile_path} \ parameters=f'--code_base {self._tmp_dir} --hostfile {self.hostfile_path} \
--num_warmup 0 --num_steps 10 --batch_size 2048 --data_prefix dataset_text_document --deepspeed', --num_warmup 0 --num_steps 10 --batch_size 2048 --data_prefix dataset_text_document \
--deepspeed --override_opt_param_scheduler',
) )
mock_generate_dataset.return_value = True
benchmark._preprocess() benchmark._preprocess()
benchmark._data_options = f'\ benchmark._data_options = f'\
--vocab-file {self._tmp_dir}/gpt2-vocab.json \ --vocab-file {self._tmp_dir}/gpt2-vocab.json \
--merge-file {self._tmp_dir}/gpt2-merges.txt \ --merge-file {self._tmp_dir}/gpt2-merges.txt \
--data-path {self._tmp_dir}/dataset_text_document' --data-path {self._tmp_dir}/dataset_text_document \
--split 949,50,1'
command = benchmark._megatron_command(Precision.BFLOAT16) command = benchmark._megatron_command(Precision.BFLOAT16)
expected_command = 'deepspeed {script_path} --override-opt_param-scheduler \ expected_command = 'deepspeed {script_path} --override-opt_param-scheduler \
--tokenizer-type GPT2BPETokenizer \
--adam-beta1 0.9 \ --adam-beta1 0.9 \
--adam-beta2 0.95 \ --adam-beta2 0.95 \
--tensor-model-parallel-size 1 \ --tensor-model-parallel-size 1 \
...@@ -282,7 +309,6 @@ def test_megatron_gpt_command(self, mock_generate_dataset): ...@@ -282,7 +309,6 @@ def test_megatron_gpt_command(self, mock_generate_dataset):
--train-samples 20480 \ --train-samples 20480 \
--lr 0.00012 \ --lr 0.00012 \
--min-lr 1e-06 \ --min-lr 1e-06 \
--split 949,50,1 \
--log-interval 1 \ --log-interval 1 \
--eval-interval 10 \ --eval-interval 10 \
--eval-iters 0 \ --eval-iters 0 \
...@@ -306,15 +332,173 @@ def test_megatron_gpt_command(self, mock_generate_dataset): ...@@ -306,15 +332,173 @@ def test_megatron_gpt_command(self, mock_generate_dataset):
--train-tokens 300000000000 \ --train-tokens 300000000000 \
--data-impl mmap --no-pipeline-parallel' --data-impl mmap --no-pipeline-parallel'
self.assertEqual( expected_command = expected_command.format(
command, precision='--bf16',
expected_command.format( data_options=benchmark._data_options,
precision='--bf16', deepseed_options=expect_ds_options,
data_options=benchmark._data_options, script_path=script_path
script_path=script_path,
deepseed_options=expect_ds_options
)
) )
command = benchmark._megatron_command(Precision.BFLOAT16)
actual_units = normalize_command(command)
expected_units = normalize_command(expected_command)
self.assertEqual(actual_units, expected_units)
def test_deepseek_v2_command(self):
"""Test v2 command."""
# test deepspeed with megatron
os.environ['OMPI_COMM_WORLD_SIZE'] = '1'
os.environ['OMPI_COMM_WORLD_LOCAL_SIZE'] = '1'
os.environ['OMPI_COMM_WORLD_RANK'] = '0'
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '12345'
with open(self.hostfile_path, 'w') as f:
f.write('host1\n')
benchmark_name = 'megatron-deepseek-v2'
(benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(self.benchmark_name, Platform.ROCM)
assert (benchmark_cls)
benchmark = benchmark_cls(
benchmark_name,
parameters=f'--code_base {self._tmp_dir} --hostfile {self.hostfile_path} '
'--num_warmup 0 '
'--num_steps 10 '
'--batch_size 256 '
'--expert_model_parallel_size 8 '
'--micro_batch_size 2 '
'--mock_data '
'--model=deepseek '
'--tokenizer_type=DeepSeekV2Tokenizer '
'--transformer_impl=transformer_engine '
'--num_layers=27 '
'--hidden_size=1024 '
'--seq_len=4096 '
'--ffn_hidden_size=10944 '
'--num_attn_heads=16 '
'--moe_ffn_hidden_size=1408 '
'--enable_shared_expert '
'--moe_layer_freq=1 '
'--num_shared_experts=2 '
'--moe_router_topk=6 '
'--moe_aux_loss_coeff=0.01 '
'--moe_router_load_balancing_type=aux_loss '
'--num_experts=64 '
'--patch_tokenizer_type=DeepSeekV2Tokenizer '
'--position_embedding_type=rope '
'--no_rope_fusion '
'--rotary_base=10000 '
'--rotary_scaling_factor=40 '
'--qk_nope_head_dim=128 '
'--qk_rope_head_dim=64 '
'--v_head_dim=128 '
'--ffn_hidden_size=10944 '
'--swiglu '
'--normalization=RMSNorm '
'--norm_epsilon=1e-06 '
'--no_bias_swiglu_fusion '
'--disable_bias_linear '
'--untie_embeddings_and_output_weights '
'--extra_vocab_size=2400 '
'--load=deepseek-ai/DeepSeek-V2-Lite '
'--no_load_optim '
'--no_load_rng '
'--ckpt_format=torch '
'--eod_mask_loss '
'--train_mode=pretrain '
'--data_cache_path=/root/cache '
'--max_padding_length=4096 '
'--kv_lora_rank=512 '
'--dataloader_type=cyclic '
)
benchmark._preprocess()
benchmark._data_options = '\
--mock-data \
--dataloader-type cyclic \
--data-cache-path /root/cache \
--dataset LLama-Pretrain-Idxmap'
precision = Precision.BFLOAT16
command = benchmark._megatron_command(precision)
expected_command = (
'torchrun {script_path} --bf16 \
--init-method-std 0.009 \
--adam-beta1 0.9 \
--hidden-dropout 0.0 \
--min-lr 1e-06 \
--lr 0.00012 \
--optimizer adam \
--log-interval 1 \
--eval-interval 10 \
--seed 1234 \
--eval-iters 0 \
--max-position-embeddings 4096 \
--hysteresis 2 \
--lr-decay-style cosine \
--lr-decay-samples 43945312 \
--clip-grad 1.0 \
--save-interval 10000 \
--adam-beta2 0.95 \
--moe-aux-loss-coeff 0.01 \
--log-throughput \
--num-workers 8 \
--use-distributed-optimizer \
--attention-dropout 0.0 \
--tensor-model-parallel-size 1 \
--lr-warmup-samples 0 \
--weight-decay 0.1 \
--train-samples 2560 \
--no-load-optim \
--load deepseek-ai/DeepSeek-V2-Lite \
--no-load-rng \
--ffn-hidden-size 10944 \
--patch-tokenizer-type DeepSeekV2Tokenizer \
--swiglu \
--normalization RMSNorm \
--norm-epsilon 1e-06 \
--no-bias-swiglu-fusion \
--no-rope-fusion \
--position-embedding-type rope \
--untie-embeddings-and-output-weights \
--disable-bias-linear \
--ckpt-format torch \
--rotary-base 10000 \
--rotary-scaling-factor 40 \
--eod-mask-loss \
--moe-ffn-hidden-size 1408 \
--enable-shared-expert \
--moe-layer-freq 1 \
--num-shared-experts 2 \
--moe-router-topk 6 \
--kv-lora-rank 512 \
--qk-nope-head-dim 128 \
--qk-rope-head-dim 64 \
--v-head-dim 128 \
--moe-router-load-balancing-type aux_loss \
--train-mode pretrain \
--extra-vocab-size 2400 \
--global-batch-size 256 \
--micro-batch-size 2 \
--num-layers 27 \
--hidden-size 1024 \
--seq-length 4096 \
--num-attention-heads 16 \
--tokenizer-type DeepSeekV2Tokenizer \
--transformer-impl transformer_engine \
--num-experts 64 \
--expert-model-parallel-size 8 \
--max-padding-length 4096 \
{data_options} \
{disitributed_args}'
).format(
script_path=str(Path(self._tmp_dir) / 'pretrain_deepseek.py'),
data_options=benchmark._data_options,
disitributed_args=benchmark._distributed_args
)
actual_units = normalize_command(command)
expected_units = normalize_command(expected_command)
self.assertEqual(actual_units, expected_units)
@decorator.load_data('tests/data/megatron_deepspeed.log') @decorator.load_data('tests/data/megatron_deepspeed.log')
@mock.patch('superbench.benchmarks.model_benchmarks.MegatronGPT._generate_dataset') @mock.patch('superbench.benchmarks.model_benchmarks.MegatronGPT._generate_dataset')
......
...@@ -16,13 +16,13 @@ ROCM_VER ?= $(shell hipconfig -R | grep -oP '\d+\.\d+\.\d+' || echo "0.0.0") ...@@ -16,13 +16,13 @@ ROCM_VER ?= $(shell hipconfig -R | grep -oP '\d+\.\d+\.\d+' || echo "0.0.0")
NUM_MAKE_JOBS ?= $(shell nproc --ignore=2) NUM_MAKE_JOBS ?= $(shell nproc --ignore=2)
.PHONY: all cuda_with_msccl cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest cuda_msccl rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd rocm_hipblaslt megatron_lm megatron_deepspeed apex_rocm nvbandwidth .PHONY: all cuda_with_msccl cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest cuda_msccl rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd rocm_hipblaslt megatron_lm megatron_deepspeed apex_rocm nvbandwidth rocm_megatron_lm
# Build targets. # Build targets.
all: cuda rocm all: cuda rocm
cuda_with_msccl: cuda cuda_msccl cuda_with_msccl: cuda cuda_msccl
cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn megatron_lm megatron_deepspeed nvbandwidth cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn megatron_lm megatron_deepspeed nvbandwidth
rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm_hipblaslt megatron_deepspeed apex_rocm rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm_hipblaslt megatron_deepspeed apex_rocm rocm_megatron_lm
cpu: common cpu_perftest cpu: common cpu_perftest
common: fio cpu_stream common: fio cpu_stream
...@@ -230,6 +230,18 @@ megatron_deepspeed: ...@@ -230,6 +230,18 @@ megatron_deepspeed:
python -m pip install --no-cache-dir -r requirements.txt && \ python -m pip install --no-cache-dir -r requirements.txt && \
python -m pip install DeepSpeed python -m pip install DeepSpeed
rocm_megatron_lm:
cd Megatron && mkdir -p rocm && cd rocm && \
if [ ! -d "Megatron-LM" ]; then \
git clone -b rocm_dev https://github.com/ROCm/Megatron-LM.git ; \
fi
cp Megatron/rocm/Megatron-LM/examples/deepseek_v2/pretrain_deepseek.py Megatron/rocm/Megatron-LM/
git clone https://github.com/caaatch22/grouped_gemm.git &&\
cd grouped_gemm &&\
git checkout 8a9b438 &&\
git submodule update --init --recursive &&\
pip install .
# Instal apex of ROCm due to dependency of Megatron # Instal apex of ROCm due to dependency of Megatron
apex_rocm: apex_rocm:
$(eval TORCH_VERSION ?= $(shell python -c "import torch; print(torch.__version__)")) $(eval TORCH_VERSION ?= $(shell python -c "import torch; print(torch.__version__)"))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment