Unverified Commit deef9a3d authored by Yuting Jiang's avatar Yuting Jiang Committed by GitHub
Browse files

Benchmarks - Add deepseek megatron-lm benchmark (#713)



**Description**
Add deepseek megatron-lm benchmark.

---------
Co-authored-by: default avataryukirora <yuting.jiang@microsoft.com>
Co-authored-by: default avatarHongtao Zhang <garyworkzht@gmail.com>
Co-authored-by: default avatarHongtao Zhang <hongtaozhang@microsoft.com>
parent a56356d8
......@@ -5,6 +5,7 @@
import os
from pathlib import Path
import shlex
import statistics
from unittest import mock
import unittest
......@@ -15,6 +16,26 @@
from tests.helper.testcase import BenchmarkTestCase
def normalize_command(cmd):
"""Convert a CLI string into a list of meaningful argument units (key-value or flag)."""
tokens = shlex.split(cmd)
units = []
i = 0
while i < len(tokens):
if tokens[i].startswith('--'):
if i + 1 >= len(tokens) or tokens[i + 1].startswith('--'):
units.append(tokens[i]) # flag-only
i += 1
else:
units.append(f'{tokens[i]} {tokens[i + 1]}') # key-value pair
i += 2
else:
# Include positional args like torchrun, script path, etc.
units.append(tokens[i])
i += 1
return sorted(units)
class MegatronGPTTest(BenchmarkTestCase, unittest.TestCase):
"""Tests for IBBenchmark benchmark."""
@classmethod
......@@ -170,17 +191,20 @@ def test_megatron_gpt_command(self, mock_generate_dataset):
benchmark = benchmark_cls(
self.benchmark_name,
parameters=f'--code_base {self._tmp_dir} --hostfile {self.hostfile_path} \
--num_warmup 0 --num_steps 10 --batch_size 2048 --data_prefix dataset_text_document',
--num_warmup 0 --num_steps 10 --batch_size 2048 --data_prefix dataset_text_document \
--override_opt_param_scheduler',
)
mock_generate_dataset.return_value = True
benchmark._preprocess()
benchmark._data_options = f'\
--vocab-file {self._tmp_dir}/gpt2-vocab.json \
--merge-file {self._tmp_dir}/gpt2-merges.txt \
--data-path {self._tmp_dir}/dataset_text_document'
--data-path {self._tmp_dir}/dataset_text_document \
--split 949,50,1'
script_path = str(Path(self._tmp_dir) / 'pretrain_gpt.py')
expected_command = 'torchrun {distributed_args} {script_path} \
expected_command_template = 'torchrun {distributed_args} {script_path} \
--tokenizer-type GPT2BPETokenizer \
--override-opt_param-scheduler \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
......@@ -199,7 +223,6 @@ def test_megatron_gpt_command(self, mock_generate_dataset):
--train-samples 20480 \
--lr 0.00012 \
--min-lr 1e-06 \
--split 949,50,1 \
--log-interval 1 \
--eval-interval 10 \
--eval-iters 0 \
......@@ -217,54 +240,58 @@ def test_megatron_gpt_command(self, mock_generate_dataset):
--log-throughput {data_options}'
precision = Precision.FLOAT32
command = benchmark._megatron_command(precision)
self.assertEqual(
command,
expected_command.format(
expected_command = expected_command_template.format(
precision='',
data_options=benchmark._data_options,
distributed_args=benchmark._distributed_args,
script_path=script_path
)
)
precision = Precision.FLOAT16
command = benchmark._megatron_command(precision)
self.assertEqual(
command,
expected_command.format(
actual_units = normalize_command(command)
expected_units = normalize_command(expected_command)
self.assertEqual(actual_units, expected_units)
precision = Precision.FLOAT16
expected_command = expected_command_template.format(
precision='--fp16',
data_options=benchmark._data_options,
distributed_args=benchmark._distributed_args,
script_path=script_path
)
)
precision = Precision.BFLOAT16
command = benchmark._megatron_command(precision)
self.assertEqual(
command,
expected_command.format(
actual_units = normalize_command(command)
expected_units = normalize_command(expected_command)
self.assertEqual(actual_units, expected_units)
precision = Precision.BFLOAT16
expected_command = expected_command_template.format(
precision='--bf16',
data_options=benchmark._data_options,
distributed_args=benchmark._distributed_args,
script_path=script_path
)
)
command = benchmark._megatron_command(precision)
actual_units = normalize_command(command)
expected_units = normalize_command(expected_command)
self.assertEqual(actual_units, expected_units)
os.environ['OMPI_COMM_WORLD_SIZE'] = '1'
benchmark = benchmark_cls(
self.benchmark_name,
parameters=f'--code_base {self._tmp_dir} --hostfile {self.hostfile_path} \
--num_warmup 0 --num_steps 10 --batch_size 2048 --data_prefix dataset_text_document --deepspeed',
--num_warmup 0 --num_steps 10 --batch_size 2048 --data_prefix dataset_text_document \
--deepspeed --override_opt_param_scheduler',
)
mock_generate_dataset.return_value = True
benchmark._preprocess()
benchmark._data_options = f'\
--vocab-file {self._tmp_dir}/gpt2-vocab.json \
--merge-file {self._tmp_dir}/gpt2-merges.txt \
--data-path {self._tmp_dir}/dataset_text_document'
--data-path {self._tmp_dir}/dataset_text_document \
--split 949,50,1'
command = benchmark._megatron_command(Precision.BFLOAT16)
expected_command = 'deepspeed {script_path} --override-opt_param-scheduler \
--tokenizer-type GPT2BPETokenizer \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--tensor-model-parallel-size 1 \
......@@ -282,7 +309,6 @@ def test_megatron_gpt_command(self, mock_generate_dataset):
--train-samples 20480 \
--lr 0.00012 \
--min-lr 1e-06 \
--split 949,50,1 \
--log-interval 1 \
--eval-interval 10 \
--eval-iters 0 \
......@@ -306,16 +332,174 @@ def test_megatron_gpt_command(self, mock_generate_dataset):
--train-tokens 300000000000 \
--data-impl mmap --no-pipeline-parallel'
self.assertEqual(
command,
expected_command.format(
expected_command = expected_command.format(
precision='--bf16',
data_options=benchmark._data_options,
script_path=script_path,
deepseed_options=expect_ds_options
deepseed_options=expect_ds_options,
script_path=script_path
)
command = benchmark._megatron_command(Precision.BFLOAT16)
actual_units = normalize_command(command)
expected_units = normalize_command(expected_command)
self.assertEqual(actual_units, expected_units)
def test_deepseek_v2_command(self):
"""Test v2 command."""
# test deepspeed with megatron
os.environ['OMPI_COMM_WORLD_SIZE'] = '1'
os.environ['OMPI_COMM_WORLD_LOCAL_SIZE'] = '1'
os.environ['OMPI_COMM_WORLD_RANK'] = '0'
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '12345'
with open(self.hostfile_path, 'w') as f:
f.write('host1\n')
benchmark_name = 'megatron-deepseek-v2'
(benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(self.benchmark_name, Platform.ROCM)
assert (benchmark_cls)
benchmark = benchmark_cls(
benchmark_name,
parameters=f'--code_base {self._tmp_dir} --hostfile {self.hostfile_path} '
'--num_warmup 0 '
'--num_steps 10 '
'--batch_size 256 '
'--expert_model_parallel_size 8 '
'--micro_batch_size 2 '
'--mock_data '
'--model=deepseek '
'--tokenizer_type=DeepSeekV2Tokenizer '
'--transformer_impl=transformer_engine '
'--num_layers=27 '
'--hidden_size=1024 '
'--seq_len=4096 '
'--ffn_hidden_size=10944 '
'--num_attn_heads=16 '
'--moe_ffn_hidden_size=1408 '
'--enable_shared_expert '
'--moe_layer_freq=1 '
'--num_shared_experts=2 '
'--moe_router_topk=6 '
'--moe_aux_loss_coeff=0.01 '
'--moe_router_load_balancing_type=aux_loss '
'--num_experts=64 '
'--patch_tokenizer_type=DeepSeekV2Tokenizer '
'--position_embedding_type=rope '
'--no_rope_fusion '
'--rotary_base=10000 '
'--rotary_scaling_factor=40 '
'--qk_nope_head_dim=128 '
'--qk_rope_head_dim=64 '
'--v_head_dim=128 '
'--ffn_hidden_size=10944 '
'--swiglu '
'--normalization=RMSNorm '
'--norm_epsilon=1e-06 '
'--no_bias_swiglu_fusion '
'--disable_bias_linear '
'--untie_embeddings_and_output_weights '
'--extra_vocab_size=2400 '
'--load=deepseek-ai/DeepSeek-V2-Lite '
'--no_load_optim '
'--no_load_rng '
'--ckpt_format=torch '
'--eod_mask_loss '
'--train_mode=pretrain '
'--data_cache_path=/root/cache '
'--max_padding_length=4096 '
'--kv_lora_rank=512 '
'--dataloader_type=cyclic '
)
benchmark._preprocess()
benchmark._data_options = '\
--mock-data \
--dataloader-type cyclic \
--data-cache-path /root/cache \
--dataset LLama-Pretrain-Idxmap'
precision = Precision.BFLOAT16
command = benchmark._megatron_command(precision)
expected_command = (
'torchrun {script_path} --bf16 \
--init-method-std 0.009 \
--adam-beta1 0.9 \
--hidden-dropout 0.0 \
--min-lr 1e-06 \
--lr 0.00012 \
--optimizer adam \
--log-interval 1 \
--eval-interval 10 \
--seed 1234 \
--eval-iters 0 \
--max-position-embeddings 4096 \
--hysteresis 2 \
--lr-decay-style cosine \
--lr-decay-samples 43945312 \
--clip-grad 1.0 \
--save-interval 10000 \
--adam-beta2 0.95 \
--moe-aux-loss-coeff 0.01 \
--log-throughput \
--num-workers 8 \
--use-distributed-optimizer \
--attention-dropout 0.0 \
--tensor-model-parallel-size 1 \
--lr-warmup-samples 0 \
--weight-decay 0.1 \
--train-samples 2560 \
--no-load-optim \
--load deepseek-ai/DeepSeek-V2-Lite \
--no-load-rng \
--ffn-hidden-size 10944 \
--patch-tokenizer-type DeepSeekV2Tokenizer \
--swiglu \
--normalization RMSNorm \
--norm-epsilon 1e-06 \
--no-bias-swiglu-fusion \
--no-rope-fusion \
--position-embedding-type rope \
--untie-embeddings-and-output-weights \
--disable-bias-linear \
--ckpt-format torch \
--rotary-base 10000 \
--rotary-scaling-factor 40 \
--eod-mask-loss \
--moe-ffn-hidden-size 1408 \
--enable-shared-expert \
--moe-layer-freq 1 \
--num-shared-experts 2 \
--moe-router-topk 6 \
--kv-lora-rank 512 \
--qk-nope-head-dim 128 \
--qk-rope-head-dim 64 \
--v-head-dim 128 \
--moe-router-load-balancing-type aux_loss \
--train-mode pretrain \
--extra-vocab-size 2400 \
--global-batch-size 256 \
--micro-batch-size 2 \
--num-layers 27 \
--hidden-size 1024 \
--seq-length 4096 \
--num-attention-heads 16 \
--tokenizer-type DeepSeekV2Tokenizer \
--transformer-impl transformer_engine \
--num-experts 64 \
--expert-model-parallel-size 8 \
--max-padding-length 4096 \
{data_options} \
{disitributed_args}'
).format(
script_path=str(Path(self._tmp_dir) / 'pretrain_deepseek.py'),
data_options=benchmark._data_options,
disitributed_args=benchmark._distributed_args
)
actual_units = normalize_command(command)
expected_units = normalize_command(expected_command)
self.assertEqual(actual_units, expected_units)
@decorator.load_data('tests/data/megatron_deepspeed.log')
@mock.patch('superbench.benchmarks.model_benchmarks.MegatronGPT._generate_dataset')
def test_megatron_parse_log(self, raw_output, mock_generate_dataset):
......
......@@ -16,13 +16,13 @@ ROCM_VER ?= $(shell hipconfig -R | grep -oP '\d+\.\d+\.\d+' || echo "0.0.0")
NUM_MAKE_JOBS ?= $(shell nproc --ignore=2)
.PHONY: all cuda_with_msccl cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest cuda_msccl rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd rocm_hipblaslt megatron_lm megatron_deepspeed apex_rocm nvbandwidth
.PHONY: all cuda_with_msccl cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest cuda_msccl rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd rocm_hipblaslt megatron_lm megatron_deepspeed apex_rocm nvbandwidth rocm_megatron_lm
# Build targets.
all: cuda rocm
cuda_with_msccl: cuda cuda_msccl
cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn megatron_lm megatron_deepspeed nvbandwidth
rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm_hipblaslt megatron_deepspeed apex_rocm
rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm_hipblaslt megatron_deepspeed apex_rocm rocm_megatron_lm
cpu: common cpu_perftest
common: fio cpu_stream
......@@ -230,6 +230,18 @@ megatron_deepspeed:
python -m pip install --no-cache-dir -r requirements.txt && \
python -m pip install DeepSpeed
rocm_megatron_lm:
cd Megatron && mkdir -p rocm && cd rocm && \
if [ ! -d "Megatron-LM" ]; then \
git clone -b rocm_dev https://github.com/ROCm/Megatron-LM.git ; \
fi
cp Megatron/rocm/Megatron-LM/examples/deepseek_v2/pretrain_deepseek.py Megatron/rocm/Megatron-LM/
git clone https://github.com/caaatch22/grouped_gemm.git &&\
cd grouped_gemm &&\
git checkout 8a9b438 &&\
git submodule update --init --recursive &&\
pip install .
# Instal apex of ROCm due to dependency of Megatron
apex_rocm:
$(eval TORCH_VERSION ?= $(shell python -c "import torch; print(torch.__version__)"))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment