Unverified Commit ce1860b9 authored by Yuting Jiang's avatar Yuting Jiang Committed by GitHub
Browse files

Bug Fix - Bug fix for latest megatron-lm benchmark (#600)

**Description**
Bug fix to sync latest megatron-lm code.
parent c2e7a543
......@@ -24,3 +24,9 @@
[submodule "third_party/msccl"]
path = third_party/msccl
url = https://github.com/Azure/msccl
[submodule "third_party/Megatron/Megatron-LM"]
path = third_party/Megatron/Megatron-LM
url = https://github.com/NVIDIA/Megatron-LM.git
[submodule "third_party/Megatron/Megatron-DeepSpeed"]
path = third_party/Megatron/Megatron-DeepSpeed
url = https://github.com/microsoft/Megatron-DeepSpeed.git
......@@ -54,6 +54,8 @@ RUN curl -s -L https://dist.nuget.org/win-x86-commandline/latest/nuget.exe -o "%
# Run the setup script to install the visual studio components
RUN "%SB_HOME%\\dockerfile\\directx\\install-components.bat"
RUN powershell -Command "Set-ItemProperty -Path HKLM:\SYSTEM\CurrentControlSet\Control\FileSystem -Name LongPathsEnabled -Value 1;"
RUN git config --system core.longpaths true
# Install Superbench
RUN python -m pip install setuptools==65.0.0 && \
python -m pip install --no-cache-dir .[amdworker] && \
......
......@@ -109,6 +109,7 @@ RUN bash -c 'echo -e "gfx90a:xnack-\ngfx90a:xnac+\ngfx940\ngfx941\ngfx942\ngfx10
# Install OpenMPI
ENV OPENMPI_VERSION=4.1.x
ENV MPI_HOME=/usr/local/mpi
# Check if Open MPI is installed
RUN cd /tmp && \
git clone --recursive https://github.com/open-mpi/ompi.git -b v${OPENMPI_VERSION} && \
......@@ -145,9 +146,9 @@ RUN cd /opt/ && \
RUN cd /opt/rocm/share/amd_smi && \
python3 -m pip install --user .
ENV PATH="/opt/superbench/bin:/usr/local/bin/:/opt/rocm/hip/bin/:/opt/rocm/bin/:${PATH}" \
ENV PATH="/usr/local/mpi/bin:/opt/superbench/bin:/usr/local/bin/:/opt/rocm/hip/bin/:/opt/rocm/bin/:${PATH}" \
LD_PRELOAD="/opt/rccl/build/librccl.so:$LD_PRELOAD" \
LD_LIBRARY_PATH="/usr/local/lib/:/opt/rocm/lib:${LD_LIBRARY_PATH}" \
LD_LIBRARY_PATH="/usr/local/mpi/lib:/usr/lib/x86_64-linux-gnu/:/usr/local/lib/:/opt/rocm/lib:${LD_LIBRARY_PATH}" \
SB_HOME=/opt/superbench \
SB_MICRO_PATH=/opt/superbench \
ANSIBLE_DEPRECATION_WARNINGS=FALSE \
......
......@@ -10,7 +10,7 @@ FROM ${BASE_IMAGE}
# Lib:
# - torch: 2.0.1
# - rccl: 2.18.3+hip6.0 develop:7e1cbb4
# - hipblaslt: 950ca43
# - hipblaslt: release/rocm-rel-6.0
# - openmpi: 4.1.x
# - apex: 1.0.0
# Intel:
......@@ -115,6 +115,7 @@ RUN bash -c 'echo -e "gfx90a:xnack-\ngfx90a:xnac+\ngfx940\ngfx941\ngfx942:sramec
# Install OpenMPI
ENV OPENMPI_VERSION=4.1.x
ENV MPI_HOME=/usr/local/mpi
# Check if Open MPI is installed
RUN cd /tmp && \
git clone --recursive https://github.com/open-mpi/ompi.git -b v${OPENMPI_VERSION} && \
......@@ -147,9 +148,9 @@ RUN cd /opt/ && \
.. && \
make -j${NUM_MAKE_JOBS}
ENV PATH="/opt/superbench/bin:/usr/local/bin/:/opt/rocm/hip/bin/:/opt/rocm/bin/:${PATH}" \
ENV PATH="/usr/local/mpi/bin:/opt/superbench/bin:/usr/local/bin/:/opt/rocm/hip/bin/:/opt/rocm/bin/:${PATH}" \
LD_PRELOAD="/opt/rccl/build/librccl.so:$LD_PRELOAD" \
LD_LIBRARY_PATH="/usr/local/lib/:/opt/rocm/lib:${LD_LIBRARY_PATH}" \
LD_LIBRARY_PATH="/usr/local/mpi/lib:/usr/lib/x86_64-linux-gnu/:/usr/local/lib/:/opt/rocm/lib:${LD_LIBRARY_PATH}" \
SB_HOME=/opt/superbench \
SB_MICRO_PATH=/opt/superbench \
ANSIBLE_DEPRECATION_WARNINGS=FALSE \
......
......@@ -116,6 +116,9 @@ def add_parser_arguments(self):
self._parser.add_argument('--data_home', type=str, default='/tmp', help='Data home.')
self._parser.add_argument('--vocab_path', type=str, default='/tmp/gpt2-vocab.json', help='Vocab path.')
self._parser.add_argument('--merge_path', type=str, default='/tmp/gpt2-merges.txt', help='Merge path.')
self._parser.add_argument(
'--split', type=str, default='949,50,1', help='Split dataset ratio for train/val/test.'
)
self._parser.add_argument('--prescale_grad', action='store_true', help='Prescale grad.')
self._parser.add_argument(
'--hostfile', type=str, default=None, help='Hostfile to run the mutli-node benchmark.'
......@@ -128,6 +131,13 @@ def add_parser_arguments(self):
def _preprocess(self):
if not super()._preprocess():
return False
if not self._args.code_base:
if self._args.deepspeed:
self._args.code_base = os.path.join(
os.getenv('SB_MICRO_PATH'), 'third_party/Megatron/Megatron-DeepSpeed/'
)
else:
self._args.code_base = os.path.join(os.getenv('SB_MICRO_PATH'), 'third_party/Megatron/Megatron-LM')
if not os.path.exists(self._args.code_base) or \
not os.path.exists(os.path.join(self._args.code_base, 'pretrain_gpt.py')):
......@@ -156,35 +166,35 @@ def _preprocess(self):
def _parse_log(self, output):
"""Parse log output and get the performance."""
tflops_pattern = re.compile(r'TFLOPs: (\d+\.\d+)')
tflops_pattern = re.compile(r'(TFLOPs|TFLOP/s/GPU\)): (\d+\.\d+)')
elapsed_time_pattern = re.compile(r'elapsed time per iteration \(ms\): (\d+\.\d+)')
mem_allocated_pattern = re.compile(r'MemAllocated=([\d.]+)[KMGTPEZY]?B')
max_mem_allocated_pattern = re.compile(r'MaxMemAllocated=([\d.]+)[KMGTPEZY]?B')
mem_allocated_pattern = re.compile(r'allocated: (\d+\.\d+)')
max_mem_allocated_pattern = re.compile(r'max allocated: (\d+\.\d+)')
lines = output.splitlines()
tflops = []
mem_allocated = []
max_mem_allocated = []
iteration_times = []
for line in lines:
if 'TFLOPs' in line:
if 'elapsed time per iteration' in line:
tflops_matches = tflops_pattern.search(line)
elapsed_time_match = elapsed_time_pattern.search(line)
if tflops_matches:
tflops_values = float(tflops_matches.group(1))
tflops_values = float(tflops_matches.group(2))
tflops.append(tflops_values)
if elapsed_time_match:
elapsed_time_value = float(elapsed_time_match.group(1))
iteration_times.append(elapsed_time_value)
if 'MaxMemAllocated' in line:
if 'max allocated' in line:
mem_allocated_match = mem_allocated_pattern.search(line)
max_mem_allocated_match = max_mem_allocated_pattern.search(line)
if mem_allocated_match:
mem_allocated_value = float(mem_allocated_match.group(1))
mem_allocated_value = float(mem_allocated_match.group(1)) / 1024
mem_allocated.append(mem_allocated_value)
if max_mem_allocated_match:
max_mem_allocated_value = float(max_mem_allocated_match.group(1))
max_mem_allocated_value = float(max_mem_allocated_match.group(1)) / 1024
max_mem_allocated.append(max_mem_allocated_value)
return iteration_times, tflops, mem_allocated, max_mem_allocated
......@@ -224,7 +234,9 @@ def __prepare_deespeed_config(self, precision_megatron):
--deepspeed \
--deepspeed_config {self._config_json_path} \
--zero-stage {self._args.zero_stage} \
--pipeline-model-parallel-size {self._args.pipeline_model_parallel_size}'
--pipeline-model-parallel-size {self._args.pipeline_model_parallel_size} \
--train-tokens {self._args.train_tokens} \
--data-impl {self._args.data_impl}'
if self._args.pipeline_model_parallel_size <= 1:
deepspeed_options = f'{deepspeed_options} --no-pipeline-parallel'
......@@ -255,11 +267,10 @@ def _megatron_command(self, precision): # noqa: C901
--num-attention-heads {self._args.num_attn_heads} \
--seq-length {self._args.seq_len} \
--max-position-embeddings {self._args.seq_len} \
--train-tokens {self._args.train_tokens} \
--train-samples {self._args.num_steps * self._args.batch_size} \
--lr {self._args.lr} \
--min-lr {self._args.min_lr} \
--split 949,50,1 \
--split {self._args.split} \
--log-interval {self._args.log_interval} \
--eval-interval {self._args.eval_interval} \
--eval-iters {self._args.eval_iters} \
......@@ -273,7 +284,8 @@ def _megatron_command(self, precision): # noqa: C901
--optimizer adam \
--use-distributed-optimizer \
{precision_megatron} \
--seed {self._args.seed}'
--seed {self._args.seed} \
--log-throughput'
if self._args.sequence_parallel:
megatron_options = f'{megatron_options} --sequence-parallel'
......@@ -298,6 +310,8 @@ def _megatron_command(self, precision): # noqa: C901
script_path = os.path.join(self._args.code_base, 'pretrain_gpt.py')
if self._args.deepspeed:
deepspeed_option = self.__prepare_deespeed_config(precision_megatron.lstrip('--'))
# No --log-throughput in Megatron-DeepSpeed by 20231219
megatron_options = megatron_options.replace('--log-throughput', '').strip()
if self._num_nodes > 1:
command = f'torchrun {self._distributed_args} ' + \
f'{script_path} {megatron_options} {self._data_options} {deepspeed_option}'
......@@ -379,6 +393,7 @@ def _init_distributed_setting(self):
return False
self._num_nodes = int(os.getenv('OMPI_COMM_WORLD_SIZE')) // int(os.getenv('OMPI_COMM_WORLD_LOCAL_SIZE'))
master_addr = 'localhost'
if self._num_nodes > 1:
if not self._args.hostfile:
sb_hostfile = os.path.join(os.environ.get('SB_WORKSPACE', '.'), 'hostfile')
......@@ -395,12 +410,13 @@ def _init_distributed_setting(self):
if self._num_nodes != len(hosts):
logger.error('MPI init failed since hostfile not match the MPI setting.')
return False
master_addr = hosts[0].split()[0]
addr = os.getenv('MASTER_ADDR', hosts[0].split()[0])
port = os.getenv('MASTER_PORT', '29500')
node_rank = int(os.environ['OMPI_COMM_WORLD_RANK']) // int(os.environ['OMPI_COMM_WORLD_LOCAL_SIZE'])
self._distributed_args = f'--nproc_per_node {self._args.num_gpus} --nnodes {self._num_nodes} ' + \
f'--node_rank {node_rank} --master_addr {addr} --master_port {port}'
addr = os.getenv('MASTER_ADDR', master_addr)
port = os.getenv('MASTER_PORT', '29500')
node_rank = int(os.environ['OMPI_COMM_WORLD_RANK']) // int(os.environ['OMPI_COMM_WORLD_LOCAL_SIZE'])
self._distributed_args = f'--nproc_per_node {self._args.num_gpus} --nnodes {self._num_nodes} ' + \
f'--node_rank {node_rank} --master_addr {addr} --master_port {port}'
return True
def _generate_dataset(self):
......@@ -448,8 +464,7 @@ def _generate_dataset(self):
self._data_options = f'\
--vocab-file {self._vocab_path} \
--merge-file {self._merges_path} \
--data-path {self._data_path} \
--data-impl {self._args.data_impl}'
--data-path {self._data_path}'
logger.info('Dataset preparation successfully.')
return True
......
......@@ -265,8 +265,8 @@ def __train(self, precision):
# The unit of step time should be millisecond.
step_times = self._train_step(precision)
if isinstance(step_times, tuple):
step_times = step_times[0]
info = step_times[1]
step_times = step_times[0]
self._process_info(ModelAction.TRAIN, precision, info)
step_times = self.__process_model_result(ModelAction.TRAIN, precision, step_times)
if not step_times:
......
......@@ -177,8 +177,7 @@ def test_megatron_gpt_command(self, mock_generate_dataset):
benchmark._data_options = f'\
--vocab-file {self._tmp_dir}/gpt2-vocab.json \
--merge-file {self._tmp_dir}/gpt2-merges.txt \
--data-path {self._tmp_dir}/dataset_text_document \
--data-impl mmap'
--data-path {self._tmp_dir}/dataset_text_document'
script_path = str(Path(self._tmp_dir) / 'pretrain_gpt.py')
expected_command = 'torchrun {distributed_args} {script_path} \
......@@ -197,7 +196,6 @@ def test_megatron_gpt_command(self, mock_generate_dataset):
--num-attention-heads 32 \
--seq-length 2048 \
--max-position-embeddings 2048 \
--train-tokens 300000000000 \
--train-samples 20480 \
--lr 0.00012 \
--min-lr 1e-06 \
......@@ -215,7 +213,8 @@ def test_megatron_gpt_command(self, mock_generate_dataset):
--optimizer adam \
--use-distributed-optimizer \
{precision} \
--seed 1234 {data_options}'
--seed 1234 \
--log-throughput {data_options}'
precision = Precision.FLOAT32
command = benchmark._megatron_command(precision)
......@@ -262,12 +261,10 @@ def test_megatron_gpt_command(self, mock_generate_dataset):
benchmark._data_options = f'\
--vocab-file {self._tmp_dir}/gpt2-vocab.json \
--merge-file {self._tmp_dir}/gpt2-merges.txt \
--data-path {self._tmp_dir}/dataset_text_document \
--data-impl mmap'
--data-path {self._tmp_dir}/dataset_text_document'
command = benchmark._megatron_command(Precision.BFLOAT16)
expected_command = 'deepspeed {script_path} \
--override-opt_param-scheduler \
expected_command = 'deepspeed {script_path} --override-opt_param-scheduler \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--tensor-model-parallel-size 1 \
......@@ -282,7 +279,6 @@ def test_megatron_gpt_command(self, mock_generate_dataset):
--num-attention-heads 32 \
--seq-length 2048 \
--max-position-embeddings 2048 \
--train-tokens 300000000000 \
--train-samples 20480 \
--lr 0.00012 \
--min-lr 1e-06 \
......@@ -306,7 +302,9 @@ def test_megatron_gpt_command(self, mock_generate_dataset):
--deepspeed \
--deepspeed_config {benchmark._config_json_path} \
--zero-stage 1 \
--pipeline-model-parallel-size 1 --no-pipeline-parallel'
--pipeline-model-parallel-size 1 \
--train-tokens 300000000000 \
--data-impl mmap --no-pipeline-parallel'
self.assertEqual(
command,
......@@ -346,12 +344,12 @@ def test_megatron_parse_log(self, raw_output, mock_generate_dataset):
iteration_times, tflops, mem_allocated, max_mem_allocated = benchmark._parse_log(raw_output)
assert (statistics.mean(iteration_times) == 75239.24)
assert (statistics.mean(tflops) == 149.136)
assert (statistics.mean(mem_allocated) == 17.54)
assert (statistics.mean(max_mem_allocated) == 66.97)
assert (statistics.mean(mem_allocated) == 17.535637855529785)
assert (statistics.mean(max_mem_allocated) == 66.9744234085083)
info = {'tflops': tflops, 'mem_allocated': mem_allocated, 'max_mem_allocated': max_mem_allocated}
benchmark._process_info(ModelAction.TRAIN, Precision.FLOAT16, info)
assert (benchmark.result is not None)
assert (benchmark.result['fp16_train_tflops'][0] == 149.136)
assert (benchmark.result['fp16_train_mem_allocated'][0] == 17.54)
assert (benchmark.result['fp16_train_max_mem_allocated'][0] == 66.97)
assert (benchmark.result['fp16_train_mem_allocated'][0] == 17.535637855529785)
assert (benchmark.result['fp16_train_max_mem_allocated'][0] == 66.9744234085083)
......@@ -177,21 +177,17 @@ directx_amf_encoding_latency:
"C:\temp\BuildTools\MSBuild\Current\Bin\MSBuild.exe" "AMF\amf\public\samples\CPPSamples_vs2019.sln" /t:EncoderLatency /p:Platform=x64 /p:Configuration=Release /p:OutDir="%SB_MICRO_PATH%\bin" \
)
# Install Megatron-LM
# Install requirements for Megatron-LM
megatron_lm:
if [ ! -d "Megatron/Megatron-LM" ]; then \
git clone "https://github.com/NVIDIA/Megatron-LM.git" "Megatron/Megatron-LM"; \
fi
cd Megatron && \
python -m pip install -r requirements.txt
apt install -y python3-mpi4py && \
python -m pip install --no-cache-dir -r requirements.txt
# Install Megatron-DeepSpeed
# Install requirements for Megatron-DeepSpeed
megatron_deepspeed:
if [ ! -d "Megatron/Megatron-DeepSpeed" ]; then \
git clone "https://github.com/microsoft/Megatron-DeepSpeed.git" "Megatron/Megatron-DeepSpeed"; \
fi
cd Megatron && \
python -m pip install -r requirements.txt && \
apt install -y python3-mpi4py && \
python -m pip install --no-cache-dir -r requirements.txt && \
python -m pip install DeepSpeed
# Instal apex of ROCm due to dependency of Megatron
......
Subproject commit 71e8407c98bacacb002823ea587c321fe58b28a6
Subproject commit 52b7a18a00bced8b3670eededfd58ee0c4bd7d06
diff --git a/megatron/fused_kernels/scaled_masked_softmax_cuda.cu b/megatron/fused_kernels/scaled_masked_softmax_cuda.cu
index 76086de..1533648 100644
--- a/megatron/fused_kernels/scaled_masked_softmax_cuda.cu
+++ b/megatron/fused_kernels/scaled_masked_softmax_cuda.cu
@@ -4,7 +4,7 @@
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_fp16.h>
-#ifndef __HIP_PLATFORM_HCC__
+#ifndef __HIP_PLATFORM_AMD__
#include <cuda_profiler_api.h>
#endif
#include <ATen/cuda/CUDAContext.h>
diff --git a/megatron/fused_kernels/scaled_softmax_cuda.cu b/megatron/fused_kernels/scaled_softmax_cuda.cu
index 90e1c9f..d217aec 100644
--- a/megatron/fused_kernels/scaled_softmax_cuda.cu
+++ b/megatron/fused_kernels/scaled_softmax_cuda.cu
@@ -4,7 +4,7 @@
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_fp16.h>
-#ifndef __HIP_PLATFORM_HCC__
+#ifndef __HIP_PLATFORM_AMD__
#include <cuda_profiler_api.h>
#endif
#include <ATen/cuda/CUDAContext.h>
diff --git a/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu b/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu
index 74c9f3d..03b5fc8 100644
--- a/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu
+++ b/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu
@@ -4,7 +4,7 @@
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_fp16.h>
-#ifndef __HIP_PLATFORM_HCC__
+#ifndef __HIP_PLATFORM_AMD__
#include <cuda_profiler_api.h>
#endif
#include <ATen/cuda/CUDAContext.h>
diff --git a/megatron/fused_kernels/scaled_softmax_cuda.cu b/megatron/fused_kernels/scaled_softmax_cuda.cu
index 90e1c9f..d217aec 100644
--- a/megatron/fused_kernels/scaled_softmax_cuda.cu
+++ b/megatron/fused_kernels/scaled_softmax_cuda.cu
@@ -4,7 +4,7 @@
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_fp16.h>
-#ifndef __HIP_PLATFORM_HCC__
+#ifndef __HIP_PLATFORM_AMD__
#include <cuda_profiler_api.h>
#endif
#include <ATen/cuda/CUDAContext.h>
diff --git a/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu b/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu
index 74c9f3d..03b5fc8 100644
--- a/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu
+++ b/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu
@@ -4,7 +4,7 @@
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_fp16.h>
-#ifndef __HIP_PLATFORM_HCC__
+#ifndef __HIP_PLATFORM_AMD__
#include <cuda_profiler_api.h>
#endif
#include <ATen/cuda/CUDAContext.h>
......@@ -10,4 +10,6 @@ tqdm
sentencepiece
wandb
einops
typing_extensions==4.5.0
typing_extensions==4.9.0
apex
mpi4py
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment