Bug Fix - Bug fix for latest megatron-lm benchmark (#600)

**Description** Bug fix to sync latest megatron-lm code.

Bug Fix - Bug fix for latest megatron-lm benchmark (#600)
**Description** Bug fix to sync latest megatron-lm code.
ce1860b9 · Yuting Jiang · GitHub · c2e7a543 · ce1860b9 · ce1860b9
Unverified Commit ce1860b9 authored Dec 28, 2023 by Yuting Jiang Committed by GitHub Dec 27, 2023
12 changed files
--- a/.gitmodules
+++ b/.gitmodules
@@ -24,3 +24,9 @@
 [submodule "third_party/msccl"]
 	path = third_party/msccl
 	url = https://github.com/Azure/msccl
+[submodule "third_party/Megatron/Megatron-LM"]
+	path = third_party/Megatron/Megatron-LM
+	url = https://github.com/NVIDIA/Megatron-LM.git
+[submodule "third_party/Megatron/Megatron-DeepSpeed"]
+	path = third_party/Megatron/Megatron-DeepSpeed
+	url = https://github.com/microsoft/Megatron-DeepSpeed.git
--- a/dockerfile/directx12.dockerfile
+++ b/dockerfile/directx12.dockerfile
@@ -54,6 +54,8 @@ RUN curl -s -L https://dist.nuget.org/win-x86-commandline/latest/nuget.exe -o "%
 # Run the setup script to install the visual studio components
 RUN "%SB_HOME%\\dockerfile\\directx\\install-components.bat"
+RUN powershell -Command "Set-ItemProperty -Path HKLM:\SYSTEM\CurrentControlSet\Control\FileSystem -Name LongPathsEnabled -Value 1;"
+RUN git config --system core.longpaths true
 # Install Superbench
 RUN python -m pip install setuptools==65.0.0 && \
    python -m pip install --no-cache-dir .[amdworker] && \

--- a/dockerfile/rocm5.7.x.dockerfile
+++ b/dockerfile/rocm5.7.x.dockerfile
@@ -109,6 +109,7 @@ RUN bash -c 'echo -e "gfx90a:xnack-\ngfx90a:xnac+\ngfx940\ngfx941\ngfx942\ngfx10
 # Install OpenMPI
 ENV OPENMPI_VERSION=4.1.x
+ENV MPI_HOME=/usr/local/mpi
 # Check if Open MPI is installed
 RUN cd /tmp && \
    git clone --recursive https://github.com/open-mpi/ompi.git -b v${OPENMPI_VERSION}  && \
@@ -145,9 +146,9 @@ RUN cd /opt/ &&  \
 RUN cd /opt/rocm/share/amd_smi && \
    python3 -m pip install --user .
-ENV PATH="/opt/superbench/bin:/usr/local/bin/:/opt/rocm/hip/bin/:/opt/rocm/bin/:${PATH}" \
+ENV PATH="/usr/local/mpi/bin:/opt/superbench/bin:/usr/local/bin/:/opt/rocm/hip/bin/:/opt/rocm/bin/:${PATH}" \
    LD_PRELOAD="/opt/rccl/build/librccl.so:$LD_PRELOAD" \
-    LD_LIBRARY_PATH="/usr/local/lib/:/opt/rocm/lib:${LD_LIBRARY_PATH}" \
+    LD_LIBRARY_PATH="/usr/local/mpi/lib:/usr/lib/x86_64-linux-gnu/:/usr/local/lib/:/opt/rocm/lib:${LD_LIBRARY_PATH}" \
    SB_HOME=/opt/superbench \
    SB_MICRO_PATH=/opt/superbench \
    ANSIBLE_DEPRECATION_WARNINGS=FALSE \

--- a/dockerfile/rocm6.0.x.dockerfile
+++ b/dockerfile/rocm6.0.x.dockerfile
@@ -10,7 +10,7 @@ FROM ${BASE_IMAGE}
 # Lib:
 #   - torch: 2.0.1
 #   - rccl: 2.18.3+hip6.0 develop:7e1cbb4
-#   - hipblaslt: 950ca43
+#   - hipblaslt: release/rocm-rel-6.0
 #   - openmpi: 4.1.x
 #   - apex: 1.0.0
 # Intel:
@@ -115,6 +115,7 @@ RUN bash -c 'echo -e "gfx90a:xnack-\ngfx90a:xnac+\ngfx940\ngfx941\ngfx942:sramec
 # Install OpenMPI
 ENV OPENMPI_VERSION=4.1.x
+ENV MPI_HOME=/usr/local/mpi
 # Check if Open MPI is installed
 RUN cd /tmp && \
    git clone --recursive https://github.com/open-mpi/ompi.git -b v${OPENMPI_VERSION}  && \
@@ -147,9 +148,9 @@ RUN cd /opt/ &&  \
    .. && \
    make -j${NUM_MAKE_JOBS}
-ENV PATH="/opt/superbench/bin:/usr/local/bin/:/opt/rocm/hip/bin/:/opt/rocm/bin/:${PATH}" \
+ENV PATH="/usr/local/mpi/bin:/opt/superbench/bin:/usr/local/bin/:/opt/rocm/hip/bin/:/opt/rocm/bin/:${PATH}" \
    LD_PRELOAD="/opt/rccl/build/librccl.so:$LD_PRELOAD" \
-    LD_LIBRARY_PATH="/usr/local/lib/:/opt/rocm/lib:${LD_LIBRARY_PATH}" \
+    LD_LIBRARY_PATH="/usr/local/mpi/lib:/usr/lib/x86_64-linux-gnu/:/usr/local/lib/:/opt/rocm/lib:${LD_LIBRARY_PATH}" \
    SB_HOME=/opt/superbench \
    SB_MICRO_PATH=/opt/superbench \
    ANSIBLE_DEPRECATION_WARNINGS=FALSE \

--- a/superbench/benchmarks/model_benchmarks/megatron_gpt3.py
+++ b/superbench/benchmarks/model_benchmarks/megatron_gpt3.py
@@ -116,6 +116,9 @@ class MegatronGPT(ModelBenchmark):
        self._parser.add_argument('--data_home', type=str, default='/tmp', help='Data home.')
        self._parser.add_argument('--vocab_path', type=str, default='/tmp/gpt2-vocab.json', help='Vocab path.')
        self._parser.add_argument('--merge_path', type=str, default='/tmp/gpt2-merges.txt', help='Merge path.')
+        self._parser.add_argument(
+            '--split', type=str, default='949,50,1', help='Split dataset ratio for train/val/test.'
+        )
        self._parser.add_argument('--prescale_grad', action='store_true', help='Prescale grad.')
        self._parser.add_argument(
            '--hostfile', type=str, default=None, help='Hostfile to run the mutli-node benchmark.'
@@ -128,6 +131,13 @@ class MegatronGPT(ModelBenchmark):
    def _preprocess(self):
        if not super()._preprocess():
            return False
+        if not self._args.code_base:
+            if self._args.deepspeed:
+                self._args.code_base = os.path.join(
+                    os.getenv('SB_MICRO_PATH'), 'third_party/Megatron/Megatron-DeepSpeed/'
+                )
+            else:
+                self._args.code_base = os.path.join(os.getenv('SB_MICRO_PATH'), 'third_party/Megatron/Megatron-LM')
        if not os.path.exists(self._args.code_base) or \
                not os.path.exists(os.path.join(self._args.code_base, 'pretrain_gpt.py')):
@@ -156,35 +166,35 @@ class MegatronGPT(ModelBenchmark):
    def _parse_log(self, output):
        """Parse log output and get the performance."""
-        tflops_pattern = re.compile(r'TFLOPs: (\d+\.\d+)')
+        tflops_pattern = re.compile(r'(TFLOPs|TFLOP/s/GPU\)): (\d+\.\d+)')
        elapsed_time_pattern = re.compile(r'elapsed time per iteration \(ms\): (\d+\.\d+)')
-        mem_allocated_pattern = re.compile(r'MemAllocated=([\d.]+)[KMGTPEZY]?B')
+        mem_allocated_pattern = re.compile(r'allocated: (\d+\.\d+)')
-        max_mem_allocated_pattern = re.compile(r'MaxMemAllocated=([\d.]+)[KMGTPEZY]?B')
+        max_mem_allocated_pattern = re.compile(r'max allocated: (\d+\.\d+)')
        lines = output.splitlines()
        tflops = []
        mem_allocated = []
        max_mem_allocated = []
        iteration_times = []
        for line in lines:
-            if 'TFLOPs' in line:
+            if 'elapsed time per iteration' in line:
                tflops_matches = tflops_pattern.search(line)
                elapsed_time_match = elapsed_time_pattern.search(line)
                if tflops_matches:
-                    tflops_values = float(tflops_matches.group(1))
+                    tflops_values = float(tflops_matches.group(2))
                    tflops.append(tflops_values)
                if elapsed_time_match:
                    elapsed_time_value = float(elapsed_time_match.group(1))
                    iteration_times.append(elapsed_time_value)
-            if 'MaxMemAllocated' in line:
+            if 'max allocated' in line:
                mem_allocated_match = mem_allocated_pattern.search(line)
                max_mem_allocated_match = max_mem_allocated_pattern.search(line)
                if mem_allocated_match:
-                    mem_allocated_value = float(mem_allocated_match.group(1))
+                    mem_allocated_value = float(mem_allocated_match.group(1)) / 1024
                    mem_allocated.append(mem_allocated_value)
                if max_mem_allocated_match:
-                    max_mem_allocated_value = float(max_mem_allocated_match.group(1))
+                    max_mem_allocated_value = float(max_mem_allocated_match.group(1)) / 1024
                    max_mem_allocated.append(max_mem_allocated_value)
        return iteration_times, tflops, mem_allocated, max_mem_allocated
@@ -224,7 +234,9 @@ class MegatronGPT(ModelBenchmark):
            --deepspeed \
            --deepspeed_config {self._config_json_path} \
            --zero-stage {self._args.zero_stage} \
-            --pipeline-model-parallel-size {self._args.pipeline_model_parallel_size}'
+            --pipeline-model-parallel-size {self._args.pipeline_model_parallel_size} \
+            --train-tokens {self._args.train_tokens} \
+            --data-impl {self._args.data_impl}'
        if self._args.pipeline_model_parallel_size <= 1:
            deepspeed_options = f'{deepspeed_options} --no-pipeline-parallel'
@@ -255,11 +267,10 @@ class MegatronGPT(ModelBenchmark):
            --num-attention-heads {self._args.num_attn_heads} \
            --seq-length {self._args.seq_len} \
            --max-position-embeddings {self._args.seq_len} \
-            --train-tokens {self._args.train_tokens} \
            --train-samples {self._args.num_steps * self._args.batch_size} \
            --lr {self._args.lr} \
            --min-lr {self._args.min_lr} \
-            --split 949,50,1 \
+            --split {self._args.split} \
            --log-interval {self._args.log_interval} \
            --eval-interval {self._args.eval_interval} \
            --eval-iters {self._args.eval_iters} \
@@ -273,7 +284,8 @@ class MegatronGPT(ModelBenchmark):
            --optimizer adam \
            --use-distributed-optimizer \
            {precision_megatron} \
-            --seed {self._args.seed}'
+            --seed {self._args.seed} \
+            --log-throughput'
        if self._args.sequence_parallel:
            megatron_options = f'{megatron_options} --sequence-parallel'
@@ -298,6 +310,8 @@ class MegatronGPT(ModelBenchmark):
        script_path = os.path.join(self._args.code_base, 'pretrain_gpt.py')
        if self._args.deepspeed:
            deepspeed_option = self.__prepare_deespeed_config(precision_megatron.lstrip('--'))
+            # No --log-throughput in Megatron-DeepSpeed by 20231219
+            megatron_options = megatron_options.replace('--log-throughput', '').strip()
            if self._num_nodes > 1:
                command = f'torchrun {self._distributed_args} ' + \
                    f'{script_path} {megatron_options} {self._data_options} {deepspeed_option}'
@@ -379,6 +393,7 @@ class MegatronGPT(ModelBenchmark):
            return False
        self._num_nodes = int(os.getenv('OMPI_COMM_WORLD_SIZE')) // int(os.getenv('OMPI_COMM_WORLD_LOCAL_SIZE'))
+        master_addr = 'localhost'
        if self._num_nodes > 1:
            if not self._args.hostfile:
                sb_hostfile = os.path.join(os.environ.get('SB_WORKSPACE', '.'), 'hostfile')
@@ -395,12 +410,13 @@ class MegatronGPT(ModelBenchmark):
            if self._num_nodes != len(hosts):
                logger.error('MPI init failed since hostfile not match the MPI setting.')
                return False
+            master_addr = hosts[0].split()[0]
-            addr = os.getenv('MASTER_ADDR', hosts[0].split()[0])
+        addr = os.getenv('MASTER_ADDR', master_addr)
-            port = os.getenv('MASTER_PORT', '29500')
+        port = os.getenv('MASTER_PORT', '29500')
-            node_rank = int(os.environ['OMPI_COMM_WORLD_RANK']) // int(os.environ['OMPI_COMM_WORLD_LOCAL_SIZE'])
+        node_rank = int(os.environ['OMPI_COMM_WORLD_RANK']) // int(os.environ['OMPI_COMM_WORLD_LOCAL_SIZE'])
-            self._distributed_args = f'--nproc_per_node {self._args.num_gpus} --nnodes {self._num_nodes} ' + \
+        self._distributed_args = f'--nproc_per_node {self._args.num_gpus} --nnodes {self._num_nodes} ' + \
-                f'--node_rank {node_rank} --master_addr {addr} --master_port {port}'
+            f'--node_rank {node_rank} --master_addr {addr} --master_port {port}'
        return True
    def _generate_dataset(self):
@@ -448,8 +464,7 @@ class MegatronGPT(ModelBenchmark):
        self._data_options = f'\
            --vocab-file {self._vocab_path} \
            --merge-file {self._merges_path} \
-            --data-path {self._data_path} \
+            --data-path {self._data_path}'
-            --data-impl {self._args.data_impl}'
        logger.info('Dataset preparation successfully.')
        return True

--- a/superbench/benchmarks/model_benchmarks/model_base.py
+++ b/superbench/benchmarks/model_benchmarks/model_base.py
@@ -265,8 +265,8 @@ class ModelBenchmark(Benchmark):
        # The unit of step time should be millisecond.
        step_times = self._train_step(precision)
        if isinstance(step_times, tuple):
-            step_times = step_times[0]
            info = step_times[1]
+            step_times = step_times[0]
            self._process_info(ModelAction.TRAIN, precision, info)
        step_times = self.__process_model_result(ModelAction.TRAIN, precision, step_times)
        if not step_times:

--- a/tests/benchmarks/model_benchmarks/test_megatron_gpt.py
+++ b/tests/benchmarks/model_benchmarks/test_megatron_gpt.py
@@ -177,8 +177,7 @@ class MegatronGPTTest(BenchmarkTestCase, unittest.TestCase):
        benchmark._data_options = f'\
            --vocab-file {self._tmp_dir}/gpt2-vocab.json \
            --merge-file {self._tmp_dir}/gpt2-merges.txt \
-            --data-path {self._tmp_dir}/dataset_text_document \
+            --data-path {self._tmp_dir}/dataset_text_document'
-            --data-impl mmap'
        script_path = str(Path(self._tmp_dir) / 'pretrain_gpt.py')
        expected_command = 'torchrun {distributed_args} {script_path} \
@@ -197,7 +196,6 @@ class MegatronGPTTest(BenchmarkTestCase, unittest.TestCase):
            --num-attention-heads 32 \
            --seq-length 2048 \
            --max-position-embeddings 2048 \
-            --train-tokens 300000000000 \
            --train-samples 20480 \
            --lr 0.00012 \
            --min-lr 1e-06 \
@@ -215,7 +213,8 @@ class MegatronGPTTest(BenchmarkTestCase, unittest.TestCase):
            --optimizer adam \
            --use-distributed-optimizer \
            {precision} \
-            --seed 1234 {data_options}'
+            --seed 1234 \
+            --log-throughput {data_options}'
        precision = Precision.FLOAT32
        command = benchmark._megatron_command(precision)
@@ -262,12 +261,10 @@ class MegatronGPTTest(BenchmarkTestCase, unittest.TestCase):
        benchmark._data_options = f'\
            --vocab-file {self._tmp_dir}/gpt2-vocab.json \
            --merge-file {self._tmp_dir}/gpt2-merges.txt \
-            --data-path {self._tmp_dir}/dataset_text_document \
+            --data-path {self._tmp_dir}/dataset_text_document'
-            --data-impl mmap'
        command = benchmark._megatron_command(Precision.BFLOAT16)
-        expected_command = 'deepspeed {script_path} \
+        expected_command = 'deepspeed {script_path} --override-opt_param-scheduler \
-            --override-opt_param-scheduler \
            --adam-beta1 0.9 \
            --adam-beta2 0.95 \
            --tensor-model-parallel-size 1 \
@@ -282,7 +279,6 @@ class MegatronGPTTest(BenchmarkTestCase, unittest.TestCase):
            --num-attention-heads 32 \
            --seq-length 2048 \
            --max-position-embeddings 2048 \
-            --train-tokens 300000000000 \
            --train-samples 20480 \
            --lr 0.00012 \
            --min-lr 1e-06 \
@@ -306,7 +302,9 @@ class MegatronGPTTest(BenchmarkTestCase, unittest.TestCase):
            --deepspeed \
            --deepspeed_config {benchmark._config_json_path} \
            --zero-stage 1 \
-            --pipeline-model-parallel-size 1 --no-pipeline-parallel'
+            --pipeline-model-parallel-size 1 \
+            --train-tokens 300000000000 \
+            --data-impl mmap --no-pipeline-parallel'
        self.assertEqual(
            command,
@@ -346,12 +344,12 @@ class MegatronGPTTest(BenchmarkTestCase, unittest.TestCase):
        iteration_times, tflops, mem_allocated, max_mem_allocated = benchmark._parse_log(raw_output)
        assert (statistics.mean(iteration_times) == 75239.24)
        assert (statistics.mean(tflops) == 149.136)
-        assert (statistics.mean(mem_allocated) == 17.54)
+        assert (statistics.mean(mem_allocated) == 17.535637855529785)
-        assert (statistics.mean(max_mem_allocated) == 66.97)
+        assert (statistics.mean(max_mem_allocated) == 66.9744234085083)
        info = {'tflops': tflops, 'mem_allocated': mem_allocated, 'max_mem_allocated': max_mem_allocated}
        benchmark._process_info(ModelAction.TRAIN, Precision.FLOAT16, info)
        assert (benchmark.result is not None)
        assert (benchmark.result['fp16_train_tflops'][0] == 149.136)
-        assert (benchmark.result['fp16_train_mem_allocated'][0] == 17.54)
+        assert (benchmark.result['fp16_train_mem_allocated'][0] == 17.535637855529785)
-        assert (benchmark.result['fp16_train_max_mem_allocated'][0] == 66.97)
+        assert (benchmark.result['fp16_train_max_mem_allocated'][0] == 66.9744234085083)
--- a/third_party/Makefile
+++ b/third_party/Makefile
@@ -177,21 +177,17 @@ directx_amf_encoding_latency:
 		"C:\temp\BuildTools\MSBuild\Current\Bin\MSBuild.exe" "AMF\amf\public\samples\CPPSamples_vs2019.sln" /t:EncoderLatency /p:Platform=x64 /p:Configuration=Release /p:OutDir="%SB_MICRO_PATH%\bin" \
 	)
-# Install Megatron-LM
+# Install requirements for Megatron-LM
 megatron_lm:
-	if [ ! -d "Megatron/Megatron-LM" ]; then \
-        git clone "https://github.com/NVIDIA/Megatron-LM.git" "Megatron/Megatron-LM"; \
-    fi
 	cd Megatron && \
-	python -m pip install -r requirements.txt
+	apt install -y python3-mpi4py && \
+	python -m pip install --no-cache-dir -r requirements.txt
-# Install Megatron-DeepSpeed
+# Install requirements for Megatron-DeepSpeed
 megatron_deepspeed:
-	if [ ! -d "Megatron/Megatron-DeepSpeed" ]; then \
-        git clone "https://github.com/microsoft/Megatron-DeepSpeed.git" "Megatron/Megatron-DeepSpeed"; \
-    fi
 	cd Megatron && \
-	python -m pip install -r requirements.txt && \
+	apt install -y python3-mpi4py && \
+	python -m pip install --no-cache-dir -r requirements.txt && \
 	python -m pip install DeepSpeed
 # Instal apex of ROCm due to dependency of Megatron

--- a/Megatron-DeepSpeed @ 71e8407c
+++ b/Megatron-DeepSpeed @ 71e8407c
+Subproject commit 71e8407c98bacacb002823ea587c321fe58b28a6
--- a/Megatron-LM @ 52b7a18a
+++ b/Megatron-LM @ 52b7a18a
+Subproject commit 52b7a18a00bced8b3670eededfd58ee0c4bd7d06
--- a/third_party/Megatron/megatron_deepspeed_rocm6.patch
+++ b/third_party/Megatron/megatron_deepspeed_rocm6.patch
-diff --git a/megatron/fused_kernels/scaled_masked_softmax_cuda.cu b/megatron/fused_kernels/scaled_masked_softmax_cuda.cu
+diff --git a/megatron/fused_kernels/scaled_softmax_cuda.cu b/megatron/fused_kernels/scaled_softmax_cuda.cu
-index 76086de..1533648 100644
+index 90e1c9f..d217aec 100644
--- a/megatron/fused_kernels/scaled_masked_softmax_cuda.cu
+--- a/megatron/fused_kernels/scaled_softmax_cuda.cu
-+++ b/megatron/fused_kernels/scaled_masked_softmax_cuda.cu
+++ b/megatron/fused_kernels/scaled_softmax_cuda.cu
 @@ -4,7 +4,7 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <cuda_fp16.h>
 -#ifndef __HIP_PLATFORM_HCC__
 +#ifndef __HIP_PLATFORM_AMD__
 #include <cuda_profiler_api.h>
 #endif
 #include <ATen/cuda/CUDAContext.h>
-diff --git a/megatron/fused_kernels/scaled_softmax_cuda.cu b/megatron/fused_kernels/scaled_softmax_cuda.cu
+diff --git a/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu b/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu
-index 90e1c9f..d217aec 100644
+index 74c9f3d..03b5fc8 100644
--- a/megatron/fused_kernels/scaled_softmax_cuda.cu
+--- a/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu
-+++ b/megatron/fused_kernels/scaled_softmax_cuda.cu
+++ b/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu
 @@ -4,7 +4,7 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <cuda_fp16.h>
 -#ifndef __HIP_PLATFORM_HCC__
 +#ifndef __HIP_PLATFORM_AMD__
 #include <cuda_profiler_api.h>
 #endif
 #include <ATen/cuda/CUDAContext.h>
-diff --git a/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu b/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu
-index 74c9f3d..03b5fc8 100644
--- a/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu
-+++ b/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu
-@@ -4,7 +4,7 @@
- #include <cuda.h>
- #include <cuda_runtime.h>
- #include <cuda_fp16.h>
-#ifndef __HIP_PLATFORM_HCC__
-+#ifndef __HIP_PLATFORM_AMD__
- #include <cuda_profiler_api.h>
- #endif
- #include <ATen/cuda/CUDAContext.h>
--- a/third_party/Megatron/requirements.txt
+++ b/third_party/Megatron/requirements.txt
@@ -10,4 +10,6 @@ tqdm
 sentencepiece
 wandb
 einops
-typing_extensions==4.5.0
+typing_extensions==4.9.0
+apex
+mpi4py