Unverified Commit ce1860b9 authored by Yuting Jiang's avatar Yuting Jiang Committed by GitHub
Browse files

Bug Fix - Bug fix for latest megatron-lm benchmark (#600)

**Description**
Bug fix to sync latest megatron-lm code.
parent c2e7a543
...@@ -24,3 +24,9 @@ ...@@ -24,3 +24,9 @@
[submodule "third_party/msccl"] [submodule "third_party/msccl"]
path = third_party/msccl path = third_party/msccl
url = https://github.com/Azure/msccl url = https://github.com/Azure/msccl
[submodule "third_party/Megatron/Megatron-LM"]
path = third_party/Megatron/Megatron-LM
url = https://github.com/NVIDIA/Megatron-LM.git
[submodule "third_party/Megatron/Megatron-DeepSpeed"]
path = third_party/Megatron/Megatron-DeepSpeed
url = https://github.com/microsoft/Megatron-DeepSpeed.git
...@@ -54,6 +54,8 @@ RUN curl -s -L https://dist.nuget.org/win-x86-commandline/latest/nuget.exe -o "% ...@@ -54,6 +54,8 @@ RUN curl -s -L https://dist.nuget.org/win-x86-commandline/latest/nuget.exe -o "%
# Run the setup script to install the visual studio components # Run the setup script to install the visual studio components
RUN "%SB_HOME%\\dockerfile\\directx\\install-components.bat" RUN "%SB_HOME%\\dockerfile\\directx\\install-components.bat"
RUN powershell -Command "Set-ItemProperty -Path HKLM:\SYSTEM\CurrentControlSet\Control\FileSystem -Name LongPathsEnabled -Value 1;"
RUN git config --system core.longpaths true
# Install Superbench # Install Superbench
RUN python -m pip install setuptools==65.0.0 && \ RUN python -m pip install setuptools==65.0.0 && \
python -m pip install --no-cache-dir .[amdworker] && \ python -m pip install --no-cache-dir .[amdworker] && \
......
...@@ -109,6 +109,7 @@ RUN bash -c 'echo -e "gfx90a:xnack-\ngfx90a:xnac+\ngfx940\ngfx941\ngfx942\ngfx10 ...@@ -109,6 +109,7 @@ RUN bash -c 'echo -e "gfx90a:xnack-\ngfx90a:xnac+\ngfx940\ngfx941\ngfx942\ngfx10
# Install OpenMPI # Install OpenMPI
ENV OPENMPI_VERSION=4.1.x ENV OPENMPI_VERSION=4.1.x
ENV MPI_HOME=/usr/local/mpi
# Check if Open MPI is installed # Check if Open MPI is installed
RUN cd /tmp && \ RUN cd /tmp && \
git clone --recursive https://github.com/open-mpi/ompi.git -b v${OPENMPI_VERSION} && \ git clone --recursive https://github.com/open-mpi/ompi.git -b v${OPENMPI_VERSION} && \
...@@ -145,9 +146,9 @@ RUN cd /opt/ && \ ...@@ -145,9 +146,9 @@ RUN cd /opt/ && \
RUN cd /opt/rocm/share/amd_smi && \ RUN cd /opt/rocm/share/amd_smi && \
python3 -m pip install --user . python3 -m pip install --user .
ENV PATH="/opt/superbench/bin:/usr/local/bin/:/opt/rocm/hip/bin/:/opt/rocm/bin/:${PATH}" \ ENV PATH="/usr/local/mpi/bin:/opt/superbench/bin:/usr/local/bin/:/opt/rocm/hip/bin/:/opt/rocm/bin/:${PATH}" \
LD_PRELOAD="/opt/rccl/build/librccl.so:$LD_PRELOAD" \ LD_PRELOAD="/opt/rccl/build/librccl.so:$LD_PRELOAD" \
LD_LIBRARY_PATH="/usr/local/lib/:/opt/rocm/lib:${LD_LIBRARY_PATH}" \ LD_LIBRARY_PATH="/usr/local/mpi/lib:/usr/lib/x86_64-linux-gnu/:/usr/local/lib/:/opt/rocm/lib:${LD_LIBRARY_PATH}" \
SB_HOME=/opt/superbench \ SB_HOME=/opt/superbench \
SB_MICRO_PATH=/opt/superbench \ SB_MICRO_PATH=/opt/superbench \
ANSIBLE_DEPRECATION_WARNINGS=FALSE \ ANSIBLE_DEPRECATION_WARNINGS=FALSE \
......
...@@ -10,7 +10,7 @@ FROM ${BASE_IMAGE} ...@@ -10,7 +10,7 @@ FROM ${BASE_IMAGE}
# Lib: # Lib:
# - torch: 2.0.1 # - torch: 2.0.1
# - rccl: 2.18.3+hip6.0 develop:7e1cbb4 # - rccl: 2.18.3+hip6.0 develop:7e1cbb4
# - hipblaslt: 950ca43 # - hipblaslt: release/rocm-rel-6.0
# - openmpi: 4.1.x # - openmpi: 4.1.x
# - apex: 1.0.0 # - apex: 1.0.0
# Intel: # Intel:
...@@ -115,6 +115,7 @@ RUN bash -c 'echo -e "gfx90a:xnack-\ngfx90a:xnac+\ngfx940\ngfx941\ngfx942:sramec ...@@ -115,6 +115,7 @@ RUN bash -c 'echo -e "gfx90a:xnack-\ngfx90a:xnac+\ngfx940\ngfx941\ngfx942:sramec
# Install OpenMPI # Install OpenMPI
ENV OPENMPI_VERSION=4.1.x ENV OPENMPI_VERSION=4.1.x
ENV MPI_HOME=/usr/local/mpi
# Check if Open MPI is installed # Check if Open MPI is installed
RUN cd /tmp && \ RUN cd /tmp && \
git clone --recursive https://github.com/open-mpi/ompi.git -b v${OPENMPI_VERSION} && \ git clone --recursive https://github.com/open-mpi/ompi.git -b v${OPENMPI_VERSION} && \
...@@ -147,9 +148,9 @@ RUN cd /opt/ && \ ...@@ -147,9 +148,9 @@ RUN cd /opt/ && \
.. && \ .. && \
make -j${NUM_MAKE_JOBS} make -j${NUM_MAKE_JOBS}
ENV PATH="/opt/superbench/bin:/usr/local/bin/:/opt/rocm/hip/bin/:/opt/rocm/bin/:${PATH}" \ ENV PATH="/usr/local/mpi/bin:/opt/superbench/bin:/usr/local/bin/:/opt/rocm/hip/bin/:/opt/rocm/bin/:${PATH}" \
LD_PRELOAD="/opt/rccl/build/librccl.so:$LD_PRELOAD" \ LD_PRELOAD="/opt/rccl/build/librccl.so:$LD_PRELOAD" \
LD_LIBRARY_PATH="/usr/local/lib/:/opt/rocm/lib:${LD_LIBRARY_PATH}" \ LD_LIBRARY_PATH="/usr/local/mpi/lib:/usr/lib/x86_64-linux-gnu/:/usr/local/lib/:/opt/rocm/lib:${LD_LIBRARY_PATH}" \
SB_HOME=/opt/superbench \ SB_HOME=/opt/superbench \
SB_MICRO_PATH=/opt/superbench \ SB_MICRO_PATH=/opt/superbench \
ANSIBLE_DEPRECATION_WARNINGS=FALSE \ ANSIBLE_DEPRECATION_WARNINGS=FALSE \
......
...@@ -116,6 +116,9 @@ def add_parser_arguments(self): ...@@ -116,6 +116,9 @@ def add_parser_arguments(self):
self._parser.add_argument('--data_home', type=str, default='/tmp', help='Data home.') self._parser.add_argument('--data_home', type=str, default='/tmp', help='Data home.')
self._parser.add_argument('--vocab_path', type=str, default='/tmp/gpt2-vocab.json', help='Vocab path.') self._parser.add_argument('--vocab_path', type=str, default='/tmp/gpt2-vocab.json', help='Vocab path.')
self._parser.add_argument('--merge_path', type=str, default='/tmp/gpt2-merges.txt', help='Merge path.') self._parser.add_argument('--merge_path', type=str, default='/tmp/gpt2-merges.txt', help='Merge path.')
self._parser.add_argument(
'--split', type=str, default='949,50,1', help='Split dataset ratio for train/val/test.'
)
self._parser.add_argument('--prescale_grad', action='store_true', help='Prescale grad.') self._parser.add_argument('--prescale_grad', action='store_true', help='Prescale grad.')
self._parser.add_argument( self._parser.add_argument(
'--hostfile', type=str, default=None, help='Hostfile to run the mutli-node benchmark.' '--hostfile', type=str, default=None, help='Hostfile to run the mutli-node benchmark.'
...@@ -128,6 +131,13 @@ def add_parser_arguments(self): ...@@ -128,6 +131,13 @@ def add_parser_arguments(self):
def _preprocess(self): def _preprocess(self):
if not super()._preprocess(): if not super()._preprocess():
return False return False
if not self._args.code_base:
if self._args.deepspeed:
self._args.code_base = os.path.join(
os.getenv('SB_MICRO_PATH'), 'third_party/Megatron/Megatron-DeepSpeed/'
)
else:
self._args.code_base = os.path.join(os.getenv('SB_MICRO_PATH'), 'third_party/Megatron/Megatron-LM')
if not os.path.exists(self._args.code_base) or \ if not os.path.exists(self._args.code_base) or \
not os.path.exists(os.path.join(self._args.code_base, 'pretrain_gpt.py')): not os.path.exists(os.path.join(self._args.code_base, 'pretrain_gpt.py')):
...@@ -156,35 +166,35 @@ def _preprocess(self): ...@@ -156,35 +166,35 @@ def _preprocess(self):
def _parse_log(self, output): def _parse_log(self, output):
"""Parse log output and get the performance.""" """Parse log output and get the performance."""
tflops_pattern = re.compile(r'TFLOPs: (\d+\.\d+)') tflops_pattern = re.compile(r'(TFLOPs|TFLOP/s/GPU\)): (\d+\.\d+)')
elapsed_time_pattern = re.compile(r'elapsed time per iteration \(ms\): (\d+\.\d+)') elapsed_time_pattern = re.compile(r'elapsed time per iteration \(ms\): (\d+\.\d+)')
mem_allocated_pattern = re.compile(r'MemAllocated=([\d.]+)[KMGTPEZY]?B') mem_allocated_pattern = re.compile(r'allocated: (\d+\.\d+)')
max_mem_allocated_pattern = re.compile(r'MaxMemAllocated=([\d.]+)[KMGTPEZY]?B') max_mem_allocated_pattern = re.compile(r'max allocated: (\d+\.\d+)')
lines = output.splitlines() lines = output.splitlines()
tflops = [] tflops = []
mem_allocated = [] mem_allocated = []
max_mem_allocated = [] max_mem_allocated = []
iteration_times = [] iteration_times = []
for line in lines: for line in lines:
if 'TFLOPs' in line: if 'elapsed time per iteration' in line:
tflops_matches = tflops_pattern.search(line) tflops_matches = tflops_pattern.search(line)
elapsed_time_match = elapsed_time_pattern.search(line) elapsed_time_match = elapsed_time_pattern.search(line)
if tflops_matches: if tflops_matches:
tflops_values = float(tflops_matches.group(1)) tflops_values = float(tflops_matches.group(2))
tflops.append(tflops_values) tflops.append(tflops_values)
if elapsed_time_match: if elapsed_time_match:
elapsed_time_value = float(elapsed_time_match.group(1)) elapsed_time_value = float(elapsed_time_match.group(1))
iteration_times.append(elapsed_time_value) iteration_times.append(elapsed_time_value)
if 'MaxMemAllocated' in line: if 'max allocated' in line:
mem_allocated_match = mem_allocated_pattern.search(line) mem_allocated_match = mem_allocated_pattern.search(line)
max_mem_allocated_match = max_mem_allocated_pattern.search(line) max_mem_allocated_match = max_mem_allocated_pattern.search(line)
if mem_allocated_match: if mem_allocated_match:
mem_allocated_value = float(mem_allocated_match.group(1)) mem_allocated_value = float(mem_allocated_match.group(1)) / 1024
mem_allocated.append(mem_allocated_value) mem_allocated.append(mem_allocated_value)
if max_mem_allocated_match: if max_mem_allocated_match:
max_mem_allocated_value = float(max_mem_allocated_match.group(1)) max_mem_allocated_value = float(max_mem_allocated_match.group(1)) / 1024
max_mem_allocated.append(max_mem_allocated_value) max_mem_allocated.append(max_mem_allocated_value)
return iteration_times, tflops, mem_allocated, max_mem_allocated return iteration_times, tflops, mem_allocated, max_mem_allocated
...@@ -224,7 +234,9 @@ def __prepare_deespeed_config(self, precision_megatron): ...@@ -224,7 +234,9 @@ def __prepare_deespeed_config(self, precision_megatron):
--deepspeed \ --deepspeed \
--deepspeed_config {self._config_json_path} \ --deepspeed_config {self._config_json_path} \
--zero-stage {self._args.zero_stage} \ --zero-stage {self._args.zero_stage} \
--pipeline-model-parallel-size {self._args.pipeline_model_parallel_size}' --pipeline-model-parallel-size {self._args.pipeline_model_parallel_size} \
--train-tokens {self._args.train_tokens} \
--data-impl {self._args.data_impl}'
if self._args.pipeline_model_parallel_size <= 1: if self._args.pipeline_model_parallel_size <= 1:
deepspeed_options = f'{deepspeed_options} --no-pipeline-parallel' deepspeed_options = f'{deepspeed_options} --no-pipeline-parallel'
...@@ -255,11 +267,10 @@ def _megatron_command(self, precision): # noqa: C901 ...@@ -255,11 +267,10 @@ def _megatron_command(self, precision): # noqa: C901
--num-attention-heads {self._args.num_attn_heads} \ --num-attention-heads {self._args.num_attn_heads} \
--seq-length {self._args.seq_len} \ --seq-length {self._args.seq_len} \
--max-position-embeddings {self._args.seq_len} \ --max-position-embeddings {self._args.seq_len} \
--train-tokens {self._args.train_tokens} \
--train-samples {self._args.num_steps * self._args.batch_size} \ --train-samples {self._args.num_steps * self._args.batch_size} \
--lr {self._args.lr} \ --lr {self._args.lr} \
--min-lr {self._args.min_lr} \ --min-lr {self._args.min_lr} \
--split 949,50,1 \ --split {self._args.split} \
--log-interval {self._args.log_interval} \ --log-interval {self._args.log_interval} \
--eval-interval {self._args.eval_interval} \ --eval-interval {self._args.eval_interval} \
--eval-iters {self._args.eval_iters} \ --eval-iters {self._args.eval_iters} \
...@@ -273,7 +284,8 @@ def _megatron_command(self, precision): # noqa: C901 ...@@ -273,7 +284,8 @@ def _megatron_command(self, precision): # noqa: C901
--optimizer adam \ --optimizer adam \
--use-distributed-optimizer \ --use-distributed-optimizer \
{precision_megatron} \ {precision_megatron} \
--seed {self._args.seed}' --seed {self._args.seed} \
--log-throughput'
if self._args.sequence_parallel: if self._args.sequence_parallel:
megatron_options = f'{megatron_options} --sequence-parallel' megatron_options = f'{megatron_options} --sequence-parallel'
...@@ -298,6 +310,8 @@ def _megatron_command(self, precision): # noqa: C901 ...@@ -298,6 +310,8 @@ def _megatron_command(self, precision): # noqa: C901
script_path = os.path.join(self._args.code_base, 'pretrain_gpt.py') script_path = os.path.join(self._args.code_base, 'pretrain_gpt.py')
if self._args.deepspeed: if self._args.deepspeed:
deepspeed_option = self.__prepare_deespeed_config(precision_megatron.lstrip('--')) deepspeed_option = self.__prepare_deespeed_config(precision_megatron.lstrip('--'))
# No --log-throughput in Megatron-DeepSpeed by 20231219
megatron_options = megatron_options.replace('--log-throughput', '').strip()
if self._num_nodes > 1: if self._num_nodes > 1:
command = f'torchrun {self._distributed_args} ' + \ command = f'torchrun {self._distributed_args} ' + \
f'{script_path} {megatron_options} {self._data_options} {deepspeed_option}' f'{script_path} {megatron_options} {self._data_options} {deepspeed_option}'
...@@ -379,6 +393,7 @@ def _init_distributed_setting(self): ...@@ -379,6 +393,7 @@ def _init_distributed_setting(self):
return False return False
self._num_nodes = int(os.getenv('OMPI_COMM_WORLD_SIZE')) // int(os.getenv('OMPI_COMM_WORLD_LOCAL_SIZE')) self._num_nodes = int(os.getenv('OMPI_COMM_WORLD_SIZE')) // int(os.getenv('OMPI_COMM_WORLD_LOCAL_SIZE'))
master_addr = 'localhost'
if self._num_nodes > 1: if self._num_nodes > 1:
if not self._args.hostfile: if not self._args.hostfile:
sb_hostfile = os.path.join(os.environ.get('SB_WORKSPACE', '.'), 'hostfile') sb_hostfile = os.path.join(os.environ.get('SB_WORKSPACE', '.'), 'hostfile')
...@@ -395,12 +410,13 @@ def _init_distributed_setting(self): ...@@ -395,12 +410,13 @@ def _init_distributed_setting(self):
if self._num_nodes != len(hosts): if self._num_nodes != len(hosts):
logger.error('MPI init failed since hostfile not match the MPI setting.') logger.error('MPI init failed since hostfile not match the MPI setting.')
return False return False
master_addr = hosts[0].split()[0]
addr = os.getenv('MASTER_ADDR', hosts[0].split()[0]) addr = os.getenv('MASTER_ADDR', master_addr)
port = os.getenv('MASTER_PORT', '29500') port = os.getenv('MASTER_PORT', '29500')
node_rank = int(os.environ['OMPI_COMM_WORLD_RANK']) // int(os.environ['OMPI_COMM_WORLD_LOCAL_SIZE']) node_rank = int(os.environ['OMPI_COMM_WORLD_RANK']) // int(os.environ['OMPI_COMM_WORLD_LOCAL_SIZE'])
self._distributed_args = f'--nproc_per_node {self._args.num_gpus} --nnodes {self._num_nodes} ' + \ self._distributed_args = f'--nproc_per_node {self._args.num_gpus} --nnodes {self._num_nodes} ' + \
f'--node_rank {node_rank} --master_addr {addr} --master_port {port}' f'--node_rank {node_rank} --master_addr {addr} --master_port {port}'
return True return True
def _generate_dataset(self): def _generate_dataset(self):
...@@ -448,8 +464,7 @@ def _generate_dataset(self): ...@@ -448,8 +464,7 @@ def _generate_dataset(self):
self._data_options = f'\ self._data_options = f'\
--vocab-file {self._vocab_path} \ --vocab-file {self._vocab_path} \
--merge-file {self._merges_path} \ --merge-file {self._merges_path} \
--data-path {self._data_path} \ --data-path {self._data_path}'
--data-impl {self._args.data_impl}'
logger.info('Dataset preparation successfully.') logger.info('Dataset preparation successfully.')
return True return True
......
...@@ -265,8 +265,8 @@ def __train(self, precision): ...@@ -265,8 +265,8 @@ def __train(self, precision):
# The unit of step time should be millisecond. # The unit of step time should be millisecond.
step_times = self._train_step(precision) step_times = self._train_step(precision)
if isinstance(step_times, tuple): if isinstance(step_times, tuple):
step_times = step_times[0]
info = step_times[1] info = step_times[1]
step_times = step_times[0]
self._process_info(ModelAction.TRAIN, precision, info) self._process_info(ModelAction.TRAIN, precision, info)
step_times = self.__process_model_result(ModelAction.TRAIN, precision, step_times) step_times = self.__process_model_result(ModelAction.TRAIN, precision, step_times)
if not step_times: if not step_times:
......
...@@ -177,8 +177,7 @@ def test_megatron_gpt_command(self, mock_generate_dataset): ...@@ -177,8 +177,7 @@ def test_megatron_gpt_command(self, mock_generate_dataset):
benchmark._data_options = f'\ benchmark._data_options = f'\
--vocab-file {self._tmp_dir}/gpt2-vocab.json \ --vocab-file {self._tmp_dir}/gpt2-vocab.json \
--merge-file {self._tmp_dir}/gpt2-merges.txt \ --merge-file {self._tmp_dir}/gpt2-merges.txt \
--data-path {self._tmp_dir}/dataset_text_document \ --data-path {self._tmp_dir}/dataset_text_document'
--data-impl mmap'
script_path = str(Path(self._tmp_dir) / 'pretrain_gpt.py') script_path = str(Path(self._tmp_dir) / 'pretrain_gpt.py')
expected_command = 'torchrun {distributed_args} {script_path} \ expected_command = 'torchrun {distributed_args} {script_path} \
...@@ -197,7 +196,6 @@ def test_megatron_gpt_command(self, mock_generate_dataset): ...@@ -197,7 +196,6 @@ def test_megatron_gpt_command(self, mock_generate_dataset):
--num-attention-heads 32 \ --num-attention-heads 32 \
--seq-length 2048 \ --seq-length 2048 \
--max-position-embeddings 2048 \ --max-position-embeddings 2048 \
--train-tokens 300000000000 \
--train-samples 20480 \ --train-samples 20480 \
--lr 0.00012 \ --lr 0.00012 \
--min-lr 1e-06 \ --min-lr 1e-06 \
...@@ -215,7 +213,8 @@ def test_megatron_gpt_command(self, mock_generate_dataset): ...@@ -215,7 +213,8 @@ def test_megatron_gpt_command(self, mock_generate_dataset):
--optimizer adam \ --optimizer adam \
--use-distributed-optimizer \ --use-distributed-optimizer \
{precision} \ {precision} \
--seed 1234 {data_options}' --seed 1234 \
--log-throughput {data_options}'
precision = Precision.FLOAT32 precision = Precision.FLOAT32
command = benchmark._megatron_command(precision) command = benchmark._megatron_command(precision)
...@@ -262,12 +261,10 @@ def test_megatron_gpt_command(self, mock_generate_dataset): ...@@ -262,12 +261,10 @@ def test_megatron_gpt_command(self, mock_generate_dataset):
benchmark._data_options = f'\ benchmark._data_options = f'\
--vocab-file {self._tmp_dir}/gpt2-vocab.json \ --vocab-file {self._tmp_dir}/gpt2-vocab.json \
--merge-file {self._tmp_dir}/gpt2-merges.txt \ --merge-file {self._tmp_dir}/gpt2-merges.txt \
--data-path {self._tmp_dir}/dataset_text_document \ --data-path {self._tmp_dir}/dataset_text_document'
--data-impl mmap'
command = benchmark._megatron_command(Precision.BFLOAT16) command = benchmark._megatron_command(Precision.BFLOAT16)
expected_command = 'deepspeed {script_path} \ expected_command = 'deepspeed {script_path} --override-opt_param-scheduler \
--override-opt_param-scheduler \
--adam-beta1 0.9 \ --adam-beta1 0.9 \
--adam-beta2 0.95 \ --adam-beta2 0.95 \
--tensor-model-parallel-size 1 \ --tensor-model-parallel-size 1 \
...@@ -282,7 +279,6 @@ def test_megatron_gpt_command(self, mock_generate_dataset): ...@@ -282,7 +279,6 @@ def test_megatron_gpt_command(self, mock_generate_dataset):
--num-attention-heads 32 \ --num-attention-heads 32 \
--seq-length 2048 \ --seq-length 2048 \
--max-position-embeddings 2048 \ --max-position-embeddings 2048 \
--train-tokens 300000000000 \
--train-samples 20480 \ --train-samples 20480 \
--lr 0.00012 \ --lr 0.00012 \
--min-lr 1e-06 \ --min-lr 1e-06 \
...@@ -306,7 +302,9 @@ def test_megatron_gpt_command(self, mock_generate_dataset): ...@@ -306,7 +302,9 @@ def test_megatron_gpt_command(self, mock_generate_dataset):
--deepspeed \ --deepspeed \
--deepspeed_config {benchmark._config_json_path} \ --deepspeed_config {benchmark._config_json_path} \
--zero-stage 1 \ --zero-stage 1 \
--pipeline-model-parallel-size 1 --no-pipeline-parallel' --pipeline-model-parallel-size 1 \
--train-tokens 300000000000 \
--data-impl mmap --no-pipeline-parallel'
self.assertEqual( self.assertEqual(
command, command,
...@@ -346,12 +344,12 @@ def test_megatron_parse_log(self, raw_output, mock_generate_dataset): ...@@ -346,12 +344,12 @@ def test_megatron_parse_log(self, raw_output, mock_generate_dataset):
iteration_times, tflops, mem_allocated, max_mem_allocated = benchmark._parse_log(raw_output) iteration_times, tflops, mem_allocated, max_mem_allocated = benchmark._parse_log(raw_output)
assert (statistics.mean(iteration_times) == 75239.24) assert (statistics.mean(iteration_times) == 75239.24)
assert (statistics.mean(tflops) == 149.136) assert (statistics.mean(tflops) == 149.136)
assert (statistics.mean(mem_allocated) == 17.54) assert (statistics.mean(mem_allocated) == 17.535637855529785)
assert (statistics.mean(max_mem_allocated) == 66.97) assert (statistics.mean(max_mem_allocated) == 66.9744234085083)
info = {'tflops': tflops, 'mem_allocated': mem_allocated, 'max_mem_allocated': max_mem_allocated} info = {'tflops': tflops, 'mem_allocated': mem_allocated, 'max_mem_allocated': max_mem_allocated}
benchmark._process_info(ModelAction.TRAIN, Precision.FLOAT16, info) benchmark._process_info(ModelAction.TRAIN, Precision.FLOAT16, info)
assert (benchmark.result is not None) assert (benchmark.result is not None)
assert (benchmark.result['fp16_train_tflops'][0] == 149.136) assert (benchmark.result['fp16_train_tflops'][0] == 149.136)
assert (benchmark.result['fp16_train_mem_allocated'][0] == 17.54) assert (benchmark.result['fp16_train_mem_allocated'][0] == 17.535637855529785)
assert (benchmark.result['fp16_train_max_mem_allocated'][0] == 66.97) assert (benchmark.result['fp16_train_max_mem_allocated'][0] == 66.9744234085083)
...@@ -177,21 +177,17 @@ directx_amf_encoding_latency: ...@@ -177,21 +177,17 @@ directx_amf_encoding_latency:
"C:\temp\BuildTools\MSBuild\Current\Bin\MSBuild.exe" "AMF\amf\public\samples\CPPSamples_vs2019.sln" /t:EncoderLatency /p:Platform=x64 /p:Configuration=Release /p:OutDir="%SB_MICRO_PATH%\bin" \ "C:\temp\BuildTools\MSBuild\Current\Bin\MSBuild.exe" "AMF\amf\public\samples\CPPSamples_vs2019.sln" /t:EncoderLatency /p:Platform=x64 /p:Configuration=Release /p:OutDir="%SB_MICRO_PATH%\bin" \
) )
# Install Megatron-LM # Install requirements for Megatron-LM
megatron_lm: megatron_lm:
if [ ! -d "Megatron/Megatron-LM" ]; then \
git clone "https://github.com/NVIDIA/Megatron-LM.git" "Megatron/Megatron-LM"; \
fi
cd Megatron && \ cd Megatron && \
python -m pip install -r requirements.txt apt install -y python3-mpi4py && \
python -m pip install --no-cache-dir -r requirements.txt
# Install Megatron-DeepSpeed # Install requirements for Megatron-DeepSpeed
megatron_deepspeed: megatron_deepspeed:
if [ ! -d "Megatron/Megatron-DeepSpeed" ]; then \
git clone "https://github.com/microsoft/Megatron-DeepSpeed.git" "Megatron/Megatron-DeepSpeed"; \
fi
cd Megatron && \ cd Megatron && \
python -m pip install -r requirements.txt && \ apt install -y python3-mpi4py && \
python -m pip install --no-cache-dir -r requirements.txt && \
python -m pip install DeepSpeed python -m pip install DeepSpeed
# Instal apex of ROCm due to dependency of Megatron # Instal apex of ROCm due to dependency of Megatron
......
Subproject commit 71e8407c98bacacb002823ea587c321fe58b28a6
Subproject commit 52b7a18a00bced8b3670eededfd58ee0c4bd7d06
diff --git a/megatron/fused_kernels/scaled_masked_softmax_cuda.cu b/megatron/fused_kernels/scaled_masked_softmax_cuda.cu diff --git a/megatron/fused_kernels/scaled_softmax_cuda.cu b/megatron/fused_kernels/scaled_softmax_cuda.cu
index 76086de..1533648 100644 index 90e1c9f..d217aec 100644
--- a/megatron/fused_kernels/scaled_masked_softmax_cuda.cu --- a/megatron/fused_kernels/scaled_softmax_cuda.cu
+++ b/megatron/fused_kernels/scaled_masked_softmax_cuda.cu +++ b/megatron/fused_kernels/scaled_softmax_cuda.cu
@@ -4,7 +4,7 @@ @@ -4,7 +4,7 @@
#include <cuda.h> #include <cuda.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <cuda_fp16.h> #include <cuda_fp16.h>
-#ifndef __HIP_PLATFORM_HCC__ -#ifndef __HIP_PLATFORM_HCC__
+#ifndef __HIP_PLATFORM_AMD__ +#ifndef __HIP_PLATFORM_AMD__
#include <cuda_profiler_api.h> #include <cuda_profiler_api.h>
#endif #endif
#include <ATen/cuda/CUDAContext.h> #include <ATen/cuda/CUDAContext.h>
diff --git a/megatron/fused_kernels/scaled_softmax_cuda.cu b/megatron/fused_kernels/scaled_softmax_cuda.cu diff --git a/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu b/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu
index 90e1c9f..d217aec 100644 index 74c9f3d..03b5fc8 100644
--- a/megatron/fused_kernels/scaled_softmax_cuda.cu --- a/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu
+++ b/megatron/fused_kernels/scaled_softmax_cuda.cu +++ b/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu
@@ -4,7 +4,7 @@ @@ -4,7 +4,7 @@
#include <cuda.h> #include <cuda.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <cuda_fp16.h> #include <cuda_fp16.h>
-#ifndef __HIP_PLATFORM_HCC__ -#ifndef __HIP_PLATFORM_HCC__
+#ifndef __HIP_PLATFORM_AMD__ +#ifndef __HIP_PLATFORM_AMD__
#include <cuda_profiler_api.h> #include <cuda_profiler_api.h>
#endif #endif
#include <ATen/cuda/CUDAContext.h> #include <ATen/cuda/CUDAContext.h>
diff --git a/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu b/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu
index 74c9f3d..03b5fc8 100644
--- a/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu
+++ b/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu
@@ -4,7 +4,7 @@
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_fp16.h>
-#ifndef __HIP_PLATFORM_HCC__
+#ifndef __HIP_PLATFORM_AMD__
#include <cuda_profiler_api.h>
#endif
#include <ATen/cuda/CUDAContext.h>
...@@ -10,4 +10,6 @@ tqdm ...@@ -10,4 +10,6 @@ tqdm
sentencepiece sentencepiece
wandb wandb
einops einops
typing_extensions==4.5.0 typing_extensions==4.9.0
apex
mpi4py
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment