Unverified Commit dd5a6329 authored by Yuting Jiang's avatar Yuting Jiang Committed by GitHub
Browse files

Benchmarks: Add benchmark: Megatron-LM/Megatron-Deepspeed GPT pretrain benchmark (#582)

**Description**
Megatron-LM/Megatron-Deepspeed GPT pretrain benchmark
parent 254ea7fe
...@@ -41,6 +41,7 @@ RUN apt-get update && \ ...@@ -41,6 +41,7 @@ RUN apt-get update && \
libtinfo5 \ libtinfo5 \
libtool \ libtool \
lshw \ lshw \
python3-mpi4py \
net-tools \ net-tools \
openssh-client \ openssh-client \
openssh-server \ openssh-server \
......
...@@ -41,6 +41,7 @@ RUN apt-get update && \ ...@@ -41,6 +41,7 @@ RUN apt-get update && \
libtinfo5 \ libtinfo5 \
libtool \ libtool \
lshw \ lshw \
python3-mpi4py \
net-tools \ net-tools \
openssh-client \ openssh-client \
openssh-server \ openssh-server \
......
...@@ -41,6 +41,7 @@ RUN apt-get update && \ ...@@ -41,6 +41,7 @@ RUN apt-get update && \
libtinfo5 \ libtinfo5 \
libtool \ libtool \
lshw \ lshw \
python3-mpi4py \
net-tools \ net-tools \
numactl \ numactl \
openssh-client \ openssh-client \
...@@ -136,7 +137,7 @@ RUN echo PATH="$PATH" > /etc/environment && \ ...@@ -136,7 +137,7 @@ RUN echo PATH="$PATH" > /etc/environment && \
WORKDIR ${SB_HOME} WORKDIR ${SB_HOME}
ADD third_party third_party ADD third_party third_party
RUN make -C third_party rocm -o rocm_hipblaslt RUN make -C third_party rocm -o rocm_hipblaslt -o megatron_deepspeed -o megatron_lm
ADD . . ADD . .
RUN python3 -m pip install --upgrade setuptools==65.7 && \ RUN python3 -m pip install --upgrade setuptools==65.7 && \
......
...@@ -41,6 +41,7 @@ RUN apt-get update && \ ...@@ -41,6 +41,7 @@ RUN apt-get update && \
libtinfo5 \ libtinfo5 \
libtool \ libtool \
lshw \ lshw \
python3-mpi4py \
net-tools \ net-tools \
numactl \ numactl \
openssh-client \ openssh-client \
...@@ -141,7 +142,7 @@ RUN echo PATH="$PATH" > /etc/environment && \ ...@@ -141,7 +142,7 @@ RUN echo PATH="$PATH" > /etc/environment && \
WORKDIR ${SB_HOME} WORKDIR ${SB_HOME}
ADD third_party third_party ADD third_party third_party
RUN make ROCBLAS_BRANCH=release/rocm-rel-5.1 -C third_party rocm -o rocm_hipblaslt RUN make ROCBLAS_BRANCH=release/rocm-rel-5.1 -C third_party rocm -o rocm_hipblaslt -o megatron_deepspeed -o megatron_lm
ADD . . ADD . .
RUN python3 -m pip install --no-cache-dir .[amdworker] && \ RUN python3 -m pip install --no-cache-dir .[amdworker] && \
......
...@@ -37,8 +37,29 @@ For inference, supported percentiles include ...@@ -37,8 +37,29 @@ For inference, supported percentiles include
| Name | Unit | Description | | Name | Unit | Description |
|-----------------------------------------------------------------------------------------|------------------------|------------------------------------------------------------------------------| |-----------------------------------------------------------------------------------------|------------------------|------------------------------------------------------------------------------|
| model-benchmarks/pytorch-${model_name}/${precision}_train_step_time | time (ms) | The average training step time with fp32/fp16 precision. | | model-benchmarks/pytorch-${model_name}/${precision}_train_step_time | time (ms) | The average training step time with fp32/fp16 precision. |
| model-benchmarks/pytorch-${model_name}/${precision}_train_throughput | throughput (samples/s) | The average training throughput with fp32/fp16 precision. | | model-benchmarks/pytorch-${model_name}/${precision}_train_throughput | throughput (samples/s) | The average training throughput with fp32/fp16 precision per GPU. |
| model-benchmarks/pytorch-${model_name}/${precision}_inference_step_time | time (ms) | The average inference step time with fp32/fp16 precision. | | model-benchmarks/pytorch-${model_name}/${precision}_inference_step_time | time (ms) | The average inference step time with fp32/fp16 precision. |
| model-benchmarks/pytorch-${model_name}/${precision}_inference_throughput | throughput (samples/s) | The average inference throughput with fp32/fp16 precision. | | model-benchmarks/pytorch-${model_name}/${precision}_inference_throughput | throughput (samples/s) | The average inference throughput with fp32/fp16 precision. |
| model-benchmarks/pytorch-${model_name}/${precision}_inference_step_time\_${percentile} | time (ms) | The n<sup>th</sup> percentile inference step time with fp32/fp16 precision. | | model-benchmarks/pytorch-${model_name}/${precision}_inference_step_time\_${percentile} | time (ms) | The n<sup>th</sup> percentile inference step time with fp32/fp16 precision. |
| model-benchmarks/pytorch-${model_name}/${precision}_inference_throughput\_${percentile} | throughput (samples/s) | The n<sup>th</sup> percentile inference throughput with fp32/fp16 precision. | | model-benchmarks/pytorch-${model_name}/${precision}_inference_throughput\_${percentile} | throughput (samples/s) | The n<sup>th</sup> percentile inference throughput with fp32/fp16 precision. |
## Megatron Model benchmarks
### `megatron-gpt`
#### Introduction
Run GPT pretrain tasks with float32, float16, bfloat16 precisions with [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) or [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed).
`tips: batch_size in this benchmark represents global batch size, the batch size on each GPU instance is micro_batch_size.`
#### Metrics
| Name | Unit | Description |
|---------------------------------------------------|------------------------|---------------------------------------------------------|
| megatron-gpt/${precision}_train_step_time | time (ms) | The average training step time per iteration. |
| megatron-gpt/${precision}_train_throughput | throughput (samples/s) | The average training throughput per iteration. |
| megatron-gpt/${precision}_train_tflops | tflops/s | The average training tflops per second per iteration. |
| megatron-gpt/${precision}_train_mem_allocated | GB | The average GPU memory allocated per iteration. |
| megatron-gpt/${precision}_train_max_mem_allocated | GB | The average maximum GPU memory allocated per iteration. |
...@@ -177,6 +177,7 @@ def run(self): ...@@ -177,6 +177,7 @@ def run(self):
'xlrd>=2.0.1', 'xlrd>=2.0.1',
'xlsxwriter>=1.3.8', 'xlsxwriter>=1.3.8',
'xmltodict>=0.12.0', 'xmltodict>=0.12.0',
'types-requests',
], ],
extras_require=( extras_require=(
lambda x: { lambda x: {
......
...@@ -8,5 +8,6 @@ ...@@ -8,5 +8,6 @@
from superbench.benchmarks.model_benchmarks.pytorch_gpt2 import PytorchGPT2 from superbench.benchmarks.model_benchmarks.pytorch_gpt2 import PytorchGPT2
from superbench.benchmarks.model_benchmarks.pytorch_cnn import PytorchCNN from superbench.benchmarks.model_benchmarks.pytorch_cnn import PytorchCNN
from superbench.benchmarks.model_benchmarks.pytorch_lstm import PytorchLSTM from superbench.benchmarks.model_benchmarks.pytorch_lstm import PytorchLSTM
from superbench.benchmarks.model_benchmarks.megatron_gpt3 import MegatronGPT
__all__ = ['ModelBenchmark', 'PytorchBERT', 'PytorchGPT2', 'PytorchCNN', 'PytorchLSTM'] __all__ = ['ModelBenchmark', 'PytorchBERT', 'PytorchGPT2', 'PytorchCNN', 'PytorchLSTM', 'MegatronGPT']
This diff is collapsed.
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
import time import time
import statistics import statistics
from abc import abstractmethod from abc import abstractmethod
from typing import Union
from superbench.common.utils import logger, stdout_logger from superbench.common.utils import logger, stdout_logger
from superbench.benchmarks import Precision, ModelAction, DistributedImpl, DistributedBackend, BenchmarkType, ReturnCode from superbench.benchmarks import Precision, ModelAction, DistributedImpl, DistributedBackend, BenchmarkType, ReturnCode
...@@ -263,6 +264,10 @@ def __train(self, precision): ...@@ -263,6 +264,10 @@ def __train(self, precision):
# The unit of step time should be millisecond. # The unit of step time should be millisecond.
step_times = self._train_step(precision) step_times = self._train_step(precision)
if isinstance(step_times, tuple):
step_times = step_times[0]
info = step_times[1]
self._process_info(ModelAction.TRAIN, precision, info)
step_times = self.__process_model_result(ModelAction.TRAIN, precision, step_times) step_times = self.__process_model_result(ModelAction.TRAIN, precision, step_times)
if not step_times: if not step_times:
self._result.set_return_code(ReturnCode.INVALID_BENCHMARK_RESULT) self._result.set_return_code(ReturnCode.INVALID_BENCHMARK_RESULT)
...@@ -302,7 +307,7 @@ def __inference(self, precision): ...@@ -302,7 +307,7 @@ def __inference(self, precision):
return True return True
@abstractmethod @abstractmethod
def _train_step(self, precision): def _train_step(self, precision) -> Union[list, tuple]:
"""Define the training process. """Define the training process.
Args: Args:
...@@ -418,6 +423,7 @@ def __process_model_result(self, model_action, precision, step_times): ...@@ -418,6 +423,7 @@ def __process_model_result(self, model_action, precision, step_times):
precision_metric = {'float16': 'fp16', 'float32': 'fp32', 'float64': 'fp64', 'bfloat16': 'bf16'} precision_metric = {'float16': 'fp16', 'float32': 'fp32', 'float64': 'fp64', 'bfloat16': 'bf16'}
if precision.value in precision_metric.keys(): if precision.value in precision_metric.keys():
precision = precision_metric[precision.value] precision = precision_metric[precision.value]
metric_s = '{}_{}_step_time'.format(precision, model_action) metric_s = '{}_{}_step_time'.format(precision, model_action)
metric_t = '{}_{}_throughput'.format(precision, model_action) metric_t = '{}_{}_throughput'.format(precision, model_action)
# The unit of step time is millisecond, use it to calculate the throughput with the unit samples/sec. # The unit of step time is millisecond, use it to calculate the throughput with the unit samples/sec.
...@@ -428,7 +434,7 @@ def __process_model_result(self, model_action, precision, step_times): ...@@ -428,7 +434,7 @@ def __process_model_result(self, model_action, precision, step_times):
if model_action == ModelAction.TRAIN: if model_action == ModelAction.TRAIN:
step_times = self._sync_result(step_times) step_times = self._sync_result(step_times)
if not step_times: if not step_times or statistics.mean(step_times) < 0:
return None return None
if self._local_rank is None or self._global_rank == 0: if self._local_rank is None or self._global_rank == 0:
self._result.add_result(metric_s, statistics.mean(step_times)) self._result.add_result(metric_s, statistics.mean(step_times))
...@@ -468,3 +474,13 @@ def _log_step_time(self, curr_step, precision, duration): ...@@ -468,3 +474,13 @@ def _log_step_time(self, curr_step, precision, duration):
step_time = statistics.mean(duration) if len(duration) < self._args.log_n_steps \ step_time = statistics.mean(duration) if len(duration) < self._args.log_n_steps \
else statistics.mean(duration[-self._args.log_n_steps:]) else statistics.mean(duration[-self._args.log_n_steps:])
stdout_logger.log(f'{self._name} - {precision.value}: step {curr_step}, step time {step_time}\n') stdout_logger.log(f'{self._name} - {precision.value}: step {curr_step}, step time {step_time}\n')
def _process_info(self, model_action, precision, info):
"""Process other info.
Args:
model_action (ModelAction): train or inference.
precision (Precision): precision of model.
info (dict): other info.
"""
pass
...@@ -207,6 +207,23 @@ superbench: ...@@ -207,6 +207,23 @@ superbench:
seq_length: 224 seq_length: 224
batch_size: 1 batch_size: 1
precision: int8 precision: int8
megatron-gpt:
modes:
- name: mpi
proc_num: 1
node_num: all
parameters:
code_base: /opt/superbench/third_party/Megatron/Megatron-DeepSpeed/
dataset_url: https://huggingface.co/datasets/suolyer/pile_bookcorpus2/raw/main/test.json
batch_size: 2048
num_warmup: 0
num_steps: 10
precision:
- float16
- bfloat16
deepspeed: yes
sequence_parallel: yes
use_rotary_position_embeddings: yes
gpt_models: gpt_models:
<<: *default_pytorch_mode <<: *default_pytorch_mode
models: models:
......
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
"""Tests for BERT model benchmarks."""
import os
from pathlib import Path
import statistics
from unittest import mock
import unittest
from superbench.benchmarks.context import ModelAction, Precision
from tests.helper import decorator
from superbench.benchmarks import BenchmarkRegistry, Platform, ReturnCode
from tests.helper.testcase import BenchmarkTestCase
class MegatronGPTTest(BenchmarkTestCase, unittest.TestCase):
"""Tests for IBBenchmark benchmark."""
@classmethod
def setUpClass(cls):
"""Hook method for setting up class fixture before running tests in the class."""
super().setUpClass()
cls.benchmark_name = 'megatron-gpt'
cls.createMockEnvs(cls)
cls.hostfile_path = os.path.join(cls._tmp_dir, 'hostfile')
@classmethod
def tearDownClass(cls):
"""Hook method for deconstructing the class fixture after running all tests in the class."""
for p in [
Path(cls._tmp_dir) / 'pretrain_gpt.py',
Path(cls._tmp_dir) / 'customdataset_text_document.bin',
Path(cls._tmp_dir) / 'customdataset_text_document.idx',
Path(cls._tmp_dir) / 'hostfile'
]:
if p.is_file():
p.unlink()
super().tearDownClass()
@mock.patch('superbench.benchmarks.model_benchmarks.MegatronGPT._generate_dataset')
def test_megatron_gpt_preprocess(self, mock_generate_dataset):
"""Test megatron-gpt benchmark."""
# Check registry.
(benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(self.benchmark_name, Platform.CUDA)
assert (benchmark_cls)
benchmark = benchmark_cls(
self.benchmark_name,
parameters=f'--hostfile {self.hostfile_path} --batch_size 2048',
)
# Check init distribued setting.
os.environ['OMPI_COMM_WORLD_SIZE'] = '2'
os.environ['OMPI_COMM_WORLD_LOCAL_SIZE'] = '1'
os.environ['OMPI_COMM_WORLD_RANK'] = '0'
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '12345'
with open(self.hostfile_path, 'w') as f:
f.write('host1\n')
f.write('host2\n')
f.write('host3\n')
mock_generate_dataset.return_value = True
ret = benchmark._preprocess()
assert (ret is False)
assert (benchmark.return_code == ReturnCode.DISTRIBUTED_SETTING_INIT_FAILURE)
benchmark = benchmark_cls(
self.benchmark_name,
parameters='--hostfile xxx --batch_size 2048',
)
mock_generate_dataset.return_value = True
ret = benchmark._preprocess()
assert (ret is False)
assert (benchmark.return_code == ReturnCode.DISTRIBUTED_SETTING_INIT_FAILURE)
os.environ['OMPI_COMM_WORLD_SIZE'] = '3'
os.environ['OMPI_COMM_WORLD_LOCAL_SIZE'] = '1'
benchmark = benchmark_cls(
self.benchmark_name,
parameters=f'--hostfile {self.hostfile_path} --batch_size 2048',
)
mock_generate_dataset.return_value = True
benchmark._preprocess()
self.assertEqual(benchmark._num_nodes, 3)
self.assertEqual(
benchmark._distributed_args,
'--nproc_per_node {0} --nnodes {1} --node_rank {2} --master_addr {3} --master_port {4}'.format(
benchmark._args.num_gpus, benchmark._num_nodes, 0, 'localhost', '12345'
)
)
# Check preprocessing.
# Negative cases
# no code_base
benchmark = benchmark_cls(
self.benchmark_name,
parameters=f'--code_base {self._tmp_dir} --hostfile {self.hostfile_path} --batch_size 2048',
)
mock_generate_dataset.return_value = True
ret = benchmark._preprocess()
assert (ret is False)
self.createMockFiles(['pretrain_gpt.py'])
# invalid micro batch size
benchmark = benchmark_cls(
self.benchmark_name,
parameters=f'--code_base {self._tmp_dir} --hostfile {self.hostfile_path} --micro_batch_size -1',
)
mock_generate_dataset.return_value = True
ret = benchmark._preprocess()
assert (ret is False)
benchmark = benchmark_cls(
self.benchmark_name,
parameters=f'--code_base {self._tmp_dir} --hostfile {self.hostfile_path} --micro_batch_size 4096',
)
mock_generate_dataset.return_value = True
ret = benchmark._preprocess()
assert (ret is False)
# invalid precision
benchmark = benchmark_cls(
self.benchmark_name,
parameters=f'--code_base {self._tmp_dir} --hostfile {self.hostfile_path} \
--batch_size 2048 --precision int8',
)
mock_generate_dataset.return_value = True
ret = benchmark._preprocess()
assert (ret is False)
# Positive cases
benchmark = benchmark_cls(
self.benchmark_name,
parameters=f'--code_base {self._tmp_dir} --hostfile {self.hostfile_path} --batch_size 2048',
)
mock_generate_dataset.return_value = True
ret = benchmark._preprocess()
assert (ret is True)
def test_megatron_gpt_dataset(self):
"""Test dataset genreation."""
(benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(self.benchmark_name, Platform.CUDA)
assert (benchmark_cls)
os.environ['OMPI_COMM_WORLD_SIZE'] = '1'
os.environ['OMPI_COMM_WORLD_LOCAL_SIZE'] = '1'
os.environ['OMPI_COMM_WORLD_RANK'] = '0'
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '12345'
# use existing dataset
self.createMockFiles(['customdataset_text_document.bin', 'customdataset_text_document.idx'])
benchmark = benchmark_cls(
self.benchmark_name,
parameters=f'--code_base /root/Megatron-DeepSpeed --data_home {self._tmp_dir} \
--batch_size 2048 --data_prefix customdataset_text_document',
)
ret = benchmark._preprocess()
ret = benchmark._generate_dataset()
assert (ret is True)
@mock.patch('superbench.benchmarks.model_benchmarks.MegatronGPT._generate_dataset')
def test_megatron_gpt_command(self, mock_generate_dataset):
"""Test command generation."""
(benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(self.benchmark_name, Platform.CUDA)
assert (benchmark_cls)
os.environ['OMPI_COMM_WORLD_SIZE'] = '2'
os.environ['OMPI_COMM_WORLD_LOCAL_SIZE'] = '1'
os.environ['OMPI_COMM_WORLD_RANK'] = '0'
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '12345'
with open(self.hostfile_path, 'w') as f:
f.write('host1\n')
f.write('host2\n')
# use url to process dataset
benchmark = benchmark_cls(
self.benchmark_name,
parameters=f'--code_base {self._tmp_dir} --hostfile {self.hostfile_path} \
--num_warmup 0 --num_steps 10 --batch_size 2048 --data_prefix dataset_text_document',
)
mock_generate_dataset.return_value = True
benchmark._preprocess()
benchmark._data_options = f'\
--vocab-file {self._tmp_dir}/gpt2-vocab.json \
--merge-file {self._tmp_dir}/gpt2-merges.txt \
--data-path {self._tmp_dir}/dataset_text_document \
--data-impl mmap'
script_path = str(Path(self._tmp_dir) / 'pretrain_gpt.py')
expected_command = 'torchrun {distributed_args} {script_path} \
--override-opt_param-scheduler \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--tensor-model-parallel-size 1 \
--init-method-std 0.009 \
--lr-decay-samples 43945312 \
--lr-warmup-samples 0 \
--lr-decay-style cosine \
--micro-batch-size 2 \
--global-batch-size 2048 \
--num-layers 32 \
--hidden-size 4096 \
--num-attention-heads 32 \
--seq-length 2048 \
--max-position-embeddings 2048 \
--train-tokens 300000000000 \
--train-samples 20480 \
--lr 0.00012 \
--min-lr 1e-06 \
--split 949,50,1 \
--log-interval 1 \
--eval-interval 10 \
--eval-iters 0 \
--save-interval 10000 \
--weight-decay 0.1 \
--clip-grad 1.0 \
--hysteresis 2 \
--num-workers 8 \
--attention-dropout 0.0 \
--hidden-dropout 0.0 \
--optimizer adam \
--use-distributed-optimizer \
{precision} \
--seed 1234 {data_options}'
precision = Precision.FLOAT32
command = benchmark._megatron_command(precision)
self.assertEqual(
command,
expected_command.format(
precision='',
data_options=benchmark._data_options,
distributed_args=benchmark._distributed_args,
script_path=script_path
)
)
precision = Precision.FLOAT16
command = benchmark._megatron_command(precision)
self.assertEqual(
command,
expected_command.format(
precision='--fp16',
data_options=benchmark._data_options,
distributed_args=benchmark._distributed_args,
script_path=script_path
)
)
precision = Precision.BFLOAT16
command = benchmark._megatron_command(precision)
self.assertEqual(
command,
expected_command.format(
precision='--bf16',
data_options=benchmark._data_options,
distributed_args=benchmark._distributed_args,
script_path=script_path
)
)
os.environ['OMPI_COMM_WORLD_SIZE'] = '1'
benchmark = benchmark_cls(
self.benchmark_name,
parameters=f'--code_base {self._tmp_dir} --hostfile {self.hostfile_path} \
--num_warmup 0 --num_steps 10 --batch_size 2048 --data_prefix dataset_text_document --deepspeed',
)
mock_generate_dataset.return_value = True
benchmark._preprocess()
benchmark._data_options = f'\
--vocab-file {self._tmp_dir}/gpt2-vocab.json \
--merge-file {self._tmp_dir}/gpt2-merges.txt \
--data-path {self._tmp_dir}/dataset_text_document \
--data-impl mmap'
command = benchmark._megatron_command(Precision.BFLOAT16)
expected_command = 'deepspeed {script_path} \
--override-opt_param-scheduler \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--tensor-model-parallel-size 1 \
--init-method-std 0.009 \
--lr-decay-samples 43945312 \
--lr-warmup-samples 0 \
--lr-decay-style cosine \
--micro-batch-size 2 \
--global-batch-size 2048 \
--num-layers 32 \
--hidden-size 4096 \
--num-attention-heads 32 \
--seq-length 2048 \
--max-position-embeddings 2048 \
--train-tokens 300000000000 \
--train-samples 20480 \
--lr 0.00012 \
--min-lr 1e-06 \
--split 949,50,1 \
--log-interval 1 \
--eval-interval 10 \
--eval-iters 0 \
--save-interval 10000 \
--weight-decay 0.1 \
--clip-grad 1.0 \
--hysteresis 2 \
--num-workers 8 \
--attention-dropout 0.0 \
--hidden-dropout 0.0 \
--optimizer adam \
--use-distributed-optimizer \
{precision} \
--seed 1234 {data_options} {deepseed_options}'
expect_ds_options = f'\
--deepspeed \
--deepspeed_config {benchmark._config_json_path} \
--zero-stage 1 \
--pipeline-model-parallel-size 1 --no-pipeline-parallel'
self.assertEqual(
command,
expected_command.format(
precision='--bf16',
data_options=benchmark._data_options,
script_path=script_path,
deepseed_options=expect_ds_options
)
)
@decorator.load_data('tests/data/megatron_deepspeed.log')
@mock.patch('superbench.benchmarks.model_benchmarks.MegatronGPT._generate_dataset')
def test_megatron_parse_log(self, raw_output, mock_generate_dataset):
"""Test parse log function."""
(benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(self.benchmark_name, Platform.CUDA)
assert (benchmark_cls)
os.environ['OMPI_COMM_WORLD_SIZE'] = '1'
os.environ['OMPI_COMM_WORLD_LOCAL_SIZE'] = '1'
os.environ['OMPI_COMM_WORLD_RANK'] = '0'
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '12345'
# use url to process dataset
benchmark = benchmark_cls(
self.benchmark_name,
parameters=f'--code_base {self._tmp_dir} --num_warmup 0 --num_steps 10 --batch_size 2048',
)
mock_generate_dataset.return_value = True
benchmark._preprocess()
benchmark._data_options = f'\
--vocab-file {self._tmp_dir}/gpt2-vocab.json \
--merge-file {self._tmp_dir}/gpt2-merges.txt \
--data-path {self._tmp_dir}/dataset_text_document \
--data-impl mmap'
iteration_times, tflops, mem_allocated, max_mem_allocated = benchmark._parse_log(raw_output)
assert (statistics.mean(iteration_times) == 75239.24)
assert (statistics.mean(tflops) == 149.136)
assert (statistics.mean(mem_allocated) == 17.54)
assert (statistics.mean(max_mem_allocated) == 66.97)
info = {'tflops': tflops, 'mem_allocated': mem_allocated, 'max_mem_allocated': max_mem_allocated}
benchmark._process_info(ModelAction.TRAIN, Precision.FLOAT16, info)
assert (benchmark.result is not None)
assert (benchmark.result['fp16_train_tflops'][0] == 149.136)
assert (benchmark.result['fp16_train_mem_allocated'][0] == 17.54)
assert (benchmark.result['fp16_train_max_mem_allocated'][0] == 66.97)
This diff is collapsed.
...@@ -11,12 +11,12 @@ HPCX_HOME ?= /opt/hpcx ...@@ -11,12 +11,12 @@ HPCX_HOME ?= /opt/hpcx
CUDA_VER ?= $(shell nvcc --version | grep 'release' | awk '{print $$6}' | cut -c2- | cut -d '.' -f1-2) CUDA_VER ?= $(shell nvcc --version | grep 'release' | awk '{print $$6}' | cut -c2- | cut -d '.' -f1-2)
ROCBLAS_BRANCH ?= rocm-$(shell dpkg -l | grep 'rocm-dev ' | awk '{print $$3}' | cut -d '.' -f1-3) ROCBLAS_BRANCH ?= rocm-$(shell dpkg -l | grep 'rocm-dev ' | awk '{print $$3}' | cut -d '.' -f1-3)
.PHONY: all cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd rocm_hipblaslt .PHONY: all cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd rocm_hipblaslt megatron_lm megatron_deepspeed
# Build all targets. # Build all targets.
all: cuda rocm all: cuda rocm
cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn megatron_lm megatron_deepspeed
rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm_hipblaslt rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm_hipblaslt megatron_deepspeed
cpu: common cpu_perftest cpu: common cpu_perftest
common: cpu_hpl cpu_stream fio common: cpu_hpl cpu_stream fio
directx_amd: directx_amf_encoding_latency directx_amd: directx_amf_encoding_latency
...@@ -171,3 +171,20 @@ directx_amf_encoding_latency: ...@@ -171,3 +171,20 @@ directx_amf_encoding_latency:
del vs_buildtools.exe && echo "Deleted vs_buildtools.exe" && \ del vs_buildtools.exe && echo "Deleted vs_buildtools.exe" && \
"C:\temp\BuildTools\MSBuild\Current\Bin\MSBuild.exe" "AMF\amf\public\samples\CPPSamples_vs2019.sln" /t:EncoderLatency /p:Platform=x64 /p:Configuration=Release /p:OutDir="%SB_MICRO_PATH%\bin" \ "C:\temp\BuildTools\MSBuild\Current\Bin\MSBuild.exe" "AMF\amf\public\samples\CPPSamples_vs2019.sln" /t:EncoderLatency /p:Platform=x64 /p:Configuration=Release /p:OutDir="%SB_MICRO_PATH%\bin" \
) )
# Install Megatron-LM
megatron_lm:
if [ ! -d "Megatron/Megatron-LM" ]; then \
git clone "https://github.com/NVIDIA/Megatron-LM.git" "Megatron/Megatron-LM"; \
fi
cd Megatron && \
python -m pip install -r requirements.txt
# Install Megatron-DeepSpeed
megatron_deepspeed:
if [ ! -d "Megatron/Megatron-DeepSpeed" ]; then \
git clone "https://github.com/microsoft/Megatron-DeepSpeed.git" "Megatron/Megatron-DeepSpeed"; \
fi
cd Megatron && \
python -m pip install -r requirements.txt && \
python -m pip install DeepSpeed
nltk
parameterized
pybind11
regex
six
# versions from HF transformers
black==21.4b0
isort>=5.5.4
tqdm
sentencepiece
wandb
einops
typing_extensions==4.5.0
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment