Unverified Commit e083a598 authored by Yuting Jiang's avatar Yuting Jiang Committed by GitHub
Browse files

Benchmarks: Add Benchmark - Add NCCL performance benchmark (#113)

**Description**
Add NCCL performance microbenchmark.

**Major Revision**
- Add microbenchmark, example, test, config for NCCL
parent b0c5addc
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
"""Micro benchmark example for NCCL Bus bandwidth performance.
Commands to run:
python3 examples/benchmarks/nccl_bw_performance.py
"""
from superbench.benchmarks import BenchmarkRegistry, Platform
from superbench.common.utils import logger
if __name__ == '__main__':
context = BenchmarkRegistry.create_benchmark_context(
'nccl-bw', platform=Platform.CUDA, parameters='--operations allreduce'
)
benchmark = BenchmarkRegistry.launch_benchmark(context)
if benchmark:
logger.info(
'benchmark: {}, return code: {}, result: {}'.format(
benchmark.name, benchmark.return_code, benchmark.result
)
)
......@@ -13,8 +13,10 @@
from superbench.benchmarks.micro_benchmarks.cuda_memory_bw_performance import CudaMemBwBenchmark
from superbench.benchmarks.micro_benchmarks.disk_performance import DiskBenchmark
from superbench.benchmarks.micro_benchmarks.ib_loopback_performance import IBLoopbackBenchmark
from superbench.benchmarks.micro_benchmarks.cuda_nccl_bw_performance import CudaNcclBwBenchmark
__all__ = [
'MicroBenchmark', 'MicroBenchmarkWithInvoke', 'ShardingMatmul', 'ComputationCommunicationOverlap', 'KernelLaunch',
'CublasBenchmark', 'CudnnBenchmark', 'GemmFlopsCuda', 'CudaMemBwBenchmark', 'DiskBenchmark', 'IBLoopbackBenchmark'
'CublasBenchmark', 'CudnnBenchmark', 'GemmFlopsCuda', 'CudaMemBwBenchmark', 'DiskBenchmark', 'IBLoopbackBenchmark',
'CudaNcclBwBenchmark'
]
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
"""Module of the NCCL performance benchmarks."""
import os
import re
from superbench.common.utils import logger
from superbench.benchmarks import BenchmarkRegistry, Platform, ReturnCode
from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke
class CudaNcclBwBenchmark(MicroBenchmarkWithInvoke):
"""The NCCL bus bandwidth performance benchmark class."""
def __init__(self, name, parameters=''):
"""Constructor.
Args:
name (str): benchmark name.
parameters (str): benchmark parameters.
"""
super().__init__(name, parameters)
self._bin_name = 'all_reduce_perf'
self.__operations = {
'allreduce': 'all_reduce_perf',
'allgather': 'all_gather_perf',
'broadcast': 'broadcast_perf',
'reduce': 'reduce_perf',
'reducescatter': 'reduce_scatter_perf',
'alltoall': 'alltoall_perf'
}
def add_parser_arguments(self):
"""Add the specified arguments."""
super().add_parser_arguments()
self._parser.add_argument(
'--operations',
type=str,
nargs='+',
default=list(self.__operations.keys()),
help='Nccl operations to benchmark, e.g., {}.'.format(' '.join(list(self.__operations.keys()))),
)
self._parser.add_argument(
'--ngpus',
type=int,
default=8,
help='Number of gpus per thread to run the nccl test.',
)
self._parser.add_argument(
'--maxbytes',
type=str,
default='8G',
help='Max size in bytes to run the nccl test. E.g. 8G.',
)
self._parser.add_argument(
'--minbytes',
type=str,
default='8',
help='Min size in bytes to run the nccl test. E.g. 1.',
)
self._parser.add_argument(
'--stepfactor',
type=int,
default=2,
help='Increment factor, multiplication factor between sizes. E.g. 2.',
)
self._parser.add_argument(
'--check',
type=int,
default=0,
help='Check correctness of results. This can be quite slow on large numbers of GPUs. E.g. 0 or 1.',
)
self._parser.add_argument(
'--iters',
type=int,
default=20,
help='Number of iterations. Default: 20.',
)
self._parser.add_argument(
'--warmup_iters',
type=int,
default=5,
help='Number of warmup iterations. Default: 5.',
)
def _preprocess(self):
"""Preprocess/preparation operations before the benchmarking.
Return:
True if _preprocess() succeed.
"""
if not super()._preprocess():
return False
# Format the arguments
self._args.operations = [p.lower() for p in self._args.operations]
# Check the arguments and generate the commands
for op in self._args.operations:
if op not in self.__operations:
self._result.set_return_code(ReturnCode.INVALID_ARGUMENT)
logger.error(
'Unsupported operation of NCCL test - benchmark: {}, operation: {}, expected: {}.'.format(
self._name, op, ' '.join(list(self.__operations.keys()))
)
)
return False
else:
self._bin_name = self.__operations[op]
if not self._set_binary_path():
return False
command = os.path.join(self._args.bin_dir, self._bin_name)
command += ' -b {} -e {} -f {} -g {} -c {} -n {} -w {}'.format(
self._args.minbytes, self._args.maxbytes, str(self._args.stepfactor), str(self._args.ngpus),
str(self._args.check), str(self._args.iters), str(self._args.warmup_iters)
)
self._commands.append(command)
return True
def _process_raw_result(self, cmd_idx, raw_output): # noqa: C901
"""Function to parse raw results and save the summarized results.
self._result.add_raw_data() and self._result.add_result() need to be called to save the results.
Args:
cmd_idx (int): the index of command corresponding with the raw_output.
raw_output (str): raw output string of the micro-benchmark.
Return:
True if the raw output string is valid and result can be extracted.
"""
# If it's invoked by MPI and rank is not 0, empty content is expected
if os.getenv('OMPI_COMM_WORLD_RANK'):
rank = int(os.getenv('OMPI_COMM_WORLD_RANK'))
if rank > 0:
return True
self._result.add_raw_data('raw_output_' + self._args.operations[cmd_idx], raw_output)
content = raw_output.splitlines()
size = -1
busbw_out = -1
time_out = -1
algbw_out = -1
try:
# Filter useless output
out_of_place_index = -1
out_of_bound_index = -1
for index, line in enumerate(content):
if 'out-of-place' in line:
out_of_place_index = index
if 'Out of bounds values' in line:
out_of_bound_index = index
content = content[out_of_place_index + 1:out_of_bound_index]
# Parse max out of bound bus bw as the result
size_index = -1
time_index = -1
busbw_index = -1
algbw_index = -1
for line in content:
if 'time' in line and 'busbw' in line:
# Get index of selected column
line = line[1:].strip(' ')
line = re.sub(r' +', ' ', line).split(' ')
# Get first index of condition in list, if it not existing, raise exception
size_index = line.index('size')
time_index = line.index('time') - len(line)
busbw_index = line.index('busbw') - len(line)
algbw_index = line.index('algbw') - len(line)
break
if size_index != -1 and busbw_index != -1 and time_index != -1 and algbw_index != -1:
for line in content:
line = line.strip(' ')
line = re.sub(r' +', ' ', line).split(' ')
# Filter line not started with number
if len(line) == 0 or not re.match(r'\d+', line[0]):
continue
size = int(line[size_index])
if size != 0:
busbw_out = float(line[busbw_index])
time_out = float(line[time_index])
algbw_out = float(line[algbw_index])
self._result.add_result(
'NCCL_' + self._args.operations[cmd_idx] + '_' + str(size) + '_busbw', busbw_out
)
self._result.add_result(
'NCCL_' + self._args.operations[cmd_idx] + '_' + str(size) + '_algbw', algbw_out
)
self._result.add_result(
'NCCL_' + self._args.operations[cmd_idx] + '_' + str(size) + '_time', time_out
)
except BaseException as e:
logger.error(
'The result format is invalid - round: {}, benchmark: {}, raw output: {}, message: {}.'.format(
self._curr_run_index, self._name, raw_output, str(e)
)
)
return False
if out_of_place_index == -1 or out_of_bound_index == -1 or busbw_out == -1:
logger.error(
'The result format is invalid - round: {}, benchmark: {}, raw output: {}.'.format(
self._curr_run_index, self._name, raw_output
)
)
return False
return True
BenchmarkRegistry.register_benchmark('nccl-bw', CudaNcclBwBenchmark, platform=Platform.CUDA)
......@@ -28,6 +28,11 @@ superbench:
model_action:
- train
benchmarks:
nccl-bw:
enable: true
modes:
- name: local
prefix: NCCL_DEBUG=INFO NCCL_IB_DISABLE=1
ib-loopback:
enable: true
modes:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment