Unverified Commit 60762518 authored by Yuting Jiang's avatar Yuting Jiang Committed by GitHub
Browse files

Benchmarks: Code Revision - Revise arguments of nccl/rccl to support mpi mode...

Benchmarks: Code Revision - Revise arguments of nccl/rccl to support mpi mode and rename metric (#189)

**Description**
Revise arguments of nccl/rccl to support mpi mode for (mpi can not run in nccl/rccl due to multiple operators run in sequence without barrier) and rename metric .

**Major Revision**
- revise argument operators to be a single one

**Minor Revision**
- rename metric to remove benchmark name info
- change argument ngpus default value to be 1
parent 4e431f11
......@@ -41,16 +41,15 @@ def add_parser_arguments(self):
super().add_parser_arguments()
self._parser.add_argument(
'--operations',
'--operation',
type=str,
nargs='+',
default=list(self.__operations.keys()),
help='Nccl operations to benchmark, e.g., {}.'.format(' '.join(list(self.__operations.keys()))),
default='allreduce',
help='NCCL operation to benchmark, e.g., {}.'.format(' '.join(list(self.__operations.keys()))),
)
self._parser.add_argument(
'--ngpus',
type=int,
default=8,
default=1,
help='Number of gpus per thread to run the nccl test.',
)
self._parser.add_argument(
......@@ -100,29 +99,29 @@ def _preprocess(self):
return False
# Format the arguments
self._args.operations = [p.lower() for p in self._args.operations]
self._args.operation = self._args.operation.lower()
# Check the arguments and generate the commands
for op in self._args.operations:
if op not in self.__operations:
self._result.set_return_code(ReturnCode.INVALID_ARGUMENT)
logger.error(
'Unsupported operation of NCCL test - benchmark: {}, operation: {}, expected: {}.'.format(
self._name, op, ' '.join(list(self.__operations.keys()))
)
op = self._args.operation
if op not in self.__operations:
self._result.set_return_code(ReturnCode.INVALID_ARGUMENT)
logger.error(
'Unsupported operation of NCCL test - benchmark: {}, operation: {}, expected: {}.'.format(
self._name, op, ' '.join(list(self.__operations.keys()))
)
)
return False
else:
self._bin_name = self.__operations[op]
if not self._set_binary_path():
return False
else:
self._bin_name = self.__operations[op]
if not self._set_binary_path():
return False
command = os.path.join(self._args.bin_dir, self._bin_name)
command += ' -b {} -e {} -f {} -g {} -c {} -n {} -w {}'.format(
self._args.minbytes, self._args.maxbytes, str(self._args.stepfactor), str(self._args.ngpus),
str(self._args.check), str(self._args.iters), str(self._args.warmup_iters)
)
self._commands.append(command)
command = os.path.join(self._args.bin_dir, self._bin_name)
command += ' -b {} -e {} -f {} -g {} -c {} -n {} -w {}'.format(
self._args.minbytes, self._args.maxbytes, str(self._args.stepfactor), str(self._args.ngpus),
str(self._args.check), str(self._args.iters), str(self._args.warmup_iters)
)
self._commands.append(command)
return True
......@@ -144,7 +143,7 @@ def _process_raw_result(self, cmd_idx, raw_output): # noqa: C901
if rank > 0:
return True
self._result.add_raw_data('raw_output_' + self._args.operations[cmd_idx], raw_output)
self._result.add_raw_data('raw_output_' + self._args.operation, raw_output)
content = raw_output.splitlines()
size = -1
......@@ -189,15 +188,9 @@ def _process_raw_result(self, cmd_idx, raw_output): # noqa: C901
busbw_out = float(line[busbw_index])
time_out = float(line[time_index])
algbw_out = float(line[algbw_index])
self._result.add_result(
'NCCL_' + self._args.operations[cmd_idx] + '_' + str(size) + '_busbw', busbw_out
)
self._result.add_result(
'NCCL_' + self._args.operations[cmd_idx] + '_' + str(size) + '_algbw', algbw_out
)
self._result.add_result(
'NCCL_' + self._args.operations[cmd_idx] + '_' + str(size) + '_time', time_out
)
self._result.add_result(self._args.operation + '_' + str(size) + '_busbw', busbw_out)
self._result.add_result(self._args.operation + '_' + str(size) + '_algbw', algbw_out)
self._result.add_result(self._args.operation + '_' + str(size) + '_time', time_out)
except BaseException as e:
logger.error(
'The result format is invalid - round: {}, benchmark: {}, raw output: {}, message: {}.'.format(
......
......@@ -39,7 +39,7 @@ def test_nccl_bw_performance(self):
predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CUDA)
assert (benchmark_class)
benchmark = benchmark_class(benchmark_name)
benchmark = benchmark_class(benchmark_name, parameters='--ngpus 8')
ret = benchmark._preprocess()
assert (ret is True)
......@@ -51,11 +51,7 @@ def test_nccl_bw_performance(self):
assert (benchmark.type == BenchmarkType.MICRO)
# Check parameters specified in BenchmarkContext.
assert (
benchmark._args.operations == [
'allreduce', 'allgather', 'broadcast', 'reduce', 'reducescatter', 'alltoall'
]
)
assert (benchmark._args.operation == 'allreduce')
assert (benchmark._args.ngpus == 8)
assert (benchmark._args.minbytes == '8')
assert (benchmark._args.maxbytes == '8G')
......@@ -70,10 +66,9 @@ def test_nccl_bw_performance(self):
'alltoall_perf'
]
for i in range(len(benchmark._args.operations)):
command = bin_names[i] + benchmark._commands[i].split(bin_names[i])[1]
expected_command = '{} -b 8 -e 8G -f 2 -g 8 -c 0 -n 20 -w 5'.format(bin_names[i])
assert (command == expected_command)
command = bin_names[0] + benchmark._commands[0].split(bin_names[0])[1]
expected_command = '{} -b 8 -e 8G -f 2 -g 8 -c 0 -n 20 -w 5'.format(bin_names[0])
assert (command == expected_command)
# Check results and metrics.
# Case with no raw_output
......@@ -411,18 +406,20 @@ def test_nccl_bw_performance(self):
"""
for i, op in enumerate(benchmark._args.operations):
assert (benchmark._process_raw_result(i, raw_output[op]))
for op in raw_output.keys():
benchmark._args.operation = op
assert (benchmark._process_raw_result(0, raw_output[op]))
for name in ['time', 'algbw', 'busbw']:
for size in ['8589934592', '4294967296', '2147483648', '1073741824', '536870912', '32']:
metric = 'NCCL_' + op + '_' + size + '_' + name
metric = op + '_' + size + '_' + name
assert (metric in benchmark.result)
assert (len(benchmark.result[metric]) == 1)
assert (isinstance(benchmark.result[metric][0], numbers.Number))
assert (benchmark.result['NCCL_allreduce_8589934592_time'][0] == 63896.0)
assert (benchmark.result['NCCL_allreduce_8589934592_algbw'][0] == 134.44)
assert (benchmark.result['NCCL_allreduce_8589934592_busbw'][0] == 235.26)
assert (benchmark.result['NCCL_alltoall_8589934592_time'][0] == 33508.0)
assert (benchmark.result['NCCL_alltoall_8589934592_algbw'][0] == 256.36)
assert (benchmark.result['NCCL_alltoall_8589934592_busbw'][0] == 224.31)
assert (benchmark.result['allreduce_8589934592_time'][0] == 63896.0)
assert (benchmark.result['allreduce_8589934592_algbw'][0] == 134.44)
assert (benchmark.result['allreduce_8589934592_busbw'][0] == 235.26)
assert (benchmark.result['alltoall_8589934592_time'][0] == 33508.0)
assert (benchmark.result['alltoall_8589934592_algbw'][0] == 256.36)
assert (benchmark.result['alltoall_8589934592_busbw'][0] == 224.31)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment