Unverified Commit 27374ad5 authored by Ziyue Yang's avatar Ziyue Yang Committed by GitHub
Browse files

Benchmarks: Microbenchmark - Support in-place for NCCL/RCCL benchmark (#591)

**Description**
Add in-place metrics for NCCL/RCCL benchmark for latency measurement.
parent 606ff191
...@@ -283,6 +283,7 @@ Measure the performance of NCCL/RCCL operations under multi nodes' traffic patte ...@@ -283,6 +283,7 @@ Measure the performance of NCCL/RCCL operations under multi nodes' traffic patte
performed by [nccl-tests](https://github.com/NVIDIA/nccl-tests/tree/44df0bf010dcc95e840ca0fb7466c67cff3f1f0f) performed by [nccl-tests](https://github.com/NVIDIA/nccl-tests/tree/44df0bf010dcc95e840ca0fb7466c67cff3f1f0f)
or [rccl-tests](https://github.com/ROCmSoftwarePlatform/rccl-tests/tree/dc1ad4853d7ec738387d42a75a58a98d7af00c7b). or [rccl-tests](https://github.com/ROCmSoftwarePlatform/rccl-tests/tree/dc1ad4853d7ec738387d42a75a58a98d7af00c7b).
Support the following operations currently: allreduce, allgather, broadcast, reduce, reducescatter, alltoall. Support the following operations currently: allreduce, allgather, broadcast, reduce, reducescatter, alltoall.
Support both in-place and out-of-place measurements.
Support the following traffic patterns: Support the following traffic patterns:
* `all-nodes`, validate the NCCL/RCCL performance across all VM nodes simultaneously. * `all-nodes`, validate the NCCL/RCCL performance across all VM nodes simultaneously.
......
...@@ -94,6 +94,11 @@ def add_parser_arguments(self): ...@@ -94,6 +94,11 @@ def add_parser_arguments(self):
default=0, default=0,
help='Number of graph launch iterations. Set to 0 to disable graph mode. Default: 0.', help='Number of graph launch iterations. Set to 0 to disable graph mode. Default: 0.',
) )
self._parser.add_argument(
'--in_place',
action='store_true',
help='If specified, collect in-place numbers, else collect out-of-place numbers.',
)
def _preprocess(self): def _preprocess(self):
"""Preprocess/preparation operations before the benchmarking. """Preprocess/preparation operations before the benchmarking.
...@@ -171,9 +176,9 @@ def _process_raw_result(self, cmd_idx, raw_output): # noqa: C901 ...@@ -171,9 +176,9 @@ def _process_raw_result(self, cmd_idx, raw_output): # noqa: C901
content = content[out_of_place_index + 1:out_of_bound_index] content = content[out_of_place_index + 1:out_of_bound_index]
# Parse max out of bound bus bw as the result # Parse max out of bound bus bw as the result
size_index = -1 size_index = -1
time_index = -1 time_index = None
busbw_index = -1 busbw_index = None
algbw_index = -1 algbw_index = None
for line in content: for line in content:
if 'time' in line and 'busbw' in line: if 'time' in line and 'busbw' in line:
# Get index of selected column # Get index of selected column
...@@ -181,11 +186,17 @@ def _process_raw_result(self, cmd_idx, raw_output): # noqa: C901 ...@@ -181,11 +186,17 @@ def _process_raw_result(self, cmd_idx, raw_output): # noqa: C901
line = re.sub(r' +', ' ', line).split(' ') line = re.sub(r' +', ' ', line).split(' ')
# Get first index of condition in list, if it not existing, raise exception # Get first index of condition in list, if it not existing, raise exception
size_index = line.index('size') size_index = line.index('size')
time_index = line.index('time') - len(line) # Need index from the end because sometimes previous fields (like redop) can be empty
busbw_index = line.index('busbw') - len(line) if self._args.in_place:
algbw_index = line.index('algbw') - len(line) time_index = -1 - list(reversed(line)).index('time')
busbw_index = -1 - list(reversed(line)).index('busbw')
algbw_index = -1 - list(reversed(line)).index('algbw')
else:
time_index = line.index('time') - len(line)
busbw_index = line.index('busbw') - len(line)
algbw_index = line.index('algbw') - len(line)
break break
if size_index != -1 and busbw_index != -1 and time_index != -1 and algbw_index != -1: if size_index != -1 and busbw_index is not None and time_index is not None and algbw_index is not None:
for line in content: for line in content:
line = line.strip(' ') line = line.strip(' ')
line = re.sub(r' +', ' ', line).split(' ') line = re.sub(r' +', ' ', line).split(' ')
......
...@@ -66,6 +66,7 @@ def test_nccl_bw_performance(self, allgather, allreduce, reduce, broadcast, redu ...@@ -66,6 +66,7 @@ def test_nccl_bw_performance(self, allgather, allreduce, reduce, broadcast, redu
assert (benchmark._args.iters == 20) assert (benchmark._args.iters == 20)
assert (benchmark._args.warmup_iters == 5) assert (benchmark._args.warmup_iters == 5)
assert (benchmark._args.graph_iters == 0) assert (benchmark._args.graph_iters == 0)
assert (benchmark._args.in_place is False)
# Check command list # Check command list
bin_names = [ bin_names = [
...@@ -91,6 +92,11 @@ def test_nccl_bw_performance(self, allgather, allreduce, reduce, broadcast, redu ...@@ -91,6 +92,11 @@ def test_nccl_bw_performance(self, allgather, allreduce, reduce, broadcast, redu
'alltoall': alltoall, 'alltoall': alltoall,
} }
if 'SB_MODE_SERIAL_INDEX' in os.environ:
os.environ.pop('SB_MODE_SERIAL_INDEX')
if 'SB_MODE_PARALLEL_INDEX' in os.environ:
os.environ.pop('SB_MODE_PARALLEL_INDEX')
for op in raw_output.keys(): for op in raw_output.keys():
benchmark._args.operation = op benchmark._args.operation = op
assert (benchmark._process_raw_result(0, raw_output[op])) assert (benchmark._process_raw_result(0, raw_output[op]))
...@@ -131,3 +137,48 @@ def test_nccl_bw_performance(self, allgather, allreduce, reduce, broadcast, redu ...@@ -131,3 +137,48 @@ def test_nccl_bw_performance(self, allgather, allreduce, reduce, broadcast, redu
assert (benchmark.result['alltoall_0_0:8589934592_time'][0] == 33508.0) assert (benchmark.result['alltoall_0_0:8589934592_time'][0] == 33508.0)
assert (benchmark.result['alltoall_0_0:8589934592_algbw'][0] == 256.36) assert (benchmark.result['alltoall_0_0:8589934592_algbw'][0] == 256.36)
assert (benchmark.result['alltoall_0_0:8589934592_busbw'][0] == 224.31) assert (benchmark.result['alltoall_0_0:8589934592_busbw'][0] == 224.31)
@decorator.load_data('tests/data/nccl_allreduce.log')
@decorator.load_data('tests/data/nccl_alltoall.log')
def test_nccl_bw_performance_in_place_parsing(self, allreduce, alltoall):
"""Test nccl-bw benchmark in-place parsing."""
benchmark_name = 'nccl-bw'
(benchmark_class,
predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CUDA)
assert (benchmark_class)
benchmark = benchmark_class(benchmark_name, parameters='--ngpus 8 --in_place')
ret = benchmark._preprocess()
assert (ret is True)
assert (benchmark.return_code == ReturnCode.SUCCESS)
assert (benchmark._args.in_place is True)
# Case with valid raw_output
raw_output = {
'allreduce': allreduce,
'alltoall': alltoall,
}
if 'SB_MODE_SERIAL_INDEX' in os.environ:
os.environ.pop('SB_MODE_SERIAL_INDEX')
if 'SB_MODE_PARALLEL_INDEX' in os.environ:
os.environ.pop('SB_MODE_PARALLEL_INDEX')
for op in raw_output.keys():
benchmark._args.operation = op
assert (benchmark._process_raw_result(0, raw_output[op]))
for name in ['time', 'algbw', 'busbw']:
for size in ['8589934592', '4294967296', '2147483648', '1073741824', '536870912', '32']:
metric = op + '_' + size + '_' + name
assert (metric in benchmark.result)
assert (len(benchmark.result[metric]) == 1)
assert (isinstance(benchmark.result[metric][0], numbers.Number))
assert (benchmark.result['allreduce_8589934592_time'][0] == 63959.0)
assert (benchmark.result['allreduce_8589934592_algbw'][0] == 134.30)
assert (benchmark.result['allreduce_8589934592_busbw'][0] == 235.03)
assert (benchmark.result['alltoall_8589934592_time'][0] == 33234.0)
assert (benchmark.result['alltoall_8589934592_algbw'][0] == 258.47)
assert (benchmark.result['alltoall_8589934592_busbw'][0] == 226.16)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment