"vscode:/vscode.git/clone" did not exist on "946f897da07a2fea6867dce584b5132e0e7e654c"
Unverified Commit ccccd988 authored by Yang Wang's avatar Yang Wang Committed by GitHub
Browse files

Benchmarks - Support topo-aware, pair-wise, and K-batch pattern in nccl-bw benchmark (#454)

Support traffic patterns under the different devices in NCCL/RCCL test
* change the metrics format if specified the pattern
parent 8e748d56
......@@ -242,11 +242,17 @@ Measure the InfiniBand loopback verbs bandwidth, performed by
#### Introduction
Measure the performance of NCCL/RCCL operations,
Measure the performance of NCCL/RCCL operations under multi nodes' traffic pattern,
performed by [nccl-tests](https://github.com/NVIDIA/nccl-tests/tree/44df0bf010dcc95e840ca0fb7466c67cff3f1f0f)
or [rccl-tests](https://github.com/ROCmSoftwarePlatform/rccl-tests/tree/dc1ad4853d7ec738387d42a75a58a98d7af00c7b).
Support the following operations currently: allreduce, allgather, broadcast, reduce, reducescatter, alltoall.
Support the following traffic patterns:
* `all-nodes`, validate the NCCL/RCCL performance across all VM nodes simultaneously.
* `pair-wise`, validate the NCCL/RCCL performance across VM pairs with all possible combinations in parallel.
* `k-batch`, validate the NCCL/RCCL performance across VM groups with a specified batch scale.
* `topo-aware`, validate the NCCL/RCCL performance across VM pairs with different distances/hops as a quick test.
#### Metrics
| Name | Unit | Description |
......@@ -258,6 +264,9 @@ Support the following operations currently: allreduce, allgather, broadcast, red
| rccl-bw/${operation}_${msg_size}_algbw | bandwidth (GB/s) | RCCL operation algorithm bandwidth with given message size. |
| rccl-bw/${operation}_${msg_size}_busbw | bandwidth (GB/s) | RCCL operation bus bandwidth with given message size. |
If traffic pattern is specified, the metrics pattern will change to `nccl-bw/${operation}_${serial_index)_${parallel_index):${msg_size}_time`
- `serial_index` represents the serial index of the host group in serial.
- `parallel_index` represents the parallel index of the host list in parallel.
### `tcp-connectivity`
#### Introduction
......
......@@ -128,7 +128,7 @@ def _preprocess(self):
def _process_raw_result(self, cmd_idx, raw_output): # noqa: C901
"""Function to parse raw results and save the summarized results.
self._result.add_raw_data() and self._result.add_result() need to be called to save the results.
self._result.add_raw_data() and self._result.add_result() need to be called to save the results.
Args:
cmd_idx (int): the index of command corresponding with the raw_output.
......@@ -150,6 +150,9 @@ def _process_raw_result(self, cmd_idx, raw_output): # noqa: C901
busbw_out = -1
time_out = -1
algbw_out = -1
serial_index = os.environ.get('SB_MODE_SERIAL_INDEX', -1)
parallel_index = os.environ.get('SB_MODE_PARALLEL_INDEX', -1)
try:
# Filter useless output
out_of_place_index = -1
......@@ -188,9 +191,13 @@ def _process_raw_result(self, cmd_idx, raw_output): # noqa: C901
busbw_out = float(line[busbw_index])
time_out = float(line[time_index])
algbw_out = float(line[algbw_index])
self._result.add_result(self._args.operation + '_' + str(size) + '_busbw', busbw_out)
self._result.add_result(self._args.operation + '_' + str(size) + '_algbw', algbw_out)
self._result.add_result(self._args.operation + '_' + str(size) + '_time', time_out)
exec_index = '_{}_{}:'.format(
serial_index, parallel_index
) if serial_index != -1 and parallel_index != -1 else '_'
prefix_name = '{}{}{}_'.format(self._args.operation, exec_index, size)
self._result.add_result(prefix_name + 'busbw', busbw_out)
self._result.add_result(prefix_name + 'algbw', algbw_out)
self._result.add_result(prefix_name + 'time', time_out)
except BaseException as e:
logger.error(
'The result format is invalid - round: {}, benchmark: {}, raw output: {}, message: {}.'.format(
......
......@@ -3,6 +3,7 @@
"""Tests for nccl-bw benchmark."""
import os
import numbers
import unittest
......@@ -106,3 +107,26 @@ def test_nccl_bw_performance(self, allgather, allreduce, reduce, broadcast, redu
assert (benchmark.result['alltoall_8589934592_time'][0] == 33508.0)
assert (benchmark.result['alltoall_8589934592_algbw'][0] == 256.36)
assert (benchmark.result['alltoall_8589934592_busbw'][0] == 224.31)
# Check with exec index info
os.environ['SB_MODE_SERIAL_INDEX'] = '0'
os.environ['SB_MODE_PARALLEL_INDEX'] = '0'
exec_index = '0_0'
for op in raw_output.keys():
benchmark._args.operation = op
assert (benchmark._process_raw_result(0, raw_output[op]))
for name in ['time', 'algbw', 'busbw']:
for size in ['8589934592', '4294967296', '2147483648', '1073741824', '536870912', '32']:
metric = op + '_' + exec_index + ':' + size + '_' + name
assert (metric in benchmark.result)
assert (len(benchmark.result[metric]) == 1)
assert (isinstance(benchmark.result[metric][0], numbers.Number))
assert (benchmark.result['allreduce_0_0:8589934592_time'][0] == 63896.0)
assert (benchmark.result['allreduce_0_0:8589934592_algbw'][0] == 134.44)
assert (benchmark.result['allreduce_0_0:8589934592_busbw'][0] == 235.26)
assert (benchmark.result['alltoall_0_0:8589934592_time'][0] == 33508.0)
assert (benchmark.result['alltoall_0_0:8589934592_algbw'][0] == 256.36)
assert (benchmark.result['alltoall_0_0:8589934592_busbw'][0] == 224.31)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment