Benchmarks: Microbenchmark - Support in-place for NCCL/RCCL benchmark (#591)

**Description** Add in-place metrics for NCCL/RCCL benchmark for latency measurement.

Benchmarks: Microbenchmark - Support in-place for NCCL/RCCL benchmark (#591)
**Description** Add in-place metrics for NCCL/RCCL benchmark for latency measurement.
27374ad5 · Ziyue Yang · GitHub · 606ff191 · 27374ad5 · 27374ad5
Unverified Commit 27374ad5 authored Dec 14, 2023 by Ziyue Yang Committed by GitHub Dec 14, 2023
3 changed files
--- a/docs/user-tutorial/benchmarks/micro-benchmarks.md
+++ b/docs/user-tutorial/benchmarks/micro-benchmarks.md
@@ -283,6 +283,7 @@ Measure the performance of NCCL/RCCL operations under multi nodes' traffic patte
 performed by [nccl-tests](https://github.com/NVIDIA/nccl-tests/tree/44df0bf010dcc95e840ca0fb7466c67cff3f1f0f)
 or [rccl-tests](https://github.com/ROCmSoftwarePlatform/rccl-tests/tree/dc1ad4853d7ec738387d42a75a58a98d7af00c7b).
 Support the following operations currently: allreduce, allgather, broadcast, reduce, reducescatter, alltoall.
+Support both in-place and out-of-place measurements.

 Support the following traffic patterns:
 * `all-nodes`, validate the NCCL/RCCL performance across all VM nodes simultaneously.

--- a/superbench/benchmarks/micro_benchmarks/cuda_nccl_bw_performance.py
+++ b/superbench/benchmarks/micro_benchmarks/cuda_nccl_bw_performance.py
@@ -94,6 +94,11 @@ class CudaNcclBwBenchmark(MicroBenchmarkWithInvoke):
            default=0,
            help='Number of graph launch iterations. Set to 0 to disable graph mode. Default: 0.',
        )
+        self._parser.add_argument(
+            '--in_place',
+            action='store_true',
+            help='If specified, collect in-place numbers, else collect out-of-place numbers.',
+        )

    def _preprocess(self):
        """Preprocess/preparation operations before the benchmarking.
@@ -171,9 +176,9 @@ class CudaNcclBwBenchmark(MicroBenchmarkWithInvoke):
            content = content[out_of_place_index + 1:out_of_bound_index]
            # Parse max out of bound bus bw as the result
            size_index = -1
-            time_index = -1
-            busbw_index = -1
-            algbw_index = -1
+            time_index = None
+            busbw_index = None
+            algbw_index = None
            for line in content:
                if 'time' in line and 'busbw' in line:
                    # Get index of selected column
@@ -181,11 +186,17 @@ class CudaNcclBwBenchmark(MicroBenchmarkWithInvoke):
                    line = re.sub(r' +', ' ', line).split(' ')
                    # Get first index of condition in list, if it not existing, raise exception
                    size_index = line.index('size')
-                    time_index = line.index('time') - len(line)
-                    busbw_index = line.index('busbw') - len(line)
-                    algbw_index = line.index('algbw') - len(line)
+                    # Need index from the end because sometimes previous fields (like redop) can be empty
+                    if self._args.in_place:
+                        time_index = -1 - list(reversed(line)).index('time')
+                        busbw_index = -1 - list(reversed(line)).index('busbw')
+                        algbw_index = -1 - list(reversed(line)).index('algbw')
+                    else:
+                        time_index = line.index('time') - len(line)
+                        busbw_index = line.index('busbw') - len(line)
+                        algbw_index = line.index('algbw') - len(line)
                    break
-            if size_index != -1 and busbw_index != -1 and time_index != -1 and algbw_index != -1:
+            if size_index != -1 and busbw_index is not None and time_index is not None and algbw_index is not None:
                for line in content:
                    line = line.strip(' ')
                    line = re.sub(r' +', ' ', line).split(' ')

--- a/tests/benchmarks/micro_benchmarks/test_cuda_nccl_bw_performance.py
+++ b/tests/benchmarks/micro_benchmarks/test_cuda_nccl_bw_performance.py
@@ -66,6 +66,7 @@ class CudaNcclBwBenchmarkTest(BenchmarkTestCase, unittest.TestCase):
        assert (benchmark._args.iters == 20)
        assert (benchmark._args.warmup_iters == 5)
        assert (benchmark._args.graph_iters == 0)
+        assert (benchmark._args.in_place is False)

        # Check command list
        bin_names = [
@@ -91,6 +92,11 @@ class CudaNcclBwBenchmarkTest(BenchmarkTestCase, unittest.TestCase):
            'alltoall': alltoall,
        }

+        if 'SB_MODE_SERIAL_INDEX' in os.environ:
+            os.environ.pop('SB_MODE_SERIAL_INDEX')
+        if 'SB_MODE_PARALLEL_INDEX' in os.environ:
+            os.environ.pop('SB_MODE_PARALLEL_INDEX')
+
        for op in raw_output.keys():
            benchmark._args.operation = op
            assert (benchmark._process_raw_result(0, raw_output[op]))
@@ -131,3 +137,48 @@ class CudaNcclBwBenchmarkTest(BenchmarkTestCase, unittest.TestCase):
        assert (benchmark.result['alltoall_0_0:8589934592_time'][0] == 33508.0)
        assert (benchmark.result['alltoall_0_0:8589934592_algbw'][0] == 256.36)
        assert (benchmark.result['alltoall_0_0:8589934592_busbw'][0] == 224.31)
+
+    @decorator.load_data('tests/data/nccl_allreduce.log')
+    @decorator.load_data('tests/data/nccl_alltoall.log')
+    def test_nccl_bw_performance_in_place_parsing(self, allreduce, alltoall):
+        """Test nccl-bw benchmark in-place parsing."""
+        benchmark_name = 'nccl-bw'
+        (benchmark_class,
+         predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CUDA)
+        assert (benchmark_class)
+
+        benchmark = benchmark_class(benchmark_name, parameters='--ngpus 8 --in_place')
+
+        ret = benchmark._preprocess()
+        assert (ret is True)
+        assert (benchmark.return_code == ReturnCode.SUCCESS)
+        assert (benchmark._args.in_place is True)
+
+        # Case with valid raw_output
+        raw_output = {
+            'allreduce': allreduce,
+            'alltoall': alltoall,
+        }
+
+        if 'SB_MODE_SERIAL_INDEX' in os.environ:
+            os.environ.pop('SB_MODE_SERIAL_INDEX')
+        if 'SB_MODE_PARALLEL_INDEX' in os.environ:
+            os.environ.pop('SB_MODE_PARALLEL_INDEX')
+
+        for op in raw_output.keys():
+            benchmark._args.operation = op
+            assert (benchmark._process_raw_result(0, raw_output[op]))
+
+            for name in ['time', 'algbw', 'busbw']:
+                for size in ['8589934592', '4294967296', '2147483648', '1073741824', '536870912', '32']:
+                    metric = op + '_' + size + '_' + name
+                    assert (metric in benchmark.result)
+                    assert (len(benchmark.result[metric]) == 1)
+                    assert (isinstance(benchmark.result[metric][0], numbers.Number))
+
+        assert (benchmark.result['allreduce_8589934592_time'][0] == 63959.0)
+        assert (benchmark.result['allreduce_8589934592_algbw'][0] == 134.30)
+        assert (benchmark.result['allreduce_8589934592_busbw'][0] == 235.03)
+        assert (benchmark.result['alltoall_8589934592_time'][0] == 33234.0)
+        assert (benchmark.result['alltoall_8589934592_algbw'][0] == 258.47)
+        assert (benchmark.result['alltoall_8589934592_busbw'][0] == 226.16)