Benchmarks: Code Revision - Revise arguments of nccl/rccl to support mpi mode...

Benchmarks: Code Revision - Revise arguments of nccl/rccl to support mpi mode and rename metric (#189) **Description** Revise arguments of nccl/rccl to support mpi mode for (mpi can not run in nccl/rccl due to multiple operators run in sequence without barrier) and rename metric . **Major Revision** - revise argument operators to be a single one **Minor Revision** - rename metric to remove benchmark name info - change argument ngpus default value to be 1

Benchmarks: Code Revision - Revise arguments of nccl/rccl to support mpi mode...
Benchmarks: Code Revision - Revise arguments of nccl/rccl to support mpi mode and rename metric (#189) **Description** Revise arguments of nccl/rccl to support mpi mode for (mpi can not run in nccl/rccl due to multiple operators run in sequence without barrier) and rename metric . **Major Revision** - revise argument operators to be a single one **Minor Revision** - rename metric to remove benchmark name info - change argument ngpus default value to be 1
60762518 · Yuting Jiang · GitHub · 4e431f11 · 60762518 · 60762518
Unverified Commit 60762518 authored Sep 03, 2021 by Yuting Jiang Committed by GitHub Sep 03, 2021
2 changed files
--- a/superbench/benchmarks/micro_benchmarks/cuda_nccl_bw_performance.py
+++ b/superbench/benchmarks/micro_benchmarks/cuda_nccl_bw_performance.py
@@ -41,16 +41,15 @@ class CudaNcclBwBenchmark(MicroBenchmarkWithInvoke):
        super().add_parser_arguments()
        self._parser.add_argument(
-            '--operations',
+            '--operation',
            type=str,
-            nargs='+',
+            default='allreduce',
-            default=list(self.__operations.keys()),
+            help='NCCL operation to benchmark, e.g., {}.'.format(' '.join(list(self.__operations.keys()))),
-            help='Nccl operations to benchmark, e.g., {}.'.format(' '.join(list(self.__operations.keys()))),
        )
        self._parser.add_argument(
            '--ngpus',
            type=int,
-            default=8,
+            default=1,
            help='Number of gpus per thread to run the nccl test.',
        )
        self._parser.add_argument(
@@ -100,29 +99,29 @@ class CudaNcclBwBenchmark(MicroBenchmarkWithInvoke):
            return False
        # Format the arguments
-        self._args.operations = [p.lower() for p in self._args.operations]
+        self._args.operation = self._args.operation.lower()
        # Check the arguments and generate the commands
-        for op in self._args.operations:
+        op = self._args.operation
-            if op not in self.__operations:
+        if op not in self.__operations:
-                self._result.set_return_code(ReturnCode.INVALID_ARGUMENT)
+            self._result.set_return_code(ReturnCode.INVALID_ARGUMENT)
-                logger.error(
+            logger.error(
-                    'Unsupported operation of NCCL test - benchmark: {}, operation: {}, expected: {}.'.format(
+                'Unsupported operation of NCCL test - benchmark: {}, operation: {}, expected: {}.'.format(
-                        self._name, op, ' '.join(list(self.__operations.keys()))
+                    self._name, op, ' '.join(list(self.__operations.keys()))
-                    )
                )
+            )
+            return False
+        else:
+            self._bin_name = self.__operations[op]
+            if not self._set_binary_path():
                return False
-            else:
-                self._bin_name = self.__operations[op]
+            command = os.path.join(self._args.bin_dir, self._bin_name)
-                if not self._set_binary_path():
+            command += ' -b {} -e {} -f {} -g {} -c {} -n {} -w {}'.format(
-                    return False
+                self._args.minbytes, self._args.maxbytes, str(self._args.stepfactor), str(self._args.ngpus),
+                str(self._args.check), str(self._args.iters), str(self._args.warmup_iters)
-                command = os.path.join(self._args.bin_dir, self._bin_name)
+            )
-                command += ' -b {} -e {} -f {} -g {} -c {} -n {} -w {}'.format(
+            self._commands.append(command)
-                    self._args.minbytes, self._args.maxbytes, str(self._args.stepfactor), str(self._args.ngpus),
-                    str(self._args.check), str(self._args.iters), str(self._args.warmup_iters)
-                )
-                self._commands.append(command)
        return True
@@ -144,7 +143,7 @@ class CudaNcclBwBenchmark(MicroBenchmarkWithInvoke):
            if rank > 0:
                return True
-        self._result.add_raw_data('raw_output_' + self._args.operations[cmd_idx], raw_output)
+        self._result.add_raw_data('raw_output_' + self._args.operation, raw_output)
        content = raw_output.splitlines()
        size = -1
@@ -189,15 +188,9 @@ class CudaNcclBwBenchmark(MicroBenchmarkWithInvoke):
                        busbw_out = float(line[busbw_index])
                        time_out = float(line[time_index])
                        algbw_out = float(line[algbw_index])
-                        self._result.add_result(
+                        self._result.add_result(self._args.operation + '_' + str(size) + '_busbw', busbw_out)
-                            'NCCL_' + self._args.operations[cmd_idx] + '_' + str(size) + '_busbw', busbw_out
+                        self._result.add_result(self._args.operation + '_' + str(size) + '_algbw', algbw_out)
-                        )
+                        self._result.add_result(self._args.operation + '_' + str(size) + '_time', time_out)
-                        self._result.add_result(
-                            'NCCL_' + self._args.operations[cmd_idx] + '_' + str(size) + '_algbw', algbw_out
-                        )
-                        self._result.add_result(
-                            'NCCL_' + self._args.operations[cmd_idx] + '_' + str(size) + '_time', time_out
-                        )
        except BaseException as e:
            logger.error(
                'The result format is invalid - round: {}, benchmark: {}, raw output: {}, message: {}.'.format(

--- a/tests/benchmarks/micro_benchmarks/test_cuda_nccl_bw_performance.py
+++ b/tests/benchmarks/micro_benchmarks/test_cuda_nccl_bw_performance.py
@@ -39,7 +39,7 @@ class CudaNcclBwBenchmarkTest(unittest.TestCase):
         predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CUDA)
        assert (benchmark_class)
-        benchmark = benchmark_class(benchmark_name)
+        benchmark = benchmark_class(benchmark_name, parameters='--ngpus 8')
        ret = benchmark._preprocess()
        assert (ret is True)
@@ -51,11 +51,7 @@ class CudaNcclBwBenchmarkTest(unittest.TestCase):
        assert (benchmark.type == BenchmarkType.MICRO)
        # Check parameters specified in BenchmarkContext.
-        assert (
+        assert (benchmark._args.operation == 'allreduce')
-            benchmark._args.operations == [
-                'allreduce', 'allgather', 'broadcast', 'reduce', 'reducescatter', 'alltoall'
-            ]
-        )
        assert (benchmark._args.ngpus == 8)
        assert (benchmark._args.minbytes == '8')
        assert (benchmark._args.maxbytes == '8G')
@@ -70,10 +66,9 @@ class CudaNcclBwBenchmarkTest(unittest.TestCase):
            'alltoall_perf'
        ]
-        for i in range(len(benchmark._args.operations)):
+        command = bin_names[0] + benchmark._commands[0].split(bin_names[0])[1]
-            command = bin_names[i] + benchmark._commands[i].split(bin_names[i])[1]
+        expected_command = '{} -b 8 -e 8G -f 2 -g 8 -c 0 -n 20 -w 5'.format(bin_names[0])
-            expected_command = '{} -b 8 -e 8G -f 2 -g 8 -c 0 -n 20 -w 5'.format(bin_names[i])
+        assert (command == expected_command)
-            assert (command == expected_command)
        # Check results and metrics.
        # Case with no raw_output
@@ -411,18 +406,20 @@ hostname:3442:3442 [0] NCCL INFO Launch mode Parallel
 """
-        for i, op in enumerate(benchmark._args.operations):
+        for op in raw_output.keys():
-            assert (benchmark._process_raw_result(i, raw_output[op]))
+            benchmark._args.operation = op
+            assert (benchmark._process_raw_result(0, raw_output[op]))
            for name in ['time', 'algbw', 'busbw']:
                for size in ['8589934592', '4294967296', '2147483648', '1073741824', '536870912', '32']:
-                    metric = 'NCCL_' + op + '_' + size + '_' + name
+                    metric = op + '_' + size + '_' + name
                    assert (metric in benchmark.result)
                    assert (len(benchmark.result[metric]) == 1)
                    assert (isinstance(benchmark.result[metric][0], numbers.Number))
-        assert (benchmark.result['NCCL_allreduce_8589934592_time'][0] == 63896.0)
+        assert (benchmark.result['allreduce_8589934592_time'][0] == 63896.0)
-        assert (benchmark.result['NCCL_allreduce_8589934592_algbw'][0] == 134.44)
+        assert (benchmark.result['allreduce_8589934592_algbw'][0] == 134.44)
-        assert (benchmark.result['NCCL_allreduce_8589934592_busbw'][0] == 235.26)
+        assert (benchmark.result['allreduce_8589934592_busbw'][0] == 235.26)
-        assert (benchmark.result['NCCL_alltoall_8589934592_time'][0] == 33508.0)
+        assert (benchmark.result['alltoall_8589934592_time'][0] == 33508.0)
-        assert (benchmark.result['NCCL_alltoall_8589934592_algbw'][0] == 256.36)
+        assert (benchmark.result['alltoall_8589934592_algbw'][0] == 256.36)
-        assert (benchmark.result['NCCL_alltoall_8589934592_busbw'][0] == 224.31)
+        assert (benchmark.result['alltoall_8589934592_busbw'][0] == 224.31)