Benchmarks: Fix Bug - Fix gemm kernel bug for nvidia v100. (#105)

* fix bug for nvidia v100 * hard code the supported dict for different arch.

Benchmarks: Fix Bug - Fix gemm kernel bug for nvidia v100. (#105)
* fix bug for nvidia v100 * hard code the supported dict for different arch.
8ffaddfa · guoshzhao · GitHub · f22bb3f2 · 8ffaddfa · 8ffaddfa
Unverified Commit 8ffaddfa authored Jun 29, 2021 by guoshzhao Committed by GitHub Jun 29, 2021
2 changed files
--- a/superbench/benchmarks/micro_benchmarks/gemm_flops_performance.py
+++ b/superbench/benchmarks/micro_benchmarks/gemm_flops_performance.py
@@ -24,16 +24,25 @@ def __init__(self, name, parameters=''):
        self._bin_name = 'cutlass_profiler'
+        # TODO - To support more architecutres, currently only support compute capability = 7.0 and 8.0
        self.__kernel_map = {
-            'FP64': 'cutlass_simt_dgemm_128x128_8x2_*',
+            7.0: {
-            'FP32': 'cutlass_simt_sgemm_128x128_8x2_*',
+                'FP64': 'cutlass_simt_dgemm_128x128_8x2_*',
-            'FP16': 'cutlass_simt_hgemm_256x128_8x2_*',
+                'FP32': 'cutlass_simt_sgemm_128x128_8x2_*',
-            'FP64_TC': 'cutlass_tensorop_d884gemm_128x128_16x3_*',
+                'FP16': 'cutlass_simt_hgemm_256x128_8x2_*',
-            'TF32_TC': 'cutlass_tensorop_tf32_s1688gemm_tf32_256x128_16x3_*',
+                'FP16_TC': 'cutlass_tensorop_h884gemm_256x128_32x2_*',
-            'BF16_TC': 'cutlass_tensorop_bf16_s16816gemm_bf16_256x128_32x3_*',
+            },
-            'FP16_TC': 'cutlass_tensorop_h16816gemm_256x128_32x3_*',
+            8.0: {
-            'INT8_TC': 'cutlass_tensorop_s8_i16832gemm_s8_256x128_64x3_*',
+                'FP64': 'cutlass_simt_dgemm_128x128_8x2_*',
-            'INT4_TC': 'cutlass_tensorop_s4_i16864gemm_s4_256x128_128x3_*',
+                'FP32': 'cutlass_simt_sgemm_128x128_8x2_*',
+                'FP16': 'cutlass_simt_hgemm_256x128_8x2_*',
+                'FP64_TC': 'cutlass_tensorop_d884gemm_128x128_16x3_*',
+                'TF32_TC': 'cutlass_tensorop_tf32_s1688gemm_tf32_256x128_16x3_*',
+                'BF16_TC': 'cutlass_tensorop_bf16_s16816gemm_bf16_256x128_32x3_*',
+                'FP16_TC': 'cutlass_tensorop_h16816gemm_256x128_32x3_*',
+                'INT8_TC': 'cutlass_tensorop_s8_i16832gemm_s8_256x128_64x3_*',
+                'INT4_TC': 'cutlass_tensorop_s4_i16864gemm_s4_256x128_128x3_*',
+            }
        }
    def add_parser_arguments(self):
@@ -72,8 +81,8 @@ def add_parser_arguments(self):
            '--precision',
            type=str,
            nargs='+',
-            default=list(self.__kernel_map.keys()),
+            default=list(),
-            help='Precision for benchmarking. E.g. {}.'.format(' '.join(list(self.__kernel_map.keys()))),
+            help='Precision for benchmarking. E.g. {}.'.format(' '.join(list(self.__kernel_map[8.0].keys()))),
        )
    def _preprocess(self):
@@ -85,32 +94,9 @@ def _preprocess(self):
        if not super()._preprocess():
            return False
-        self._args.precision = [p.upper() for p in self._args.precision]
+        # Reset kernels according to compute capability.
-        for p in self._args.precision:
-            if p not in list(self.__kernel_map.keys()):
-                self._result.set_return_code(ReturnCode.INVALID_ARGUMENT)
-                logger.error(
-                    'Unsupported precision - benchmark: {}, precision: {}, expected: {}.'.format(
-                        self._name, p, list(self.__kernel_map.keys())
-                    )
-                )
-                return False
-            else:
-                command = os.path.join(self._args.bin_dir, self._bin_name)
-                command += (' --warmup-iterations=' + str(self._args.num_warmup))
-                command += (' --operation=gemm')
-                command += (' --n=' + str(self._args.n))
-                command += (' --k=' + str(self._args.k))
-                command += (' --m=' + str(self._args.m))
-                command += (' --kernels=' + self.__kernel_map[p])
-                self._commands.append(command)
-        # TODO - To support more architecutres, currently only support compute capability = 7.0 or 8.0
        capability = nv_helper.get_device_compute_capability()
-        if capability == 7.0:
+        if capability not in self.__kernel_map:
-            self.__kernel_map['FP16_TC'] = 'cutlass_tensorop_h884gemm_256x128_32x2_*'
-        if capability not in [7.0, 8.0]:
            self._result.set_return_code(ReturnCode.MICROBENCHMARK_UNSUPPORTED_ARCHITECTURE)
            logger.error(
                'Unsupported architecture - benchmark: {}, compute capability: {}, expected: 7.0 or 8.0'.format(
@@ -119,6 +105,36 @@ def _preprocess(self):
            )
            return False
+        self.__precision_need_to_run = list()
+        if len(self._args.precision) == 0:
+            self.__precision_need_to_run = list(self.__kernel_map[capability].keys())
+        else:
+            self._args.precision = [p.upper() for p in self._args.precision]
+            for p in self._args.precision:
+                if p not in list(self.__kernel_map.keys()):
+                    self._result.set_return_code(ReturnCode.INVALID_ARGUMENT)
+                    logger.warning(
+                        'Unsupported precision - benchmark: {}, precision: {}, expected: {}.'.format(
+                            self._name, p, list(self.__kernel_map[capability].keys())
+                        )
+                    )
+                else:
+                    self.__precision_need_to_run.append(p)
+        if len(self.__precision_need_to_run) == 0:
+            self._result.set_return_code(ReturnCode.NO_SUPPORTED_PRECISION)
+            return False
+        for p in self.__precision_need_to_run:
+            command = os.path.join(self._args.bin_dir, self._bin_name)
+            command += (' --warmup-iterations=' + str(self._args.num_warmup))
+            command += (' --operation=gemm')
+            command += (' --n=' + str(self._args.n))
+            command += (' --k=' + str(self._args.k))
+            command += (' --m=' + str(self._args.m))
+            command += (' --kernels=' + self.__kernel_map[capability][p])
+            self._commands.append(command)
        return True
    def _process_raw_result(self, cmd_idx, raw_output):
@@ -133,7 +149,7 @@ def _process_raw_result(self, cmd_idx, raw_output):
        Return:
            True if the raw output string is valid and result can be extracted.
        """
-        precision = self._args.precision[cmd_idx]
+        precision = self.__precision_need_to_run[cmd_idx]
        self._result.add_raw_data('raw_output_' + precision, raw_output)
        valid = True

--- a/tests/benchmarks/micro_benchmarks/test_gemm_flops_performance.py
+++ b/tests/benchmarks/micro_benchmarks/test_gemm_flops_performance.py
@@ -60,15 +60,7 @@ def test_flops_performance_cuda(self):
        assert (benchmark._args.k == 512)
        assert (benchmark._args.m == 2048)
        assert (benchmark._args.precision == ['FP32', 'TF32_TC', 'FP16_TC', 'INT8_TC'])
+        benchmark._GemmFlopsCuda__precision_need_to_run = ['FP32', 'TF32_TC', 'FP16_TC', 'INT8_TC']
-        # Check the command list.
-        for i in range(len(benchmark._args.precision)):
-            command = '{} --warmup-iterations={} --operation=gemm --n={} --k={} --m={} --kernels={}'.format(
-                benchmark._bin_name, benchmark._args.num_warmup, benchmark._args.n, benchmark._args.k,
-                benchmark._args.m, benchmark._GemmFlopsCuda__kernel_map[benchmark._args.precision[i]]
-            )
-            expected_cmd = benchmark._bin_name + benchmark._commands[i].split(benchmark._bin_name)[1]
-            assert (command == expected_cmd)
        # Check results and metrics.
        raw_output_FP32 = """