Benchmarks: Code Revision - Extract base class for gemm flops microbenchmark (#165)

**Description** Extract base class for gemm flops microbenchmark. **Major Revision** - extract base class for gemm flops microbenchmark and add related test. - revise gemm_flops_performance for cuda.

Benchmarks: Code Revision - Extract base class for gemm flops microbenchmark (#165)
**Description** Extract base class for gemm flops microbenchmark. **Major Revision** - extract base class for gemm flops microbenchmark and add related test. - revise gemm_flops_performance for cuda.
b0df66f7 · Yuting Jiang · GitHub · 35114bae · b0df66f7 · b0df66f7
Unverified Commit b0df66f7 authored Aug 30, 2021 by Yuting Jiang Committed by GitHub Aug 30, 2021
5 changed files
--- a/superbench/benchmarks/micro_benchmarks/__init__.py
+++ b/superbench/benchmarks/micro_benchmarks/__init__.py
@@ -9,7 +9,8 @@
 from superbench.benchmarks.micro_benchmarks.kernel_launch_overhead import KernelLaunch
 from superbench.benchmarks.micro_benchmarks.cublas_function import CublasBenchmark
 from superbench.benchmarks.micro_benchmarks.cudnn_function import CudnnBenchmark
-from superbench.benchmarks.micro_benchmarks.gemm_flops_performance import GemmFlopsCuda
+from superbench.benchmarks.micro_benchmarks.gemm_flops_performance_base import GemmFlopsBenchmark
+from superbench.benchmarks.micro_benchmarks.cuda_gemm_flops_performance import CudaGemmFlopsBenchmark
 from superbench.benchmarks.micro_benchmarks.memory_bw_performance_base import MemBwBenchmark
 from superbench.benchmarks.micro_benchmarks.cuda_memory_bw_performance import CudaMemBwBenchmark
 from superbench.benchmarks.micro_benchmarks.disk_performance import DiskBenchmark
@@ -19,6 +20,6 @@

 __all__ = [
    'MicroBenchmark', 'MicroBenchmarkWithInvoke', 'ShardingMatmul', 'ComputationCommunicationOverlap', 'KernelLaunch',
-    'CublasBenchmark', 'CudnnBenchmark', 'GemmFlopsCuda', 'MemBwBenchmark', 'CudaMemBwBenchmark', 'DiskBenchmark',
-    'IBLoopbackBenchmark', 'CudaNcclBwBenchmark', 'RocmMemBwBenchmark'
+    'CublasBenchmark', 'CudnnBenchmark', 'GemmFlopsBenchmark', 'CudaGemmFlopsBenchmark', 'MemBwBenchmark',
+    'CudaMemBwBenchmark', 'DiskBenchmark', 'IBLoopbackBenchmark', 'CudaNcclBwBenchmark', 'RocmMemBwBenchmark'
 ]
--- a/superbench/benchmarks/micro_benchmarks/gemm_flops_performance.py
+++ b/superbench/benchmarks/micro_benchmarks/gemm_flops_performance.py
@@ -8,10 +8,10 @@
 from superbench.common.utils import logger
 from superbench.common.utils import nv_helper
 from superbench.benchmarks import BenchmarkRegistry, Platform, ReturnCode
-from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke
+from superbench.benchmarks.micro_benchmarks import GemmFlopsBenchmark


-class GemmFlopsCuda(MicroBenchmarkWithInvoke):
+class CudaGemmFlopsBenchmark(GemmFlopsBenchmark):
    """The GEMM FLOPs performance benchmark class."""
    def __init__(self, name, parameters=''):
        """Constructor.
@@ -44,59 +44,30 @@ def __init__(self, name, parameters=''):
                'INT4_TC': 'cutlass_tensorop_s4_i16864gemm_s4_256x128_128x3_*',
            }
        }
+        self.__parse_logline = [
+            'gemm,cutlass_simt_dgemm_128x128_8x2', 'gemm,cutlass_simt_sgemm_128x128_8x2',
+            'gemm,cutlass_simt_hgemm_256x128_8x2', 'gemm,cutlass_tensorop_d884gemm_128x128_16x3',
+            'gemm,cutlass_tensorop_tf32_s1688gemm_tf32_256x128_16x3',
+            'gemm,cutlass_tensorop_bf16_s16816gemm_bf16_256x128_32x3', 'gemm,cutlass_tensorop_h16816gemm_256x128_32x3',
+            'gemm,cutlass_tensorop_h884gemm_256x128_32x2', 'gemm,cutlass_tensorop_s8_i16832gemm_s8_256x128_64x3',
+            'gemm,cutlass_tensorop_s4_i16864gemm_s4_256x128_128x3'
+        ]

    def add_parser_arguments(self):
        """Add the specified arguments."""
        super().add_parser_arguments()

-        self._parser.add_argument(
-            '--num_warmup',
-            type=int,
-            default=5,
-            required=False,
-            help='The number of warmup step.',
-        )
-        self._parser.add_argument(
-            '--n',
-            type=int,
-            default=16384,
-            required=False,
-            help='The N dim of matmul (N, K) * (K, M).',
-        )
-        self._parser.add_argument(
-            '--k',
-            type=int,
-            default=16384,
-            required=False,
-            help='The K dim of matmul (N, K) * (K, M).',
-        )
-        self._parser.add_argument(
-            '--m',
-            type=int,
-            default=16384,
-            required=False,
-            help='The M dim of matmul (N, K) * (K, M).',
-        )
-        self._parser.add_argument(
-            '--precision',
-            type=str,
-            nargs='+',
-            default=list(),
-            help='Precision for benchmarking. E.g. {}.'.format(' '.join(list(self.__kernel_map[8.0].keys()))),
-        )
-
    def _preprocess(self):
        """Preprocess/preparation operations before the benchmarking.

        Return:
            True if _preprocess() succeed.
        """
-        if not super()._preprocess():
-            return False
-
        # Reset kernels according to compute capability.
        capability = nv_helper.get_device_compute_capability()
        if capability not in self.__kernel_map:
+            # After preprocess() self._result.return_code can be generated
+            super()._preprocess()
            self._result.set_return_code(ReturnCode.MICROBENCHMARK_UNSUPPORTED_ARCHITECTURE)
            logger.error(
                'Unsupported architecture - benchmark: {}, compute capability: {}, expected: 7.0 or 8.0'.format(
@@ -105,27 +76,13 @@ def _preprocess(self):
            )
            return False

-        self.__precision_need_to_run = list()
-        if len(self._args.precision) == 0:
-            self.__precision_need_to_run = list(self.__kernel_map[capability].keys())
-        else:
-            self._args.precision = [p.upper() for p in self._args.precision]
-            for p in self._args.precision:
-                if p not in self.__kernel_map[capability]:
-                    self._result.set_return_code(ReturnCode.INVALID_ARGUMENT)
-                    logger.warning(
-                        'Unsupported precision - benchmark: {}, precision: {}, expected: {}.'.format(
-                            self._name, p, list(self.__kernel_map[capability].keys())
-                        )
-                    )
-                else:
-                    self.__precision_need_to_run.append(p)
-
-        if len(self.__precision_need_to_run) == 0:
-            self._result.set_return_code(ReturnCode.NO_SUPPORTED_PRECISION)
+        self._support_precisions = list(self.__kernel_map[capability].keys())
+        # 'support_precisions' are precise only after getting capability,
+        #  and then using super.preprocess() to check if the precision in arguments are supported to run
+        if not super()._preprocess():
            return False

-        for p in self.__precision_need_to_run:
+        for p in self._precision_need_to_run:
            command = os.path.join(self._args.bin_dir, self._bin_name)
            command += (' --warmup-iterations=' + str(self._args.num_warmup))
            command += (' --operation=gemm')
@@ -149,7 +106,7 @@ def _process_raw_result(self, cmd_idx, raw_output):
        Return:
            True if the raw output string is valid and result can be extracted.
        """
-        precision = self.__precision_need_to_run[cmd_idx]
+        precision = self._precision_need_to_run[cmd_idx]
        self._result.add_raw_data('raw_output_' + precision, raw_output)

        valid = True
@@ -157,17 +114,9 @@ def _process_raw_result(self, cmd_idx, raw_output):
        content = raw_output.splitlines()
        try:
            for line in content:
-                if 'gemm,cutlass_simt_dgemm_128x128_8x2' in line or \
-                   'gemm,cutlass_simt_sgemm_128x128_8x2' in line or \
-                   'gemm,cutlass_simt_hgemm_256x128_8x2' in line or \
-                   'gemm,cutlass_tensorop_d884gemm_128x128_16x3' in line or \
-                   'gemm,cutlass_tensorop_tf32_s1688gemm_tf32_256x128_16x3' in line or \
-                   'gemm,cutlass_tensorop_bf16_s16816gemm_bf16_256x128_32x3' in line or \
-                   'gemm,cutlass_tensorop_h16816gemm_256x128_32x3' in line or \
-                   'gemm,cutlass_tensorop_h884gemm_256x128_32x2' in line or \
-                   'gemm,cutlass_tensorop_s8_i16832gemm_s8_256x128_64x3' in line or \
-                   'gemm,cutlass_tensorop_s4_i16864gemm_s4_256x128_128x3' in line:
-                    flops.append(float(line.split(',')[-1]))
+                for item in self.__parse_logline:
+                    if item in line:
+                        flops.append(float(line.split(',')[-1]))
        except BaseException:
            valid = False
        finally:
@@ -184,4 +133,4 @@ def _process_raw_result(self, cmd_idx, raw_output):
        return True


-BenchmarkRegistry.register_benchmark('gemm-flops', GemmFlopsCuda, platform=Platform.CUDA)
+BenchmarkRegistry.register_benchmark('gemm-flops', CudaGemmFlopsBenchmark, platform=Platform.CUDA)
--- a/superbench/benchmarks/micro_benchmarks/gemm_flops_performance_base.py
+++ b/superbench/benchmarks/micro_benchmarks/gemm_flops_performance_base.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""Module of the FLOPs performance benchmark base class."""
+
+from superbench.common.utils import logger
+from superbench.benchmarks import ReturnCode
+from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke
+
+
+class GemmFlopsBenchmark(MicroBenchmarkWithInvoke):
+    """The GEMM FLOPs performance benchmark base class."""
+    def __init__(self, name, parameters=''):
+        """Constructor.
+
+        Args:
+            name (str): benchmark name.
+            parameters (str): benchmark parameters.
+        """
+        super().__init__(name, parameters)
+
+        self._support_precisions = [
+            'FP64', 'FP32', 'FP16', 'FP64_TC', 'TF32_TC', 'BF16_TC', 'FP16_TC', 'INT8_TC', 'INT4_TC'
+        ]
+        self._precision_need_to_run = list()
+
+    def add_parser_arguments(self):
+        """Add the specified arguments."""
+        super().add_parser_arguments()
+
+        self._parser.add_argument(
+            '--num_warmup',
+            type=int,
+            default=5,
+            required=False,
+            help='The number of warmup step.',
+        )
+        self._parser.add_argument(
+            '--n',
+            type=int,
+            default=16384,
+            required=False,
+            help='The N dim of matmul (N, K) * (K, M).',
+        )
+        self._parser.add_argument(
+            '--k',
+            type=int,
+            default=16384,
+            required=False,
+            help='The K dim of matmul (N, K) * (K, M).',
+        )
+        self._parser.add_argument(
+            '--m',
+            type=int,
+            default=16384,
+            required=False,
+            help='The M dim of matmul (N, K) * (K, M).',
+        )
+        self._parser.add_argument(
+            '--precision',
+            type=str,
+            nargs='+',
+            default=list(),
+            help='Precision for benchmarking. E.g. {}.'.format(' '.join(self._support_precisions)),
+        )
+
+    def _preprocess(self):
+        """Preprocess/preparation operations before the benchmarking.
+
+        Return:
+            True if _preprocess() succeed.
+        """
+        if not super()._preprocess():
+            return False
+
+        if len(self._args.precision) == 0:
+            self._precision_need_to_run = self._support_precisions
+        else:
+            self._args.precision = [p.upper() for p in self._args.precision]
+            for p in self._args.precision:
+                if p not in self._support_precisions:
+                    logger.warning(
+                        'Unsupported precision - benchmark: {}, precision: {}, expected: {}.'.format(
+                            self._name, p, self._support_precisions
+                        )
+                    )
+                else:
+                    self._precision_need_to_run.append(p)
+
+        if len(self._precision_need_to_run) == 0:
+            self._result.set_return_code(ReturnCode.NO_SUPPORTED_PRECISION)
+            return False
+
+        return True
--- a/tests/benchmarks/micro_benchmarks/test_gemm_flops_performance.py
+++ b/tests/benchmarks/micro_benchmarks/test_gemm_flops_performance.py
@@ -12,8 +12,8 @@
 from superbench.benchmarks import BenchmarkRegistry, ReturnCode, Platform, BenchmarkType


-class GemmFlopsCudaTest(unittest.TestCase):
-    """Tests for GemmFlopsCuda benchmark."""
+class CudaGemmFlopsBenchmarkTest(unittest.TestCase):
+    """Tests for CudaGemmFlopsBenchmark benchmark."""
    def setUp(self):
        """Method called to prepare the test fixture."""
        # Create fake binary file just for testing.
@@ -60,7 +60,7 @@ def test_flops_performance_cuda(self):
        assert (benchmark._args.k == 512)
        assert (benchmark._args.m == 2048)
        assert (benchmark._args.precision == ['FP32', 'TF32_TC', 'FP16_TC', 'INT8_TC'])
-        benchmark._GemmFlopsCuda__precision_need_to_run = ['FP32', 'TF32_TC', 'FP16_TC', 'INT8_TC']
+        benchmark._CudaGemmFlopsBenchmark__precision_need_to_run = ['FP32', 'TF32_TC', 'FP16_TC', 'INT8_TC']

        # Check results and metrics.
        raw_output_FP32 = """

--- a/tests/benchmarks/micro_benchmarks/test_gemm_flops_performance_base.py
+++ b/tests/benchmarks/micro_benchmarks/test_gemm_flops_performance_base.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Tests for GemmFlopsBenchmark modules."""
+
+import os
+
+from superbench.benchmarks import BenchmarkType, ReturnCode
+from superbench.benchmarks.micro_benchmarks import GemmFlopsBenchmark
+
+
+class FakeGemmFlopsBenchmark(GemmFlopsBenchmark):
+    """Fake benchmark inherit from GemmFlopsBenchmark."""
+    def __init__(self, name, parameters=''):
+        """Constructor.
+
+        Args:
+            name: benchmark name.
+            parameters: benchmark parameters.
+        """
+        super().__init__(name, parameters)
+        self._bin_name = 'echo'
+
+    def _preprocess(self):
+        """Preprocess/preparation operations before the benchmarking.
+
+        Return:
+            True if _preprocess() succeed.
+        """
+        if not super()._preprocess():
+            return False
+
+        # Check the arguments and generate the commands
+        for precision in self._precision_need_to_run:
+            command = os.path.join(self._args.bin_dir, self._bin_name)
+            command += ' "--precision ' + precision
+            command += ' --m ' + str(self._args.m)
+            command += ' --n ' + str(self._args.n)
+            command += ' --k ' + str(self._args.k)
+            command += ' --num_warmup ' + str(self._args.num_warmup) + '"'
+            self._commands.append(command)
+
+        return True
+
+    def _process_raw_result(self, cmd_idx, raw_output):
+        """Function to process raw results and save the summarized results.
+
+          self._result.add_raw_data() and self._result.add_result() need to be called to save the results.
+
+        Args:
+            cmd_idx (int): the index of command corresponding with the raw_output.
+            raw_output (str): raw output string of the micro-benchmark.
+
+        Return:
+            True if the raw output string is valid and result can be extracted.
+        """
+        self._result.add_raw_data('raw_output_' + str(cmd_idx), raw_output)
+
+        try:
+            params = raw_output.strip('\n').split('--')
+            for param in params[1:]:
+                key_value = param.split()
+                if key_value[0] == 'precision':
+                    if key_value[1] != self._precision_need_to_run[cmd_idx]:
+                        return False
+            metric = self._precision_need_to_run[cmd_idx]
+        except BaseException:
+            return False
+
+        self._result.add_result(metric, 0)
+
+        return True
+
+
+def test_memory_bw_performance_base():
+    """Test GemmFlopsBenchmark."""
+    # Positive case - memory=pinned.
+    benchmark = FakeGemmFlopsBenchmark('fake')
+    assert (benchmark._benchmark_type == BenchmarkType.MICRO)
+    assert (benchmark.run())
+    assert (benchmark.return_code == ReturnCode.SUCCESS)
+    # Check command list
+    expected_command = [
+        'echo "--precision FP64 --m 16384 --n 16384 --k 16384 --num_warmup 5"',
+        'echo "--precision FP32 --m 16384 --n 16384 --k 16384 --num_warmup 5"',
+        'echo "--precision FP16 --m 16384 --n 16384 --k 16384 --num_warmup 5"',
+        'echo "--precision FP64_TC --m 16384 --n 16384 --k 16384 --num_warmup 5"',
+        'echo "--precision TF32_TC --m 16384 --n 16384 --k 16384 --num_warmup 5"',
+        'echo "--precision BF16_TC --m 16384 --n 16384 --k 16384 --num_warmup 5"',
+        'echo "--precision FP16_TC --m 16384 --n 16384 --k 16384 --num_warmup 5"',
+        'echo "--precision INT8_TC --m 16384 --n 16384 --k 16384 --num_warmup 5"',
+        'echo "--precision INT4_TC --m 16384 --n 16384 --k 16384 --num_warmup 5"'
+    ]
+    for i in range(len(expected_command)):
+        command = benchmark._bin_name + benchmark._commands[i].split(benchmark._bin_name)[1]
+        assert (command == expected_command[i])
+    for i, metric in enumerate(
+        ['FP64', 'FP32', 'FP16', 'FP64_TC', 'TF32_TC', 'BF16_TC', 'FP16_TC', 'INT8_TC', 'INT4_TC']
+    ):
+        assert (metric in benchmark.result)
+        assert (len(benchmark.result[metric]) == 1)
+
+    # Positive case - memory=unpinned.
+    benchmark = FakeGemmFlopsBenchmark('fake', parameters='--precision FP64 FP32 FP16')
+    assert (benchmark._benchmark_type == BenchmarkType.MICRO)
+    assert (benchmark.run())
+    assert (benchmark.return_code == ReturnCode.SUCCESS)
+    # Check command list
+    expected_command = [
+        'echo "--precision FP64 --m 16384 --n 16384 --k 16384 --num_warmup 5"',
+        'echo "--precision FP32 --m 16384 --n 16384 --k 16384 --num_warmup 5"',
+        'echo "--precision FP16 --m 16384 --n 16384 --k 16384 --num_warmup 5"'
+    ]
+    for i in range(len(expected_command)):
+        command = benchmark._bin_name + benchmark._commands[i].split(benchmark._bin_name)[1]
+        assert (command == expected_command[i])
+    for i, metric in enumerate(['FP64', 'FP32', 'FP16']):
+        assert (metric in benchmark.result)
+        assert (len(benchmark.result[metric]) == 1)
+
+    benchmark = FakeGemmFlopsBenchmark('fake', parameters='--precision FP64 BF64')
+    assert (benchmark._benchmark_type == BenchmarkType.MICRO)
+    assert (benchmark.run() is True)
+
+    # Negative case - INVALID_ARGUMENT.
+    benchmark = FakeGemmFlopsBenchmark('fake', parameters='--precision BF64')
+    assert (benchmark._benchmark_type == BenchmarkType.MICRO)
+    assert (benchmark.run() is False)
+    assert (benchmark.return_code == ReturnCode.NO_SUPPORTED_PRECISION)