Benchmarks - Integrate cublaslt micro-benchmark (#455)

Integrate cublaslt-gemm micro-benchmark #451.

Benchmarks - Integrate cublaslt micro-benchmark (#455)
Integrate cublaslt-gemm micro-benchmark #451.
616e7a5a · Yifan Xiong · GitHub · 75573f59 · 616e7a5a · 616e7a5a
Unverified Commit 616e7a5a authored Jan 03, 2023 by Yifan Xiong Committed by GitHub Jan 03, 2023
4 changed files
--- a/docs/user-tutorial/benchmarks/micro-benchmarks.md
+++ b/docs/user-tutorial/benchmarks/micro-benchmarks.md
@@ -58,6 +58,18 @@ Large scale matmul operation using `torch.matmul` with one GPU.
 |--------------------------------|-----------|--------------------------------|
 | pytorch-matmul/nosharding_time | time (ms) | Time of pure matmul operation. |
+### `cublaslt-gemm`
+#### Introduction
+Measure the GEMM performance of [`cublasLtMatmul`](https://docs.nvidia.com/cuda/cublas/#cublasltmatmul).
+#### Metrics
+| Name                            | Unit           | Description                     |
+|---------------------------------|----------------|---------------------------------|
+| cublaslt-gemm/dtype_m_n_k_flops | FLOPS (TFLOPS) | TFLOPS of measured GEMM kernel. |
 ### `cublas-function`
 #### Introduction

--- a/superbench/benchmarks/micro_benchmarks/__init__.py
+++ b/superbench/benchmarks/micro_benchmarks/__init__.py
@@ -9,6 +9,7 @@ from superbench.benchmarks.micro_benchmarks.memory_bw_performance_base import Me
 from superbench.benchmarks.micro_benchmarks.computation_communication_overlap import ComputationCommunicationOverlap
 from superbench.benchmarks.micro_benchmarks.cublas_function import CublasBenchmark
+from superbench.benchmarks.micro_benchmarks.cublaslt_function import CublasLtBenchmark
 from superbench.benchmarks.micro_benchmarks.cuda_gemm_flops_performance import CudaGemmFlopsBenchmark
 from superbench.benchmarks.micro_benchmarks.cuda_memory_bw_performance import CudaMemBwBenchmark
 from superbench.benchmarks.micro_benchmarks.cuda_nccl_bw_performance import CudaNcclBwBenchmark
@@ -30,17 +31,18 @@ from superbench.benchmarks.micro_benchmarks.tensorrt_inference_performance impor
 __all__ = [
    'ComputationCommunicationOverlap',
+    'CpuMemBwLatencyBenchmark',
    'CublasBenchmark',
+    'CublasLtBenchmark',
    'CudaGemmFlopsBenchmark',
    'CudaMemBwBenchmark',
    'CudaNcclBwBenchmark',
    'CudnnBenchmark',
    'DiskBenchmark',
-    'CpuMemBwLatencyBenchmark',
    'GPCNetBenchmark',
    'GemmFlopsBenchmark',
-    'GpuCopyBwBenchmark',
    'GpuBurnBenchmark',
+    'GpuCopyBwBenchmark',
    'IBBenchmark',
    'IBLoopbackBenchmark',
    'KernelLaunch',

--- a/superbench/benchmarks/micro_benchmarks/cublaslt_function.py
+++ b/superbench/benchmarks/micro_benchmarks/cublaslt_function.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+"""Module of the cuBLASLt GEMM benchmark."""
+import os
+from superbench.common.utils import logger
+from superbench.benchmarks import BenchmarkRegistry, Platform, ReturnCode
+from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke
+class CublasLtBenchmark(MicroBenchmarkWithInvoke):
+    """The cuBLASLt GEMM benchmark class."""
+    def __init__(self, name, parameters=''):
+        """Constructor.
+        Args:
+            name (str): benchmark name.
+            parameters (str): benchmark parameters.
+        """
+        super().__init__(name, parameters)
+        self._bin_name = 'cublaslt_fp8_gemm'
+        self._in_types = ['fp16', 'fp8e4m3', 'fp8e5m2']
+    def add_parser_arguments(self):
+        """Add the specified arguments."""
+        super().add_parser_arguments()
+        self._parser.add_argument(
+            '--shapes',
+            type=str,
+            nargs='+',
+            default=[f'{x},{x},{x}' for x in [2048, 4096, 8192]],
+            help='Shapes in m,n,k format.',
+        )
+        self._parser.add_argument(
+            '--batch',
+            type=int,
+            default=0,
+            required=False,
+            help='Batch size for strided batch GEMM, set 0 to disable.',
+        )
+        self._parser.add_argument(
+            '--num_warmup',
+            type=int,
+            default=20,
+            required=False,
+            help='Number of warm up steps.',
+        )
+        self._parser.add_argument(
+            '--num_steps',
+            type=int,
+            default=50,
+            required=False,
+            help='Number of steps to measure.',
+        )
+        self._parser.add_argument(
+            '--in_type',
+            type=str,
+            default='fp8e4m3',
+            required=False,
+            help='Input data type, supports {}.'.format(' '.join(self._in_types)),
+        )
+    def _preprocess(self):
+        """Preprocess/preparation operations before the benchmarking.
+        Return:
+            True if _preprocess() succeed.
+        """
+        if not super()._preprocess():
+            return False
+        self.__bin_path = os.path.join(self._args.bin_dir, self._bin_name)
+        if self._args.in_type not in self._in_types:
+            logger.error(f'Invalid input type {self._args.in_type}.')
+            return False
+        self._commands = []
+        for shape in self._args.shapes:
+            shape_list = shape.replace(',', ' ').split()
+            if len(shape_list) != 3 or not all(x.isdigit() for x in shape_list):
+                logger.error(f'Invalid shape {shape}.')
+                return False
+            self._commands.append(
+                f'{self.__bin_path} -m {shape_list[0]} -n {shape_list[1]} -k {shape_list[2]} '
+                f'-b {self._args.batch} -w {self._args.num_warmup} -i {self._args.num_steps} -t {self._args.in_type}'
+            )
+        return True
+    def _process_raw_result(self, cmd_idx, raw_output):
+        """Function to parse raw results and save the summarized results.
+          self._result.add_raw_data() and self._result.add_result() need to be called to save the results.
+        Args:
+            cmd_idx (int): the index of command corresponding with the raw_output.
+            raw_output (str): raw output string of the micro-benchmark.
+        Return:
+            True if the raw output string is valid and result can be extracted.
+        """
+        self._result.add_raw_data(f'raw_output_{cmd_idx}', raw_output, self._args.log_raw_data)
+        try:
+            fields = raw_output.strip().split()
+            if len(fields) != 6 or not all(x.isdigit() for x in fields[:4]):
+                raise ValueError('Invalid result.')
+            self._result.add_result(f'{self._args.in_type}_{"_".join(fields[:3])}_flops', float(fields[-1]))
+        except BaseException as e:
+            self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
+            logger.error(
+                'The result format is invalid - round: {}, benchmark: {}, raw output: {}, message: {}.'.format(
+                    self._curr_run_index, self._name, raw_output, str(e)
+                )
+            )
+            return False
+        return True
+BenchmarkRegistry.register_benchmark('cublaslt-gemm', CublasLtBenchmark, platform=Platform.CUDA)
--- a/tests/benchmarks/micro_benchmarks/test_cublaslt_function.py
+++ b/tests/benchmarks/micro_benchmarks/test_cublaslt_function.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+"""Tests for cublaslt-gemm benchmark."""
+import unittest
+from types import SimpleNamespace
+from tests.helper.testcase import BenchmarkTestCase
+from superbench.benchmarks import BenchmarkRegistry, BenchmarkType, ReturnCode, Platform
+from superbench.benchmarks.result import BenchmarkResult
+class CublasLtBenchmarkTestCase(BenchmarkTestCase, unittest.TestCase):
+    """Class for cublaslt-gemm benchmark test cases."""
+    @classmethod
+    def setUpClass(cls):
+        """Hook method for setting up class fixture before running tests in the class."""
+        super().setUpClass()
+        cls.benchmark_name = 'cublaslt-gemm'
+        cls.createMockEnvs(cls)
+        cls.createMockFiles(cls, ['bin/cublaslt_fp8_gemm'])
+    def test_cublaslt_gemm_cls(self):
+        """Test cublaslt-gemm benchmark class."""
+        for platform in Platform:
+            (benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(self.benchmark_name, platform)
+            if platform is Platform.CUDA:
+                self.assertIsNotNone(benchmark_cls)
+            else:
+                self.assertIsNone(benchmark_cls)
+    def test_cublaslt_gemm_result_parsing(self):
+        """Test cublaslt-gemm benchmark result parsing."""
+        (benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(self.benchmark_name, Platform.CUDA)
+        benchmark = benchmark_cls(self.benchmark_name, parameters='')
+        benchmark._args = SimpleNamespace(shapes=['16,16,16', '32,64,128'], in_type='fp8e4m3', log_raw_data=False)
+        benchmark._result = BenchmarkResult(self.benchmark_name, BenchmarkType.MICRO, ReturnCode.SUCCESS, run_count=1)
+        # Positive case - valid raw output
+        self.assertTrue(benchmark._process_raw_result(0, '16   16    16    0       1.111      2.222'))
+        self.assertTrue(benchmark._process_raw_result(1, '32   64    128    0       1.111      2.222'))
+        self.assertEqual(ReturnCode.SUCCESS, benchmark.return_code)
+        self.assertEqual(3, len(benchmark.result))
+        for shape in benchmark._args.shapes:
+            self.assertEqual(2.222, benchmark.result[f'fp8e4m3_{shape.replace(",", "_")}_flops'][0])
+        # Negative case - invalid raw output
+        self.assertFalse(benchmark._process_raw_result(1, 'cuBLAS API failed'))