Unverified Commit 616e7a5a authored by Yifan Xiong's avatar Yifan Xiong Committed by GitHub
Browse files

Benchmarks - Integrate cublaslt micro-benchmark (#455)

Integrate cublaslt-gemm micro-benchmark #451.
parent 75573f59
...@@ -58,6 +58,18 @@ Large scale matmul operation using `torch.matmul` with one GPU. ...@@ -58,6 +58,18 @@ Large scale matmul operation using `torch.matmul` with one GPU.
|--------------------------------|-----------|--------------------------------| |--------------------------------|-----------|--------------------------------|
| pytorch-matmul/nosharding_time | time (ms) | Time of pure matmul operation. | | pytorch-matmul/nosharding_time | time (ms) | Time of pure matmul operation. |
### `cublaslt-gemm`
#### Introduction
Measure the GEMM performance of [`cublasLtMatmul`](https://docs.nvidia.com/cuda/cublas/#cublasltmatmul).
#### Metrics
| Name | Unit | Description |
|---------------------------------|----------------|---------------------------------|
| cublaslt-gemm/dtype_m_n_k_flops | FLOPS (TFLOPS) | TFLOPS of measured GEMM kernel. |
### `cublas-function` ### `cublas-function`
#### Introduction #### Introduction
......
...@@ -9,6 +9,7 @@ ...@@ -9,6 +9,7 @@
from superbench.benchmarks.micro_benchmarks.computation_communication_overlap import ComputationCommunicationOverlap from superbench.benchmarks.micro_benchmarks.computation_communication_overlap import ComputationCommunicationOverlap
from superbench.benchmarks.micro_benchmarks.cublas_function import CublasBenchmark from superbench.benchmarks.micro_benchmarks.cublas_function import CublasBenchmark
from superbench.benchmarks.micro_benchmarks.cublaslt_function import CublasLtBenchmark
from superbench.benchmarks.micro_benchmarks.cuda_gemm_flops_performance import CudaGemmFlopsBenchmark from superbench.benchmarks.micro_benchmarks.cuda_gemm_flops_performance import CudaGemmFlopsBenchmark
from superbench.benchmarks.micro_benchmarks.cuda_memory_bw_performance import CudaMemBwBenchmark from superbench.benchmarks.micro_benchmarks.cuda_memory_bw_performance import CudaMemBwBenchmark
from superbench.benchmarks.micro_benchmarks.cuda_nccl_bw_performance import CudaNcclBwBenchmark from superbench.benchmarks.micro_benchmarks.cuda_nccl_bw_performance import CudaNcclBwBenchmark
...@@ -30,17 +31,18 @@ ...@@ -30,17 +31,18 @@
__all__ = [ __all__ = [
'ComputationCommunicationOverlap', 'ComputationCommunicationOverlap',
'CpuMemBwLatencyBenchmark',
'CublasBenchmark', 'CublasBenchmark',
'CublasLtBenchmark',
'CudaGemmFlopsBenchmark', 'CudaGemmFlopsBenchmark',
'CudaMemBwBenchmark', 'CudaMemBwBenchmark',
'CudaNcclBwBenchmark', 'CudaNcclBwBenchmark',
'CudnnBenchmark', 'CudnnBenchmark',
'DiskBenchmark', 'DiskBenchmark',
'CpuMemBwLatencyBenchmark',
'GPCNetBenchmark', 'GPCNetBenchmark',
'GemmFlopsBenchmark', 'GemmFlopsBenchmark',
'GpuCopyBwBenchmark',
'GpuBurnBenchmark', 'GpuBurnBenchmark',
'GpuCopyBwBenchmark',
'IBBenchmark', 'IBBenchmark',
'IBLoopbackBenchmark', 'IBLoopbackBenchmark',
'KernelLaunch', 'KernelLaunch',
......
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
"""Module of the cuBLASLt GEMM benchmark."""
import os
from superbench.common.utils import logger
from superbench.benchmarks import BenchmarkRegistry, Platform, ReturnCode
from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke
class CublasLtBenchmark(MicroBenchmarkWithInvoke):
"""The cuBLASLt GEMM benchmark class."""
def __init__(self, name, parameters=''):
"""Constructor.
Args:
name (str): benchmark name.
parameters (str): benchmark parameters.
"""
super().__init__(name, parameters)
self._bin_name = 'cublaslt_fp8_gemm'
self._in_types = ['fp16', 'fp8e4m3', 'fp8e5m2']
def add_parser_arguments(self):
"""Add the specified arguments."""
super().add_parser_arguments()
self._parser.add_argument(
'--shapes',
type=str,
nargs='+',
default=[f'{x},{x},{x}' for x in [2048, 4096, 8192]],
help='Shapes in m,n,k format.',
)
self._parser.add_argument(
'--batch',
type=int,
default=0,
required=False,
help='Batch size for strided batch GEMM, set 0 to disable.',
)
self._parser.add_argument(
'--num_warmup',
type=int,
default=20,
required=False,
help='Number of warm up steps.',
)
self._parser.add_argument(
'--num_steps',
type=int,
default=50,
required=False,
help='Number of steps to measure.',
)
self._parser.add_argument(
'--in_type',
type=str,
default='fp8e4m3',
required=False,
help='Input data type, supports {}.'.format(' '.join(self._in_types)),
)
def _preprocess(self):
"""Preprocess/preparation operations before the benchmarking.
Return:
True if _preprocess() succeed.
"""
if not super()._preprocess():
return False
self.__bin_path = os.path.join(self._args.bin_dir, self._bin_name)
if self._args.in_type not in self._in_types:
logger.error(f'Invalid input type {self._args.in_type}.')
return False
self._commands = []
for shape in self._args.shapes:
shape_list = shape.replace(',', ' ').split()
if len(shape_list) != 3 or not all(x.isdigit() for x in shape_list):
logger.error(f'Invalid shape {shape}.')
return False
self._commands.append(
f'{self.__bin_path} -m {shape_list[0]} -n {shape_list[1]} -k {shape_list[2]} '
f'-b {self._args.batch} -w {self._args.num_warmup} -i {self._args.num_steps} -t {self._args.in_type}'
)
return True
def _process_raw_result(self, cmd_idx, raw_output):
"""Function to parse raw results and save the summarized results.
self._result.add_raw_data() and self._result.add_result() need to be called to save the results.
Args:
cmd_idx (int): the index of command corresponding with the raw_output.
raw_output (str): raw output string of the micro-benchmark.
Return:
True if the raw output string is valid and result can be extracted.
"""
self._result.add_raw_data(f'raw_output_{cmd_idx}', raw_output, self._args.log_raw_data)
try:
fields = raw_output.strip().split()
if len(fields) != 6 or not all(x.isdigit() for x in fields[:4]):
raise ValueError('Invalid result.')
self._result.add_result(f'{self._args.in_type}_{"_".join(fields[:3])}_flops', float(fields[-1]))
except BaseException as e:
self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
logger.error(
'The result format is invalid - round: {}, benchmark: {}, raw output: {}, message: {}.'.format(
self._curr_run_index, self._name, raw_output, str(e)
)
)
return False
return True
BenchmarkRegistry.register_benchmark('cublaslt-gemm', CublasLtBenchmark, platform=Platform.CUDA)
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
"""Tests for cublaslt-gemm benchmark."""
import unittest
from types import SimpleNamespace
from tests.helper.testcase import BenchmarkTestCase
from superbench.benchmarks import BenchmarkRegistry, BenchmarkType, ReturnCode, Platform
from superbench.benchmarks.result import BenchmarkResult
class CublasLtBenchmarkTestCase(BenchmarkTestCase, unittest.TestCase):
"""Class for cublaslt-gemm benchmark test cases."""
@classmethod
def setUpClass(cls):
"""Hook method for setting up class fixture before running tests in the class."""
super().setUpClass()
cls.benchmark_name = 'cublaslt-gemm'
cls.createMockEnvs(cls)
cls.createMockFiles(cls, ['bin/cublaslt_fp8_gemm'])
def test_cublaslt_gemm_cls(self):
"""Test cublaslt-gemm benchmark class."""
for platform in Platform:
(benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(self.benchmark_name, platform)
if platform is Platform.CUDA:
self.assertIsNotNone(benchmark_cls)
else:
self.assertIsNone(benchmark_cls)
def test_cublaslt_gemm_result_parsing(self):
"""Test cublaslt-gemm benchmark result parsing."""
(benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(self.benchmark_name, Platform.CUDA)
benchmark = benchmark_cls(self.benchmark_name, parameters='')
benchmark._args = SimpleNamespace(shapes=['16,16,16', '32,64,128'], in_type='fp8e4m3', log_raw_data=False)
benchmark._result = BenchmarkResult(self.benchmark_name, BenchmarkType.MICRO, ReturnCode.SUCCESS, run_count=1)
# Positive case - valid raw output
self.assertTrue(benchmark._process_raw_result(0, '16 16 16 0 1.111 2.222'))
self.assertTrue(benchmark._process_raw_result(1, '32 64 128 0 1.111 2.222'))
self.assertEqual(ReturnCode.SUCCESS, benchmark.return_code)
self.assertEqual(3, len(benchmark.result))
for shape in benchmark._args.shapes:
self.assertEqual(2.222, benchmark.result[f'fp8e4m3_{shape.replace(",", "_")}_flops'][0])
# Negative case - invalid raw output
self.assertFalse(benchmark._process_raw_result(1, 'cuBLAS API failed'))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment