# Copyright (c) Microsoft Corporation. # Licensed under the MIT license. """Module of the FLOPs performance benchmarks.""" import os from superbench.common.utils import logger from superbench.common.utils import nv_helper from superbench.benchmarks import BenchmarkRegistry, Platform, ReturnCode from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke class GemmFlopsCuda(MicroBenchmarkWithInvoke): """The GEMM FLOPs performance benchmark class.""" def __init__(self, name, parameters=''): """Constructor. Args: name (str): benchmark name. parameters (str): benchmark parameters. """ super().__init__(name, parameters) self._bin_name = 'cutlass_profiler' # TODO - To support more architecutres, currently only support compute capability = 7.0 and 8.0 self.__kernel_map = { 7.0: { 'FP64': 'cutlass_simt_dgemm_128x128_8x2_*', 'FP32': 'cutlass_simt_sgemm_128x128_8x2_*', 'FP16': 'cutlass_simt_hgemm_256x128_8x2_*', 'FP16_TC': 'cutlass_tensorop_h884gemm_256x128_32x2_*', }, 8.0: { 'FP64': 'cutlass_simt_dgemm_128x128_8x2_*', 'FP32': 'cutlass_simt_sgemm_128x128_8x2_*', 'FP16': 'cutlass_simt_hgemm_256x128_8x2_*', 'FP64_TC': 'cutlass_tensorop_d884gemm_128x128_16x3_*', 'TF32_TC': 'cutlass_tensorop_tf32_s1688gemm_tf32_256x128_16x3_*', 'BF16_TC': 'cutlass_tensorop_bf16_s16816gemm_bf16_256x128_32x3_*', 'FP16_TC': 'cutlass_tensorop_h16816gemm_256x128_32x3_*', 'INT8_TC': 'cutlass_tensorop_s8_i16832gemm_s8_256x128_64x3_*', 'INT4_TC': 'cutlass_tensorop_s4_i16864gemm_s4_256x128_128x3_*', } } def add_parser_arguments(self): """Add the specified arguments.""" super().add_parser_arguments() self._parser.add_argument( '--num_warmup', type=int, default=5, required=False, help='The number of warmup step.', ) self._parser.add_argument( '--n', type=int, default=16384, required=False, help='The N dim of matmul (N, K) * (K, M).', ) self._parser.add_argument( '--k', type=int, default=16384, required=False, help='The K dim of matmul (N, K) * (K, M).', ) self._parser.add_argument( '--m', type=int, default=16384, required=False, help='The M dim of matmul (N, K) * (K, M).', ) self._parser.add_argument( '--precision', type=str, nargs='+', default=list(), help='Precision for benchmarking. E.g. {}.'.format(' '.join(list(self.__kernel_map[8.0].keys()))), ) def _preprocess(self): """Preprocess/preparation operations before the benchmarking. Return: True if _preprocess() succeed. """ if not super()._preprocess(): return False # Reset kernels according to compute capability. capability = nv_helper.get_device_compute_capability() if capability not in self.__kernel_map: self._result.set_return_code(ReturnCode.MICROBENCHMARK_UNSUPPORTED_ARCHITECTURE) logger.error( 'Unsupported architecture - benchmark: {}, compute capability: {}, expected: 7.0 or 8.0'.format( self._name, capability ) ) return False self.__precision_need_to_run = list() if len(self._args.precision) == 0: self.__precision_need_to_run = list(self.__kernel_map[capability].keys()) else: self._args.precision = [p.upper() for p in self._args.precision] for p in self._args.precision: if p not in list(self.__kernel_map.keys()): self._result.set_return_code(ReturnCode.INVALID_ARGUMENT) logger.warning( 'Unsupported precision - benchmark: {}, precision: {}, expected: {}.'.format( self._name, p, list(self.__kernel_map[capability].keys()) ) ) else: self.__precision_need_to_run.append(p) if len(self.__precision_need_to_run) == 0: self._result.set_return_code(ReturnCode.NO_SUPPORTED_PRECISION) return False for p in self.__precision_need_to_run: command = os.path.join(self._args.bin_dir, self._bin_name) command += (' --warmup-iterations=' + str(self._args.num_warmup)) command += (' --operation=gemm') command += (' --n=' + str(self._args.n)) command += (' --k=' + str(self._args.k)) command += (' --m=' + str(self._args.m)) command += (' --kernels=' + self.__kernel_map[capability][p]) self._commands.append(command) return True def _process_raw_result(self, cmd_idx, raw_output): """Function to parse raw results and save the summarized results. self._result.add_raw_data() and self._result.add_result() need to be called to save the results. Args: cmd_idx (int): the index of command corresponding with the raw_output. raw_output (str): raw output string of the micro-benchmark. Return: True if the raw output string is valid and result can be extracted. """ precision = self.__precision_need_to_run[cmd_idx] self._result.add_raw_data('raw_output_' + precision, raw_output) valid = True flops = list() content = raw_output.splitlines() try: for line in content: if 'gemm,cutlass_simt_dgemm_128x128_8x2' in line or \ 'gemm,cutlass_simt_sgemm_128x128_8x2' in line or \ 'gemm,cutlass_simt_hgemm_256x128_8x2' in line or \ 'gemm,cutlass_tensorop_d884gemm_128x128_16x3' in line or \ 'gemm,cutlass_tensorop_tf32_s1688gemm_tf32_256x128_16x3' in line or \ 'gemm,cutlass_tensorop_bf16_s16816gemm_bf16_256x128_32x3' in line or \ 'gemm,cutlass_tensorop_h16816gemm_256x128_32x3' in line or \ 'gemm,cutlass_tensorop_h884gemm_256x128_32x2' in line or \ 'gemm,cutlass_tensorop_s8_i16832gemm_s8_256x128_64x3' in line or \ 'gemm,cutlass_tensorop_s4_i16864gemm_s4_256x128_128x3' in line: flops.append(float(line.split(',')[-1])) except BaseException: valid = False finally: if valid is False or len(flops) == 0: logger.error( 'The result format is invalid - round: {}, benchmark: {}, raw output: {}.'.format( self._curr_run_index, self._name, raw_output ) ) return False self._result.add_result(precision, max(flops)) return True BenchmarkRegistry.register_benchmark('gemm-flops', GemmFlopsCuda, platform=Platform.CUDA)