Unverified Commit b0df66f7 authored by Yuting Jiang's avatar Yuting Jiang Committed by GitHub
Browse files

Benchmarks: Code Revision - Extract base class for gemm flops microbenchmark (#165)

**Description**
Extract base class for gemm flops microbenchmark.

**Major Revision**
- extract base class for gemm flops microbenchmark and add related test.
- revise gemm_flops_performance for cuda.
parent 35114bae
......@@ -9,7 +9,8 @@
from superbench.benchmarks.micro_benchmarks.kernel_launch_overhead import KernelLaunch
from superbench.benchmarks.micro_benchmarks.cublas_function import CublasBenchmark
from superbench.benchmarks.micro_benchmarks.cudnn_function import CudnnBenchmark
from superbench.benchmarks.micro_benchmarks.gemm_flops_performance import GemmFlopsCuda
from superbench.benchmarks.micro_benchmarks.gemm_flops_performance_base import GemmFlopsBenchmark
from superbench.benchmarks.micro_benchmarks.cuda_gemm_flops_performance import CudaGemmFlopsBenchmark
from superbench.benchmarks.micro_benchmarks.memory_bw_performance_base import MemBwBenchmark
from superbench.benchmarks.micro_benchmarks.cuda_memory_bw_performance import CudaMemBwBenchmark
from superbench.benchmarks.micro_benchmarks.disk_performance import DiskBenchmark
......@@ -19,6 +20,6 @@
__all__ = [
'MicroBenchmark', 'MicroBenchmarkWithInvoke', 'ShardingMatmul', 'ComputationCommunicationOverlap', 'KernelLaunch',
'CublasBenchmark', 'CudnnBenchmark', 'GemmFlopsCuda', 'MemBwBenchmark', 'CudaMemBwBenchmark', 'DiskBenchmark',
'IBLoopbackBenchmark', 'CudaNcclBwBenchmark', 'RocmMemBwBenchmark'
'CublasBenchmark', 'CudnnBenchmark', 'GemmFlopsBenchmark', 'CudaGemmFlopsBenchmark', 'MemBwBenchmark',
'CudaMemBwBenchmark', 'DiskBenchmark', 'IBLoopbackBenchmark', 'CudaNcclBwBenchmark', 'RocmMemBwBenchmark'
]
......@@ -8,10 +8,10 @@
from superbench.common.utils import logger
from superbench.common.utils import nv_helper
from superbench.benchmarks import BenchmarkRegistry, Platform, ReturnCode
from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke
from superbench.benchmarks.micro_benchmarks import GemmFlopsBenchmark
class GemmFlopsCuda(MicroBenchmarkWithInvoke):
class CudaGemmFlopsBenchmark(GemmFlopsBenchmark):
"""The GEMM FLOPs performance benchmark class."""
def __init__(self, name, parameters=''):
"""Constructor.
......@@ -44,59 +44,30 @@ def __init__(self, name, parameters=''):
'INT4_TC': 'cutlass_tensorop_s4_i16864gemm_s4_256x128_128x3_*',
}
}
self.__parse_logline = [
'gemm,cutlass_simt_dgemm_128x128_8x2', 'gemm,cutlass_simt_sgemm_128x128_8x2',
'gemm,cutlass_simt_hgemm_256x128_8x2', 'gemm,cutlass_tensorop_d884gemm_128x128_16x3',
'gemm,cutlass_tensorop_tf32_s1688gemm_tf32_256x128_16x3',
'gemm,cutlass_tensorop_bf16_s16816gemm_bf16_256x128_32x3', 'gemm,cutlass_tensorop_h16816gemm_256x128_32x3',
'gemm,cutlass_tensorop_h884gemm_256x128_32x2', 'gemm,cutlass_tensorop_s8_i16832gemm_s8_256x128_64x3',
'gemm,cutlass_tensorop_s4_i16864gemm_s4_256x128_128x3'
]
def add_parser_arguments(self):
"""Add the specified arguments."""
super().add_parser_arguments()
self._parser.add_argument(
'--num_warmup',
type=int,
default=5,
required=False,
help='The number of warmup step.',
)
self._parser.add_argument(
'--n',
type=int,
default=16384,
required=False,
help='The N dim of matmul (N, K) * (K, M).',
)
self._parser.add_argument(
'--k',
type=int,
default=16384,
required=False,
help='The K dim of matmul (N, K) * (K, M).',
)
self._parser.add_argument(
'--m',
type=int,
default=16384,
required=False,
help='The M dim of matmul (N, K) * (K, M).',
)
self._parser.add_argument(
'--precision',
type=str,
nargs='+',
default=list(),
help='Precision for benchmarking. E.g. {}.'.format(' '.join(list(self.__kernel_map[8.0].keys()))),
)
def _preprocess(self):
"""Preprocess/preparation operations before the benchmarking.
Return:
True if _preprocess() succeed.
"""
if not super()._preprocess():
return False
# Reset kernels according to compute capability.
capability = nv_helper.get_device_compute_capability()
if capability not in self.__kernel_map:
# After preprocess() self._result.return_code can be generated
super()._preprocess()
self._result.set_return_code(ReturnCode.MICROBENCHMARK_UNSUPPORTED_ARCHITECTURE)
logger.error(
'Unsupported architecture - benchmark: {}, compute capability: {}, expected: 7.0 or 8.0'.format(
......@@ -105,27 +76,13 @@ def _preprocess(self):
)
return False
self.__precision_need_to_run = list()
if len(self._args.precision) == 0:
self.__precision_need_to_run = list(self.__kernel_map[capability].keys())
else:
self._args.precision = [p.upper() for p in self._args.precision]
for p in self._args.precision:
if p not in self.__kernel_map[capability]:
self._result.set_return_code(ReturnCode.INVALID_ARGUMENT)
logger.warning(
'Unsupported precision - benchmark: {}, precision: {}, expected: {}.'.format(
self._name, p, list(self.__kernel_map[capability].keys())
)
)
else:
self.__precision_need_to_run.append(p)
if len(self.__precision_need_to_run) == 0:
self._result.set_return_code(ReturnCode.NO_SUPPORTED_PRECISION)
self._support_precisions = list(self.__kernel_map[capability].keys())
# 'support_precisions' are precise only after getting capability,
# and then using super.preprocess() to check if the precision in arguments are supported to run
if not super()._preprocess():
return False
for p in self.__precision_need_to_run:
for p in self._precision_need_to_run:
command = os.path.join(self._args.bin_dir, self._bin_name)
command += (' --warmup-iterations=' + str(self._args.num_warmup))
command += (' --operation=gemm')
......@@ -149,7 +106,7 @@ def _process_raw_result(self, cmd_idx, raw_output):
Return:
True if the raw output string is valid and result can be extracted.
"""
precision = self.__precision_need_to_run[cmd_idx]
precision = self._precision_need_to_run[cmd_idx]
self._result.add_raw_data('raw_output_' + precision, raw_output)
valid = True
......@@ -157,17 +114,9 @@ def _process_raw_result(self, cmd_idx, raw_output):
content = raw_output.splitlines()
try:
for line in content:
if 'gemm,cutlass_simt_dgemm_128x128_8x2' in line or \
'gemm,cutlass_simt_sgemm_128x128_8x2' in line or \
'gemm,cutlass_simt_hgemm_256x128_8x2' in line or \
'gemm,cutlass_tensorop_d884gemm_128x128_16x3' in line or \
'gemm,cutlass_tensorop_tf32_s1688gemm_tf32_256x128_16x3' in line or \
'gemm,cutlass_tensorop_bf16_s16816gemm_bf16_256x128_32x3' in line or \
'gemm,cutlass_tensorop_h16816gemm_256x128_32x3' in line or \
'gemm,cutlass_tensorop_h884gemm_256x128_32x2' in line or \
'gemm,cutlass_tensorop_s8_i16832gemm_s8_256x128_64x3' in line or \
'gemm,cutlass_tensorop_s4_i16864gemm_s4_256x128_128x3' in line:
flops.append(float(line.split(',')[-1]))
for item in self.__parse_logline:
if item in line:
flops.append(float(line.split(',')[-1]))
except BaseException:
valid = False
finally:
......@@ -184,4 +133,4 @@ def _process_raw_result(self, cmd_idx, raw_output):
return True
BenchmarkRegistry.register_benchmark('gemm-flops', GemmFlopsCuda, platform=Platform.CUDA)
BenchmarkRegistry.register_benchmark('gemm-flops', CudaGemmFlopsBenchmark, platform=Platform.CUDA)
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
"""Module of the FLOPs performance benchmark base class."""
from superbench.common.utils import logger
from superbench.benchmarks import ReturnCode
from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke
class GemmFlopsBenchmark(MicroBenchmarkWithInvoke):
"""The GEMM FLOPs performance benchmark base class."""
def __init__(self, name, parameters=''):
"""Constructor.
Args:
name (str): benchmark name.
parameters (str): benchmark parameters.
"""
super().__init__(name, parameters)
self._support_precisions = [
'FP64', 'FP32', 'FP16', 'FP64_TC', 'TF32_TC', 'BF16_TC', 'FP16_TC', 'INT8_TC', 'INT4_TC'
]
self._precision_need_to_run = list()
def add_parser_arguments(self):
"""Add the specified arguments."""
super().add_parser_arguments()
self._parser.add_argument(
'--num_warmup',
type=int,
default=5,
required=False,
help='The number of warmup step.',
)
self._parser.add_argument(
'--n',
type=int,
default=16384,
required=False,
help='The N dim of matmul (N, K) * (K, M).',
)
self._parser.add_argument(
'--k',
type=int,
default=16384,
required=False,
help='The K dim of matmul (N, K) * (K, M).',
)
self._parser.add_argument(
'--m',
type=int,
default=16384,
required=False,
help='The M dim of matmul (N, K) * (K, M).',
)
self._parser.add_argument(
'--precision',
type=str,
nargs='+',
default=list(),
help='Precision for benchmarking. E.g. {}.'.format(' '.join(self._support_precisions)),
)
def _preprocess(self):
"""Preprocess/preparation operations before the benchmarking.
Return:
True if _preprocess() succeed.
"""
if not super()._preprocess():
return False
if len(self._args.precision) == 0:
self._precision_need_to_run = self._support_precisions
else:
self._args.precision = [p.upper() for p in self._args.precision]
for p in self._args.precision:
if p not in self._support_precisions:
logger.warning(
'Unsupported precision - benchmark: {}, precision: {}, expected: {}.'.format(
self._name, p, self._support_precisions
)
)
else:
self._precision_need_to_run.append(p)
if len(self._precision_need_to_run) == 0:
self._result.set_return_code(ReturnCode.NO_SUPPORTED_PRECISION)
return False
return True
......@@ -12,8 +12,8 @@
from superbench.benchmarks import BenchmarkRegistry, ReturnCode, Platform, BenchmarkType
class GemmFlopsCudaTest(unittest.TestCase):
"""Tests for GemmFlopsCuda benchmark."""
class CudaGemmFlopsBenchmarkTest(unittest.TestCase):
"""Tests for CudaGemmFlopsBenchmark benchmark."""
def setUp(self):
"""Method called to prepare the test fixture."""
# Create fake binary file just for testing.
......@@ -60,7 +60,7 @@ def test_flops_performance_cuda(self):
assert (benchmark._args.k == 512)
assert (benchmark._args.m == 2048)
assert (benchmark._args.precision == ['FP32', 'TF32_TC', 'FP16_TC', 'INT8_TC'])
benchmark._GemmFlopsCuda__precision_need_to_run = ['FP32', 'TF32_TC', 'FP16_TC', 'INT8_TC']
benchmark._CudaGemmFlopsBenchmark__precision_need_to_run = ['FP32', 'TF32_TC', 'FP16_TC', 'INT8_TC']
# Check results and metrics.
raw_output_FP32 = """
......
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
"""Tests for GemmFlopsBenchmark modules."""
import os
from superbench.benchmarks import BenchmarkType, ReturnCode
from superbench.benchmarks.micro_benchmarks import GemmFlopsBenchmark
class FakeGemmFlopsBenchmark(GemmFlopsBenchmark):
"""Fake benchmark inherit from GemmFlopsBenchmark."""
def __init__(self, name, parameters=''):
"""Constructor.
Args:
name: benchmark name.
parameters: benchmark parameters.
"""
super().__init__(name, parameters)
self._bin_name = 'echo'
def _preprocess(self):
"""Preprocess/preparation operations before the benchmarking.
Return:
True if _preprocess() succeed.
"""
if not super()._preprocess():
return False
# Check the arguments and generate the commands
for precision in self._precision_need_to_run:
command = os.path.join(self._args.bin_dir, self._bin_name)
command += ' "--precision ' + precision
command += ' --m ' + str(self._args.m)
command += ' --n ' + str(self._args.n)
command += ' --k ' + str(self._args.k)
command += ' --num_warmup ' + str(self._args.num_warmup) + '"'
self._commands.append(command)
return True
def _process_raw_result(self, cmd_idx, raw_output):
"""Function to process raw results and save the summarized results.
self._result.add_raw_data() and self._result.add_result() need to be called to save the results.
Args:
cmd_idx (int): the index of command corresponding with the raw_output.
raw_output (str): raw output string of the micro-benchmark.
Return:
True if the raw output string is valid and result can be extracted.
"""
self._result.add_raw_data('raw_output_' + str(cmd_idx), raw_output)
try:
params = raw_output.strip('\n').split('--')
for param in params[1:]:
key_value = param.split()
if key_value[0] == 'precision':
if key_value[1] != self._precision_need_to_run[cmd_idx]:
return False
metric = self._precision_need_to_run[cmd_idx]
except BaseException:
return False
self._result.add_result(metric, 0)
return True
def test_memory_bw_performance_base():
"""Test GemmFlopsBenchmark."""
# Positive case - memory=pinned.
benchmark = FakeGemmFlopsBenchmark('fake')
assert (benchmark._benchmark_type == BenchmarkType.MICRO)
assert (benchmark.run())
assert (benchmark.return_code == ReturnCode.SUCCESS)
# Check command list
expected_command = [
'echo "--precision FP64 --m 16384 --n 16384 --k 16384 --num_warmup 5"',
'echo "--precision FP32 --m 16384 --n 16384 --k 16384 --num_warmup 5"',
'echo "--precision FP16 --m 16384 --n 16384 --k 16384 --num_warmup 5"',
'echo "--precision FP64_TC --m 16384 --n 16384 --k 16384 --num_warmup 5"',
'echo "--precision TF32_TC --m 16384 --n 16384 --k 16384 --num_warmup 5"',
'echo "--precision BF16_TC --m 16384 --n 16384 --k 16384 --num_warmup 5"',
'echo "--precision FP16_TC --m 16384 --n 16384 --k 16384 --num_warmup 5"',
'echo "--precision INT8_TC --m 16384 --n 16384 --k 16384 --num_warmup 5"',
'echo "--precision INT4_TC --m 16384 --n 16384 --k 16384 --num_warmup 5"'
]
for i in range(len(expected_command)):
command = benchmark._bin_name + benchmark._commands[i].split(benchmark._bin_name)[1]
assert (command == expected_command[i])
for i, metric in enumerate(
['FP64', 'FP32', 'FP16', 'FP64_TC', 'TF32_TC', 'BF16_TC', 'FP16_TC', 'INT8_TC', 'INT4_TC']
):
assert (metric in benchmark.result)
assert (len(benchmark.result[metric]) == 1)
# Positive case - memory=unpinned.
benchmark = FakeGemmFlopsBenchmark('fake', parameters='--precision FP64 FP32 FP16')
assert (benchmark._benchmark_type == BenchmarkType.MICRO)
assert (benchmark.run())
assert (benchmark.return_code == ReturnCode.SUCCESS)
# Check command list
expected_command = [
'echo "--precision FP64 --m 16384 --n 16384 --k 16384 --num_warmup 5"',
'echo "--precision FP32 --m 16384 --n 16384 --k 16384 --num_warmup 5"',
'echo "--precision FP16 --m 16384 --n 16384 --k 16384 --num_warmup 5"'
]
for i in range(len(expected_command)):
command = benchmark._bin_name + benchmark._commands[i].split(benchmark._bin_name)[1]
assert (command == expected_command[i])
for i, metric in enumerate(['FP64', 'FP32', 'FP16']):
assert (metric in benchmark.result)
assert (len(benchmark.result[metric]) == 1)
benchmark = FakeGemmFlopsBenchmark('fake', parameters='--precision FP64 BF64')
assert (benchmark._benchmark_type == BenchmarkType.MICRO)
assert (benchmark.run() is True)
# Negative case - INVALID_ARGUMENT.
benchmark = FakeGemmFlopsBenchmark('fake', parameters='--precision BF64')
assert (benchmark._benchmark_type == BenchmarkType.MICRO)
assert (benchmark.run() is False)
assert (benchmark.return_code == ReturnCode.NO_SUPPORTED_PRECISION)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment