Unverified Commit 8ffaddfa authored by guoshzhao's avatar guoshzhao Committed by GitHub
Browse files

Benchmarks: Fix Bug - Fix gemm kernel bug for nvidia v100. (#105)

* fix bug for nvidia v100
* hard code the supported dict for different arch.
parent f22bb3f2
...@@ -24,16 +24,25 @@ def __init__(self, name, parameters=''): ...@@ -24,16 +24,25 @@ def __init__(self, name, parameters=''):
self._bin_name = 'cutlass_profiler' self._bin_name = 'cutlass_profiler'
# TODO - To support more architecutres, currently only support compute capability = 7.0 and 8.0
self.__kernel_map = { self.__kernel_map = {
'FP64': 'cutlass_simt_dgemm_128x128_8x2_*', 7.0: {
'FP32': 'cutlass_simt_sgemm_128x128_8x2_*', 'FP64': 'cutlass_simt_dgemm_128x128_8x2_*',
'FP16': 'cutlass_simt_hgemm_256x128_8x2_*', 'FP32': 'cutlass_simt_sgemm_128x128_8x2_*',
'FP64_TC': 'cutlass_tensorop_d884gemm_128x128_16x3_*', 'FP16': 'cutlass_simt_hgemm_256x128_8x2_*',
'TF32_TC': 'cutlass_tensorop_tf32_s1688gemm_tf32_256x128_16x3_*', 'FP16_TC': 'cutlass_tensorop_h884gemm_256x128_32x2_*',
'BF16_TC': 'cutlass_tensorop_bf16_s16816gemm_bf16_256x128_32x3_*', },
'FP16_TC': 'cutlass_tensorop_h16816gemm_256x128_32x3_*', 8.0: {
'INT8_TC': 'cutlass_tensorop_s8_i16832gemm_s8_256x128_64x3_*', 'FP64': 'cutlass_simt_dgemm_128x128_8x2_*',
'INT4_TC': 'cutlass_tensorop_s4_i16864gemm_s4_256x128_128x3_*', 'FP32': 'cutlass_simt_sgemm_128x128_8x2_*',
'FP16': 'cutlass_simt_hgemm_256x128_8x2_*',
'FP64_TC': 'cutlass_tensorop_d884gemm_128x128_16x3_*',
'TF32_TC': 'cutlass_tensorop_tf32_s1688gemm_tf32_256x128_16x3_*',
'BF16_TC': 'cutlass_tensorop_bf16_s16816gemm_bf16_256x128_32x3_*',
'FP16_TC': 'cutlass_tensorop_h16816gemm_256x128_32x3_*',
'INT8_TC': 'cutlass_tensorop_s8_i16832gemm_s8_256x128_64x3_*',
'INT4_TC': 'cutlass_tensorop_s4_i16864gemm_s4_256x128_128x3_*',
}
} }
def add_parser_arguments(self): def add_parser_arguments(self):
...@@ -72,8 +81,8 @@ def add_parser_arguments(self): ...@@ -72,8 +81,8 @@ def add_parser_arguments(self):
'--precision', '--precision',
type=str, type=str,
nargs='+', nargs='+',
default=list(self.__kernel_map.keys()), default=list(),
help='Precision for benchmarking. E.g. {}.'.format(' '.join(list(self.__kernel_map.keys()))), help='Precision for benchmarking. E.g. {}.'.format(' '.join(list(self.__kernel_map[8.0].keys()))),
) )
def _preprocess(self): def _preprocess(self):
...@@ -85,32 +94,9 @@ def _preprocess(self): ...@@ -85,32 +94,9 @@ def _preprocess(self):
if not super()._preprocess(): if not super()._preprocess():
return False return False
self._args.precision = [p.upper() for p in self._args.precision] # Reset kernels according to compute capability.
for p in self._args.precision:
if p not in list(self.__kernel_map.keys()):
self._result.set_return_code(ReturnCode.INVALID_ARGUMENT)
logger.error(
'Unsupported precision - benchmark: {}, precision: {}, expected: {}.'.format(
self._name, p, list(self.__kernel_map.keys())
)
)
return False
else:
command = os.path.join(self._args.bin_dir, self._bin_name)
command += (' --warmup-iterations=' + str(self._args.num_warmup))
command += (' --operation=gemm')
command += (' --n=' + str(self._args.n))
command += (' --k=' + str(self._args.k))
command += (' --m=' + str(self._args.m))
command += (' --kernels=' + self.__kernel_map[p])
self._commands.append(command)
# TODO - To support more architecutres, currently only support compute capability = 7.0 or 8.0
capability = nv_helper.get_device_compute_capability() capability = nv_helper.get_device_compute_capability()
if capability == 7.0: if capability not in self.__kernel_map:
self.__kernel_map['FP16_TC'] = 'cutlass_tensorop_h884gemm_256x128_32x2_*'
if capability not in [7.0, 8.0]:
self._result.set_return_code(ReturnCode.MICROBENCHMARK_UNSUPPORTED_ARCHITECTURE) self._result.set_return_code(ReturnCode.MICROBENCHMARK_UNSUPPORTED_ARCHITECTURE)
logger.error( logger.error(
'Unsupported architecture - benchmark: {}, compute capability: {}, expected: 7.0 or 8.0'.format( 'Unsupported architecture - benchmark: {}, compute capability: {}, expected: 7.0 or 8.0'.format(
...@@ -119,6 +105,36 @@ def _preprocess(self): ...@@ -119,6 +105,36 @@ def _preprocess(self):
) )
return False return False
self.__precision_need_to_run = list()
if len(self._args.precision) == 0:
self.__precision_need_to_run = list(self.__kernel_map[capability].keys())
else:
self._args.precision = [p.upper() for p in self._args.precision]
for p in self._args.precision:
if p not in list(self.__kernel_map.keys()):
self._result.set_return_code(ReturnCode.INVALID_ARGUMENT)
logger.warning(
'Unsupported precision - benchmark: {}, precision: {}, expected: {}.'.format(
self._name, p, list(self.__kernel_map[capability].keys())
)
)
else:
self.__precision_need_to_run.append(p)
if len(self.__precision_need_to_run) == 0:
self._result.set_return_code(ReturnCode.NO_SUPPORTED_PRECISION)
return False
for p in self.__precision_need_to_run:
command = os.path.join(self._args.bin_dir, self._bin_name)
command += (' --warmup-iterations=' + str(self._args.num_warmup))
command += (' --operation=gemm')
command += (' --n=' + str(self._args.n))
command += (' --k=' + str(self._args.k))
command += (' --m=' + str(self._args.m))
command += (' --kernels=' + self.__kernel_map[capability][p])
self._commands.append(command)
return True return True
def _process_raw_result(self, cmd_idx, raw_output): def _process_raw_result(self, cmd_idx, raw_output):
...@@ -133,7 +149,7 @@ def _process_raw_result(self, cmd_idx, raw_output): ...@@ -133,7 +149,7 @@ def _process_raw_result(self, cmd_idx, raw_output):
Return: Return:
True if the raw output string is valid and result can be extracted. True if the raw output string is valid and result can be extracted.
""" """
precision = self._args.precision[cmd_idx] precision = self.__precision_need_to_run[cmd_idx]
self._result.add_raw_data('raw_output_' + precision, raw_output) self._result.add_raw_data('raw_output_' + precision, raw_output)
valid = True valid = True
......
...@@ -60,15 +60,7 @@ def test_flops_performance_cuda(self): ...@@ -60,15 +60,7 @@ def test_flops_performance_cuda(self):
assert (benchmark._args.k == 512) assert (benchmark._args.k == 512)
assert (benchmark._args.m == 2048) assert (benchmark._args.m == 2048)
assert (benchmark._args.precision == ['FP32', 'TF32_TC', 'FP16_TC', 'INT8_TC']) assert (benchmark._args.precision == ['FP32', 'TF32_TC', 'FP16_TC', 'INT8_TC'])
benchmark._GemmFlopsCuda__precision_need_to_run = ['FP32', 'TF32_TC', 'FP16_TC', 'INT8_TC']
# Check the command list.
for i in range(len(benchmark._args.precision)):
command = '{} --warmup-iterations={} --operation=gemm --n={} --k={} --m={} --kernels={}'.format(
benchmark._bin_name, benchmark._args.num_warmup, benchmark._args.n, benchmark._args.k,
benchmark._args.m, benchmark._GemmFlopsCuda__kernel_map[benchmark._args.precision[i]]
)
expected_cmd = benchmark._bin_name + benchmark._commands[i].split(benchmark._bin_name)[1]
assert (command == expected_cmd)
# Check results and metrics. # Check results and metrics.
raw_output_FP32 = """ raw_output_FP32 = """
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment