Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tsoc
superbenchmark
Commits
6c6f5269
Unverified
Commit
6c6f5269
authored
Jun 02, 2021
by
guoshzhao
Committed by
GitHub
Jun 02, 2021
Browse files
Benchmarks: Add Benchmark - Add FLOPs performance benchmark for cuda. (#87)
* add cuda flops performance benchmark.
parent
331c740a
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
307 additions
and
1 deletion
+307
-1
examples/benchmarks/gemm_flops_cuda_performance.py
examples/benchmarks/gemm_flops_cuda_performance.py
+23
-0
superbench/benchmarks/micro_benchmarks/__init__.py
superbench/benchmarks/micro_benchmarks/__init__.py
+2
-1
superbench/benchmarks/micro_benchmarks/gemm_flops_performance.py
...nch/benchmarks/micro_benchmarks/gemm_flops_performance.py
+171
-0
superbench/benchmarks/return_code.py
superbench/benchmarks/return_code.py
+1
-0
tests/benchmarks/micro_benchmarks/test_gemm_flops_performance.py
...enchmarks/micro_benchmarks/test_gemm_flops_performance.py
+110
-0
No files found.
examples/benchmarks/gemm_flops_cuda_performance.py
0 → 100644
View file @
6c6f5269
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
"""Model benchmark example for Cutlass GEMM FLOPs performance.
Commands to run:
python3 examples/benchmarks/gemm_flops_cuda_performance.py
"""
from
superbench.benchmarks
import
BenchmarkRegistry
,
Platform
from
superbench.common.utils
import
logger
if
__name__
==
'__main__'
:
parameters
=
'--n 16384 --k 16384 --m 16384'
context
=
BenchmarkRegistry
.
create_benchmark_context
(
'gemm-flops'
,
platform
=
Platform
.
CUDA
,
parameters
=
parameters
)
benchmark
=
BenchmarkRegistry
.
launch_benchmark
(
context
)
if
benchmark
:
logger
.
info
(
'benchmark: {}, return code: {}, result: {}'
.
format
(
benchmark
.
name
,
benchmark
.
return_code
,
benchmark
.
result
)
)
superbench/benchmarks/micro_benchmarks/__init__.py
View file @
6c6f5269
...
...
@@ -9,8 +9,9 @@
from
superbench.benchmarks.micro_benchmarks.kernel_launch_overhead
import
KernelLaunch
from
superbench.benchmarks.micro_benchmarks.cublas_function
import
CublasBenchmark
from
superbench.benchmarks.micro_benchmarks.cudnn_function
import
CudnnBenchmark
from
superbench.benchmarks.micro_benchmarks.gemm_flops_performance
import
GemmFlopsCuda
__all__
=
[
'MicroBenchmark'
,
'MicroBenchmarkWithInvoke'
,
'ShardingMatmul'
,
'ComputationCommunicationOverlap'
,
'KernelLaunch'
,
'CublasBenchmark'
,
'CudnnBenchmark'
'CublasBenchmark'
,
'CudnnBenchmark'
,
'GemmFlopsCuda'
]
superbench/benchmarks/micro_benchmarks/gemm_flops_performance.py
0 → 100644
View file @
6c6f5269
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
"""Module of the FLOPs performance benchmarks."""
import
os
from
superbench.common.utils
import
logger
from
superbench.common.utils
import
nv_helper
from
superbench.benchmarks
import
BenchmarkRegistry
,
Platform
,
ReturnCode
from
superbench.benchmarks.micro_benchmarks
import
MicroBenchmarkWithInvoke
class
GemmFlopsCuda
(
MicroBenchmarkWithInvoke
):
"""The GEMM FLOPs performance benchmark class."""
def
__init__
(
self
,
name
,
parameters
=
''
):
"""Constructor.
Args:
name (str): benchmark name.
parameters (str): benchmark parameters.
"""
super
().
__init__
(
name
,
parameters
)
self
.
_bin_name
=
'cutlass_profiler'
self
.
__kernel_map
=
{
'FP64'
:
'cutlass_simt_dgemm_128x128_8x2_*'
,
'FP32'
:
'cutlass_simt_sgemm_128x128_8x2_*'
,
'FP16'
:
'cutlass_simt_hgemm_256x128_8x2_*'
,
'FP64_TC'
:
'cutlass_tensorop_d884gemm_128x128_16x3_*'
,
'TF32_TC'
:
'cutlass_tensorop_tf32_s1688gemm_tf32_256x128_16x3_*'
,
'BF16_TC'
:
'cutlass_tensorop_bf16_s16816gemm_bf16_256x128_32x3_*'
,
'FP16_TC'
:
'cutlass_tensorop_h16816gemm_256x128_32x3_*'
,
'INT8_TC'
:
'cutlass_tensorop_s8_i16832gemm_s8_256x128_64x3_*'
,
'INT4_TC'
:
'cutlass_tensorop_s4_i16864gemm_s4_256x128_128x3_*'
,
}
def
add_parser_arguments
(
self
):
"""Add the specified arguments."""
super
().
add_parser_arguments
()
self
.
_parser
.
add_argument
(
'--num_warmup'
,
type
=
int
,
default
=
5
,
required
=
False
,
help
=
'The number of warmup step.'
,
)
self
.
_parser
.
add_argument
(
'--n'
,
type
=
int
,
default
=
16384
,
required
=
False
,
help
=
'The N dim of matmul (N, K) * (K, M).'
,
)
self
.
_parser
.
add_argument
(
'--k'
,
type
=
int
,
default
=
16384
,
required
=
False
,
help
=
'The K dim of matmul (N, K) * (K, M).'
,
)
self
.
_parser
.
add_argument
(
'--m'
,
type
=
int
,
default
=
16384
,
required
=
False
,
help
=
'The M dim of matmul (N, K) * (K, M).'
,
)
self
.
_parser
.
add_argument
(
'--precision'
,
type
=
str
,
nargs
=
'+'
,
default
=
list
(
self
.
__kernel_map
.
keys
()),
help
=
'Precision for benchmarking. E.g. {}.'
.
format
(
' '
.
join
(
list
(
self
.
__kernel_map
.
keys
()))),
)
def
_preprocess
(
self
):
"""Preprocess/preparation operations before the benchmarking.
Return:
True if _preprocess() succeed.
"""
if
not
super
().
_preprocess
():
return
False
self
.
_args
.
precision
=
[
p
.
upper
()
for
p
in
self
.
_args
.
precision
]
for
p
in
self
.
_args
.
precision
:
if
p
not
in
list
(
self
.
__kernel_map
.
keys
()):
self
.
_result
.
set_return_code
(
ReturnCode
.
INVALID_ARGUMENT
)
logger
.
error
(
'Unsupported precision - benchmark: {}, precision: {}, expected: {}.'
.
format
(
self
.
_name
,
p
,
list
(
self
.
__kernel_map
.
keys
())
)
)
return
False
else
:
command
=
os
.
path
.
join
(
self
.
_args
.
bin_dir
,
self
.
_bin_name
)
command
+=
(
' --warmup-iterations='
+
str
(
self
.
_args
.
num_warmup
))
command
+=
(
' --operation=gemm'
)
command
+=
(
' --n='
+
str
(
self
.
_args
.
n
))
command
+=
(
' --k='
+
str
(
self
.
_args
.
k
))
command
+=
(
' --m='
+
str
(
self
.
_args
.
m
))
command
+=
(
' --kernels='
+
self
.
__kernel_map
[
p
])
self
.
_commands
.
append
(
command
)
# TODO - To support more architecutres, currently only support compute capability = 7.0 or 8.0
capability
=
nv_helper
.
get_device_compute_capability
()
if
capability
==
7.0
:
self
.
__kernel_map
[
'FP16_TC'
]
=
'cutlass_tensorop_h884gemm_256x128_32x2_*'
if
capability
not
in
[
7.0
,
8.0
]:
self
.
_result
.
set_return_code
(
ReturnCode
.
MICROBENCHMARK_UNSUPPORTED_ARCHITECTURE
)
logger
.
error
(
'Unsupported architecture - benchmark: {}, compute capability: {}, expected: 7.0 or 8.0'
.
format
(
self
.
_name
,
capability
)
)
return
False
return
True
def
_process_raw_result
(
self
,
cmd_idx
,
raw_output
):
"""Function to parse raw results and save the summarized results.
self._result.add_raw_data() and self._result.add_result() need to be called to save the results.
Args:
cmd_idx (int): the index of command corresponding with the raw_output.
raw_output (str): raw output string of the micro-benchmark.
Return:
True if the raw output string is valid and result can be extracted.
"""
precision
=
self
.
_args
.
precision
[
cmd_idx
]
self
.
_result
.
add_raw_data
(
'raw_output_'
+
precision
,
raw_output
)
valid
=
True
flops
=
list
()
content
=
raw_output
.
splitlines
()
try
:
for
line
in
content
:
if
'gemm,cutlass_simt_dgemm_128x128_8x2'
in
line
or
\
'gemm,cutlass_simt_sgemm_128x128_8x2'
in
line
or
\
'gemm,cutlass_simt_hgemm_256x128_8x2'
in
line
or
\
'gemm,cutlass_tensorop_d884gemm_128x128_16x3'
in
line
or
\
'gemm,cutlass_tensorop_tf32_s1688gemm_tf32_256x128_16x3'
in
line
or
\
'gemm,cutlass_tensorop_bf16_s16816gemm_bf16_256x128_32x3'
in
line
or
\
'gemm,cutlass_tensorop_h16816gemm_256x128_32x3'
in
line
or
\
'gemm,cutlass_tensorop_h884gemm_256x128_32x2'
in
line
or
\
'gemm,cutlass_tensorop_s8_i16832gemm_s8_256x128_64x3'
in
line
or
\
'gemm,cutlass_tensorop_s4_i16864gemm_s4_256x128_128x3'
in
line
:
flops
.
append
(
float
(
line
.
split
(
','
)[
-
1
]))
except
BaseException
:
valid
=
False
finally
:
if
valid
is
False
or
len
(
flops
)
==
0
:
logger
.
error
(
'The result format is invalid - round: {}, benchmark: {}, raw output: {}.'
.
format
(
self
.
_curr_run_index
,
self
.
_name
,
raw_output
)
)
return
False
self
.
_result
.
add_result
(
precision
,
max
(
flops
))
return
True
BenchmarkRegistry
.
register_benchmark
(
'gemm-flops'
,
GemmFlopsCuda
,
platform
=
Platform
.
CUDA
)
superbench/benchmarks/return_code.py
View file @
6c6f5269
...
...
@@ -28,3 +28,4 @@ class ReturnCode(Enum):
MICROBENCHMARK_BINARY_NOT_EXIST
=
31
MICROBENCHMARK_EXECUTION_FAILURE
=
32
MICROBENCHMARK_RESULT_PARSING_FAILURE
=
33
MICROBENCHMARK_UNSUPPORTED_ARCHITECTURE
=
34
tests/benchmarks/micro_benchmarks/test_gemm_flops_performance.py
0 → 100644
View file @
6c6f5269
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
"""Tests for gemm-flops benchmark."""
import
os
import
unittest
from
pathlib
import
Path
from
tests.helper
import
decorator
from
superbench.common.utils
import
nv_helper
from
superbench.benchmarks
import
BenchmarkRegistry
,
ReturnCode
,
Platform
,
BenchmarkType
class
GemmFlopsCudaTest
(
unittest
.
TestCase
):
"""Tests for GemmFlopsCuda benchmark."""
def
setUp
(
self
):
"""Method called to prepare the test fixture."""
# Create fake binary file just for testing.
os
.
environ
[
'SB_MICRO_PATH'
]
=
'/tmp/superbench/'
binary_path
=
os
.
path
.
join
(
os
.
getenv
(
'SB_MICRO_PATH'
),
'bin'
)
Path
(
binary_path
).
mkdir
(
parents
=
True
,
exist_ok
=
True
)
self
.
__binary_file
=
Path
(
os
.
path
.
join
(
binary_path
,
'cutlass_profiler'
))
self
.
__binary_file
.
touch
(
mode
=
0o755
,
exist_ok
=
True
)
def
tearDown
(
self
):
"""Method called after the test method has been called and the result recorded."""
self
.
__binary_file
.
unlink
()
@
decorator
.
cuda_test
def
test_flops_performance_cuda
(
self
):
"""Test gemm-flops benchmark."""
benchmark_name
=
'gemm-flops'
(
benchmark_class
,
predefine_params
)
=
BenchmarkRegistry
.
_BenchmarkRegistry__select_benchmark
(
benchmark_name
,
Platform
.
CUDA
)
assert
(
benchmark_class
)
# Negative case - MICROBENCHMARK_UNSUPPORTED_ARCHITECTURE.
benchmark
=
benchmark_class
(
benchmark_name
,
parameters
=
'--num_warmup 200 --n 1024 --k 512 --m 2048 --precision FP32 TF32_TC FP16_TC INT8_TC'
)
ret
=
benchmark
.
_preprocess
()
if
nv_helper
.
get_device_compute_capability
()
not
in
[
7.0
,
8.0
]:
assert
(
ret
is
False
)
assert
(
benchmark
.
return_code
==
ReturnCode
.
MICROBENCHMARK_UNSUPPORTED_ARCHITECTURE
)
else
:
assert
(
ret
is
True
)
assert
(
benchmark
.
return_code
==
ReturnCode
.
SUCCESS
)
# Check basic information.
assert
(
benchmark
.
name
==
'gemm-flops'
)
assert
(
benchmark
.
type
==
BenchmarkType
.
MICRO
)
assert
(
benchmark
.
_bin_name
==
'cutlass_profiler'
)
# Check parameters specified in BenchmarkContext.
assert
(
benchmark
.
_args
.
num_warmup
==
200
)
assert
(
benchmark
.
_args
.
n
==
1024
)
assert
(
benchmark
.
_args
.
k
==
512
)
assert
(
benchmark
.
_args
.
m
==
2048
)
assert
(
benchmark
.
_args
.
precision
==
[
'FP32'
,
'TF32_TC'
,
'FP16_TC'
,
'INT8_TC'
])
# Check the command list.
for
i
in
range
(
len
(
benchmark
.
_args
.
precision
)):
command
=
'{} --warmup-iterations={} --operation=gemm --n={} --k={} --m={} --kernels={}'
.
format
(
benchmark
.
_bin_name
,
benchmark
.
_args
.
num_warmup
,
benchmark
.
_args
.
n
,
benchmark
.
_args
.
k
,
benchmark
.
_args
.
m
,
benchmark
.
_GemmFlopsCuda__kernel_map
[
benchmark
.
_args
.
precision
[
i
]]
)
expected_cmd
=
benchmark
.
_bin_name
+
benchmark
.
_commands
[
i
].
split
(
benchmark
.
_bin_name
)[
1
]
assert
(
command
==
expected_cmd
)
# Check results and metrics.
raw_output_FP32
=
"""
CSV Results:
Problem,Provider,OperationKind,Operation,Disposition,Status,gemm_kind,m,n,k,A,B,C,alpha,beta,split_k_slices,batch_count,op_class,accum,cta_m,cta_n,cta_k,stages,warps_m,warps_n,warps_k,inst_m,inst_n,inst_k,min_cc,max_cc,Bytes,Flops,Runtime,GB/s,GFLOPs
1,CUTLASS,gemm,cutlass_simt_sgemm_128x128_8x2_nn_align1,passed,success,universal,16384,16384,16384,f32:column,f32:column,f32:column,1,0,1,1,simt,f32,128,128,8,2,4,2,1,1,1,1,50,1024,3221225472,8796629893120,481.022,6.23672,18287.4
1,CUTLASS,gemm,cutlass_simt_sgemm_128x128_8x2_nt_align1,passed,success,universal,16384,16384,16384,f32:column,f32:row,f32:column,1,0,1,1,simt,f32,128,128,8,2,4,2,1,1,1,1,50,1024,3221225472,8796629893120,478.866,6.2648,18369.7
1,CUTLASS,gemm,cutlass_simt_sgemm_128x128_8x2_tn_align1,passed,success,universal,16384,16384,16384,f32:row,f32:column,f32:column,1,0,1,1,simt,f32,128,128,8,2,4,2,1,1,1,1,50,1024,3221225472,8796629893120,482.034,6.22363,18249
1,CUTLASS,gemm,cutlass_simt_sgemm_128x128_8x2_tt_align1,passed,success,universal,16384,16384,16384,f32:row,f32:row,f32:column,1,0,1,1,simt,f32,128,128,8,2,4,2,1,1,1,1,50,1024,3221225472,8796629893120,481.838,6.22616,18256.4
"""
raw_output_TF32_TC
=
"""
CSV Results:
Problem,Provider,OperationKind,Operation,Disposition,Status,gemm_kind,m,n,k,A,B,C,alpha,beta,split_k_slices,batch_count,op_class,accum,cta_m,cta_n,cta_k,stages,warps_m,warps_n,warps_k,inst_m,inst_n,inst_k,min_cc,max_cc,Bytes,Flops,Runtime,GB/s,GFLOPs
1,CUTLASS,gemm,cutlass_tensorop_tf32_s1688gemm_tf32_256x128_16x3_nn_align4,passed,success,universal,16384,16384,16384,tf32:column,tf32:column,tf32:column,1,0,1,1,tensorop,f32,256,128,16,3,4,2,1,16,8,8,80,1024,3221225472,8796629893120,88.5764,33.8691,99311.2
1,CUTLASS,gemm,cutlass_tensorop_tf32_s1688gemm_tf32_256x128_16x3_nt_align4,passed,success,universal,16384,16384,16384,tf32:column,tf32:row,tf32:column,1,0,1,1,tensorop,f32,256,128,16,3,4,2,1,16,8,8,80,1024,3221225472,8796629893120,70.3503,42.6438,125040
1,CUTLASS,gemm,cutlass_tensorop_tf32_s1688gemm_tf32_256x128_16x3_tn_align4,passed,success,universal,16384,16384,16384,tf32:row,tf32:column,tf32:column,1,0,1,1,tensorop,f32,256,128,16,3,4,2,1,16,8,8,80,1024,3221225472,8796629893120,86.5167,34.6754,101676
1,CUTLASS,gemm,cutlass_tensorop_tf32_s1688gemm_tf32_256x128_16x3_tt_align4,passed,success,universal,16384,16384,16384,tf32:row,tf32:row,tf32:column,1,0,1,1,tensorop,f32,256,128,16,3,4,2,1,16,8,8,80,1024,3221225472,8796629893120,68.3621,43.884,128677
"""
raw_output_FP16_TC
=
"""
CSV Results:
Problem,Provider,OperationKind,Operation,Disposition,Status,gemm_kind,m,n,k,A,B,C,alpha,beta,split_k_slices,batch_count,op_class,accum,cta_m,cta_n,cta_k,stages,warps_m,warps_n,warps_k,inst_m,inst_n,inst_k,min_cc,max_cc,Bytes,Flops,Runtime,GB/s,GFLOPs
1,CUTLASS,gemm,cutlass_tensorop_h16816gemm_256x128_32x3_nn_align8,incorrect,success,universal,16384,16384,16384,f16:column,f16:column,f16:column,1,0,1,1,tensorop,f16,256,128,32,3,4,2,1,16,8,16,80,1024,1610612736,8796629893120,34.1575,43.9142,257531
1,CUTLASS,gemm,cutlass_tensorop_h16816gemm_256x128_32x3_nt_align8,incorrect,success,universal,16384,16384,16384,f16:column,f16:row,f16:column,1,0,1,1,tensorop,f16,256,128,32,3,4,2,1,16,8,16,80,1024,1610612736,8796629893120,34.6153,43.3334,254126
1,CUTLASS,gemm,cutlass_tensorop_h16816gemm_256x128_32x3_tn_align8,incorrect,success,universal,16384,16384,16384,f16:row,f16:column,f16:column,1,0,1,1,tensorop,f16,256,128,32,3,4,2,1,16,8,16,80,1024,1610612736,8796629893120,39.0413,38.4209,225316
1,CUTLASS,gemm,cutlass_tensorop_h16816gemm_256x128_32x3_tt_align8,incorrect,success,universal,16384,16384,16384,f16:row,f16:row,f16:column,1,0,1,1,tensorop,f16,256,128,32,3,4,2,1,16,8,16,80,1024,1610612736,8796629893120,31.2994,47.9243,281048
"""
assert
(
benchmark
.
_process_raw_result
(
0
,
raw_output_FP32
))
assert
(
benchmark
.
_process_raw_result
(
1
,
raw_output_TF32_TC
))
assert
(
benchmark
.
_process_raw_result
(
2
,
raw_output_FP16_TC
))
assert
(
benchmark
.
result
[
'FP32'
][
0
]
==
18369.7
)
assert
(
benchmark
.
result
[
'TF32_TC'
][
0
]
==
128677
)
assert
(
benchmark
.
result
[
'FP16_TC'
][
0
]
==
281048
)
# Negative case - Add invalid raw output.
assert
(
benchmark
.
_process_raw_result
(
3
,
'Invalid raw output'
)
is
False
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment