Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
superbenchmark
Commits
b808135c
Unverified
Commit
b808135c
authored
Mar 20, 2023
by
Yifan Xiong
Committed by
GitHub
Mar 20, 2023
Browse files
Benchmarks - Support tensor core precisions in cublaslt gemm (#492)
Support FP64/TF32/FP16/BF16 in cublaslt (batch) GEMM.
parent
139d4df5
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
36 additions
and
20 deletions
+36
-20
superbench/benchmarks/micro_benchmarks/cublaslt_function.py
superbench/benchmarks/micro_benchmarks/cublaslt_function.py
+2
-2
superbench/benchmarks/micro_benchmarks/cublaslt_gemm/CMakeLists.txt
.../benchmarks/micro_benchmarks/cublaslt_gemm/CMakeLists.txt
+5
-5
superbench/benchmarks/micro_benchmarks/cublaslt_gemm/cublaslt_gemm.cu
...enchmarks/micro_benchmarks/cublaslt_gemm/cublaslt_gemm.cu
+23
-8
superbench/benchmarks/micro_benchmarks/cublaslt_gemm/cublaslt_utils.cc
...nchmarks/micro_benchmarks/cublaslt_gemm/cublaslt_utils.cc
+6
-5
superbench/benchmarks/micro_benchmarks/cublaslt_gemm/cublaslt_utils.h
...enchmarks/micro_benchmarks/cublaslt_gemm/cublaslt_utils.h
+0
-0
No files found.
superbench/benchmarks/micro_benchmarks/cublaslt_function.py
View file @
b808135c
...
...
@@ -21,8 +21,8 @@ class CublasLtBenchmark(MicroBenchmarkWithInvoke):
"""
super
().
__init__
(
name
,
parameters
)
self
.
_bin_name
=
'cublaslt_
fp8_
gemm'
self
.
_in_types
=
[
'fp16'
,
'fp8e4m3'
,
'fp8e5m2'
]
self
.
_bin_name
=
'cublaslt_gemm'
self
.
_in_types
=
[
'fp
64'
,
'fp32'
,
'fp16'
,
'bf
16'
,
'fp8e4m3'
,
'fp8e5m2'
]
def
add_parser_arguments
(
self
):
"""Add the specified arguments."""
...
...
superbench/benchmarks/micro_benchmarks/cublaslt_
fp8_
gemm/CMakeLists.txt
→
superbench/benchmarks/micro_benchmarks/cublaslt_gemm/CMakeLists.txt
View file @
b808135c
...
...
@@ -2,7 +2,7 @@
# Licensed under the MIT License.
cmake_minimum_required
(
VERSION 3.18
)
project
(
cublaslt_
fp8_
gemm LANGUAGES CXX
)
project
(
cublaslt_gemm LANGUAGES CXX
)
find_package
(
CUDAToolkit QUIET
)
...
...
@@ -15,8 +15,8 @@ if(CUDAToolkit_FOUND AND NOT CUDAToolkit_VERSION VERSION_LESS 11.8)
set_target_properties
(
cublaslt_utils PROPERTIES LINK_FLAGS_RELEASE -s
)
install
(
TARGETS cublaslt_utils LIBRARY DESTINATION lib
)
add_executable
(
cublaslt_
fp8_
gemm cublaslt_
fp8_
gemm.cu
)
target_link_libraries
(
cublaslt_
fp8_
gemm cublaslt_utils
)
set_target_properties
(
cublaslt_
fp8_
gemm PROPERTIES CUDA_ARCHITECTURES
"80;86;90"
)
install
(
TARGETS cublaslt_
fp8_
gemm RUNTIME DESTINATION bin
)
add_executable
(
cublaslt_gemm cublaslt_gemm.cu
)
target_link_libraries
(
cublaslt_gemm cublaslt_utils
)
set_target_properties
(
cublaslt_gemm PROPERTIES CUDA_ARCHITECTURES
"80;86;90"
)
install
(
TARGETS cublaslt_gemm RUNTIME DESTINATION bin
)
endif
()
superbench/benchmarks/micro_benchmarks/cublaslt_
fp8_
gemm/cublaslt_
fp8_
gemm.cu
→
superbench/benchmarks/micro_benchmarks/cublaslt_gemm/cublaslt_gemm.cu
View file @
b808135c
...
...
@@ -10,7 +10,10 @@
#include "cublaslt_utils.h"
using
fp16
=
half
;
// nv_bfloat16
using
fp64
=
double
;
using
fp32
=
float
;
using
fp16
=
half
;
using
bf16
=
nv_bfloat16
;
using
fp8e4m3
=
__nv_fp8_e4m3
;
using
fp8e5m2
=
__nv_fp8_e5m2
;
...
...
@@ -61,7 +64,7 @@ void process_args(int argc, char **argv, Args *args) {
}
}
template
<
typename
T
>
__global__
void
init_matrix
(
T
*
matrix
,
const
fp
16
val
,
const
size_t
N
)
{
template
<
typename
T
>
__global__
void
init_matrix
(
T
*
matrix
,
const
fp
32
val
,
const
size_t
N
)
{
size_t
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
for
(
size_t
i
=
tid
;
i
<
N
;
i
+=
gridDim
.
x
*
blockDim
.
x
)
{
matrix
[
i
]
=
T
(
val
);
...
...
@@ -69,8 +72,14 @@ template <typename T> __global__ void init_matrix(T *matrix, const fp16 val, con
}
template
<
typename
T
>
cudaDataType_t
get_datatype
()
{
if
(
std
::
is_same
<
T
,
fp64
>::
value
)
return
CUDA_R_64F
;
if
(
std
::
is_same
<
T
,
fp32
>::
value
)
return
CUDA_R_32F
;
if
(
std
::
is_same
<
T
,
fp16
>::
value
)
return
CUDA_R_16F
;
if
(
std
::
is_same
<
T
,
bf16
>::
value
)
return
CUDA_R_16BF
;
if
(
std
::
is_same
<
T
,
fp8e4m3
>::
value
)
return
CUDA_R_8F_E4M3
;
if
(
std
::
is_same
<
T
,
fp8e5m2
>::
value
)
...
...
@@ -88,8 +97,8 @@ float timing_matmul_tn(int m, int n, int k, int batch, int warmup, int iter) {
cudaMalloc
(
&
matrix_b
,
k
*
n
*
std
::
max
(
batch
,
1
)
*
sizeof
(
Tb
));
cudaMalloc
(
&
matrix_out
,
m
*
n
*
std
::
max
(
batch
,
1
)
*
sizeof
(
Tout
));
init_matrix
<
Ta
><<<
216
,
1024
>>>
(
matrix_a
,
static_cast
<
fp16
>
(
1.
f
)
,
m
*
k
*
std
::
max
(
batch
,
1
));
init_matrix
<
Tb
><<<
216
,
1024
>>>
(
matrix_b
,
static_cast
<
fp16
>
(
2.
f
)
,
k
*
n
*
std
::
max
(
batch
,
1
));
init_matrix
<
Ta
><<<
216
,
1024
>>>
(
matrix_a
,
1.
f
,
m
*
k
*
std
::
max
(
batch
,
1
));
init_matrix
<
Tb
><<<
216
,
1024
>>>
(
matrix_b
,
2.
f
,
k
*
n
*
std
::
max
(
batch
,
1
));
// init gemm
int
lda
=
k
,
ldb
=
k
,
ldd
=
m
;
...
...
@@ -129,7 +138,7 @@ float timing_matmul_tn(int m, int n, int k, int batch, int warmup, int iter) {
return
(
time
*
1e3
/
iter
);
}
template
<
typename
Ta
,
typename
Tb
=
Ta
,
typename
Tout
=
fp16
>
void
run
(
Args
*
args
)
{
template
<
typename
Ta
,
typename
Tb
=
Ta
,
typename
Tout
=
Ta
>
void
run
(
Args
*
args
)
{
float
time_us
=
timing_matmul_tn
<
Ta
,
Tb
,
Tout
>
(
args
->
m
,
args
->
n
,
args
->
k
,
args
->
batch
,
args
->
warmup
,
args
->
iter
);
// m n k batch time_us tflops
printf
(
"%d
\t
%d
\t
%d
\t
%d
\t
%f
\t
%f
\n
"
,
args
->
m
,
args
->
n
,
args
->
k
,
args
->
batch
,
time_us
,
...
...
@@ -140,12 +149,18 @@ int main(int argc, char **argv) {
Args
args
;
process_args
(
argc
,
argv
,
&
args
);
if
(
args
.
in_type
==
"fp16"
)
if
(
args
.
in_type
==
"fp64"
)
run
<
fp64
>
(
&
args
);
else
if
(
args
.
in_type
==
"fp32"
)
run
<
fp32
>
(
&
args
);
else
if
(
args
.
in_type
==
"fp16"
)
run
<
fp16
>
(
&
args
);
else
if
(
args
.
in_type
==
"bf16"
)
run
<
bf16
>
(
&
args
);
else
if
(
args
.
in_type
==
"fp8e4m3"
)
run
<
fp8e4m3
>
(
&
args
);
run
<
fp8e4m3
,
fp8e4m3
,
fp16
>
(
&
args
);
else
if
(
args
.
in_type
==
"fp8e5m2"
)
run
<
fp8e5m2
,
fp8e4m3
>
(
&
args
);
run
<
fp8e5m2
,
fp8e4m3
,
fp16
>
(
&
args
);
else
throw
std
::
invalid_argument
(
"Unknown type "
+
args
.
in_type
);
...
...
superbench/benchmarks/micro_benchmarks/cublaslt_
fp8_
gemm/cublaslt_utils.cc
→
superbench/benchmarks/micro_benchmarks/cublaslt_gemm/cublaslt_utils.cc
View file @
b808135c
...
...
@@ -22,7 +22,7 @@ void cublasLtGemm::Setup(int m, int n, int k, int batch, int lda, int ldb, int l
)
{
cublasLtMatrixLayout_t
a_desc
=
nullptr
,
b_desc
=
nullptr
,
c_desc
=
nullptr
,
d_desc
=
nullptr
;
// force c_type
cudaDataType_t
c_type
=
CUDA_R_16F
;
cudaDataType_t
c_type
=
d_type
;
// Create matrix descriptors.
checkCublasStatus
(
cublasLtMatrixLayoutCreate
(
&
a_desc
,
a_type
,
transa
==
CUBLAS_OP_N
?
m
:
k
,
transa
==
CUBLAS_OP_N
?
k
:
m
,
lda
));
...
...
@@ -57,10 +57,11 @@ void cublasLtGemm::Setup(int m, int n, int k, int batch, int lda, int ldb, int l
d_desc_
.
reset
(
d_desc
);
// default to tf32 except for e5m2 inputs where the config is not supported
cublasComputeType_t
gemm_compute_type
=
(
a_type
==
CUDA_R_8F_E5M2
||
b_type
==
CUDA_R_8F_E5M2
||
a_type
==
CUDA_R_8F_E4M3
||
b_type
==
CUDA_R_8F_E4M3
)
?
CUBLAS_COMPUTE_32F
:
CUBLAS_COMPUTE_32F_FAST_TF32
;
cublasComputeType_t
gemm_compute_type
=
CUBLAS_COMPUTE_32F_FAST_TF32
;
if
(
a_type
==
CUDA_R_8F_E5M2
||
b_type
==
CUDA_R_8F_E5M2
||
a_type
==
CUDA_R_8F_E4M3
||
b_type
==
CUDA_R_8F_E4M3
)
gemm_compute_type
=
CUBLAS_COMPUTE_32F
;
if
(
a_type
==
CUDA_R_64F
||
b_type
==
CUDA_R_64F
)
gemm_compute_type
=
CUBLAS_COMPUTE_64F
;
cublasLtMatmulDesc_t
op_desc
=
nullptr
;
checkCublasStatus
(
cublasLtMatmulDescCreate
(
&
op_desc
,
gemm_compute_type
,
CUDA_R_32F
));
...
...
superbench/benchmarks/micro_benchmarks/cublaslt_
fp8_
gemm/cublaslt_utils.h
→
superbench/benchmarks/micro_benchmarks/cublaslt_gemm/cublaslt_utils.h
View file @
b808135c
File moved
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment