Benchmarks - Support tensor core precisions in cublaslt gemm (#492)

Support FP64/TF32/FP16/BF16 in cublaslt (batch) GEMM.

Benchmarks - Support tensor core precisions in cublaslt gemm (#492)
Support FP64/TF32/FP16/BF16 in cublaslt (batch) GEMM.
b808135c · Yifan Xiong · GitHub · 139d4df5 · b808135c · b808135c
Unverified Commit b808135c authored Mar 20, 2023 by Yifan Xiong Committed by GitHub Mar 20, 2023
5 changed files
--- a/superbench/benchmarks/micro_benchmarks/cublaslt_function.py
+++ b/superbench/benchmarks/micro_benchmarks/cublaslt_function.py
@@ -21,8 +21,8 @@ def __init__(self, name, parameters=''):
        """
        super().__init__(name, parameters)

-        self._bin_name = 'cublaslt_fp8_gemm'
-        self._in_types = ['fp16', 'fp8e4m3', 'fp8e5m2']
+        self._bin_name = 'cublaslt_gemm'
+        self._in_types = ['fp64', 'fp32', 'fp16', 'bf16', 'fp8e4m3', 'fp8e5m2']

    def add_parser_arguments(self):
        """Add the specified arguments."""

--- a/superbench/benchmarks/micro_benchmarks/cublaslt_fp8_gemm/CMakeLists.txt
+++ b/superbench/benchmarks/micro_benchmarks/cublaslt_fp8_gemm/CMakeLists.txt
@@ -2,7 +2,7 @@
 # Licensed under the MIT License.

 cmake_minimum_required(VERSION 3.18)
-project(cublaslt_fp8_gemm LANGUAGES CXX)
+project(cublaslt_gemm LANGUAGES CXX)

 find_package(CUDAToolkit QUIET)

@@ -15,8 +15,8 @@ if(CUDAToolkit_FOUND AND NOT CUDAToolkit_VERSION VERSION_LESS 11.8)
    set_target_properties(cublaslt_utils PROPERTIES LINK_FLAGS_RELEASE -s)
    install(TARGETS cublaslt_utils LIBRARY DESTINATION lib)

-    add_executable(cublaslt_fp8_gemm cublaslt_fp8_gemm.cu)
-    target_link_libraries(cublaslt_fp8_gemm cublaslt_utils)
-    set_target_properties(cublaslt_fp8_gemm PROPERTIES CUDA_ARCHITECTURES "80;86;90")
-    install(TARGETS cublaslt_fp8_gemm RUNTIME DESTINATION bin)
+    add_executable(cublaslt_gemm cublaslt_gemm.cu)
+    target_link_libraries(cublaslt_gemm cublaslt_utils)
+    set_target_properties(cublaslt_gemm PROPERTIES CUDA_ARCHITECTURES "80;86;90")
+    install(TARGETS cublaslt_gemm RUNTIME DESTINATION bin)
 endif()
--- a/superbench/benchmarks/micro_benchmarks/cublaslt_fp8_gemm/cublaslt_fp8_gemm.cu
+++ b/superbench/benchmarks/micro_benchmarks/cublaslt_fp8_gemm/cublaslt_fp8_gemm.cu
@@ -10,7 +10,10 @@

 #include "cublaslt_utils.h"

-using fp16 = half; // nv_bfloat16
+using fp64 = double;
+using fp32 = float;
+using fp16 = half;
+using bf16 = nv_bfloat16;
 using fp8e4m3 = __nv_fp8_e4m3;
 using fp8e5m2 = __nv_fp8_e5m2;

@@ -61,7 +64,7 @@ void process_args(int argc, char **argv, Args *args) {
    }
 }

-template <typename T> __global__ void init_matrix(T *matrix, const fp16 val, const size_t N) {
+template <typename T> __global__ void init_matrix(T *matrix, const fp32 val, const size_t N) {
    size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
    for (size_t i = tid; i < N; i += gridDim.x * blockDim.x) {
        matrix[i] = T(val);
@@ -69,8 +72,14 @@ template <typename T> __global__ void init_matrix(T *matrix, const fp16 val, con
 }

 template <typename T> cudaDataType_t get_datatype() {
+    if (std::is_same<T, fp64>::value)
+        return CUDA_R_64F;
+    if (std::is_same<T, fp32>::value)
+        return CUDA_R_32F;
    if (std::is_same<T, fp16>::value)
        return CUDA_R_16F;
+    if (std::is_same<T, bf16>::value)
+        return CUDA_R_16BF;
    if (std::is_same<T, fp8e4m3>::value)
        return CUDA_R_8F_E4M3;
    if (std::is_same<T, fp8e5m2>::value)
@@ -88,8 +97,8 @@ float timing_matmul_tn(int m, int n, int k, int batch, int warmup, int iter) {
    cudaMalloc(&matrix_b, k * n * std::max(batch, 1) * sizeof(Tb));
    cudaMalloc(&matrix_out, m * n * std::max(batch, 1) * sizeof(Tout));

-    init_matrix<Ta><<<216, 1024>>>(matrix_a, static_cast<fp16>(1.f), m * k * std::max(batch, 1));
-    init_matrix<Tb><<<216, 1024>>>(matrix_b, static_cast<fp16>(2.f), k * n * std::max(batch, 1));
+    init_matrix<Ta><<<216, 1024>>>(matrix_a, 1.f, m * k * std::max(batch, 1));
+    init_matrix<Tb><<<216, 1024>>>(matrix_b, 2.f, k * n * std::max(batch, 1));

    // init gemm
    int lda = k, ldb = k, ldd = m;
@@ -129,7 +138,7 @@ float timing_matmul_tn(int m, int n, int k, int batch, int warmup, int iter) {
    return (time * 1e3 / iter);
 }

-template <typename Ta, typename Tb = Ta, typename Tout = fp16> void run(Args *args) {
+template <typename Ta, typename Tb = Ta, typename Tout = Ta> void run(Args *args) {
    float time_us = timing_matmul_tn<Ta, Tb, Tout>(args->m, args->n, args->k, args->batch, args->warmup, args->iter);
    // m n k batch time_us tflops
    printf("%d\t%d\t%d\t%d\t%f\t%f\n", args->m, args->n, args->k, args->batch, time_us,
@@ -140,12 +149,18 @@ int main(int argc, char **argv) {
    Args args;
    process_args(argc, argv, &args);

-    if (args.in_type == "fp16")
+    if (args.in_type == "fp64")
+        run<fp64>(&args);
+    else if (args.in_type == "fp32")
+        run<fp32>(&args);
+    else if (args.in_type == "fp16")
        run<fp16>(&args);
+    else if (args.in_type == "bf16")
+        run<bf16>(&args);
    else if (args.in_type == "fp8e4m3")
-        run<fp8e4m3>(&args);
+        run<fp8e4m3, fp8e4m3, fp16>(&args);
    else if (args.in_type == "fp8e5m2")
-        run<fp8e5m2, fp8e4m3>(&args);
+        run<fp8e5m2, fp8e4m3, fp16>(&args);
    else
        throw std::invalid_argument("Unknown type " + args.in_type);


--- a/superbench/benchmarks/micro_benchmarks/cublaslt_fp8_gemm/cublaslt_utils.cc
+++ b/superbench/benchmarks/micro_benchmarks/cublaslt_fp8_gemm/cublaslt_utils.cc
@@ -22,7 +22,7 @@ void cublasLtGemm::Setup(int m, int n, int k, int batch, int lda, int ldb, int l
 ) {
    cublasLtMatrixLayout_t a_desc = nullptr, b_desc = nullptr, c_desc = nullptr, d_desc = nullptr;
    // force c_type
-    cudaDataType_t c_type = CUDA_R_16F;
+    cudaDataType_t c_type = d_type;
    // Create matrix descriptors.
    checkCublasStatus(
        cublasLtMatrixLayoutCreate(&a_desc, a_type, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda));
@@ -57,10 +57,11 @@ void cublasLtGemm::Setup(int m, int n, int k, int batch, int lda, int ldb, int l
    d_desc_.reset(d_desc);

    // default to tf32 except for e5m2 inputs where the config is not supported
-    cublasComputeType_t gemm_compute_type =
-        (a_type == CUDA_R_8F_E5M2 || b_type == CUDA_R_8F_E5M2 || a_type == CUDA_R_8F_E4M3 || b_type == CUDA_R_8F_E4M3)
-            ? CUBLAS_COMPUTE_32F
-            : CUBLAS_COMPUTE_32F_FAST_TF32;
+    cublasComputeType_t gemm_compute_type = CUBLAS_COMPUTE_32F_FAST_TF32;
+    if (a_type == CUDA_R_8F_E5M2 || b_type == CUDA_R_8F_E5M2 || a_type == CUDA_R_8F_E4M3 || b_type == CUDA_R_8F_E4M3)
+        gemm_compute_type = CUBLAS_COMPUTE_32F;
+    if (a_type == CUDA_R_64F || b_type == CUDA_R_64F)
+        gemm_compute_type = CUBLAS_COMPUTE_64F;

    cublasLtMatmulDesc_t op_desc = nullptr;
    checkCublasStatus(cublasLtMatmulDescCreate(&op_desc, gemm_compute_type, CUDA_R_32F));

--- a/superbench/benchmarks/micro_benchmarks/cublaslt_fp8_gemm/cublaslt_utils.h
+++ b/superbench/benchmarks/micro_benchmarks/cublaslt_fp8_gemm/cublaslt_utils.h