Benchmarks - Add FP4 GEMM FLOPS support for cublaslt_gemm benchmark (#711)

**Description** Add FP4 precision support for cublaslt_gemm benchmark. **Major Revision** - Add new type `fp4e2m1` and `__nv_fp4_e2m1`. - For FP4 matmul, precision of MatrixC (add) should be FP16, precision of MatricD (output) should be FP4, otherwise, it will not work. - Add macro `CUDA_VERSION` to resolve the compatibility issue of different CUDA versions. --------- Co-authored-by: Ubuntu <aiperf@aiperf000000.hp5z1gqeinfufbj2u3jcty5fme.cdmx.internal.cloudapp.net> Co-authored-by: AVA <39534996+avazr@users.noreply.github.com> Co-authored-by: Guoshuai Zhao <microsoft@microsoft.com>

Benchmarks - Add FP4 GEMM FLOPS support for cublaslt_gemm benchmark (#711)
**Description** Add FP4 precision support for cublaslt_gemm benchmark. **Major Revision** - Add new type `fp4e2m1` and `__nv_fp4_e2m1`. - For FP4 matmul, precision of MatrixC (add) should be FP16, precision of MatricD (output) should be FP4, otherwise, it will not work. - Add macro `CUDA_VERSION` to resolve the compatibility issue of different CUDA versions. --------- Co-authored-by: Ubuntu <aiperf@aiperf000000.hp5z1gqeinfufbj2u3jcty5fme.cdmx.internal.cloudapp.net> Co-authored-by: AVA <39534996+avazr@users.noreply.github.com> Co-authored-by: Guoshuai Zhao <microsoft@microsoft.com>
b795477e · guoshzhao · GitHub · 60b13256 · b795477e · b795477e
Unverified Commit b795477e authored Jun 24, 2025 by guoshzhao Committed by GitHub Jun 24, 2025
5 changed files
--- a/superbench/benchmarks/micro_benchmarks/cublaslt_function.py
+++ b/superbench/benchmarks/micro_benchmarks/cublaslt_function.py
@@ -22,7 +22,7 @@ def __init__(self, name, parameters=''):
        super().__init__(name, parameters)

        self._bin_name = 'cublaslt_gemm'
-        self._in_types = ['fp64', 'fp32', 'fp16', 'bf16', 'fp8e4m3', 'fp8e5m2', 'int8']
+        self._in_types = ['fp64', 'fp32', 'fp16', 'bf16', 'fp8e4m3', 'fp8e5m2', 'fp4e2m1', 'int8']

    def add_parser_arguments(self):
        """Add the specified arguments."""

--- a/superbench/benchmarks/micro_benchmarks/cublaslt_gemm/CMakeLists.txt
+++ b/superbench/benchmarks/micro_benchmarks/cublaslt_gemm/CMakeLists.txt
@@ -16,7 +16,7 @@ if(CUDAToolkit_FOUND AND NOT CUDAToolkit_VERSION VERSION_LESS 11.8)
    endif()

    add_library(cublaslt_utils SHARED cublaslt_utils.cc)
-    target_link_libraries(cublaslt_utils CUDA::cublas CUDA::cublasLt)
+    target_link_libraries(cublaslt_utils CUDA::cublas CUDA::cublasLt CUDA::cudart)
    set_target_properties(cublaslt_utils PROPERTIES LINK_FLAGS_RELEASE -s)
    install(TARGETS cublaslt_utils LIBRARY DESTINATION lib)


--- a/superbench/benchmarks/micro_benchmarks/cublaslt_gemm/cublaslt_gemm.cu
+++ b/superbench/benchmarks/micro_benchmarks/cublaslt_gemm/cublaslt_gemm.cu
@@ -5,9 +5,15 @@
 #include <memory>
 #include <stdio.h>

+#include <cuda.h>
 #include <cuda_fp16.h>
 #include <cuda_fp8.h>

+#if CUDA_VERSION >= 12080
+#include <cuda_fp4.h>
+using fp4e2m1 = __nv_fp4_e2m1;
+#endif
+
 #include "cublaslt_utils.h"

 using fp64 = double;
@@ -102,32 +108,39 @@ template <typename T> cudaDataType_t get_datatype() {
        return CUDA_R_8F_E4M3;
    if (std::is_same<T, fp8e5m2>::value)
        return CUDA_R_8F_E5M2;
+#if CUDA_VERSION >= 12080
+    if (std::is_same<T, fp4e2m1>::value)
+        return CUDA_R_4F_E2M1;
+#endif
    if (std::is_same<T, int8>::value)
        return CUDA_R_8I;
    throw std::invalid_argument("Unknown type");
 }

-template <typename Ta, typename Tb, typename Tout>
+template <typename Ta, typename Tb, typename Tout, typename Tc>
 float timing_matmul_tn(size_t m, size_t n, size_t k, size_t batch, int warmup, int iter, bool autotune,
                       int iter_autotune, int warmup_autotune) {
    // init matrix
    Ta *matrix_a = nullptr;
    Tb *matrix_b = nullptr;
+    Tc *matrix_c = nullptr;
    Tout *matrix_out = nullptr;
    batch = std::max<size_t>(batch, 1);
    cudaMalloc(&matrix_a, m * k * batch * sizeof(Ta));
    cudaMalloc(&matrix_b, k * n * batch * sizeof(Tb));
+    cudaMalloc(&matrix_c, m * n * batch * sizeof(Tc));
    cudaMalloc(&matrix_out, m * n * batch * sizeof(Tout));

    init_matrix<Ta><<<216, 1024>>>(matrix_a, 1.f, m * k * batch);
    init_matrix<Tb><<<216, 1024>>>(matrix_b, 2.f, k * n * batch);
+    init_matrix<Tc><<<216, 1024>>>(matrix_c, 3.f, m * n * batch);

    // init gemm
-    size_t lda = k, ldb = k, ldd = m;
+    size_t lda = k, ldb = k, ldc = m, ldd = m;
    std::unique_ptr<cublasLtGemm> gemm = std::make_unique<cublasLtGemm>();
    gemm->Init();
-    gemm->Setup(m, n, k, batch, lda, ldb, ldd, get_datatype<Ta>(), get_datatype<Tb>(), get_datatype<Tout>(),
-                CUBLAS_OP_T, CUBLAS_OP_N, CUBLASLT_EPILOGUE_DEFAULT);
+    gemm->Setup(m, n, k, batch, lda, ldb, ldc, ldd, get_datatype<Ta>(), get_datatype<Tb>(), get_datatype<Tc>(),
+                get_datatype<Tout>(), CUBLAS_OP_T, CUBLAS_OP_N, CUBLASLT_EPILOGUE_DEFAULT);

    void *workspace = nullptr;
    size_t workspace_size;
@@ -169,9 +182,9 @@ float timing_matmul_tn(size_t m, size_t n, size_t k, size_t batch, int warmup, i
    return (time * 1e3 / iter);
 }

-template <typename Ta, typename Tb = Ta, typename Tout = Ta> void run(const Args *args) {
-    float time_us = timing_matmul_tn<Ta, Tb, Tout>(args->m, args->n, args->k, args->batch, args->warmup, args->iter,
-                                                   args->autotune, args->iter_autotune, args->warmup_autotune);
+template <typename Ta, typename Tb = Ta, typename Tout = Ta, typename Tc = Tout> void run(const Args *args) {
+    float time_us = timing_matmul_tn<Ta, Tb, Tout, Tc>(args->m, args->n, args->k, args->batch, args->warmup, args->iter,
+                                                       args->autotune, args->iter_autotune, args->warmup_autotune);
    // m n k batch time_us tflops
    printf("%d\t%d\t%d\t%d\t%f\t%f\n", args->m, args->n, args->k, args->batch, time_us,
           float(args->m) * float(args->n) * float(2 * args->k - 1) / 1e6 / time_us * std::max(args->batch, 1));
@@ -193,6 +206,10 @@ int main(int argc, char **argv) {
        run<fp8e4m3, fp8e4m3, fp16>(&args);
    else if (args.in_type == "fp8e5m2")
        run<fp8e5m2, fp8e4m3, fp16>(&args);
+#if CUDA_VERSION >= 12080
+    else if (args.in_type == "fp4e2m1")
+        run<fp4e2m1, fp4e2m1, fp4e2m1, fp16>(&args);
+#endif
    else if (args.in_type == "int8")
        run<int8>(&args);
    else

--- a/superbench/benchmarks/micro_benchmarks/cublaslt_gemm/cublaslt_utils.cc
+++ b/superbench/benchmarks/micro_benchmarks/cublaslt_gemm/cublaslt_utils.cc
@@ -4,6 +4,32 @@
 #include "cublaslt_utils.h"
 #include <algorithm> // for std::sort
 #include <cassert>   // for assert
+#include <cuda.h>
+#include <cuda_fp8.h>
+
+#if CUDA_VERSION >= 12080
+int GetScaleTensorSize(int inner, int outer, cublasLtMatmulMatrixScale_t scale_mode) {
+    if (scale_mode == CUBLASLT_MATMUL_MATRIX_SCALE_SCALAR_32F) {
+        return 1;
+    }
+    if (scale_mode == CUBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0 ||
+        scale_mode == CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3) {
+        const auto s_vscale = scale_mode == CUBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0 ? 32 : 16;
+        const auto s_block_cols = 32;
+        const auto s_block_rows = 4;
+        const auto s_block_inner = 4;
+        const auto block_rows = s_block_inner * s_vscale;
+        const auto block_cols = s_block_cols * s_block_rows;
+        const auto round_off = [](auto x, auto granularity) {
+            return granularity * ((x + (granularity - 1)) / granularity);
+        };
+        const auto s_rows = round_off(inner, block_rows) / s_vscale;
+        const auto s_cols = round_off(outer, block_cols);
+        return s_rows * s_cols;
+    }
+    return 0;
+}
+#endif

 void cublasLtGemm::Init() {
    cublasLtHandle_t handle;
@@ -16,8 +42,8 @@ void cublasLtGemm::Init() {
    preference_.reset(preference);
 }

-void cublasLtGemm::Setup(int m, int n, int k, int batch, int lda, int ldb, int ldd, cudaDataType_t a_type,
-                         cudaDataType_t b_type, cudaDataType_t d_type, cublasOperation_t transa,
+void cublasLtGemm::Setup(int m, int n, int k, int batch, int lda, int ldb, int ldc, int ldd, cudaDataType_t a_type,
+                         cudaDataType_t b_type, cudaDataType_t c_type, cudaDataType_t d_type, cublasOperation_t transa,
                         cublasOperation_t transb, cublasLtEpilogue_t epilogue,
                         void *a_scale_inverse, /* only need to be set for fp8 */
                         void *b_scale_inverse  /* only need to be set for fp8 */
@@ -28,14 +54,12 @@ void cublasLtGemm::Setup(int m, int n, int k, int batch, int lda, int ldb, int l
    k_ = k;

    cublasLtMatrixLayout_t a_desc = nullptr, b_desc = nullptr, c_desc = nullptr, d_desc = nullptr;
-    // force c_type
-    cudaDataType_t c_type = d_type;
    // Create matrix descriptors.
    CUBLAS_CHECK(
        cublasLtMatrixLayoutCreate(&a_desc, a_type, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda));
    CUBLAS_CHECK(
        cublasLtMatrixLayoutCreate(&b_desc, b_type, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb));
-    CUBLAS_CHECK(cublasLtMatrixLayoutCreate(&c_desc, c_type, m, n, ldd));
+    CUBLAS_CHECK(cublasLtMatrixLayoutCreate(&c_desc, c_type, m, n, ldc));
    CUBLAS_CHECK(cublasLtMatrixLayoutCreate(&d_desc, d_type, m, n, ldd));

    // strided batch gemm
@@ -67,13 +91,17 @@ void cublasLtGemm::Setup(int m, int n, int k, int batch, int lda, int ldb, int l
    // Set compute type and scale type based on input types
    cublasComputeType_t gemm_compute_type;
    cudaDataType_t scale_type;
-
    if (a_type == CUDA_R_8F_E5M2 || b_type == CUDA_R_8F_E5M2 || a_type == CUDA_R_8F_E4M3 || b_type == CUDA_R_8F_E4M3) {
        gemm_compute_type = CUBLAS_COMPUTE_32F;
        scale_type = CUDA_R_32F;
    } else if (a_type == CUDA_R_16F || b_type == CUDA_R_16F || a_type == CUDA_R_16BF || b_type == CUDA_R_16BF) {
        gemm_compute_type = CUBLAS_COMPUTE_32F;
        scale_type = CUDA_R_32F;
+#if CUDA_VERSION >= 12080
+    } else if (a_type == CUDA_R_4F_E2M1 || b_type == CUDA_R_4F_E2M1) {
+        gemm_compute_type = CUBLAS_COMPUTE_32F;
+        scale_type = CUDA_R_32F;
+#endif
    } else if (a_type == CUDA_R_64F || b_type == CUDA_R_64F) {
        gemm_compute_type = CUBLAS_COMPUTE_64F;
        scale_type = CUDA_R_64F;
@@ -108,6 +136,76 @@ void cublasLtGemm::Setup(int m, int n, int k, int batch, int lda, int ldb, int l
    }
    CUBLAS_CHECK(
        cublasLtMatmulDescSetAttribute(op_desc_.get(), CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogue, sizeof(epilogue)));
+
+#if CUDA_VERSION >= 12080
+    if (a_type == CUDA_R_4F_E2M1 || b_type == CUDA_R_4F_E2M1) {
+        // Allocate and copy device scale values
+        const auto a_scale = __nv_fp8_e4m3{1.f}, b_scale = __nv_fp8_e4m3{1.f}, d_out_scale = __nv_fp8_e4m3{1.f};
+        const auto d_scale = 1.f;
+        void *AscaleDev, *BscaleDev, *DscaleDev, *DOutscaleDev;
+
+        // Set scale modes
+        cublasLtMatmulMatrixScale_t AScaleMode = CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3;
+        cublasLtMatmulMatrixScale_t BScaleMode = CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3;
+        cublasLtMatmulMatrixScale_t DScaleMode = CUBLASLT_MATMUL_MATRIX_SCALE_SCALAR_32F;
+        cublasLtMatmulMatrixScale_t DOutScaleMode = CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3;
+        CUBLAS_CHECK(cublasLtMatmulDescSetAttribute(op_desc_.get(), CUBLASLT_MATMUL_DESC_A_SCALE_MODE, &AScaleMode,
+                                                    sizeof(AScaleMode)));
+        CUBLAS_CHECK(cublasLtMatmulDescSetAttribute(op_desc_.get(), CUBLASLT_MATMUL_DESC_B_SCALE_MODE, &BScaleMode,
+                                                    sizeof(BScaleMode)));
+        CUBLAS_CHECK(cublasLtMatmulDescSetAttribute(op_desc_.get(), CUBLASLT_MATMUL_DESC_D_SCALE_MODE, &DScaleMode,
+                                                    sizeof(DScaleMode)));
+        CUBLAS_CHECK(cublasLtMatmulDescSetAttribute(op_desc_.get(), CUBLASLT_MATMUL_DESC_D_OUT_SCALE_MODE,
+                                                    &DOutScaleMode, sizeof(DOutScaleMode)));
+
+        const auto a_scale_size =
+            GetScaleTensorSize(transa != CUBLAS_OP_N ? k : m, transa != CUBLAS_OP_N ? m : k, AScaleMode);
+        const auto b_scale_size =
+            GetScaleTensorSize(transb != CUBLAS_OP_N ? n : k, transb != CUBLAS_OP_N ? k : n, BScaleMode);
+        const auto d_scale_size = GetScaleTensorSize(m, n, DScaleMode);
+        const auto d_out_scale_size = GetScaleTensorSize(m, n, DOutScaleMode);
+
+        if (a_scale_size > 0) {
+            __nv_fp8_e4m3 *a_scale_host = new __nv_fp8_e4m3[a_scale_size];
+            std::fill_n(a_scale_host, a_scale_size, a_scale);
+            cudaMalloc(&AscaleDev, a_scale_size * sizeof(__nv_fp8_e4m3));
+            cudaMemcpy(AscaleDev, a_scale_host, a_scale_size * sizeof(__nv_fp8_e4m3), cudaMemcpyHostToDevice);
+            delete[] a_scale_host;
+        }
+        if (b_scale_size > 0) {
+            __nv_fp8_e4m3 *b_scale_host = new __nv_fp8_e4m3[b_scale_size];
+            std::fill_n(b_scale_host, b_scale_size, b_scale);
+            cudaMalloc(&BscaleDev, b_scale_size * sizeof(__nv_fp8_e4m3));
+            cudaMemcpy(BscaleDev, b_scale_host, b_scale_size * sizeof(__nv_fp8_e4m3), cudaMemcpyHostToDevice);
+            delete[] b_scale_host;
+        }
+        if (d_scale_size > 0) {
+            float *d_scale_host = new float[d_scale_size];
+            std::fill_n(d_scale_host, d_scale_size, d_scale);
+            cudaMalloc(&DscaleDev, d_scale_size * sizeof(float));
+            cudaMemcpy(DscaleDev, d_scale_host, d_scale_size * sizeof(float), cudaMemcpyHostToDevice);
+            delete[] d_scale_host;
+        }
+        if (d_out_scale_size > 0) {
+            __nv_fp8_e4m3 *d_out_scale_host = new __nv_fp8_e4m3[d_out_scale_size];
+            std::fill_n(d_out_scale_host, d_out_scale_size, d_out_scale);
+            cudaMalloc(&DOutscaleDev, d_out_scale_size * sizeof(__nv_fp8_e4m3));
+            cudaMemcpy(DOutscaleDev, d_out_scale_host, d_out_scale_size * sizeof(__nv_fp8_e4m3),
+                       cudaMemcpyHostToDevice);
+            delete[] d_out_scale_host;
+        }
+
+        // Use device scale pointer attributes
+        CUBLAS_CHECK(cublasLtMatmulDescSetAttribute(op_desc_.get(), CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, &AscaleDev,
+                                                    sizeof(void *)));
+        CUBLAS_CHECK(cublasLtMatmulDescSetAttribute(op_desc_.get(), CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, &BscaleDev,
+                                                    sizeof(void *)));
+        CUBLAS_CHECK(cublasLtMatmulDescSetAttribute(op_desc_.get(), CUBLASLT_MATMUL_DESC_D_SCALE_POINTER, &DscaleDev,
+                                                    sizeof(void *)));
+        CUBLAS_CHECK(cublasLtMatmulDescSetAttribute(op_desc_.get(), CUBLASLT_MATMUL_DESC_D_OUT_SCALE_POINTER,
+                                                    &DOutscaleDev, sizeof(void *)));
+    }
+#endif
 }

 size_t cublasLtGemm::GetAlgorithm(int max_algorithm_count, size_t max_workspace_size) {

--- a/superbench/benchmarks/micro_benchmarks/cublaslt_gemm/cublaslt_utils.h
+++ b/superbench/benchmarks/micro_benchmarks/cublaslt_gemm/cublaslt_utils.h
@@ -45,9 +45,10 @@ class cublasLtGemm {

    void Init();

-    void Setup(int m, int n, int k, int batch, int lda, int ldb, int ldd, cudaDataType_t a_type, cudaDataType_t b_type,
-               cudaDataType_t d_type, cublasOperation_t transa, cublasOperation_t transb, cublasLtEpilogue_t epilogue,
-               void *a_scale_inverse = nullptr, void *b_scale_inverse = nullptr);
+    void Setup(int m, int n, int k, int batch, int lda, int ldb, int ldc, int ldd, cudaDataType_t a_type,
+               cudaDataType_t b_type, cudaDataType_t c_type, cudaDataType_t d_type, cublasOperation_t transa,
+               cublasOperation_t transb, cublasLtEpilogue_t epilogue, void *a_scale_inverse = nullptr,
+               void *b_scale_inverse = nullptr);

    size_t GetAlgorithm(int max_algorithm_count, size_t max_workspace_size);