Unverified Commit b795477e authored by guoshzhao's avatar guoshzhao Committed by GitHub
Browse files

Benchmarks - Add FP4 GEMM FLOPS support for cublaslt_gemm benchmark (#711)



**Description**
Add FP4 precision support for cublaslt_gemm benchmark.

**Major Revision**
- Add new type `fp4e2m1` and `__nv_fp4_e2m1`.
- For FP4 matmul, precision of MatrixC (add) should be FP16, precision
of MatricD (output) should be FP4, otherwise, it will not work.
- Add macro `CUDA_VERSION` to resolve the compatibility issue of
different CUDA versions.

---------
Co-authored-by: default avatarUbuntu <aiperf@aiperf000000.hp5z1gqeinfufbj2u3jcty5fme.cdmx.internal.cloudapp.net>
Co-authored-by: default avatarAVA <39534996+avazr@users.noreply.github.com>
Co-authored-by: default avatarGuoshuai Zhao <microsoft@microsoft.com>
parent 60b13256
......@@ -22,7 +22,7 @@ def __init__(self, name, parameters=''):
super().__init__(name, parameters)
self._bin_name = 'cublaslt_gemm'
self._in_types = ['fp64', 'fp32', 'fp16', 'bf16', 'fp8e4m3', 'fp8e5m2', 'int8']
self._in_types = ['fp64', 'fp32', 'fp16', 'bf16', 'fp8e4m3', 'fp8e5m2', 'fp4e2m1', 'int8']
def add_parser_arguments(self):
"""Add the specified arguments."""
......
......@@ -16,7 +16,7 @@ if(CUDAToolkit_FOUND AND NOT CUDAToolkit_VERSION VERSION_LESS 11.8)
endif()
add_library(cublaslt_utils SHARED cublaslt_utils.cc)
target_link_libraries(cublaslt_utils CUDA::cublas CUDA::cublasLt)
target_link_libraries(cublaslt_utils CUDA::cublas CUDA::cublasLt CUDA::cudart)
set_target_properties(cublaslt_utils PROPERTIES LINK_FLAGS_RELEASE -s)
install(TARGETS cublaslt_utils LIBRARY DESTINATION lib)
......
......@@ -5,9 +5,15 @@
#include <memory>
#include <stdio.h>
#include <cuda.h>
#include <cuda_fp16.h>
#include <cuda_fp8.h>
#if CUDA_VERSION >= 12080
#include <cuda_fp4.h>
using fp4e2m1 = __nv_fp4_e2m1;
#endif
#include "cublaslt_utils.h"
using fp64 = double;
......@@ -102,32 +108,39 @@ template <typename T> cudaDataType_t get_datatype() {
return CUDA_R_8F_E4M3;
if (std::is_same<T, fp8e5m2>::value)
return CUDA_R_8F_E5M2;
#if CUDA_VERSION >= 12080
if (std::is_same<T, fp4e2m1>::value)
return CUDA_R_4F_E2M1;
#endif
if (std::is_same<T, int8>::value)
return CUDA_R_8I;
throw std::invalid_argument("Unknown type");
}
template <typename Ta, typename Tb, typename Tout>
template <typename Ta, typename Tb, typename Tout, typename Tc>
float timing_matmul_tn(size_t m, size_t n, size_t k, size_t batch, int warmup, int iter, bool autotune,
int iter_autotune, int warmup_autotune) {
// init matrix
Ta *matrix_a = nullptr;
Tb *matrix_b = nullptr;
Tc *matrix_c = nullptr;
Tout *matrix_out = nullptr;
batch = std::max<size_t>(batch, 1);
cudaMalloc(&matrix_a, m * k * batch * sizeof(Ta));
cudaMalloc(&matrix_b, k * n * batch * sizeof(Tb));
cudaMalloc(&matrix_c, m * n * batch * sizeof(Tc));
cudaMalloc(&matrix_out, m * n * batch * sizeof(Tout));
init_matrix<Ta><<<216, 1024>>>(matrix_a, 1.f, m * k * batch);
init_matrix<Tb><<<216, 1024>>>(matrix_b, 2.f, k * n * batch);
init_matrix<Tc><<<216, 1024>>>(matrix_c, 3.f, m * n * batch);
// init gemm
size_t lda = k, ldb = k, ldd = m;
size_t lda = k, ldb = k, ldc = m, ldd = m;
std::unique_ptr<cublasLtGemm> gemm = std::make_unique<cublasLtGemm>();
gemm->Init();
gemm->Setup(m, n, k, batch, lda, ldb, ldd, get_datatype<Ta>(), get_datatype<Tb>(), get_datatype<Tout>(),
CUBLAS_OP_T, CUBLAS_OP_N, CUBLASLT_EPILOGUE_DEFAULT);
gemm->Setup(m, n, k, batch, lda, ldb, ldc, ldd, get_datatype<Ta>(), get_datatype<Tb>(), get_datatype<Tc>(),
get_datatype<Tout>(), CUBLAS_OP_T, CUBLAS_OP_N, CUBLASLT_EPILOGUE_DEFAULT);
void *workspace = nullptr;
size_t workspace_size;
......@@ -169,9 +182,9 @@ float timing_matmul_tn(size_t m, size_t n, size_t k, size_t batch, int warmup, i
return (time * 1e3 / iter);
}
template <typename Ta, typename Tb = Ta, typename Tout = Ta> void run(const Args *args) {
float time_us = timing_matmul_tn<Ta, Tb, Tout>(args->m, args->n, args->k, args->batch, args->warmup, args->iter,
args->autotune, args->iter_autotune, args->warmup_autotune);
template <typename Ta, typename Tb = Ta, typename Tout = Ta, typename Tc = Tout> void run(const Args *args) {
float time_us = timing_matmul_tn<Ta, Tb, Tout, Tc>(args->m, args->n, args->k, args->batch, args->warmup, args->iter,
args->autotune, args->iter_autotune, args->warmup_autotune);
// m n k batch time_us tflops
printf("%d\t%d\t%d\t%d\t%f\t%f\n", args->m, args->n, args->k, args->batch, time_us,
float(args->m) * float(args->n) * float(2 * args->k - 1) / 1e6 / time_us * std::max(args->batch, 1));
......@@ -193,6 +206,10 @@ int main(int argc, char **argv) {
run<fp8e4m3, fp8e4m3, fp16>(&args);
else if (args.in_type == "fp8e5m2")
run<fp8e5m2, fp8e4m3, fp16>(&args);
#if CUDA_VERSION >= 12080
else if (args.in_type == "fp4e2m1")
run<fp4e2m1, fp4e2m1, fp4e2m1, fp16>(&args);
#endif
else if (args.in_type == "int8")
run<int8>(&args);
else
......
......@@ -4,6 +4,32 @@
#include "cublaslt_utils.h"
#include <algorithm> // for std::sort
#include <cassert> // for assert
#include <cuda.h>
#include <cuda_fp8.h>
#if CUDA_VERSION >= 12080
int GetScaleTensorSize(int inner, int outer, cublasLtMatmulMatrixScale_t scale_mode) {
if (scale_mode == CUBLASLT_MATMUL_MATRIX_SCALE_SCALAR_32F) {
return 1;
}
if (scale_mode == CUBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0 ||
scale_mode == CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3) {
const auto s_vscale = scale_mode == CUBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0 ? 32 : 16;
const auto s_block_cols = 32;
const auto s_block_rows = 4;
const auto s_block_inner = 4;
const auto block_rows = s_block_inner * s_vscale;
const auto block_cols = s_block_cols * s_block_rows;
const auto round_off = [](auto x, auto granularity) {
return granularity * ((x + (granularity - 1)) / granularity);
};
const auto s_rows = round_off(inner, block_rows) / s_vscale;
const auto s_cols = round_off(outer, block_cols);
return s_rows * s_cols;
}
return 0;
}
#endif
void cublasLtGemm::Init() {
cublasLtHandle_t handle;
......@@ -16,8 +42,8 @@ void cublasLtGemm::Init() {
preference_.reset(preference);
}
void cublasLtGemm::Setup(int m, int n, int k, int batch, int lda, int ldb, int ldd, cudaDataType_t a_type,
cudaDataType_t b_type, cudaDataType_t d_type, cublasOperation_t transa,
void cublasLtGemm::Setup(int m, int n, int k, int batch, int lda, int ldb, int ldc, int ldd, cudaDataType_t a_type,
cudaDataType_t b_type, cudaDataType_t c_type, cudaDataType_t d_type, cublasOperation_t transa,
cublasOperation_t transb, cublasLtEpilogue_t epilogue,
void *a_scale_inverse, /* only need to be set for fp8 */
void *b_scale_inverse /* only need to be set for fp8 */
......@@ -28,14 +54,12 @@ void cublasLtGemm::Setup(int m, int n, int k, int batch, int lda, int ldb, int l
k_ = k;
cublasLtMatrixLayout_t a_desc = nullptr, b_desc = nullptr, c_desc = nullptr, d_desc = nullptr;
// force c_type
cudaDataType_t c_type = d_type;
// Create matrix descriptors.
CUBLAS_CHECK(
cublasLtMatrixLayoutCreate(&a_desc, a_type, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda));
CUBLAS_CHECK(
cublasLtMatrixLayoutCreate(&b_desc, b_type, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb));
CUBLAS_CHECK(cublasLtMatrixLayoutCreate(&c_desc, c_type, m, n, ldd));
CUBLAS_CHECK(cublasLtMatrixLayoutCreate(&c_desc, c_type, m, n, ldc));
CUBLAS_CHECK(cublasLtMatrixLayoutCreate(&d_desc, d_type, m, n, ldd));
// strided batch gemm
......@@ -67,13 +91,17 @@ void cublasLtGemm::Setup(int m, int n, int k, int batch, int lda, int ldb, int l
// Set compute type and scale type based on input types
cublasComputeType_t gemm_compute_type;
cudaDataType_t scale_type;
if (a_type == CUDA_R_8F_E5M2 || b_type == CUDA_R_8F_E5M2 || a_type == CUDA_R_8F_E4M3 || b_type == CUDA_R_8F_E4M3) {
gemm_compute_type = CUBLAS_COMPUTE_32F;
scale_type = CUDA_R_32F;
} else if (a_type == CUDA_R_16F || b_type == CUDA_R_16F || a_type == CUDA_R_16BF || b_type == CUDA_R_16BF) {
gemm_compute_type = CUBLAS_COMPUTE_32F;
scale_type = CUDA_R_32F;
#if CUDA_VERSION >= 12080
} else if (a_type == CUDA_R_4F_E2M1 || b_type == CUDA_R_4F_E2M1) {
gemm_compute_type = CUBLAS_COMPUTE_32F;
scale_type = CUDA_R_32F;
#endif
} else if (a_type == CUDA_R_64F || b_type == CUDA_R_64F) {
gemm_compute_type = CUBLAS_COMPUTE_64F;
scale_type = CUDA_R_64F;
......@@ -108,6 +136,76 @@ void cublasLtGemm::Setup(int m, int n, int k, int batch, int lda, int ldb, int l
}
CUBLAS_CHECK(
cublasLtMatmulDescSetAttribute(op_desc_.get(), CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogue, sizeof(epilogue)));
#if CUDA_VERSION >= 12080
if (a_type == CUDA_R_4F_E2M1 || b_type == CUDA_R_4F_E2M1) {
// Allocate and copy device scale values
const auto a_scale = __nv_fp8_e4m3{1.f}, b_scale = __nv_fp8_e4m3{1.f}, d_out_scale = __nv_fp8_e4m3{1.f};
const auto d_scale = 1.f;
void *AscaleDev, *BscaleDev, *DscaleDev, *DOutscaleDev;
// Set scale modes
cublasLtMatmulMatrixScale_t AScaleMode = CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3;
cublasLtMatmulMatrixScale_t BScaleMode = CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3;
cublasLtMatmulMatrixScale_t DScaleMode = CUBLASLT_MATMUL_MATRIX_SCALE_SCALAR_32F;
cublasLtMatmulMatrixScale_t DOutScaleMode = CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3;
CUBLAS_CHECK(cublasLtMatmulDescSetAttribute(op_desc_.get(), CUBLASLT_MATMUL_DESC_A_SCALE_MODE, &AScaleMode,
sizeof(AScaleMode)));
CUBLAS_CHECK(cublasLtMatmulDescSetAttribute(op_desc_.get(), CUBLASLT_MATMUL_DESC_B_SCALE_MODE, &BScaleMode,
sizeof(BScaleMode)));
CUBLAS_CHECK(cublasLtMatmulDescSetAttribute(op_desc_.get(), CUBLASLT_MATMUL_DESC_D_SCALE_MODE, &DScaleMode,
sizeof(DScaleMode)));
CUBLAS_CHECK(cublasLtMatmulDescSetAttribute(op_desc_.get(), CUBLASLT_MATMUL_DESC_D_OUT_SCALE_MODE,
&DOutScaleMode, sizeof(DOutScaleMode)));
const auto a_scale_size =
GetScaleTensorSize(transa != CUBLAS_OP_N ? k : m, transa != CUBLAS_OP_N ? m : k, AScaleMode);
const auto b_scale_size =
GetScaleTensorSize(transb != CUBLAS_OP_N ? n : k, transb != CUBLAS_OP_N ? k : n, BScaleMode);
const auto d_scale_size = GetScaleTensorSize(m, n, DScaleMode);
const auto d_out_scale_size = GetScaleTensorSize(m, n, DOutScaleMode);
if (a_scale_size > 0) {
__nv_fp8_e4m3 *a_scale_host = new __nv_fp8_e4m3[a_scale_size];
std::fill_n(a_scale_host, a_scale_size, a_scale);
cudaMalloc(&AscaleDev, a_scale_size * sizeof(__nv_fp8_e4m3));
cudaMemcpy(AscaleDev, a_scale_host, a_scale_size * sizeof(__nv_fp8_e4m3), cudaMemcpyHostToDevice);
delete[] a_scale_host;
}
if (b_scale_size > 0) {
__nv_fp8_e4m3 *b_scale_host = new __nv_fp8_e4m3[b_scale_size];
std::fill_n(b_scale_host, b_scale_size, b_scale);
cudaMalloc(&BscaleDev, b_scale_size * sizeof(__nv_fp8_e4m3));
cudaMemcpy(BscaleDev, b_scale_host, b_scale_size * sizeof(__nv_fp8_e4m3), cudaMemcpyHostToDevice);
delete[] b_scale_host;
}
if (d_scale_size > 0) {
float *d_scale_host = new float[d_scale_size];
std::fill_n(d_scale_host, d_scale_size, d_scale);
cudaMalloc(&DscaleDev, d_scale_size * sizeof(float));
cudaMemcpy(DscaleDev, d_scale_host, d_scale_size * sizeof(float), cudaMemcpyHostToDevice);
delete[] d_scale_host;
}
if (d_out_scale_size > 0) {
__nv_fp8_e4m3 *d_out_scale_host = new __nv_fp8_e4m3[d_out_scale_size];
std::fill_n(d_out_scale_host, d_out_scale_size, d_out_scale);
cudaMalloc(&DOutscaleDev, d_out_scale_size * sizeof(__nv_fp8_e4m3));
cudaMemcpy(DOutscaleDev, d_out_scale_host, d_out_scale_size * sizeof(__nv_fp8_e4m3),
cudaMemcpyHostToDevice);
delete[] d_out_scale_host;
}
// Use device scale pointer attributes
CUBLAS_CHECK(cublasLtMatmulDescSetAttribute(op_desc_.get(), CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, &AscaleDev,
sizeof(void *)));
CUBLAS_CHECK(cublasLtMatmulDescSetAttribute(op_desc_.get(), CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, &BscaleDev,
sizeof(void *)));
CUBLAS_CHECK(cublasLtMatmulDescSetAttribute(op_desc_.get(), CUBLASLT_MATMUL_DESC_D_SCALE_POINTER, &DscaleDev,
sizeof(void *)));
CUBLAS_CHECK(cublasLtMatmulDescSetAttribute(op_desc_.get(), CUBLASLT_MATMUL_DESC_D_OUT_SCALE_POINTER,
&DOutscaleDev, sizeof(void *)));
}
#endif
}
size_t cublasLtGemm::GetAlgorithm(int max_algorithm_count, size_t max_workspace_size) {
......
......@@ -45,9 +45,10 @@ class cublasLtGemm {
void Init();
void Setup(int m, int n, int k, int batch, int lda, int ldb, int ldd, cudaDataType_t a_type, cudaDataType_t b_type,
cudaDataType_t d_type, cublasOperation_t transa, cublasOperation_t transb, cublasLtEpilogue_t epilogue,
void *a_scale_inverse = nullptr, void *b_scale_inverse = nullptr);
void Setup(int m, int n, int k, int batch, int lda, int ldb, int ldc, int ldd, cudaDataType_t a_type,
cudaDataType_t b_type, cudaDataType_t c_type, cudaDataType_t d_type, cublasOperation_t transa,
cublasOperation_t transb, cublasLtEpilogue_t epilogue, void *a_scale_inverse = nullptr,
void *b_scale_inverse = nullptr);
size_t GetAlgorithm(int max_algorithm_count, size_t max_workspace_size);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment