Unverified Commit c3769cb7 authored by Paweł Gadziński's avatar Paweł Gadziński Committed by GitHub
Browse files

Fix minimum version of cublas for grouped gemm (#2631)



* version change
Signed-off-by: default avatarPawel Gadzinski <pgadzinski@nvidia.com>

* fix
Signed-off-by: default avatarPawel Gadzinski <pgadzinski@nvidia.com>

* ifx
Signed-off-by: default avatarPawel Gadzinski <pgadzinski@nvidia.com>

* fix
Signed-off-by: default avatarPawel Gadzinski <pgadzinski@nvidia.com>

* fix
Signed-off-by: default avatarPawel Gadzinski <pgadzinski@nvidia.com>

* fix
Signed-off-by: default avatarPawel Gadzinski <pgadzinski@nvidia.com>

---------
Signed-off-by: default avatarPawel Gadzinski <pgadzinski@nvidia.com>
parent f8cca8b9
...@@ -102,8 +102,8 @@ std::vector<std::tuple<size_t, size_t, size_t>> make_shapes(ShapeCase scase) { ...@@ -102,8 +102,8 @@ std::vector<std::tuple<size_t, size_t, size_t>> make_shapes(ShapeCase scase) {
} }
void run_grouped_gemm_case(const TestParams& params) { void run_grouped_gemm_case(const TestParams& params) {
#if CUBLAS_VERSION < 130100 #if CUBLAS_VERSION < 130200
GTEST_SKIP() << "Grouped GEMM requires cuBLAS 13.1+, but compile-time cuBLAS version is " GTEST_SKIP() << "Grouped GEMM requires cuBLAS 13.2+, but compile-time cuBLAS version is "
<< CUBLAS_VERSION << "."; << CUBLAS_VERSION << ".";
#else #else
if (getDeviceComputeCapability() < blackwellComputeCapability) { if (getDeviceComputeCapability() < blackwellComputeCapability) {
...@@ -267,7 +267,7 @@ void run_grouped_gemm_case(const TestParams& params) { ...@@ -267,7 +267,7 @@ void run_grouped_gemm_case(const TestParams& params) {
atol, atol,
rtol); rtol);
} }
#endif // CUBLAS_VERSION >= 130100 #endif // CUBLAS_VERSION >= 130200
} }
class GroupedGemmTest : public ::testing::TestWithParam<TestParams> {}; class GroupedGemmTest : public ::testing::TestWithParam<TestParams> {};
......
...@@ -494,9 +494,9 @@ void cublas_gemm(const Tensor *inputA, const Tensor *inputB, Tensor *outputD, ...@@ -494,9 +494,9 @@ void cublas_gemm(const Tensor *inputA, const Tensor *inputB, Tensor *outputD,
#endif // CUBLAS_VERSION >= 120800 #endif // CUBLAS_VERSION >= 120800
} else if (mxfp8_gemm) { } else if (mxfp8_gemm) {
#if CUBLAS_VERSION >= 120800 #if CUBLAS_VERSION >= 120800
NVTE_CHECK(cuda::cublas_version() >= 120800, NVTE_CHECK(transformer_engine::cuda::cublas_version() >= 120800,
"MXFP8 requires cuBLAS 12.8+, but run-time cuBLAS version is ", "MXFP8 requires cuBLAS 12.8+, but run-time cuBLAS version is ",
cuda::cublas_version()); transformer_engine::cuda::cublas_version());
// Check that scales are in expected format // Check that scales are in expected format
NVTE_CHECK(inputA->with_gemm_swizzled_scales, NVTE_CHECK(inputA->with_gemm_swizzled_scales,
...@@ -518,7 +518,7 @@ void cublas_gemm(const Tensor *inputA, const Tensor *inputB, Tensor *outputD, ...@@ -518,7 +518,7 @@ void cublas_gemm(const Tensor *inputA, const Tensor *inputB, Tensor *outputD,
// Workaround for heuristic cache bug in cublasLt. This separates the MXFP8 cache key from non-block scaling. // Workaround for heuristic cache bug in cublasLt. This separates the MXFP8 cache key from non-block scaling.
// CUBLASLT_MATMUL_DESC_ALPHA_VECTOR_BATCH_STRIDE is unused for block scaling so it's safe to set. // CUBLASLT_MATMUL_DESC_ALPHA_VECTOR_BATCH_STRIDE is unused for block scaling so it's safe to set.
if (cuda::cublas_version() <= 120803) { if (transformer_engine::cuda::cublas_version() <= 120803) {
const int64_t dummy_a_vec_stride = 1; const int64_t dummy_a_vec_stride = 1;
NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(
operationDesc, CUBLASLT_MATMUL_DESC_ALPHA_VECTOR_BATCH_STRIDE, &dummy_a_vec_stride, operationDesc, CUBLASLT_MATMUL_DESC_ALPHA_VECTOR_BATCH_STRIDE, &dummy_a_vec_stride,
...@@ -530,9 +530,9 @@ void cublas_gemm(const Tensor *inputA, const Tensor *inputB, Tensor *outputD, ...@@ -530,9 +530,9 @@ void cublas_gemm(const Tensor *inputA, const Tensor *inputB, Tensor *outputD,
#endif // CUBLAS_VERSION >= 120800 #endif // CUBLAS_VERSION >= 120800
} else if (use_fp4) { // NVFP4 GEMM } else if (use_fp4) { // NVFP4 GEMM
#if CUBLAS_VERSION >= 120800 #if CUBLAS_VERSION >= 120800
NVTE_CHECK(cuda::cublas_version() >= 120800, NVTE_CHECK(transformer_engine::cuda::cublas_version() >= 120800,
"FP4 requires cuBLAS 12.8+, but run-time cuBLAS version is ", "FP4 requires cuBLAS 12.8+, but run-time cuBLAS version is ",
cuda::cublas_version()); transformer_engine::cuda::cublas_version());
// Check that scales are in expected format // Check that scales are in expected format
NVTE_CHECK(inputA->with_gemm_swizzled_scales, NVTE_CHECK(inputA->with_gemm_swizzled_scales,
...@@ -567,9 +567,9 @@ void cublas_gemm(const Tensor *inputA, const Tensor *inputB, Tensor *outputD, ...@@ -567,9 +567,9 @@ void cublas_gemm(const Tensor *inputA, const Tensor *inputB, Tensor *outputD,
(inputB->scaling_mode == NVTE_BLOCK_SCALING_1D || (inputB->scaling_mode == NVTE_BLOCK_SCALING_1D ||
inputB->scaling_mode == NVTE_BLOCK_SCALING_2D)) { inputB->scaling_mode == NVTE_BLOCK_SCALING_2D)) {
#if CUBLAS_VERSION >= 120900 #if CUBLAS_VERSION >= 120900
NVTE_CHECK(cuda::cublas_version() >= 120900, NVTE_CHECK(transformer_engine::cuda::cublas_version() >= 120900,
"FP8 block scaling requires cuBLAS 12.9+, but run-time cuBLAS version is ", "FP8 block scaling requires cuBLAS 12.9+, but run-time cuBLAS version is ",
cuda::cublas_version()); transformer_engine::cuda::cublas_version());
// Check that matrix formats are valid // Check that matrix formats are valid
NVTE_CHECK((!(inputA->scaling_mode == NVTE_BLOCK_SCALING_2D && NVTE_CHECK((!(inputA->scaling_mode == NVTE_BLOCK_SCALING_2D &&
...@@ -602,7 +602,7 @@ void cublas_gemm(const Tensor *inputA, const Tensor *inputB, Tensor *outputD, ...@@ -602,7 +602,7 @@ void cublas_gemm(const Tensor *inputA, const Tensor *inputB, Tensor *outputD,
} }
#if CUBLAS_VERSION >= 120800 #if CUBLAS_VERSION >= 120800
if (cuda::cublas_version() >= 120800) { if (transformer_engine::cuda::cublas_version() >= 120800) {
NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(operationDesc, NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(operationDesc,
CUBLASLT_MATMUL_DESC_A_SCALE_MODE, CUBLASLT_MATMUL_DESC_A_SCALE_MODE,
&scaling_mode_a, sizeof(scaling_mode_a))); &scaling_mode_a, sizeof(scaling_mode_a)));
...@@ -619,7 +619,7 @@ void cublas_gemm(const Tensor *inputA, const Tensor *inputB, Tensor *outputD, ...@@ -619,7 +619,7 @@ void cublas_gemm(const Tensor *inputA, const Tensor *inputB, Tensor *outputD,
NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(
operationDesc, CUBLASLT_MATMUL_DESC_AMAX_D_POINTER, &D_amax, sizeof(D_amax))); operationDesc, CUBLASLT_MATMUL_DESC_AMAX_D_POINTER, &D_amax, sizeof(D_amax)));
#if CUBLAS_VERSION >= 120800 #if CUBLAS_VERSION >= 120800
if (cuda::cublas_version() >= 120800) { if (transformer_engine::cuda::cublas_version() >= 120800) {
// NOTE: In all current cases where FP8 output is supported, the input is // NOTE: In all current cases where FP8 output is supported, the input is
// scaled identically to the output. // scaled identically to the output.
NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(operationDesc, NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(operationDesc,
...@@ -703,12 +703,14 @@ void cublas_gemm(const Tensor *inputA, const Tensor *inputB, Tensor *outputD, ...@@ -703,12 +703,14 @@ void cublas_gemm(const Tensor *inputA, const Tensor *inputB, Tensor *outputD,
"Atomic GEMM requires cuBLAS >=12.2.5 and <13.0.0, but compile-time cuBLAS version is ", "Atomic GEMM requires cuBLAS >=12.2.5 and <13.0.0, but compile-time cuBLAS version is ",
CUBLAS_VERSION); CUBLAS_VERSION);
#else #else
NVTE_CHECK(cuda::cudart_version() >= 12020 && cuda::cudart_version() < 13000, NVTE_CHECK(transformer_engine::cuda::cudart_version() >= 12020 &&
transformer_engine::cuda::cudart_version() < 13000,
"Atomic GEMM requires CUDA >=12.2.0 and <13.0.0, but run-time CUDA version is ", "Atomic GEMM requires CUDA >=12.2.0 and <13.0.0, but run-time CUDA version is ",
cuda::cudart_version()); transformer_engine::cuda::cudart_version());
NVTE_CHECK(cuda::cublas_version() >= 120205 && cuda::cublas_version() < 130000, NVTE_CHECK(transformer_engine::cuda::cublas_version() >= 120205 &&
transformer_engine::cuda::cublas_version() < 130000,
"Atomic GEMM requires cuBLAS >=12.2.5 and <13.0.0, but run-time cuBLAS version is ", "Atomic GEMM requires cuBLAS >=12.2.5 and <13.0.0, but run-time cuBLAS version is ",
cuda::cublas_version()); transformer_engine::cuda::cublas_version());
if (m_split == 0) m_split = 1; if (m_split == 0) m_split = 1;
if (n_split == 0) n_split = 1; if (n_split == 0) n_split = 1;
NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(
...@@ -934,9 +936,10 @@ void nvte_cublas_atomic_gemm(const NVTETensor A, const NVTETensor B, NVTETensor ...@@ -934,9 +936,10 @@ void nvte_cublas_atomic_gemm(const NVTETensor A, const NVTETensor B, NVTETensor
"Atomic GEMM requires CUDA version >=12.2.0 and <13.0.0, but run-time CUDA version is ", "Atomic GEMM requires CUDA version >=12.2.0 and <13.0.0, but run-time CUDA version is ",
transformer_engine::cuda::cudart_version()); transformer_engine::cuda::cudart_version());
NVTE_CHECK( NVTE_CHECK(
cuda::cublas_version() >= 120205 && cuda::cublas_version() < 130000, transformer_engine::cuda::cublas_version() >= 120205 &&
transformer_engine::cuda::cublas_version() < 130000,
"Atomic GEMM requires cuBLAS version >=12.2.5 and <13.0.0, but run-time cuBLAS version is ", "Atomic GEMM requires cuBLAS version >=12.2.5 and <13.0.0, but run-time cuBLAS version is ",
cuda::cublas_version()); transformer_engine::cuda::cublas_version());
const Tensor *inputA = convertNVTETensorCheck(A); const Tensor *inputA = convertNVTETensorCheck(A);
const Tensor *inputB = convertNVTETensorCheck(B); const Tensor *inputB = convertNVTETensorCheck(B);
......
...@@ -26,7 +26,7 @@ inline void CreateCublasHandle(cublasLtHandle_t *handle) { ...@@ -26,7 +26,7 @@ inline void CreateCublasHandle(cublasLtHandle_t *handle) {
} // namespace } // namespace
#if CUBLAS_VERSION >= 130100 #if CUBLAS_VERSION >= 130200
namespace { namespace {
...@@ -543,13 +543,13 @@ void nvte_grouped_gemm(const NVTEGroupedTensor A, int transa, const NVTEGroupedT ...@@ -543,13 +543,13 @@ void nvte_grouped_gemm(const NVTEGroupedTensor A, int transa, const NVTEGroupedT
NVTE_API_CALL(nvte_grouped_gemm); NVTE_API_CALL(nvte_grouped_gemm);
using namespace transformer_engine; using namespace transformer_engine;
// Grouped GEMM requires Blackwell (SM100) or newer and cuBLAS 13.1+ // Grouped GEMM requires Blackwell (SM100) or newer and cuBLAS 13.2+
const int current_device = cuda::current_device(); const int current_device = transformer_engine::cuda::current_device();
NVTE_CHECK(cuda::sm_arch(current_device) >= 100, NVTE_CHECK(transformer_engine::cuda::sm_arch(current_device) >= 100,
"nvte_grouped_gemm requires Blackwell (SM100) or newer architecture."); "nvte_grouped_gemm requires Blackwell (SM100) or newer architecture.");
NVTE_CHECK(cuda::cublas_version() >= 130100, NVTE_CHECK(transformer_engine::cuda::cublas_version() >= 130200,
"nvte_grouped_gemm requires cuBLAS 13.1+, but run-time cuBLAS version is ", "nvte_grouped_gemm requires cuBLAS 13.2+, but run-time cuBLAS version is ",
cuda::cublas_version()); transformer_engine::cuda::cublas_version());
// Convert to internal types // Convert to internal types
const GroupedTensor *inputA = convertNVTEGroupedTensorCheck(A); const GroupedTensor *inputA = convertNVTEGroupedTensorCheck(A);
...@@ -631,15 +631,15 @@ void nvte_grouped_gemm(const NVTEGroupedTensor A, int transa, const NVTEGroupedT ...@@ -631,15 +631,15 @@ void nvte_grouped_gemm(const NVTEGroupedTensor A, int transa, const NVTEGroupedT
kGroupedGemmCublasWorkspaceSize, stream)); kGroupedGemmCublasWorkspaceSize, stream));
} }
#else // CUBLAS_VERSION < 130100 #else // CUBLAS_VERSION < 130200
void nvte_grouped_gemm(const NVTEGroupedTensor A, int transa, const NVTEGroupedTensor B, int transb, void nvte_grouped_gemm(const NVTEGroupedTensor A, int transa, const NVTEGroupedTensor B, int transb,
const NVTEGroupedTensor C, NVTEGroupedTensor D, const NVTETensor alpha, const NVTEGroupedTensor C, NVTEGroupedTensor D, const NVTETensor alpha,
const NVTETensor beta, NVTETensor workspace_setup, const NVTETensor beta, NVTETensor workspace_setup,
NVTETensor workspace_cublas, NVTEGroupedMatmulConfig config, NVTETensor workspace_cublas, NVTEGroupedMatmulConfig config,
cudaStream_t stream) { cudaStream_t stream) {
NVTE_ERROR("nvte_grouped_gemm requires cuBLAS 13.1+, but compile-time cuBLAS version is ", NVTE_ERROR("nvte_grouped_gemm requires cuBLAS 13.2+, but compile-time cuBLAS version is ",
CUBLAS_VERSION, ". Please upgrade to CUDA 13.1 or newer."); CUBLAS_VERSION, ". Please upgrade to CUDA 13.1 or newer.");
} }
#endif // CUBLAS_VERSION >= 130100 #endif // CUBLAS_VERSION >= 130200
...@@ -299,7 +299,7 @@ void nvte_multi_tensor_gemm(const NVTETensor *A, const NVTETensor *B, NVTETensor ...@@ -299,7 +299,7 @@ void nvte_multi_tensor_gemm(const NVTETensor *A, const NVTETensor *B, NVTETensor
/* EXPERIMENTAL FEATURE AND SUBJECT TO CHANGE. */ /* EXPERIMENTAL FEATURE AND SUBJECT TO CHANGE. */
/*! \brief Grouped matrix multiplication: D = alpha * op(A) @ op(B) + beta * C /*! \brief Grouped matrix multiplication: D = alpha * op(A) @ op(B) + beta * C
* *
* \note Requires cuBLAS 13.1+ (CUDA 13.1+) and Blackwell (SM100) or newer GPU architecture. * \note Requires cuBLAS 13.2+ (CUDA 13.1+) and Blackwell (SM100) or newer GPU architecture.
* Will error at runtime if compiled with an older cuBLAS version or run on * Will error at runtime if compiled with an older cuBLAS version or run on
* a pre-Blackwell GPU. * a pre-Blackwell GPU.
* *
...@@ -322,7 +322,7 @@ void nvte_multi_tensor_gemm(const NVTETensor *A, const NVTETensor *B, NVTETensor ...@@ -322,7 +322,7 @@ void nvte_multi_tensor_gemm(const NVTETensor *A, const NVTETensor *B, NVTETensor
* \param[in] stream CUDA stream for the operation. * \param[in] stream CUDA stream for the operation.
* *
* Requirements: * Requirements:
* - cuBLAS 13.1+ (CUDA 13.1+) * - cuBLAS 13.2+ (CUDA 13.1+)
* - Blackwell (SM100) or newer GPU architecture * - Blackwell (SM100) or newer GPU architecture
* - A, B, C (if provided), D must have the same num_tensors * - A, B, C (if provided), D must have the same num_tensors
* - For each i: D[i] = alpha[i] * op(A[i]) @ op(B[i]) + beta[i] * C[i] * - For each i: D[i] = alpha[i] * op(A[i]) @ op(B[i]) + beta[i] * C[i]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment