Unverified Commit c3769cb7 authored by Paweł Gadziński's avatar Paweł Gadziński Committed by GitHub
Browse files

Fix minimum version of cublas for grouped gemm (#2631)



* version change
Signed-off-by: default avatarPawel Gadzinski <pgadzinski@nvidia.com>

* fix
Signed-off-by: default avatarPawel Gadzinski <pgadzinski@nvidia.com>

* ifx
Signed-off-by: default avatarPawel Gadzinski <pgadzinski@nvidia.com>

* fix
Signed-off-by: default avatarPawel Gadzinski <pgadzinski@nvidia.com>

* fix
Signed-off-by: default avatarPawel Gadzinski <pgadzinski@nvidia.com>

* fix
Signed-off-by: default avatarPawel Gadzinski <pgadzinski@nvidia.com>

---------
Signed-off-by: default avatarPawel Gadzinski <pgadzinski@nvidia.com>
parent f8cca8b9
......@@ -102,8 +102,8 @@ std::vector<std::tuple<size_t, size_t, size_t>> make_shapes(ShapeCase scase) {
}
void run_grouped_gemm_case(const TestParams& params) {
#if CUBLAS_VERSION < 130100
GTEST_SKIP() << "Grouped GEMM requires cuBLAS 13.1+, but compile-time cuBLAS version is "
#if CUBLAS_VERSION < 130200
GTEST_SKIP() << "Grouped GEMM requires cuBLAS 13.2+, but compile-time cuBLAS version is "
<< CUBLAS_VERSION << ".";
#else
if (getDeviceComputeCapability() < blackwellComputeCapability) {
......@@ -267,7 +267,7 @@ void run_grouped_gemm_case(const TestParams& params) {
atol,
rtol);
}
#endif // CUBLAS_VERSION >= 130100
#endif // CUBLAS_VERSION >= 130200
}
class GroupedGemmTest : public ::testing::TestWithParam<TestParams> {};
......
......@@ -494,9 +494,9 @@ void cublas_gemm(const Tensor *inputA, const Tensor *inputB, Tensor *outputD,
#endif // CUBLAS_VERSION >= 120800
} else if (mxfp8_gemm) {
#if CUBLAS_VERSION >= 120800
NVTE_CHECK(cuda::cublas_version() >= 120800,
NVTE_CHECK(transformer_engine::cuda::cublas_version() >= 120800,
"MXFP8 requires cuBLAS 12.8+, but run-time cuBLAS version is ",
cuda::cublas_version());
transformer_engine::cuda::cublas_version());
// Check that scales are in expected format
NVTE_CHECK(inputA->with_gemm_swizzled_scales,
......@@ -518,7 +518,7 @@ void cublas_gemm(const Tensor *inputA, const Tensor *inputB, Tensor *outputD,
// Workaround for heuristic cache bug in cublasLt. This separates the MXFP8 cache key from non-block scaling.
// CUBLASLT_MATMUL_DESC_ALPHA_VECTOR_BATCH_STRIDE is unused for block scaling so it's safe to set.
if (cuda::cublas_version() <= 120803) {
if (transformer_engine::cuda::cublas_version() <= 120803) {
const int64_t dummy_a_vec_stride = 1;
NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(
operationDesc, CUBLASLT_MATMUL_DESC_ALPHA_VECTOR_BATCH_STRIDE, &dummy_a_vec_stride,
......@@ -530,9 +530,9 @@ void cublas_gemm(const Tensor *inputA, const Tensor *inputB, Tensor *outputD,
#endif // CUBLAS_VERSION >= 120800
} else if (use_fp4) { // NVFP4 GEMM
#if CUBLAS_VERSION >= 120800
NVTE_CHECK(cuda::cublas_version() >= 120800,
NVTE_CHECK(transformer_engine::cuda::cublas_version() >= 120800,
"FP4 requires cuBLAS 12.8+, but run-time cuBLAS version is ",
cuda::cublas_version());
transformer_engine::cuda::cublas_version());
// Check that scales are in expected format
NVTE_CHECK(inputA->with_gemm_swizzled_scales,
......@@ -567,9 +567,9 @@ void cublas_gemm(const Tensor *inputA, const Tensor *inputB, Tensor *outputD,
(inputB->scaling_mode == NVTE_BLOCK_SCALING_1D ||
inputB->scaling_mode == NVTE_BLOCK_SCALING_2D)) {
#if CUBLAS_VERSION >= 120900
NVTE_CHECK(cuda::cublas_version() >= 120900,
NVTE_CHECK(transformer_engine::cuda::cublas_version() >= 120900,
"FP8 block scaling requires cuBLAS 12.9+, but run-time cuBLAS version is ",
cuda::cublas_version());
transformer_engine::cuda::cublas_version());
// Check that matrix formats are valid
NVTE_CHECK((!(inputA->scaling_mode == NVTE_BLOCK_SCALING_2D &&
......@@ -602,7 +602,7 @@ void cublas_gemm(const Tensor *inputA, const Tensor *inputB, Tensor *outputD,
}
#if CUBLAS_VERSION >= 120800
if (cuda::cublas_version() >= 120800) {
if (transformer_engine::cuda::cublas_version() >= 120800) {
NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(operationDesc,
CUBLASLT_MATMUL_DESC_A_SCALE_MODE,
&scaling_mode_a, sizeof(scaling_mode_a)));
......@@ -619,7 +619,7 @@ void cublas_gemm(const Tensor *inputA, const Tensor *inputB, Tensor *outputD,
NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(
operationDesc, CUBLASLT_MATMUL_DESC_AMAX_D_POINTER, &D_amax, sizeof(D_amax)));
#if CUBLAS_VERSION >= 120800
if (cuda::cublas_version() >= 120800) {
if (transformer_engine::cuda::cublas_version() >= 120800) {
// NOTE: In all current cases where FP8 output is supported, the input is
// scaled identically to the output.
NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(operationDesc,
......@@ -703,12 +703,14 @@ void cublas_gemm(const Tensor *inputA, const Tensor *inputB, Tensor *outputD,
"Atomic GEMM requires cuBLAS >=12.2.5 and <13.0.0, but compile-time cuBLAS version is ",
CUBLAS_VERSION);
#else
NVTE_CHECK(cuda::cudart_version() >= 12020 && cuda::cudart_version() < 13000,
NVTE_CHECK(transformer_engine::cuda::cudart_version() >= 12020 &&
transformer_engine::cuda::cudart_version() < 13000,
"Atomic GEMM requires CUDA >=12.2.0 and <13.0.0, but run-time CUDA version is ",
cuda::cudart_version());
NVTE_CHECK(cuda::cublas_version() >= 120205 && cuda::cublas_version() < 130000,
transformer_engine::cuda::cudart_version());
NVTE_CHECK(transformer_engine::cuda::cublas_version() >= 120205 &&
transformer_engine::cuda::cublas_version() < 130000,
"Atomic GEMM requires cuBLAS >=12.2.5 and <13.0.0, but run-time cuBLAS version is ",
cuda::cublas_version());
transformer_engine::cuda::cublas_version());
if (m_split == 0) m_split = 1;
if (n_split == 0) n_split = 1;
NVTE_CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(
......@@ -934,9 +936,10 @@ void nvte_cublas_atomic_gemm(const NVTETensor A, const NVTETensor B, NVTETensor
"Atomic GEMM requires CUDA version >=12.2.0 and <13.0.0, but run-time CUDA version is ",
transformer_engine::cuda::cudart_version());
NVTE_CHECK(
cuda::cublas_version() >= 120205 && cuda::cublas_version() < 130000,
transformer_engine::cuda::cublas_version() >= 120205 &&
transformer_engine::cuda::cublas_version() < 130000,
"Atomic GEMM requires cuBLAS version >=12.2.5 and <13.0.0, but run-time cuBLAS version is ",
cuda::cublas_version());
transformer_engine::cuda::cublas_version());
const Tensor *inputA = convertNVTETensorCheck(A);
const Tensor *inputB = convertNVTETensorCheck(B);
......
......@@ -26,7 +26,7 @@ inline void CreateCublasHandle(cublasLtHandle_t *handle) {
} // namespace
#if CUBLAS_VERSION >= 130100
#if CUBLAS_VERSION >= 130200
namespace {
......@@ -543,13 +543,13 @@ void nvte_grouped_gemm(const NVTEGroupedTensor A, int transa, const NVTEGroupedT
NVTE_API_CALL(nvte_grouped_gemm);
using namespace transformer_engine;
// Grouped GEMM requires Blackwell (SM100) or newer and cuBLAS 13.1+
const int current_device = cuda::current_device();
NVTE_CHECK(cuda::sm_arch(current_device) >= 100,
// Grouped GEMM requires Blackwell (SM100) or newer and cuBLAS 13.2+
const int current_device = transformer_engine::cuda::current_device();
NVTE_CHECK(transformer_engine::cuda::sm_arch(current_device) >= 100,
"nvte_grouped_gemm requires Blackwell (SM100) or newer architecture.");
NVTE_CHECK(cuda::cublas_version() >= 130100,
"nvte_grouped_gemm requires cuBLAS 13.1+, but run-time cuBLAS version is ",
cuda::cublas_version());
NVTE_CHECK(transformer_engine::cuda::cublas_version() >= 130200,
"nvte_grouped_gemm requires cuBLAS 13.2+, but run-time cuBLAS version is ",
transformer_engine::cuda::cublas_version());
// Convert to internal types
const GroupedTensor *inputA = convertNVTEGroupedTensorCheck(A);
......@@ -631,15 +631,15 @@ void nvte_grouped_gemm(const NVTEGroupedTensor A, int transa, const NVTEGroupedT
kGroupedGemmCublasWorkspaceSize, stream));
}
#else // CUBLAS_VERSION < 130100
#else // CUBLAS_VERSION < 130200
void nvte_grouped_gemm(const NVTEGroupedTensor A, int transa, const NVTEGroupedTensor B, int transb,
const NVTEGroupedTensor C, NVTEGroupedTensor D, const NVTETensor alpha,
const NVTETensor beta, NVTETensor workspace_setup,
NVTETensor workspace_cublas, NVTEGroupedMatmulConfig config,
cudaStream_t stream) {
NVTE_ERROR("nvte_grouped_gemm requires cuBLAS 13.1+, but compile-time cuBLAS version is ",
NVTE_ERROR("nvte_grouped_gemm requires cuBLAS 13.2+, but compile-time cuBLAS version is ",
CUBLAS_VERSION, ". Please upgrade to CUDA 13.1 or newer.");
}
#endif // CUBLAS_VERSION >= 130100
#endif // CUBLAS_VERSION >= 130200
......@@ -299,7 +299,7 @@ void nvte_multi_tensor_gemm(const NVTETensor *A, const NVTETensor *B, NVTETensor
/* EXPERIMENTAL FEATURE AND SUBJECT TO CHANGE. */
/*! \brief Grouped matrix multiplication: D = alpha * op(A) @ op(B) + beta * C
*
* \note Requires cuBLAS 13.1+ (CUDA 13.1+) and Blackwell (SM100) or newer GPU architecture.
* \note Requires cuBLAS 13.2+ (CUDA 13.1+) and Blackwell (SM100) or newer GPU architecture.
* Will error at runtime if compiled with an older cuBLAS version or run on
* a pre-Blackwell GPU.
*
......@@ -322,7 +322,7 @@ void nvte_multi_tensor_gemm(const NVTETensor *A, const NVTETensor *B, NVTETensor
* \param[in] stream CUDA stream for the operation.
*
* Requirements:
* - cuBLAS 13.1+ (CUDA 13.1+)
* - cuBLAS 13.2+ (CUDA 13.1+)
* - Blackwell (SM100) or newer GPU architecture
* - A, B, C (if provided), D must have the same num_tensors
* - For each i: D[i] = alpha[i] * op(A[i]) @ op(B[i]) + beta[i] * C[i]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment