support dtk23.10

ee33e2e7 · zhouxiang · e432dbb0 · ee33e2e7 · ee33e2e7 · ee33e2e7
Commit ee33e2e7 authored Nov 27, 2023 by zhouxiang
17 changed files
--- a/lmdeploy/version.py
+++ b/lmdeploy/version.py
 # Copyright (c) OpenMMLab. All rights reserved.
 from typing import Tuple
-__dcu_version__ = '0.0.13'
+__dcu_version__ = '0.0.13+gite432dbb.abi0.dtk2310.torch1.13'
 __version__ = '0.0.13'
 short_version = __version__


--- a/requirements.txt
+++ b/requirements.txt
@@ -12,6 +12,6 @@ setuptools
 shortuuid
 tiktoken
 torch
-transformers>=4.33.0
+transformers>=4.33.2
 tritonclient[all]
 uvicorn
--- a/src/turbomind/kernels/activation_kernels.cu
+++ b/src/turbomind/kernels/activation_kernels.cu
@@ -37,14 +37,14 @@ __forceinline__ __device__ float copysignf_pos(float a, float b)

 __inline__ __device__ float tanh_opt(float x)
 {
-#if (__CUDA_ARCH__ >= 750 && CUDART_VERSION >= 11000)
-    float r;
-    asm("tanh.approx.f32 %0,%1; \n\t" : "=f"(r) : "f"(x));
-    return r;
-#else
+// #if (__CUDA_ARCH__ >= 750 && CUDART_VERSION >= 11000)
+//     float r;
+//     asm("tanh.approx.f32 %0,%1; \n\t" : "=f"(r) : "f"(x));
+//     return r;
+// #else
    const float exp_val = -1.f * fabs(2 * x);
    return copysignf_pos((1.0f - __expf(exp_val)) / (__expf(exp_val) + 1.0f), x);
-#endif
+// #endif
 }

 template<typename T>

--- a/src/turbomind/kernels/gemm_s_f16/cta_iterator.h
+++ b/src/turbomind/kernels/gemm_s_f16/cta_iterator.h
@@ -7,11 +7,11 @@

 namespace turbomind {

-#if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 4)
-#define L2_CACHEHINT(size) ".L2::" #size "B"
-#else
+// #if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 4)
+// #define L2_CACHEHINT(size) ".L2::" #size "B"
+// #else
 #define L2_CACHEHINT(size)
-#endif
+// #endif

 template<typename T>
 __inline__ __device__ void cp_async_cg_A(uint32_t smem_int_ptr, const T* __restrict__ src, bool mask)

--- a/src/turbomind/kernels/gemm_s_f16/gemm_template.h
+++ b/src/turbomind/kernels/gemm_s_f16/gemm_template.h
@@ -61,12 +61,12 @@ __inline__ __device__ uint transpose_m8n8_b16_movmatrix(uint a)
 __inline__ __device__ uint transpose_m8n8_b16(uint a, int lane_id)
 {

-#if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 8)
-    (void)lane_id;
-    return transpose_m8n8_b16_movmatrix(a);
-#else
+// #if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 8)
+//     (void)lane_id;
+//     return transpose_m8n8_b16_movmatrix(a);
+// #else
    return transpose_m8n8_b16_warp_shuffle(a, lane_id);
-#endif
+// #endif
 }

 namespace ops {

--- a/src/turbomind/kernels/reduce_kernel_utils.cuh
+++ b/src/turbomind/kernels/reduce_kernel_utils.cuh
@@ -16,11 +16,11 @@
 #pragma once
 #include <array>
 #include <assert.h>
-#if ((__CUDACC_VER_MAJOR__ > 11) || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 0))
-#include <cooperative_groups/reduce.h>
-#else
+// #if ((__CUDACC_VER_MAJOR__ > 11) || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 0))
+// #include <cooperative_groups/reduce.h>
+// #else
 #include <cooperative_groups.h>
-#endif
+// #endif
 #include "src/turbomind/utils/cuda_bf16_wrapper.h"
 #include "src/turbomind/utils/cuda_type_utils.cuh"
 #include <cuda_fp16.h>
@@ -244,15 +244,15 @@ __inline__ __device__ void cgBlockReduceSumElements(float* element_list, float*
    const int tid    = cta.thread_rank();
    const int blockz = blockDim.x;
    for (int i = 0; i < NUM; i++) {
-#if ((__CUDACC_VER_MAJOR__ > 11) || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 0))
-        cgBlockReduceSumElements_shm[i * blockz + tid] = cg::reduce(tile, element_list[i], cg::plus<float>());
-#else
+// #if ((__CUDACC_VER_MAJOR__ > 11) || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 0))
+//         cgBlockReduceSumElements_shm[i * blockz + tid] = cg::reduce(tile, element_list[i], cg::plus<float>());
+// #else
        // TODO Add implementation here
        if (threadIdx.x == 0 && blockIdx.x == 0) {
            printf("[ERROR] Not support cgBlockReduceSumElements when CUDA < 11 \n");
            assert(false);
        }
-#endif
+// #endif
    }
    cg::sync(cta);
    if (tid == 0) {

--- a/src/turbomind/utils/CMakeLists.txt
+++ b/src/turbomind/utils/CMakeLists.txt
@@ -77,11 +77,11 @@ if (BUILD_MULTI_GPU)
    target_link_libraries(nccl_utils PUBLIC ${NCCL_LIBRARIES} logger)
 endif()

-add_library(cublasINT8MMWrapper STATIC cublasINT8MMWrapper.cc)
+# add_library(cublasINT8MMWrapper STATIC cublasINT8MMWrapper.cc)
 #set_property(TARGET cublasINT8MMWrapper PROPERTY POSITION_INDEPENDENT_CODE  ON)
 #set_property(TARGET cublasINT8MMWrapper PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
 #target_link_libraries(cublasINT8MMWrapper PUBLIC cublasLt cudart curand cublasAlgoMap cublasMMWrapper cuda_utils logger)
-target_link_libraries(cublasINT8MMWrapper PUBLIC cudart curand cublasAlgoMap cublasMMWrapper cuda_utils logger)
+# target_link_libraries(cublasINT8MMWrapper PUBLIC cudart curand cublasAlgoMap cublasMMWrapper cuda_utils logger)

 if(ENABLE_FP8)
 add_library(cublasFP8MMWrapper STATIC cublasFP8MMWrapper.cu)
@@ -108,7 +108,7 @@ if (SPARSITY_SUPPORT)
    target_link_libraries(gemm PUBLIC cusparse -lcusparseLt)
 endif()

-add_library(cuda_fp8_utils STATIC cuda_fp8_utils.cu)
+# add_library(cuda_fp8_utils STATIC cuda_fp8_utils.cu)
 #set_property(TARGET cuda_fp8_utils PROPERTY POSITION_INDEPENDENT_CODE  ON)
 #set_property(TARGET cuda_fp8_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)


--- a/src/turbomind/utils/allocator.h
+++ b/src/turbomind/utils/allocator.h
@@ -44,9 +44,9 @@

 #include "src/turbomind/utils/logger.h"

-#if defined(CUDART_VERSION) && CUDART_VERSION < 11020
+// #if defined(CUDART_VERSION) && CUDART_VERSION < 11020
 #define CUDA_MEMORY_POOL_DISABLED
-#endif
+// #endif

 namespace turbomind {


--- a/src/turbomind/utils/cublasFP8MMWrapper.cu
+++ b/src/turbomind/utils/cublasFP8MMWrapper.cu
@@ -237,10 +237,10 @@ void cublasFP8MMWrapper::Gemm(__nv_bfloat16*       res,
            cublasLtMatmulAlgoConfigSetAttribute(
                &algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &(info.reductionScheme), sizeof(info.reductionScheme));

-#if (CUDART_VERSION >= 11000)
-            cublasLtMatmulAlgoConfigSetAttribute(
-                &algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(info.stages), sizeof(info.stages));
-#endif
+// #if (CUDART_VERSION >= 11000)
+//             cublasLtMatmulAlgoConfigSetAttribute(
+//                 &algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(info.stages), sizeof(info.stages));
+// #endif

 #if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
            cublasLtMatmulAlgoConfigSetAttribute(
@@ -462,10 +462,10 @@ void cublasFP8MMWrapper::Gemm(__nv_fp8_e4m3*       res,
            cublasLtMatmulAlgoConfigSetAttribute(
                &algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &(info.reductionScheme), sizeof(info.reductionScheme));

-#if (CUDART_VERSION >= 11000)
-            cublasLtMatmulAlgoConfigSetAttribute(
-                &algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(info.stages), sizeof(info.stages));
-#endif
+// #if (CUDART_VERSION >= 11000)
+//             cublasLtMatmulAlgoConfigSetAttribute(
+//                 &algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(info.stages), sizeof(info.stages));
+// #endif

 #if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
            cublasLtMatmulAlgoConfigSetAttribute(

--- a/src/turbomind/utils/cublasINT8MMWrapper.cc
+++ b/src/turbomind/utils/cublasINT8MMWrapper.cc
@@ -94,11 +94,11 @@ void cublasINT8MMWrapper::Gemm(int*          res,
 {
    mu_->lock();
    cublasOperation_t opTranspose = CUBLAS_OP_T;
-#if (CUDART_VERSION >= 11000)
-    cublasComputeType_t computeType = CUBLAS_COMPUTE_32I;
-#else
+// #if (CUDART_VERSION >= 11000)
+//     cublasComputeType_t computeType = CUBLAS_COMPUTE_32I;
+// #else
    cudaDataType_t computeType = CUDA_R_32I;
-#endif
+// #endif
    cublasLtMatmulDesc_t   matmulDesc;
    cublasLtMatrixLayout_t AtransformDesc = NULL;
    cublasLtMatrixLayout_t BtransformDesc = NULL;
@@ -106,16 +106,16 @@ void cublasINT8MMWrapper::Gemm(int*          res,
    cublasLtOrder_t        order_COL32    = CUBLASLT_ORDER_COL32;

    cublasLtOrder_t order_matrixB;
-#if (CUDART_VERSION >= 11000)
-    if (use_ORDER_COL32_2R_4R4_) {
-        order_matrixB = CUBLASLT_ORDER_COL32_2R_4R4;
-    }
-    else {
-        order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C;
-    }
-#else
+// #if (CUDART_VERSION >= 11000)
+//     if (use_ORDER_COL32_2R_4R4_) {
+//         order_matrixB = CUBLASLT_ORDER_COL32_2R_4R4;
+//     }
+//     else {
+//         order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C;
+//     }
+// #else
    order_matrixB              = CUBLASLT_ORDER_COL4_4R2_8C;
-#endif
+// #endif

    int ldaTransform = 32 * m;
    int ldbTransform;
@@ -128,11 +128,11 @@ void cublasINT8MMWrapper::Gemm(int*          res,
    int ldcTransform = 32 * m;

    // create matmulDesc
-#if (CUDART_VERSION >= 11000)
-    cublasLtMatmulDescCreate(&matmulDesc, computeType, CUDA_R_32I);
-#else
+// #if (CUDART_VERSION >= 11000)
+//     cublasLtMatmulDescCreate(&matmulDesc, computeType, CUDA_R_32I);
+// #else
    cublasLtMatmulDescCreate(&matmulDesc, computeType);
-#endif
+// #endif
    cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &opTranspose, sizeof(cublasOperation_t));
    cublasLtMatrixLayoutCreate(&AtransformDesc, CUDA_R_8I, m, k, ldaTransform);
    cublasLtMatrixLayoutSetAttribute(AtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_COL32, sizeof(order_COL32));
@@ -187,10 +187,10 @@ void cublasINT8MMWrapper::Gemm(int*          res,
            &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(tmp_info.swizzle), sizeof(tmp_info.swizzle));
        cublasLtMatmulAlgoConfigSetAttribute(
            &algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &(tmp_info.reductionScheme), sizeof(int));
-#if (CUDART_VERSION >= 11000)
-        cublasLtMatmulAlgoConfigSetAttribute(
-            &algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(tmp_info.stages), sizeof(tmp_info.stages));
-#endif
+// #if (CUDART_VERSION >= 11000)
+//         cublasLtMatmulAlgoConfigSetAttribute(
+//             &algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(tmp_info.stages), sizeof(tmp_info.stages));
+// #endif
    }
    else {
        findAlgo = 1;
@@ -215,16 +215,16 @@ void cublasINT8MMWrapper::Gemm(int*          res,
        cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(swizzle), sizeof(swizzle));
        cublasLtMatmulAlgoConfigSetAttribute(
            &algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &(reductionScheme), sizeof(int));
-#if (CUDART_VERSION >= 11000)
-        int stages;
-        if (use_ORDER_COL32_2R_4R4_) {
-            stages = 15;
-        }
-        else {
-            stages = 13;
-        }
-        cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(stages), sizeof(stages));
-#endif
+// #if (CUDART_VERSION >= 11000)
+//         int stages;
+//         if (use_ORDER_COL32_2R_4R4_) {
+//             stages = 15;
+//         }
+//         else {
+//             stages = 13;
+//         }
+//         cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(stages), sizeof(stages));
+// #endif
    }

    cublasLtMatmul(cublaslt_handle_,
@@ -273,11 +273,11 @@ void cublasINT8MMWrapper::Gemm(int8_t*       res,
    // int8 gemm does not support CUBLAS_POINTER_MODE_DEVICE
    // cublasLtPointerMode_t pointerMode = CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_ZERO;
    cudaDataType_t scaleType = CUDA_R_32F;
-#if (CUDART_VERSION >= 11000)
-    cublasComputeType_t computeType = CUBLAS_COMPUTE_32I;
-#else
+// #if (CUDART_VERSION >= 11000)
+//     cublasComputeType_t computeType = CUBLAS_COMPUTE_32I;
+// #else
    cudaDataType_t computeType = CUDA_R_32I;
-#endif
+// #endif
    cublasLtMatmulDesc_t   matmulDesc;
    cublasLtMatrixLayout_t AtransformDesc = NULL;
    cublasLtMatrixLayout_t BtransformDesc = NULL;
@@ -285,16 +285,16 @@ void cublasINT8MMWrapper::Gemm(int8_t*       res,
    cublasLtOrder_t        order_COL32    = CUBLASLT_ORDER_COL32;

    cublasLtOrder_t order_matrixB;
-#if (CUDART_VERSION >= 11000)
-    if (use_ORDER_COL32_2R_4R4_) {
-        order_matrixB = CUBLASLT_ORDER_COL32_2R_4R4;
-    }
-    else {
-        order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C;
-    }
-#else
+// #if (CUDART_VERSION >= 11000)
+//     if (use_ORDER_COL32_2R_4R4_) {
+//         order_matrixB = CUBLASLT_ORDER_COL32_2R_4R4;
+//     }
+//     else {
+//         order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C;
+//     }
+// #else
    order_matrixB              = CUBLASLT_ORDER_COL4_4R2_8C;
-#endif
+// #endif

    int ldaTransform = 32 * m;

@@ -309,11 +309,11 @@ void cublasINT8MMWrapper::Gemm(int8_t*       res,
    int ldcTransform = 32 * m;

    // create matmulDesc
-#if (CUDART_VERSION >= 11000)
-    cublasLtMatmulDescCreate(&matmulDesc, computeType, scaleType);
-#else
+// #if (CUDART_VERSION >= 11000)
+//     cublasLtMatmulDescCreate(&matmulDesc, computeType, scaleType);
+// #else
    cublasLtMatmulDescCreate(&matmulDesc, computeType);
-#endif
+// #endif
    cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &opTranspose, sizeof(cublasOperation_t));
    cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_SCALE_TYPE, &scaleType, sizeof(scaleType));
    // cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &pointerMode,
@@ -367,10 +367,10 @@ void cublasINT8MMWrapper::Gemm(int8_t*       res,
            &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(tmp_info.swizzle), sizeof(tmp_info.swizzle));
        cublasLtMatmulAlgoConfigSetAttribute(
            &algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &(tmp_info.reductionScheme), sizeof(int));
-#if (CUDART_VERSION >= 11000)
-        cublasLtMatmulAlgoConfigSetAttribute(
-            &algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(tmp_info.stages), sizeof(tmp_info.stages));
-#endif
+// #if (CUDART_VERSION >= 11000)
+//         cublasLtMatmulAlgoConfigSetAttribute(
+//             &algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(tmp_info.stages), sizeof(tmp_info.stages));
+// #endif
    }
    else {
        findAlgo = 1;
@@ -395,16 +395,16 @@ void cublasINT8MMWrapper::Gemm(int8_t*       res,
        cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(swizzle), sizeof(swizzle));
        cublasLtMatmulAlgoConfigSetAttribute(
            &algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &(reductionScheme), sizeof(int));
-#if (CUDART_VERSION >= 11000)
-        int stages;
-        if (use_ORDER_COL32_2R_4R4_) {
-            stages = 15;
-        }
-        else {
-            stages = 13;
-        }
-        cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(stages), sizeof(stages));
-#endif
+// #if (CUDART_VERSION >= 11000)
+//         int stages;
+//         if (use_ORDER_COL32_2R_4R4_) {
+//             stages = 15;
+//         }
+//         else {
+//             stages = 13;
+//         }
+//         cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(stages), sizeof(stages));
+// #endif
    }

    float beta = 0.0f;

--- a/src/turbomind/utils/cublasMMWrapper.cc
+++ b/src/turbomind/utils/cublasMMWrapper.cc
@@ -192,118 +192,119 @@ void cublasMMWrapper::Gemm(cublasOperation_t transa,
        }
    }

-    // if (using_cublasLt) {
-    if (0) {
-        cublasLtMatmulDesc_t   operationDesc = NULL;
-        cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL;
-        cudaDataType_t         scaleType;
-#if (CUDART_VERSION >= 11000)
-        cublasComputeType_t computeType;
-#else
-        cudaDataType_t computeType;
-#endif
-
-        if (is_fp16_computeType) {
-#if (CUDART_VERSION >= 11000)
-            computeType = CUBLAS_COMPUTE_16F;
-#else
-            computeType = CUDA_R_16F;
-#endif
-            scaleType = CUDA_R_16F;
-        }
-        else {
-#if (CUDART_VERSION >= 11000)
-            computeType = CUBLAS_COMPUTE_32F;
-#else
-            computeType = CUDA_R_32F;
-#endif
-            scaleType = CUDA_R_32F;
-        }
-
-        // --------------------------------------
-        // Create descriptors for the original matrices
-        cublasLtMatrixLayoutCreate(&Adesc, Atype_, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda);
-        cublasLtMatrixLayoutCreate(&Bdesc, Btype_, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb);
-        cublasLtMatrixLayoutCreate(&Cdesc, Ctype_, m, n, ldc);
-#if (CUDART_VERSION >= 11000)
-        cublasLtMatmulDescCreate(&operationDesc, computeType, scaleType);
-#else
-        cublasLtMatmulDescCreate(&operationDesc, computeType);
-#endif
-
-        cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(cublasOperation_t));
-        cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(cublasOperation_t));

-        cublasLtMatmulAlgo_t algo;
-        void*                workSpace     = cublas_workspace_;
-        int                  workspaceSize = cublas_workspace_ == NULL ? 0 : CUBLAS_WORKSPACE_SIZE;
-        if (findAlgo) {
-            if (info.workspaceSize > workspaceSize) {
-                findAlgo = 0;
-            }
-            else {
-                cublasLtMatmulAlgoInit(
-                    cublaslt_handle_, computeType, scaleType, Atype_, Btype_, Ctype_, Ctype_, info.algoId, &algo);
-                cublasLtMatmulAlgoConfigSetAttribute(
-                    &algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &(info.customOption), sizeof(info.customOption));
-                cublasLtMatmulAlgoConfigSetAttribute(
-                    &algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &(info.tile), sizeof(info.tile));
-                cublasLtMatmulAlgoConfigSetAttribute(
-                    &algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &(info.splitK_val), sizeof(info.splitK_val));
-                cublasLtMatmulAlgoConfigSetAttribute(
-                    &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(info.swizzle), sizeof(info.swizzle));
-                cublasLtMatmulAlgoConfigSetAttribute(&algo,
-                                                     CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME,
-                                                     &(info.reductionScheme),
-                                                     sizeof(info.reductionScheme));
-
-#if (CUDART_VERSION >= 11000)
-                cublasLtMatmulAlgoConfigSetAttribute(
-                    &algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(info.stages), sizeof(info.stages));
-#endif
-
-#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
-                cublasLtMatmulAlgoConfigSetAttribute(
-                    &algo, CUBLASLT_ALGO_CONFIG_INNER_SHAPE_ID, &(info.inner_shapeId), sizeof(info.inner_shapeId));
-                cublasLtMatmulAlgoConfigSetAttribute(&algo,
-                                                     CUBLASLT_ALGO_CONFIG_CLUSTER_SHAPE_ID,
-                                                     &(info.cluster_shapeId),
-                                                     sizeof(info.cluster_shapeId));
-#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
-                cublasLtMatmulAlgoConfigSetAttribute(
-                    &algo, CUBLASLT_ALGO_CONFIG_MMA_SHAPE_ID, &(info.mma_shapeId), sizeof(info.mma_shapeId));
-                cublasLtMatmulAlgoConfigSetAttribute(
-                    &algo, CUBLASLT_ALGO_CONFIG_CGA_SHAPE_ID, &(info.cga_shapeId), sizeof(info.cga_shapeId));
-                cublasLtMatmulAlgoConfigSetAttribute(
-                    &algo, CUBLASLT_ALGO_CONFIG_SCHEDULING_MODE, &(info.sche_mode), sizeof(info.sche_mode));
-#endif
-            }
-        }
-
-        // cublasLtMatmul(cublaslt_handle_,
-        //                operationDesc,
-        //                alpha,
-        //                A,
-        //                Adesc,
-        //                B,
-        //                Bdesc,
-        //                beta,
-        //                C,
-        //                Cdesc,
-        //                C,
-        //                Cdesc,
-        //                (findAlgo == 1 ? (&algo) : NULL),
-        //                workSpace,
-        //                workspaceSize,
-        //                stream_);
-
-        cublasLtMatmulDescDestroy(operationDesc);
-        cublasLtMatrixLayoutDestroy(Adesc);
-        cublasLtMatrixLayoutDestroy(Bdesc);
-        cublasLtMatrixLayoutDestroy(Cdesc);
-        sync_check_cuda_error();
-    }
-    else {
+    // if (using_cublasLt) {
+//     if (0) {
+//         cublasLtMatmulDesc_t   operationDesc = NULL;
+//         cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL;
+//         cudaDataType_t         scaleType;
+// #if (CUDART_VERSION >= 11000)
+//         cublasComputeType_t computeType;
+// #else
+//         cudaDataType_t computeType;
+// #endif
+
+//         if (is_fp16_computeType) {
+// #if (CUDART_VERSION >= 11000)
+//             computeType = CUBLAS_COMPUTE_16F;
+// #else
+//             computeType = CUDA_R_16F;
+// #endif
+//             scaleType = CUDA_R_16F;
+//         }
+//         else {
+// #if (CUDART_VERSION >= 11000)
+//             computeType = CUBLAS_COMPUTE_32F;
+// #else
+//             computeType = CUDA_R_32F;
+// #endif
+//             scaleType = CUDA_R_32F;
+//         }
+
+//         // --------------------------------------
+//         // Create descriptors for the original matrices
+//         cublasLtMatrixLayoutCreate(&Adesc, Atype_, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda);
+//         cublasLtMatrixLayoutCreate(&Bdesc, Btype_, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb);
+//         cublasLtMatrixLayoutCreate(&Cdesc, Ctype_, m, n, ldc);
+// #if (CUDART_VERSION >= 11000)
+//         cublasLtMatmulDescCreate(&operationDesc, computeType, scaleType);
+// #else
+//         cublasLtMatmulDescCreate(&operationDesc, computeType);
+// #endif
+
+//         cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(cublasOperation_t));
+//         cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(cublasOperation_t));
+
+//         cublasLtMatmulAlgo_t algo;
+//         void*                workSpace     = cublas_workspace_;
+//         int                  workspaceSize = cublas_workspace_ == NULL ? 0 : CUBLAS_WORKSPACE_SIZE;
+//         if (findAlgo) {
+//             if (info.workspaceSize > workspaceSize) {
+//                 findAlgo = 0;
+//             }
+//             else {
+//                 cublasLtMatmulAlgoInit(
+//                     cublaslt_handle_, computeType, scaleType, Atype_, Btype_, Ctype_, Ctype_, info.algoId, &algo);
+//                 cublasLtMatmulAlgoConfigSetAttribute(
+//                     &algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &(info.customOption), sizeof(info.customOption));
+//                 cublasLtMatmulAlgoConfigSetAttribute(
+//                     &algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &(info.tile), sizeof(info.tile));
+//                 cublasLtMatmulAlgoConfigSetAttribute(
+//                     &algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &(info.splitK_val), sizeof(info.splitK_val));
+//                 cublasLtMatmulAlgoConfigSetAttribute(
+//                     &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(info.swizzle), sizeof(info.swizzle));
+//                 cublasLtMatmulAlgoConfigSetAttribute(&algo,
+//                                                      CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME,
+//                                                      &(info.reductionScheme),
+//                                                      sizeof(info.reductionScheme));
+
+// // #if (CUDART_VERSION >= 11000)
+// //                 cublasLtMatmulAlgoConfigSetAttribute(
+// //                     &algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(info.stages), sizeof(info.stages));
+// // #endif
+
+// #if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
+//                 cublasLtMatmulAlgoConfigSetAttribute(
+//                     &algo, CUBLASLT_ALGO_CONFIG_INNER_SHAPE_ID, &(info.inner_shapeId), sizeof(info.inner_shapeId));
+//                 cublasLtMatmulAlgoConfigSetAttribute(&algo,
+//                                                      CUBLASLT_ALGO_CONFIG_CLUSTER_SHAPE_ID,
+//                                                      &(info.cluster_shapeId),
+//                                                      sizeof(info.cluster_shapeId));
+// #elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
+//                 cublasLtMatmulAlgoConfigSetAttribute(
+//                     &algo, CUBLASLT_ALGO_CONFIG_MMA_SHAPE_ID, &(info.mma_shapeId), sizeof(info.mma_shapeId));
+//                 cublasLtMatmulAlgoConfigSetAttribute(
+//                     &algo, CUBLASLT_ALGO_CONFIG_CGA_SHAPE_ID, &(info.cga_shapeId), sizeof(info.cga_shapeId));
+//                 cublasLtMatmulAlgoConfigSetAttribute(
+//                     &algo, CUBLASLT_ALGO_CONFIG_SCHEDULING_MODE, &(info.sche_mode), sizeof(info.sche_mode));
+// #endif
+//             }
+//         }
+
+//         // cublasLtMatmul(cublaslt_handle_,
+//         //                operationDesc,
+//         //                alpha,
+//         //                A,
+//         //                Adesc,
+//         //                B,
+//         //                Bdesc,
+//         //                beta,
+//         //                C,
+//         //                Cdesc,
+//         //                C,
+//         //                Cdesc,
+//         //                (findAlgo == 1 ? (&algo) : NULL),
+//         //                workSpace,
+//         //                workspaceSize,
+//         //                stream_);
+
+//         cublasLtMatmulDescDestroy(operationDesc);
+//         cublasLtMatrixLayoutDestroy(Adesc);
+//         cublasLtMatrixLayoutDestroy(Bdesc);
+//         cublasLtMatrixLayoutDestroy(Cdesc);
+//         sync_check_cuda_error();
+//     }
+//     else {
        int cublasAlgo = info.algoId;
        check_cuda_error(cublasGemmEx(cublas_handle_,
                                      transa,
@@ -325,7 +326,7 @@ void cublasMMWrapper::Gemm(cublasOperation_t transa,
                                      computeType_,
                                      static_cast<cublasGemmAlgo_t>(cublasAlgo)));
        sync_check_cuda_error();
-    }
+    // }
    mu_->unlock();
 }

@@ -382,81 +383,81 @@ CublasDataType cublasMMWrapper::getCublasDataType(cudaDataType_t data_type)
    return FLOAT_DATATYPE;
 }

-#if (CUDART_VERSION >= 11000)
-// input, weight, output are row-major
-// only works for cublas 11.x
-void cublasMMWrapper::Gemm(cublasOperation_t transa,
-                           cublasOperation_t transb,
-                           const int         m,
-                           const int         n,
-                           const int         k,
-                           const void*       A,
-                           const int         lda,
-                           const void*       B,
-                           const int         ldb,
-                           const void*       bias,
-                           void*             C,
-                           const int         ldc)
-{
-    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-    cudaDataType_t      Atype, Btype, Ctype;
-    cublasComputeType_t computeType;
-    cudaDataType_t      scaleType;
-    float               alpha_float = 1.0f;
-    float               beta_float  = 0.0f;
-    half                alpha_half  = half(1.0f);
-    half                beta_half   = half(0.0f);
-    void *              alpha, *beta;
-
-    // int is_fp16_computeType = computeType_ == CUDA_R_16F ? 1 : 0;
-    if (Atype_ == CUDA_R_32F) {
-        computeType = CUBLAS_COMPUTE_32F_FAST_TF32;
-        Atype       = CUDA_R_32F;
-        Btype       = CUDA_R_32F;
-        Ctype       = CUDA_R_32F;
-        scaleType   = CUDA_R_32F;
-        alpha       = &alpha_float;
-        beta        = &beta_float;
-    }
-    else if (Atype_ == CUDA_R_16BF) {
-        computeType = CUBLAS_COMPUTE_32F_FAST_TF32;
-        Atype       = CUDA_R_16BF;
-        Btype       = CUDA_R_16BF;
-        Ctype       = CUDA_R_16BF;
-        scaleType   = CUDA_R_32F;
-        alpha       = &alpha_float;
-        beta        = &beta_float;
-    }
-    else {
-        computeType = CUBLAS_COMPUTE_16F;
-        Atype       = CUDA_R_16F;
-        Btype       = CUDA_R_16F;
-        Ctype       = CUDA_R_16F;
-        scaleType   = CUDA_R_16F;
-        alpha       = &alpha_half;
-        beta        = &beta_half;
-    }
-
-    cublasLtMatmulDesc_t   operationDesc = NULL;
-    cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL;
-    cublasLtEpilogue_t     epi = CUBLASLT_EPILOGUE_BIAS;
-    cublasLtMatrixLayoutCreate(&Adesc, Atype, (transa == CUBLAS_OP_N) ? m : k, (transa == CUBLAS_OP_N) ? k : m, lda);
-    cublasLtMatrixLayoutCreate(&Bdesc, Btype, (transb == CUBLAS_OP_N) ? k : n, (transb == CUBLAS_OP_N) ? n : k, ldb);
-    cublasLtMatrixLayoutCreate(&Cdesc, Ctype, m, n, ldc);
-
-    cublasLtMatmulDescCreate(&operationDesc, computeType, scaleType);
-    cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(cublasOperation_t));
-    cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(cublasOperation_t));
-    cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epi, sizeof(cublasLtEpilogue_t));
-    cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias, sizeof(const void*));
-    // check_cuda_error(cublasLtMatmul(
-    //     cublaslt_handle_, operationDesc, alpha, A, Adesc, B, Bdesc, beta, C, Cdesc, C, Cdesc, NULL, NULL, 0, stream_));
-    cublasLtMatrixLayoutDestroy(Adesc);
-    cublasLtMatrixLayoutDestroy(Bdesc);
-    cublasLtMatrixLayoutDestroy(Cdesc);
-    cublasLtMatmulDescDestroy(operationDesc);
-}
-#endif
+// #if (CUDART_VERSION >= 11000)
+// // input, weight, output are row-major
+// // only works for cublas 11.x
+// void cublasMMWrapper::Gemm(cublasOperation_t transa,
+//                            cublasOperation_t transb,
+//                            const int         m,
+//                            const int         n,
+//                            const int         k,
+//                            const void*       A,
+//                            const int         lda,
+//                            const void*       B,
+//                            const int         ldb,
+//                            const void*       bias,
+//                            void*             C,
+//                            const int         ldc)
+// {
+//     TM_LOG_DEBUG(__PRETTY_FUNCTION__);
+//     cudaDataType_t      Atype, Btype, Ctype;
+//     cublasComputeType_t computeType;
+//     cudaDataType_t      scaleType;
+//     float               alpha_float = 1.0f;
+//     float               beta_float  = 0.0f;
+//     half                alpha_half  = half(1.0f);
+//     half                beta_half   = half(0.0f);
+//     void *              alpha, *beta;
+
+//     // int is_fp16_computeType = computeType_ == CUDA_R_16F ? 1 : 0;
+//     if (Atype_ == CUDA_R_32F) {
+//         computeType = CUBLAS_COMPUTE_32F_FAST_TF32;
+//         Atype       = CUDA_R_32F;
+//         Btype       = CUDA_R_32F;
+//         Ctype       = CUDA_R_32F;
+//         scaleType   = CUDA_R_32F;
+//         alpha       = &alpha_float;
+//         beta        = &beta_float;
+//     }
+//     else if (Atype_ == CUDA_R_16BF) {
+//         computeType = CUBLAS_COMPUTE_32F_FAST_TF32;
+//         Atype       = CUDA_R_16BF;
+//         Btype       = CUDA_R_16BF;
+//         Ctype       = CUDA_R_16BF;
+//         scaleType   = CUDA_R_32F;
+//         alpha       = &alpha_float;
+//         beta        = &beta_float;
+//     }
+//     else {
+//         computeType = CUBLAS_COMPUTE_16F;
+//         Atype       = CUDA_R_16F;
+//         Btype       = CUDA_R_16F;
+//         Ctype       = CUDA_R_16F;
+//         scaleType   = CUDA_R_16F;
+//         alpha       = &alpha_half;
+//         beta        = &beta_half;
+//     }
+
+//     cublasLtMatmulDesc_t   operationDesc = NULL;
+//     cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL;
+//     cublasLtEpilogue_t     epi = CUBLASLT_EPILOGUE_BIAS;
+//     cublasLtMatrixLayoutCreate(&Adesc, Atype, (transa == CUBLAS_OP_N) ? m : k, (transa == CUBLAS_OP_N) ? k : m, lda);
+//     cublasLtMatrixLayoutCreate(&Bdesc, Btype, (transb == CUBLAS_OP_N) ? k : n, (transb == CUBLAS_OP_N) ? n : k, ldb);
+//     cublasLtMatrixLayoutCreate(&Cdesc, Ctype, m, n, ldc);
+
+//     cublasLtMatmulDescCreate(&operationDesc, computeType, scaleType);
+//     cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(cublasOperation_t));
+//     cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(cublasOperation_t));
+//     cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epi, sizeof(cublasLtEpilogue_t));
+//     cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias, sizeof(const void*));
+//     // check_cuda_error(cublasLtMatmul(
+//     //     cublaslt_handle_, operationDesc, alpha, A, Adesc, B, Bdesc, beta, C, Cdesc, C, Cdesc, NULL, NULL, 0, stream_));
+//     cublasLtMatrixLayoutDestroy(Adesc);
+//     cublasLtMatrixLayoutDestroy(Bdesc);
+//     cublasLtMatrixLayoutDestroy(Cdesc);
+//     cublasLtMatmulDescDestroy(operationDesc);
+// }
+// #endif
 void cublasMMWrapper::setStream(cudaStream_t stream)
 {
    stream_ = stream;

--- a/src/turbomind/utils/cublasMMWrapper.h
+++ b/src/turbomind/utils/cublasMMWrapper.h
@@ -207,20 +207,20 @@ public:

    CublasDataType getCublasDataType(cudaDataType_t data_type);

-#if (CUDART_VERSION >= 11000)
-    void Gemm(cublasOperation_t transa,
-              cublasOperation_t transb,
-              const int         m,
-              const int         n,
-              const int         k,
-              const void*       A,
-              const int         lda,
-              const void*       B,
-              const int         ldb,
-              const void*       bias,
-              void*             C,
-              const int         ldc);
-#endif
+// #if (CUDART_VERSION >= 11000)
+//     void Gemm(cublasOperation_t transa,
+//               cublasOperation_t transb,
+//               const int         m,
+//               const int         n,
+//               const int         k,
+//               const void*       A,
+//               const int         lda,
+//               const void*       B,
+//               const int         ldb,
+//               const void*       bias,
+//               void*             C,
+//               const int         ldc);
+// #endif

    void stridedBatchedGemm(cublasOperation_t transa,
                            cublasOperation_t transb,

--- a/src/turbomind/utils/custom_ar_comm.cc
+++ b/src/turbomind/utils/custom_ar_comm.cc
@@ -152,17 +152,17 @@ void initCustomAllReduceComm(std::vector<std::shared_ptr<AbstractCustomComm>>* c
        return;
    }

-#if defined(CUDART_VERSION) && CUDART_VERSION >= 11020
-    for (size_t i = 0; i < rank_size; i++) {
-        custom_all_reduce_comms->push_back(std::make_shared<CustomAllReduceComm<T>>(rank_size, i));
-    }
-    custom_all_reduce_comms->at(0)->allocateAndExchangePeerAccessPointer(custom_all_reduce_comms);
-#else
+// #if defined(CUDART_VERSION) && CUDART_VERSION >= 11020
+//     for (size_t i = 0; i < rank_size; i++) {
+//         custom_all_reduce_comms->push_back(std::make_shared<CustomAllReduceComm<T>>(rank_size, i));
+//     }
+//     custom_all_reduce_comms->at(0)->allocateAndExchangePeerAccessPointer(custom_all_reduce_comms);
+// #else
    TM_LOG_WARNING("Custom All Reduce is not supported before CUDA 11.2. Using NCCL as Comm.");
    for (size_t i = 0; i < rank_size; i++) {
        custom_all_reduce_comms->push_back(nullptr);
    }
-#endif
+// #endif
 }

 // Template instantiation

--- a/src/turbomind/utils/gemm.cc
+++ b/src/turbomind/utils/gemm.cc
@@ -269,82 +269,82 @@ void Gemm::gemm(const GemmOp   transa,
    }

    // if (using_cublasLt) {
-    if(0) {
-        const size_t a_rows = (a_op == getCublasOperation(GEMM_OP_N)) ? _m : k;
-        const size_t a_cols = (a_op == getCublasOperation(GEMM_OP_N)) ? k : _m;
-        const size_t b_rows = (b_op == getCublasOperation(GEMM_OP_N)) ? k : _n;
-        const size_t b_cols = (b_op == getCublasOperation(GEMM_OP_N)) ? _n : k;
-
-        cublasLtMatmulDesc_t   matmul_desc = NULL;
-        cublasLtMatrixLayout_t a_desc = NULL, b_desc = NULL, c_desc = NULL;
-        cudaDataType_t         scale_type   = getCublasDataType(compute_type_);
-        auto                   compute_type = getCublasComputeType(compute_type_);
-
-        // --------------------------------------
-        // Create descriptors for the original matrices
-        cublasLtMatrixLayoutCreate(&a_desc, a_type, a_rows, a_cols, _lda);
-        cublasLtMatrixLayoutCreate(&b_desc, b_type, b_rows, b_cols, _ldb);
-        cublasLtMatrixLayoutCreate(&c_desc, c_type, _m, _n, ldc);
-#if (CUDART_VERSION >= 11000)
-        cublasLtMatmulDescCreate(&matmul_desc, compute_type, scale_type);
-#else
-        cublasLtMatmulDescCreate(&matmul_desc, compute_type);
-#endif
-
-        cublasLtMatmulDescSetAttribute(matmul_desc, CUBLASLT_MATMUL_DESC_TRANSA, &a_op, sizeof(cublasOperation_t));
-        cublasLtMatmulDescSetAttribute(matmul_desc, CUBLASLT_MATMUL_DESC_TRANSB, &b_op, sizeof(cublasOperation_t));
-
-        cublasLtMatmulAlgo_t algo;
-        void*                workspace      = workspace_;
-        int                  workspace_size = workspace_ == nullptr ? 0 : CUBLAS_WORKSPACE_SIZE;
-        if (findAlgo) {
-            if (info.workspaceSize > workspace_size) {
-                findAlgo = 0;
-            }
-            else {
-                cublasLtMatmulAlgoInit(
-                    cublaslt_handle_, compute_type, scale_type, a_type, b_type, c_type, c_type, info.algoId, &algo);
-                cublasLtMatmulAlgoConfigSetAttribute(
-                    &algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &(info.customOption), sizeof(info.customOption));
-                cublasLtMatmulAlgoConfigSetAttribute(
-                    &algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &(info.tile), sizeof(info.tile));
-                cublasLtMatmulAlgoConfigSetAttribute(
-                    &algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &(info.splitK_val), sizeof(info.splitK_val));
-                cublasLtMatmulAlgoConfigSetAttribute(
-                    &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(info.swizzle), sizeof(info.swizzle));
-                cublasLtMatmulAlgoConfigSetAttribute(
-                    &algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &(info.reductionScheme), sizeof(int));
-#if (CUDART_VERSION >= 11000)
-                cublasLtMatmulAlgoConfigSetAttribute(
-                    &algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(info.stages), sizeof(info.stages));
-#endif
-            }
-        }
-
-        cublasLtMatmul(cublaslt_handle_,
-                       matmul_desc,
-                       alpha_ptr,
-                       a_data_ptr,
-                       a_desc,
-                       b_data_ptr,
-                       b_desc,
-                       beta_ptr,
-                       C,
-                       c_desc,
-                       C,
-                       c_desc,
-                       (findAlgo == 1 ? (&algo) : NULL),
-                       workspace,
-                       workspace_size,
-                       stream_);
-
-        cublasLtMatmulDescDestroy(matmul_desc);
-        cublasLtMatrixLayoutDestroy(a_desc);
-        cublasLtMatrixLayoutDestroy(b_desc);
-        cublasLtMatrixLayoutDestroy(c_desc);
-        sync_check_cuda_error();
-    }
-    else {
+//     if(0) {
+//         const size_t a_rows = (a_op == getCublasOperation(GEMM_OP_N)) ? _m : k;
+//         const size_t a_cols = (a_op == getCublasOperation(GEMM_OP_N)) ? k : _m;
+//         const size_t b_rows = (b_op == getCublasOperation(GEMM_OP_N)) ? k : _n;
+//         const size_t b_cols = (b_op == getCublasOperation(GEMM_OP_N)) ? _n : k;
+
+//         cublasLtMatmulDesc_t   matmul_desc = NULL;
+//         cublasLtMatrixLayout_t a_desc = NULL, b_desc = NULL, c_desc = NULL;
+//         cudaDataType_t         scale_type   = getCublasDataType(compute_type_);
+//         auto                   compute_type = getCublasComputeType(compute_type_);
+
+//         // --------------------------------------
+//         // Create descriptors for the original matrices
+//         cublasLtMatrixLayoutCreate(&a_desc, a_type, a_rows, a_cols, _lda);
+//         cublasLtMatrixLayoutCreate(&b_desc, b_type, b_rows, b_cols, _ldb);
+//         cublasLtMatrixLayoutCreate(&c_desc, c_type, _m, _n, ldc);
+// #if (CUDART_VERSION >= 11000)
+//         cublasLtMatmulDescCreate(&matmul_desc, compute_type, scale_type);
+// #else
+//         cublasLtMatmulDescCreate(&matmul_desc, compute_type);
+// #endif
+
+//         cublasLtMatmulDescSetAttribute(matmul_desc, CUBLASLT_MATMUL_DESC_TRANSA, &a_op, sizeof(cublasOperation_t));
+//         cublasLtMatmulDescSetAttribute(matmul_desc, CUBLASLT_MATMUL_DESC_TRANSB, &b_op, sizeof(cublasOperation_t));
+
+//         cublasLtMatmulAlgo_t algo;
+//         void*                workspace      = workspace_;
+//         int                  workspace_size = workspace_ == nullptr ? 0 : CUBLAS_WORKSPACE_SIZE;
+//         if (findAlgo) {
+//             if (info.workspaceSize > workspace_size) {
+//                 findAlgo = 0;
+//             }
+//             else {
+//                 cublasLtMatmulAlgoInit(
+//                     cublaslt_handle_, compute_type, scale_type, a_type, b_type, c_type, c_type, info.algoId, &algo);
+//                 cublasLtMatmulAlgoConfigSetAttribute(
+//                     &algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &(info.customOption), sizeof(info.customOption));
+//                 cublasLtMatmulAlgoConfigSetAttribute(
+//                     &algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &(info.tile), sizeof(info.tile));
+//                 cublasLtMatmulAlgoConfigSetAttribute(
+//                     &algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &(info.splitK_val), sizeof(info.splitK_val));
+//                 cublasLtMatmulAlgoConfigSetAttribute(
+//                     &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(info.swizzle), sizeof(info.swizzle));
+//                 cublasLtMatmulAlgoConfigSetAttribute(
+//                     &algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &(info.reductionScheme), sizeof(int));
+// #if (CUDART_VERSION >= 11000)
+//                 cublasLtMatmulAlgoConfigSetAttribute(
+//                     &algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(info.stages), sizeof(info.stages));
+// #endif
+//             }
+//         }
+
+//         cublasLtMatmul(cublaslt_handle_,
+//                        matmul_desc,
+//                        alpha_ptr,
+//                        a_data_ptr,
+//                        a_desc,
+//                        b_data_ptr,
+//                        b_desc,
+//                        beta_ptr,
+//                        C,
+//                        c_desc,
+//                        C,
+//                        c_desc,
+//                        (findAlgo == 1 ? (&algo) : NULL),
+//                        workspace,
+//                        workspace_size,
+//                        stream_);
+
+//         cublasLtMatmulDescDestroy(matmul_desc);
+//         cublasLtMatrixLayoutDestroy(a_desc);
+//         cublasLtMatrixLayoutDestroy(b_desc);
+//         cublasLtMatrixLayoutDestroy(c_desc);
+//         sync_check_cuda_error();
+//     }
+//     else {
        cudaDataType_t compute_type = getCublasDataType(compute_type_);
        int            cublas_algo  = info.algoId;
        check_cuda_error(cublasGemmEx(cublas_handle_,
@@ -367,7 +367,7 @@ void Gemm::gemm(const GemmOp   transa,
                                      compute_type,
                                      static_cast<cublasGemmAlgo_t>(cublas_algo)));
        sync_check_cuda_error();
-    }
+    // }
    mutex_->unlock();
 }

@@ -1035,19 +1035,19 @@ cudaDataType_t getCublasDataType(DataType dtype)
    }
 }

-#if (CUDART_VERSION >= 11000)
-cublasComputeType_t getCublasComputeType(DataType ctype)
-{
-    switch (ctype) {
-        case TYPE_FP16:
-            return CUBLAS_COMPUTE_16F;
-        case TYPE_FP32:
-            return CUBLAS_COMPUTE_32F;
-        default:
-            throw GemmNotSupportedException("Not supported cublas compute type.");
-    }
-}
-#else
+// #if (CUDART_VERSION >= 11000)
+// cublasComputeType_t getCublasComputeType(DataType ctype)
+// {
+//     switch (ctype) {
+//         case TYPE_FP16:
+//             return CUBLAS_COMPUTE_16F;
+//         case TYPE_FP32:
+//             return CUBLAS_COMPUTE_32F;
+//         default:
+//             throw GemmNotSupportedException("Not supported cublas compute type.");
+//     }
+// }
+// #else
 cudaDataType_t getCublasComputeType(DataType ctype)
 {
    switch (ctype) {
@@ -1059,7 +1059,7 @@ cudaDataType_t getCublasComputeType(DataType ctype)
            throw GemmNotSupportedException("Not supported cublas compute type.");
    }
 }
-#endif
+// #endif

 cublasOperation_t getCublasOperation(GemmOp op)
 {

--- a/src/turbomind/utils/gemm.h
+++ b/src/turbomind/utils/gemm.h
@@ -622,11 +622,11 @@ std::shared_ptr<Gemm>
 createGemm(IAllocator* allocator, cudaStream_t stream, bool sparse = false, bool quantized = false);

 cudaDataType_t getCublasDataType(DataType dtype);
-#if (CUDART_VERSION >= 11000)
-cublasComputeType_t getCublasComputeType(DataType dtype);
-#else
+// #if (CUDART_VERSION >= 11000)
+// cublasComputeType_t getCublasComputeType(DataType dtype);
+// #else
 cudaDataType_t getCublasComputeType(DataType dtype);
-#endif
+// #endif
 cublasOperation_t getCublasOperation(GemmOp op);
 std::string       getGemmOpString(const GemmOp& op);


--- a/src/turbomind/utils/gemm_test/encoder_igemm_func.cc
+++ b/src/turbomind/utils/gemm_test/encoder_igemm_func.cc
--- a/src/turbomind/utils/gemm_test/gemm_func.cc
+++ b/src/turbomind/utils/gemm_test/gemm_func.cc