check-in fastertransformer (#7)

* add ft code * gitignore * fix lint * revert fmha

check-in fastertransformer (#7)
* add ft code * gitignore * fix lint * revert fmha
9efcac38 · Li Zhang · GitHub · 720fc533 · 9efcac38 · 9efcac38
Unverified Commit 9efcac38 authored Jun 20, 2023 by Li Zhang Committed by GitHub Jun 20, 2023
20 changed files
--- a/src/fastertransformer/utils/cublasMMWrapper.h
+++ b/src/fastertransformer/utils/cublasMMWrapper.h
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "cuda_utils.h"
+#include "src/fastertransformer/utils/allocator.h"
+#include "src/fastertransformer/utils/cublasAlgoMap.h"
+#include <cublasLt.h>
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+#include <map>
+#include <mutex>
+#include <string>
+#pragma once
+namespace fastertransformer {
+class cublasMMWrapper {
+protected:
+    cublasHandle_t   cublas_handle_;
+    cublasLtHandle_t cublaslt_handle_;
+#ifdef SPARSITY_ENABLED
+    cusparseLtHandle_t                               cusparselt_handle_;
+    std::map<std::string, cusparseLtMatDescriptor_t> sp_mat_A_desc_map_;
+    std::map<std::string, cusparseLtMatDescriptor_t> sp_mat_B_desc_map_;
+    std::map<std::string, cusparseLtMatDescriptor_t> sp_mat_C_desc_map_;
+#endif
+    cudaDataType_t Atype_;
+    cudaDataType_t Btype_;
+    cudaDataType_t Ctype_;
+    cudaDataType_t computeType_;
+    cudaStream_t   stream_;
+    cublasAlgoMap* cublas_algo_map_;
+    std::mutex*    mu_;
+    IAllocator* allocator_        = nullptr;
+    void*       cublas_workspace_ = nullptr;
+    friend class cublasINT8MMWrapper;
+    void _Int8Gemm(const int     m,
+                   const int     n,
+                   const int     k,
+                   const int8_t* A,
+                   const int     lda,
+                   const int8_t* B,
+                   const int     ldb,
+                   void*         C,
+                   const int     ldc,
+                   const void*   alpha,
+                   const int     mode,
+                   const bool    per_column_scaling);
+public:
+    cublasMMWrapper(cublasHandle_t   cublas_handle_,
+                    cublasLtHandle_t cublaslt_handle_,
+                    cudaStream_t     stream,
+                    cublasAlgoMap*   map,
+                    std::mutex*      mu,
+                    IAllocator*      allocator);
+#ifdef SPARSITY_ENABLED
+    cublasMMWrapper(cublasHandle_t     cublas_handle_,
+                    cublasLtHandle_t   cublaslt_handle_,
+                    cusparseLtHandle_t cusparselt_handle,
+                    cudaStream_t       stream,
+                    cublasAlgoMap*     map,
+                    std::mutex*        mu,
+                    IAllocator*        allocator);
+#endif
+    ~cublasMMWrapper();
+    cublasMMWrapper(const cublasMMWrapper& wrapper);
+    virtual void cublasVersionCheck()
+    {
+        return;
+    };
+    cublasStatus_t cublasLtMatmulWrapper(cublasLtHandle_t            lightHandle,
+                                         cublasLtMatmulDesc_t        computeDesc,
+                                         const void*                 alpha,
+                                         const void*                 A,
+                                         cublasLtMatrixLayout_t      Adesc,
+                                         const void*                 B,
+                                         cublasLtMatrixLayout_t      Bdesc,
+                                         const void*                 beta,
+                                         const void*                 C,
+                                         cublasLtMatrixLayout_t      Cdesc,
+                                         void*                       D,
+                                         cublasLtMatrixLayout_t      Ddesc,
+                                         const cublasLtMatmulAlgo_t* algo,
+                                         void*                       workspace,
+                                         size_t                      workspaceSizeInBytes,
+                                         cudaStream_t                stream);
+    std::pair<bool, cublasLtMatmulAlgo_t> findBestAlgo(cublasLtHandle_t       lightHandle,
+                                                       cublasLtMatmulDesc_t   computeDesc,
+                                                       const void*            alpha,
+                                                       const void*            A,
+                                                       cublasLtMatrixLayout_t Adesc,
+                                                       const void*            B,
+                                                       cublasLtMatrixLayout_t Bdesc,
+                                                       const void*            beta,
+                                                       const void*            C,
+                                                       cublasLtMatrixLayout_t Cdesc,
+                                                       void*                  D,
+                                                       cublasLtMatrixLayout_t Ddesc,
+                                                       cudaStream_t           stream);
+    using MatrixLayout = std::tuple<cudaDataType_t, cublasLtOrder_t, uint64_t, uint64_t>;
+    using cache_idx_t  = std::tuple<cublasLtMatmulDesc_t, std::array<MatrixLayout, 4>>;
+    std::map<cache_idx_t, cublasLtMatmulAlgo_t> algo_cache;
+    MatrixLayout createMatrixLayout(cublasLtMatrixLayout_t Mdesc);
+    void Gemm(cublasOperation_t transa,
+              cublasOperation_t transb,
+              const int         m,
+              const int         n,
+              const int         k,
+              const void*       alpha,
+              const void*       A,
+              cudaDataType_t    Atype,
+              int               lda,
+              const void*       B,
+              cudaDataType_t    Btype,
+              int               ldb,
+              const void*       beta,
+              void*             C,
+              cudaDataType_t    Ctype,
+              int               ldc,
+              cudaDataType_t    computeType,
+              cublasGemmAlgo_t  algo);
+    void Gemm(cublasOperation_t transa,
+              cublasOperation_t transb,
+              const int         m,
+              const int         n,
+              const int         k,
+              const void*       A,
+              const int         lda,
+              const void*       B,
+              const int         ldb,
+              void*             C,
+              const int         ldc);
+    void Gemm(cublasOperation_t transa,
+              cublasOperation_t transb,
+              const int         m,
+              const int         n,
+              const int         k,
+              const void*       A,
+              const int         lda,
+              const void*       B,
+              const int         ldb,
+              void*             C,
+              const int         ldc,
+              float             f_alpha,
+              float             f_beta);
+    void Int8Gemm(const int     m,
+                  const int     n,
+                  const int     k,
+                  const int8_t* A,
+                  const int     lda,
+                  const int8_t* B,
+                  const int     ldb,
+                  int8_t*       C,
+                  const int     ldc,
+                  const float*  alpha,
+                  const bool    per_column_scaling = false);
+    void Int8Gemm(const int     m,
+                  const int     n,
+                  const int     k,
+                  const int8_t* A,
+                  const int     lda,
+                  const int8_t* B,
+                  const int     ldb,
+                  int32_t*      C,
+                  const int     ldc);
+    void setFP32GemmConfig();
+    void setFP16GemmConfig();
+#ifdef ENABLE_BF16
+    void setBF16GemmConfig();
+#endif
+    void setStream(cudaStream_t stream);
+    void setGemmConfig(cudaDataType_t aType, cudaDataType_t bType, cudaDataType_t cType, cudaDataType_t computeType);
+    CublasDataType getCublasDataType(cudaDataType_t data_type);
+#if (CUDART_VERSION >= 11000)
+    void Gemm(cublasOperation_t transa,
+              cublasOperation_t transb,
+              const int         m,
+              const int         n,
+              const int         k,
+              const void*       A,
+              const int         lda,
+              const void*       B,
+              const int         ldb,
+              const void*       bias,
+              void*             C,
+              const int         ldc);
+#endif
+    void stridedBatchedGemm(cublasOperation_t transa,
+                            cublasOperation_t transb,
+                            const int         m,
+                            const int         n,
+                            const int         k,
+                            const void*       A,
+                            const int         lda,
+                            const int64_t     strideA,
+                            const void*       B,
+                            const int         ldb,
+                            const int64_t     strideB,
+                            void*             C,
+                            const int         ldc,
+                            const int64_t     strideC,
+                            const int         batchCount,
+                            const float       f_alpha = 1.0f,
+                            const float       f_beta  = 0.0f);
+    void stridedBatchedGemm(cublasOperation_t transa,
+                            cublasOperation_t transb,
+                            const int         m,
+                            const int         n,
+                            const int         k,
+                            const float       f_alpha,
+                            const void*       A,
+                            cudaDataType_t    AType,
+                            const int         lda,
+                            const int64_t     strideA,
+                            const void*       B,
+                            cudaDataType_t    BType,
+                            const int         ldb,
+                            const int64_t     strideB,
+                            const float       f_beta,
+                            void*             C,
+                            cudaDataType_t    CType,
+                            const int         ldc,
+                            const int64_t     strideC,
+                            const int         batch_count,
+                            cudaDataType_t    computeType);
+    void batchedGemm(cublasOperation_t  transa,
+                     cublasOperation_t  transb,
+                     const int          m,
+                     const int          n,
+                     const int          k,
+                     const void* const* A,
+                     const int          lda,
+                     const void* const* B,
+                     const int          ldb,
+                     void* const*       C,
+                     const int          ldc,
+                     const int          batch_count);
+    bool isFuseBatchGemm(const int batch_count, const int m, const int k, const int n);
+#ifdef SPARSITY_ENABLED
+    void SpGemm(cublasOperation_t transa,
+                cublasOperation_t transb,
+                const int         m,
+                const int         n,
+                const int         k,
+                const void*       A,
+                const void*       B,
+                void*             C);
+    size_t getSparseMatrixSize(int m, int k);
+    void   compressMatrix(const void* input, void* output, const int m, const int k);
+    bool isUseSparse(const int batch_count, const int m, const int n, const int k);
+#endif
+};
+}  // namespace fastertransformer
--- a/src/fastertransformer/utils/cuda_bf16_fallbacks.cuh
+++ b/src/fastertransformer/utils/cuda_bf16_fallbacks.cuh
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
+#include <cuda_fp16.h>
+namespace fastertransformer {
+#ifdef ENABLE_BF16
+inline __device__ float2 bf1622float2(const __nv_bfloat162 val) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    float2 f_val;
+    f_val.x = __low2float(val); 
+    f_val.y = __high2float(val);
+    return f_val;
+#else
+    return __bfloat1622float2(val);
+#endif
+}
+inline __device__ int16_t bf1622int16(__nv_bfloat162 val) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    float2 f_val;
+    f_val.x = max(min(__low2float(val), 127.f), -128.f);
+    f_val.y = max(min(__high2float(val), 127.f), -128.f);
+    union { int8_t int8[2]; int16_t int16; };
+    int8[0] = static_cast<int8_t>(static_cast<short>(f_val.x));
+    int8[1] = static_cast<int8_t>(static_cast<short>(f_val.y));
+    return int16;
+#else
+    val = __hmin2(val, make_bfloat162(127., 127.));
+    val = __hmax2(val, make_bfloat162(-128., -128.));
+    union { int8_t int8[2]; int16_t int16; };
+    int8[0] = static_cast<int8_t>(static_cast<short>(val.x));
+    int8[1] = static_cast<int8_t>(static_cast<short>(val.y));
+    return int16;
+#endif
+}
+inline __device__ __nv_bfloat162 float22bf162(const float2 val) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    return __floats2bfloat162_rn(val.x, val.y);
+#else
+    return __float22bfloat162_rn(val);
+#endif
+}
+inline __device__ __nv_bfloat162 bf162bf162(const __nv_bfloat16 val) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    __nv_bfloat162 val2;
+    val2.x = val;
+    val2.y = val;
+    return val2;
+#else
+    return __bfloat162bfloat162(val);
+#endif
+}
+inline __device__ __nv_bfloat162 bf16hadd2(const __nv_bfloat162 x, const __nv_bfloat162 y) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    float fxl, fxh, fyl, fyh;
+    fxl = __low2float(x);
+    fxh = __high2float(x);
+    fyl = __low2float(y);
+    fyh = __high2float(y);
+    return __floats2bfloat162_rn(fxl + fyl, fxh + fyh);
+#else
+    return __hadd2(x, y);
+#endif
+}
+inline __device__ __nv_bfloat16 bf16hadd(const __nv_bfloat16 x, const __nv_bfloat16 y) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    return __float2bfloat16( __bfloat162float(x) + __bfloat162float(y) );
+#else
+    return __hadd(x, y);
+#endif
+}
+inline __device__ __nv_bfloat162 bf16hsub2(const __nv_bfloat162 x, const __nv_bfloat162 y) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    float fxl, fxh, fyl, fyh;
+    fxl = __low2float(x);
+    fxh = __high2float(x);
+    fyl = __low2float(y);
+    fyh = __high2float(y);
+    return __floats2bfloat162_rn(fxl - fyl, fxh - fyh);
+#else
+    return __hsub2(x, y);
+#endif
+}
+inline __device__ __nv_bfloat16 bf16hsub(const __nv_bfloat16 x, const __nv_bfloat16 y) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    return __float2bfloat16( __bfloat162float(x) - __bfloat162float(y) );
+#else
+    return __hsub(x, y);
+#endif
+}
+inline __device__ __nv_bfloat162 bf16hmul2(const __nv_bfloat162 x, const __nv_bfloat162 y) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    float fxl, fxh, fyl, fyh;
+    fxl = __low2float(x);
+    fxh = __high2float(x);
+    fyl = __low2float(y);
+    fyh = __high2float(y);
+    return __floats2bfloat162_rn(fxl * fyl, fxh * fyh);
+#else
+    return __hmul2(x, y);
+#endif
+}
+inline __device__ __nv_bfloat16 bf16hmul(const __nv_bfloat16 x, const __nv_bfloat16 y) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    return __float2bfloat16( __bfloat162float(x) * __bfloat162float(y) );
+#else 
+    return __hmul(x, y);
+#endif
+}
+inline __device__ __nv_bfloat162 bf16hfma2(const __nv_bfloat162 x, const __nv_bfloat162 y, const __nv_bfloat162 z) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    float fxl, fxh, fyl, fyh, fzl, fzh;
+    fxl = __low2float(x);
+    fxh = __high2float(x);
+    fyl = __low2float(y);
+    fyh = __high2float(y);
+    fzl = __low2float(z);
+    fzh = __high2float(z);
+    return __floats2bfloat162_rn(fxl * fyl + fzl, fxh * fyh + fzh);
+#else
+    return __hfma2(x, y, z);
+#endif
+}
+inline __device__ __nv_bfloat16 bf16hfma(const __nv_bfloat16 x, const __nv_bfloat16 y, const __nv_bfloat16 z) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    return __float2bfloat16( __bfloat162float(x) * __bfloat162float(y) + __bfloat162float(z));
+#else
+    return __hfma(x, y, z);
+#endif
+}
+inline __device__ __nv_bfloat162 bf16exp2(const __nv_bfloat162 x) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    float fxl, fxh;
+    fxl = __low2float(x);
+    fxh = __high2float(x);;
+    return __floats2bfloat162_rn(expf(fxl), expf(fxh));
+#else
+    return h2exp(x);
+#endif
+}
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800)
+inline __device__ __nv_bfloat162 operator*(const __nv_bfloat162 x, const __nv_bfloat162 y) { return bf16hmul2(x, y); };
+inline __device__ __nv_bfloat162 operator+(const __nv_bfloat162 x, const __nv_bfloat162 y) { return bf16hadd2(x, y); };
+inline __device__ __nv_bfloat162 make_bfloat162(const __nv_bfloat16 x, const __nv_bfloat16 y)
+{
+    __nv_bfloat162 t; t.x = x; t.y = y; return t;
+}
+#endif
+inline __device__ __nv_bfloat16 bf16hadd(__nv_bfloat16 a, __nv_bfloat16 b, __nv_bfloat16 c) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    return __float2bfloat16(__bfloat162float(a) + __bfloat162float(b) + __bfloat162float(c));
+#else
+    return a + b + c;
+#endif
+}
+inline __device__ __nv_bfloat16 bf16hadd(__nv_bfloat16 a, __nv_bfloat16 b, __nv_bfloat16 c, __nv_bfloat16 d) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    return __float2bfloat16(__bfloat162float(a) + __bfloat162float(b) + __bfloat162float(c) + __bfloat162float(d));
+#else
+    return (__nv_bfloat16)((float)a + (float)b + (float)c + (float)d);
+#endif
+}
+inline __device__ __nv_bfloat162 bf16hadd2(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    float fal, fah, fbl, fbh, fcl, fch;
+    fal = __low2float(a);
+    fah = __high2float(a);
+    fbl = __low2float(b);
+    fbh = __high2float(b);
+    fcl = __low2float(c);
+    fch = __high2float(c);
+    return __floats2bfloat162_rn(fal + fbl + fcl, fah + fbh + fch);
+#else
+    return a + b + c;
+#endif
+}
+inline __device__ __nv_bfloat16 bf16hmul(__nv_bfloat16 a, __nv_bfloat16 b, __nv_bfloat16 c) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    return __float2bfloat16(__bfloat162float(a) * __bfloat162float(b) * __bfloat162float(c));
+#else
+    return a * b * c;
+#endif
+}
+inline __device__ __nv_bfloat162 bf16hmul2(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    float fal, fah, fbl, fbh, fcl, fch;
+    fal = __low2float(a);
+    fah = __high2float(a);
+    fbl = __low2float(b);
+    fbh = __high2float(b);
+    fcl = __low2float(c);
+    fch = __high2float(c);
+    return __floats2bfloat162_rn(fal * fbl * fcl, fah * fbh * fch);
+#else
+    return a * b * c;
+#endif
+}
+inline __device__ __nv_bfloat162 bf16hfma2(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c, __nv_bfloat162 d) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    float fal, fah, fbl, fbh, fcl, fch, fdl, fdh;
+    fal = __low2float(a);
+    fah = __high2float(a);
+    fbl = __low2float(b);
+    fbh = __high2float(b);
+    fcl = __low2float(c);
+    fch = __high2float(c);
+    fdl = __low2float(d);
+    fdh = __high2float(d);
+    return __floats2bfloat162_rn(fal * fbl * fcl + fdl, fah * fbh * fch + fdh);
+#else
+    return a * b * c + d;
+#endif
+}
+#endif // ENABLE_BF16
+}  // namespace fastertransformer
\ No newline at end of file
--- a/src/fastertransformer/utils/cuda_bf16_wrapper.h
+++ b/src/fastertransformer/utils/cuda_bf16_wrapper.h
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#ifdef ENABLE_BF16
+#include <cuda_bf16.h>
+#endif
\ No newline at end of file
--- a/src/fastertransformer/utils/cuda_fp8_utils.cu
+++ b/src/fastertransformer/utils/cuda_fp8_utils.cu
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "cuda_fp8_utils.h"
+namespace fastertransformer {
+#ifdef ENABLE_FP8
+template<typename T_OUT, typename T_IN, QUANTIZE_MODE quantize_mode>
+__global__ void quantizeMatrix(T_OUT* output, float const* input_scale, T_IN const* input, uint32_t size, uint32_t n)
+{
+    for (uint32_t i = threadIdx.x + blockIdx.x * blockDim.x; i < size; i += blockDim.x * gridDim.x) {
+        if (quantize_mode == QUANTIZE_MODE::PER_CHANNEL) {
+            output[i] = T_OUT((float)(input[i]) * __ldg(input_scale + (i % n)));
+        }
+        else {
+            output[i] = T_OUT((float)(input[i]) * __ldg(input_scale));
+        }
+    }
+}
+template<typename T_OUT, typename T_IN, QUANTIZE_MODE quantize_mode>
+void invokeQuantizeMatrix(
+    T_OUT* output, float const* input_scale, T_IN const* input, uint32_t size, uint32_t n, cudaStream_t stream)
+{
+    dim3 grid(32);
+    dim3 block(256);
+    quantizeMatrix<T_OUT, T_IN, quantize_mode><<<grid, block, 0, stream>>>(output, input_scale, input, size, n);
+}
+#define defineinvokeQuantizeMatrix(type_out, type_in, mode)                                                            \
+    template void invokeQuantizeMatrix<type_out, type_in, mode>(type_out * output,                                     \
+                                                                float const*   input_scale,                            \
+                                                                type_in const* input,                                  \
+                                                                uint32_t       size,                                   \
+                                                                uint32_t       n,                                      \
+                                                                cudaStream_t   stream);
+defineinvokeQuantizeMatrix(__nv_fp8_e4m3, float, QUANTIZE_MODE::PER_CHANNEL);
+defineinvokeQuantizeMatrix(__nv_fp8_e4m3, float, QUANTIZE_MODE::PER_TENSOR);
+defineinvokeQuantizeMatrix(__nv_fp8_e4m3, half, QUANTIZE_MODE::PER_CHANNEL);
+defineinvokeQuantizeMatrix(__nv_fp8_e4m3, half, QUANTIZE_MODE::PER_TENSOR);
+defineinvokeQuantizeMatrix(half, __nv_fp8_e4m3, QUANTIZE_MODE::PER_CHANNEL);
+defineinvokeQuantizeMatrix(half, __nv_fp8_e4m3, QUANTIZE_MODE::PER_TENSOR);
+defineinvokeQuantizeMatrix(float, __nv_fp8_e4m3, QUANTIZE_MODE::PER_CHANNEL);
+defineinvokeQuantizeMatrix(float, __nv_fp8_e4m3, QUANTIZE_MODE::PER_TENSOR);
+#ifdef ENABLE_BF16
+defineinvokeQuantizeMatrix(__nv_fp8_e4m3, __nv_bfloat16, QUANTIZE_MODE::PER_CHANNEL);
+defineinvokeQuantizeMatrix(__nv_fp8_e4m3, __nv_bfloat16, QUANTIZE_MODE::PER_TENSOR);
+defineinvokeQuantizeMatrix(__nv_bfloat16, __nv_fp8_e4m3, QUANTIZE_MODE::PER_CHANNEL);
+defineinvokeQuantizeMatrix(__nv_bfloat16, __nv_fp8_e4m3, QUANTIZE_MODE::PER_TENSOR);
+#endif
+template<typename T_OUT, typename T_IN, typename T_FAKE>
+__global__ void fakeQuantize(T_OUT* dst, const T_IN* src, const int size)
+{
+    for (int tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) {
+        T_FAKE tmp = (T_FAKE)((float)src[tid]);
+        dst[tid]   = (T_OUT)((float)tmp);
+    }
+}
+template<typename T_OUT, typename T_IN, typename T_FAKE>
+void invokeFakeQuantize(T_OUT* dst, const T_IN* src, const int size, cudaStream_t stream)
+{
+    fakeQuantize<T_OUT, T_IN, T_FAKE><<<256, 256, 0, stream>>>(dst, src, size);
+}
+template void
+invokeFakeQuantize<float, float, __nv_fp8_e4m3>(float* dst, const float* src, const int size, cudaStream_t stream);
+template void
+invokeFakeQuantize<half, half, __nv_fp8_e4m3>(half* dst, const half* src, const int size, cudaStream_t stream);
+template void invokeFakeQuantize<__nv_bfloat16, __nv_bfloat16, __nv_fp8_e4m3>(__nv_bfloat16*       dst,
+                                                                              const __nv_bfloat16* src,
+                                                                              const int            size,
+                                                                              cudaStream_t         stream);
+template<typename T_W>
+__global__ void computeFP8QuantizeScale(float* quant_ptr, const T_W* weights, const int k, const int n)
+{
+    float max = -10000.f;
+    for (int i = 0; i < k; i++) {
+        float val = fabs((float)weights[i * n + blockIdx.x * blockDim.x + threadIdx.x]);
+        max       = max > val ? max : val;
+        if (threadIdx.x == 0 && blockIdx.x == 0 && i % 100 == 0) {
+            printf("max: %f, val: %f \n", max, val);
+        }
+    }
+    // quant_ptr[blockIdx.x * blockDim.x + threadIdx.x] = 1.0f;
+    // quant_ptr[blockIdx.x * blockDim.x + threadIdx.x] = FP8_E4M3_MAX / max;
+    quant_ptr[blockIdx.x * blockDim.x + threadIdx.x] = std::max(max / FP8_E4M3_MAX, 1.0f / 32.f);
+}
+template<typename T_W>
+void invokeComputeFP8QuantizeScale(float* quant_ptr, const T_W* weights, const int k, const int n, cudaStream_t stream)
+{
+    dim3 block(256);
+    dim3 grid;
+    grid.x = (n + 255) / 256;
+    computeFP8QuantizeScale<T_W><<<grid, block, 0, stream>>>(quant_ptr, weights, k, n);
+}
+#ifdef ENABLE_BF16
+template void invokeComputeFP8QuantizeScale(
+    float* quant_ptr, const __nv_bfloat16* weights, const int k, const int n, cudaStream_t stream);
+#endif
+template void
+invokeComputeFP8QuantizeScale(float* quant_ptr, const float* weights, const int k, const int n, cudaStream_t stream);
+#endif  // ENABLE_FP8
+}  // namespace fastertransformer
\ No newline at end of file
--- a/src/fastertransformer/utils/cuda_fp8_utils.h
+++ b/src/fastertransformer/utils/cuda_fp8_utils.h
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#ifdef ENABLE_FP8
+#include <cuda_fp8.h>
+#include <cuda_runtime.h>
+#include <stdint.h>
+// #define FP8_MHA
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 900
+#define FUSE_GEMM_ACT
+#endif
+#define FP8_GEMM_OUTPUT_QUANT_DISABLE
+#ifdef FUSE_GEMM_ACT
+#define USE_QGMMA
+#endif
+namespace fastertransformer {
+const float FP8_E4M3_MAX = 480.0f;
+enum QUANTIZE_MODE {
+    PER_CHANNEL,
+    PER_TENSOR,
+    PER_CHANNEL_WEIGHT_PER_TENSOR_ACT
+};
+// Packed Data Type
+typedef struct __CUDA_ALIGN__(32) {
+    float array[8];
+} float8;
+typedef struct __CUDA_ALIGN__(16) {
+    half array[8];
+} half8;
+#ifdef ENABLE_BF16
+typedef struct __CUDA_ALIGN__(4) {
+    __nv_bfloat16 array[2];
+} __nv_bfloat16_2;
+typedef struct __CUDA_ALIGN__(8) {
+    __nv_bfloat162 x, y;
+} __nv_bfloat162_2_xy;
+typedef struct __CUDA_ALIGN__(8) {
+    __nv_bfloat16 array[4];
+} __nv_bfloat164;
+typedef struct __CUDA_ALIGN__(8) {
+    __nv_bfloat162 array[2];
+} __nv_bfloat162_2;
+typedef struct __CUDA_ALIGN__(16) {
+    __nv_bfloat16 array[8];
+} __nv_bfloat168;
+typedef struct __CUDA_ALIGN__(16) {
+    __nv_bfloat162 array[4];
+} __nv_bfloat162_4;
+typedef struct __CUDA_ALIGN__(32) {
+    __nv_bfloat16 array[16];
+} __nv_bfloat1616;
+#endif
+#ifdef ENABLE_FP8
+typedef struct __CUDA_ALIGN__(2) {
+    __nv_fp8_e4m3 array[2];
+} __nv_fp8_2_e4m3;
+typedef struct __CUDA_ALIGN__(4) {
+    __nv_fp8_e4m3 array[4];
+} __nv_fp8_4_e4m3;
+typedef struct __CUDA_ALIGN__(4) {
+    __nv_fp8x2_e4m3 array[2];
+} __nv_fp8x2_x2_e4m3;
+typedef struct __CUDA_ALIGN__(8) {
+    __nv_fp8_e4m3 array[8];
+} __nv_fp8_8_e4m3;
+typedef struct __CUDA_ALIGN__(8) {
+    __nv_fp8x2_e4m3 array[4];
+} __nv_fp8x2_x4_e4m3;
+typedef struct __CUDA_ALIGN__(16) {
+    __nv_fp8_e4m3 array[16];
+} __nv_fp8x16_e4m3;
+#endif
+// only BF16 and FP8
+template<typename T, int PACK_SIZE>
+struct PackType {
+    using type = float;
+};
+#ifdef ENABLE_BF16
+template<>
+struct PackType<__nv_bfloat16, 2> {
+    using type = __nv_bfloat16_2;
+};
+template<>
+struct PackType<__nv_bfloat16, 4> {
+    using type = __nv_bfloat164;
+};
+template<>
+struct PackType<__nv_bfloat16, 8> {
+    using type = __nv_bfloat168;
+};
+#endif
+#ifdef ENABLE_FP8
+template<>
+struct PackType<__nv_fp8_e4m3, 2> {
+    using type = __nv_fp8_2_e4m3;
+};
+template<>
+struct PackType<__nv_fp8_e4m3, 4> {
+    using type = __nv_fp8_4_e4m3;
+};
+template<>
+struct PackType<__nv_fp8_e4m3, 8> {
+    using type = __nv_fp8_8_e4m3;
+};
+#endif
+__inline__ __device__ void fp8x4_e4m3_to_bfloat2(__nv_bfloat162* out1, __nv_bfloat162* out2, const __nv_fp8x4_e4m3* in)
+{
+    const char4 tmp_val = reinterpret_cast<const char4*>(in)[0];
+    *out1               = __nv_bfloat162((float)reinterpret_cast<const __nv_fp8_e4m3*>(&tmp_val.x)[0],
+                           (float)reinterpret_cast<const __nv_fp8_e4m3*>(&tmp_val.y)[0]);
+    *out2               = __nv_bfloat162((float)reinterpret_cast<const __nv_fp8_e4m3*>(&tmp_val.z)[0],
+                           (float)reinterpret_cast<const __nv_fp8_e4m3*>(&tmp_val.w)[0]);
+}
+__inline__ __device__ __nv_bfloat162 fp8x2_e4m3_to_bfloat2(const __nv_fp8x2_e4m3* in)
+{
+    const char2    tmp_val = reinterpret_cast<const char2*>(in)[0];
+    __nv_bfloat162 out     = __nv_bfloat162((float)reinterpret_cast<const __nv_fp8_e4m3*>(&tmp_val.x)[0],
+                                        (float)reinterpret_cast<const __nv_fp8_e4m3*>(&tmp_val.y)[0]);
+    return out;
+}
+__inline__ __device__ void fp8x4_e4m3_to_half2(half2* out1, half2* out2, const __nv_fp8x4_e4m3* in)
+{
+    const char4 tmp_val = reinterpret_cast<const char4*>(in)[0];
+    *out1               = half2((float)reinterpret_cast<const __nv_fp8_e4m3*>(&tmp_val.x)[0],
+                  (float)reinterpret_cast<const __nv_fp8_e4m3*>(&tmp_val.y)[0]);
+    *out2               = half2((float)reinterpret_cast<const __nv_fp8_e4m3*>(&tmp_val.z)[0],
+                  (float)reinterpret_cast<const __nv_fp8_e4m3*>(&tmp_val.w)[0]);
+}
+__inline__ __device__ half2 fp8x2_e4m3_to_half2(const __nv_fp8x2_e4m3* in)
+{
+    const char2 tmp_val = reinterpret_cast<const char2*>(in)[0];
+    half2       out     = half2((float)reinterpret_cast<const __nv_fp8_e4m3*>(&tmp_val.x)[0],
+                      (float)reinterpret_cast<const __nv_fp8_e4m3*>(&tmp_val.y)[0]);
+    return out;
+}
+template<typename T_OUT, typename T_IN, QUANTIZE_MODE quantize_mode>
+void invokeQuantizeMatrix(
+    T_OUT* output, float const* input_qua_amax_ptr, T_IN const* input, uint32_t size, uint32_t n, cudaStream_t stream);
+template<typename T_OUT, typename T_IN, typename T_FAKE>
+void invokeFakeQuantize(T_OUT* dst, const T_IN* src, const int size, cudaStream_t stream);
+template<typename T_W>
+void invokeComputeFP8QuantizeScale(float* quant_ptr, const T_W* weights, const int k, const int n, cudaStream_t stream);
+}  // namespace fastertransformer
+#endif  // ENABLE_FP8
--- a/src/fastertransformer/utils/cuda_type_utils.cuh
+++ b/src/fastertransformer/utils/cuda_type_utils.cuh
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
+#include "src/fastertransformer/utils/cuda_bf16_fallbacks.cuh"
+#include "src/fastertransformer/utils/cuda_fp8_utils.h"
+namespace fastertransformer {
+template<typename T>
+inline __device__ T ldg(const T* val) {
+    return __ldg(val);
+}
+#if ENABLE_BF16
+template<>
+inline __device__ __nv_bfloat162 ldg(const __nv_bfloat162* val) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    return val[0];
+#else
+    return __ldg(val);
+#endif
+}
+template<>
+inline __device__ __nv_bfloat16 ldg(const __nv_bfloat16* val) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    return val[0];
+#else
+    return __ldg(val);
+#endif
+}
+#endif // ENABLE_BF16
+// Get type2 from type or vice versa (applied to half and bfloat16)
+template<typename T>
+struct TypeConverter {using Type = half2;}; // keep for generality
+template<>
+struct TypeConverter<half2> {using Type = half;};
+template<>
+struct TypeConverter<half> {using Type = half2;};
+#if ENABLE_BF16
+template<>
+struct TypeConverter<__nv_bfloat162> {using Type = __nv_bfloat16;};
+template<>
+struct TypeConverter<__nv_bfloat16> {using Type = __nv_bfloat162;};
+#endif // ENABLE_BF16
+// Defined math operations (bfloat16 fallback to fp32 when it is not supported)
+template<typename T>
+inline __device__ T hadd2(T a, T b) {
+    return __hadd2(a, b);
+}
+#if ENABLE_BF16
+template<>
+inline __device__ __nv_bfloat162 hadd2(__nv_bfloat162 a, __nv_bfloat162 b) {
+    return bf16hadd2(a, b);
+}
+#endif // ENABLE_BF16
+template<typename T>
+inline __device__ T add(T a, T b) {
+    return a + b;
+}
+template<>
+inline __device__ half2 add(half2 a, half2 b) {
+    return __hadd2(a, b);
+}
+template<>
+inline __device__ half add(half a, half b) {
+    return __hadd(a, b);
+}
+#if ENABLE_BF16
+template<>
+inline __device__ __nv_bfloat162 add(__nv_bfloat162 a, __nv_bfloat162 b) {
+    return bf16hadd2(a, b);
+}
+template<>
+inline __device__ __nv_bfloat16 add(__nv_bfloat16 a, __nv_bfloat16 b) {
+    return bf16hadd(a, b);
+}
+inline __device__ __nv_bfloat16 add(__nv_bfloat16 a, float b) {
+    return bf16hadd(a, __float2bfloat16(b));
+}
+#endif // ENABLE_BF16
+// applies to all 4 values addition
+template<typename T>
+inline __device__ T add(T a, T b, T c) {
+    return a + b + c;
+}
+#if ENABLE_BF16
+inline __device__ __nv_bfloat16 add(__nv_bfloat16 a, __nv_bfloat16 b, __nv_bfloat16 c) {
+    return bf16hadd(a, b, c);
+}
+inline __device__ __nv_bfloat162 add(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c) {
+    return bf16hadd2(a, b, c);
+}
+#endif // ENABLE_BF16
+// applies to all 4 values addition
+template<typename T>
+inline __device__ T add(T a, T b, T c, T d) {
+    return (T)((float)a + (float)b + (float)c + (float)d);
+}
+#if ENABLE_BF16
+inline __device__ __nv_bfloat16 add(__nv_bfloat16 a, __nv_bfloat16 b, __nv_bfloat16 c, __nv_bfloat16 d) {
+    return bf16hadd(a, b, c, d);
+}
+#endif // ENABLE_BF16
+template<typename T>
+inline __device__ T hsub2(T a, T b) {
+    return __hsub2(a, b);
+}
+#if ENABLE_BF16
+template<>
+inline __device__ __nv_bfloat162 hsub2(__nv_bfloat162 a, __nv_bfloat162 b) {
+    return bf16hsub2(a, b);
+}
+#endif // ENABLE_BF16
+template<typename T>
+inline __device__ T hmul2(T a, T b) {
+    return __hmul2(a, b);
+}
+#if ENABLE_BF16
+template<>
+inline __device__ __nv_bfloat162 hmul2(__nv_bfloat162 a, __nv_bfloat162 b) {
+    return bf16hmul2(a, b);
+}
+#endif // ENABLE_BF16
+template<typename T>
+inline __device__ T hmul2(T a, T b, T c) {
+    return a * b * c;
+}
+#if ENABLE_BF16
+template<>
+inline __device__ __nv_bfloat162 hmul2(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c) {
+    return bf16hmul2(a, b, c);
+}
+#endif // ENABLE_BF16
+template<typename T>
+inline __device__ T mul(T a, T b, T c) {
+    return a * b * c;
+}
+#if ENABLE_BF16
+template<>
+inline __device__ __nv_bfloat16 mul(__nv_bfloat16 a, __nv_bfloat16 b, __nv_bfloat16 c) {
+    return bf16hmul(a, b, c);
+}
+inline __device__ __nv_bfloat162 mul(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c) {
+    return bf16hmul2(a, b, c);
+}
+#endif // ENABLE_BF16
+template<typename T>
+inline __device__ T fma(T a, T b, T c, T d) {
+    return a * b * c + d;
+}
+#if ENABLE_BF16
+inline __device__ __nv_bfloat162 fma(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c, __nv_bfloat162 d) {
+    return bf16hfma2(a, b, c, d);
+}
+#endif // ENABLE_BF16
+template<typename T>
+inline __device__ T fma(T a, T b, T c) {
+    return a * b + c;
+}
+#if ENABLE_BF16
+template<>
+inline __device__ __nv_bfloat162 fma(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c) {
+    return bf16hfma2(a, b, c);
+}
+template<>
+inline __device__ __nv_bfloat16 fma(__nv_bfloat16 a, __nv_bfloat16 b, __nv_bfloat16 c) {
+    return bf16hfma(a, b, c);
+}
+#endif // ENABLE_BF16
+template<typename T>
+inline __device__ T hexp2(T a) {
+    return h2exp(a);
+}
+#if ENABLE_BF16
+template<>
+inline __device__ __nv_bfloat162 hexp2(__nv_bfloat162 a) {
+    return bf16exp2(a);
+}
+#endif // ENABLE_BF16
+template<typename T_OUT, typename T_IN> __device__ inline T_OUT cuda_cast(T_IN val) { return val; }
+template<> __device__ inline float2 cuda_cast<float2, int2>(int2 val) { return make_float2(val.x, val.y); }
+template<> __device__ inline float2 cuda_cast<float2, float>(float val) { return make_float2(val, val); }
+template<> __device__ inline float2 cuda_cast<float2, half2>(half2 val) { return __half22float2(val); }
+template<> __device__ inline half2 cuda_cast<half2, float2>(float2 val) { return __float22half2_rn(val); }
+template<> __device__ inline half2 cuda_cast<half2, float>(float val) { return __float2half2_rn(val); }
+template<> __device__ inline half2 cuda_cast<half2, half>(half val) { return __half2half2(val); }
+template<> __device__ inline int8_t cuda_cast<int8_t, half>(half val) {
+    union { int8_t int8[2]; int16_t int16; };
+    union { half fp16; int16_t int16_in; };
+    fp16 = val;
+    asm volatile ("cvt.rni.sat.s8.f16 %0, %1;" : "=h"(int16) : "h"(int16_in));
+    return int8[0];
+}
+template<> __device__ inline int16_t cuda_cast<int16_t, half2>(half2 val) {
+    union { int8_t int8[2]; int16_t int16; };
+    int8[0] = cuda_cast<int8_t>(val.x);
+    int8[1] = cuda_cast<int8_t>(val.y);
+    return int16;
+}
+template<> __device__ inline int8_t cuda_cast<int8_t, float>(float val) {
+    union { int8_t int8[2]; int16_t int16; };
+    asm volatile ("cvt.rni.sat.s8.f32 %0, %1;" : "=h"(int16) : "f"(val));
+    return int8[0];
+}
+template<> __device__ inline int16_t cuda_cast<int16_t, float2>(float2 val) {
+    union { int8_t int8[2]; int16_t int16; };
+    int8[0] = cuda_cast<int8_t>(val.x);
+    int8[1] = cuda_cast<int8_t>(val.y);
+    return int16;
+}
+template<> __device__ inline half2 cuda_cast<half2, int16_t>(int16_t val) {
+    union { int8_t int8[2]; int16_t int16; };
+    int16 = val;
+    return make_half2(int8[0], int8[1]);
+}
+template<> __device__ inline float2 cuda_cast<float2, int16_t>(int16_t val) {
+    union { int8_t int8[2]; int16_t int16; };
+    int16 = val;
+    return make_float2(int8[0], int8[1]);
+}
+#ifdef ENABLE_BF16
+template<> __device__ inline __nv_bfloat16 cuda_cast(int32_t val) { return static_cast<float>(val); }
+template<> __device__ inline __nv_bfloat16 cuda_cast(int8_t val) { return static_cast<float>(val); }
+template<> __device__ inline int8_t cuda_cast(__nv_bfloat16 val) { return static_cast<float>(val); }
+template<>
+__device__ inline float cuda_cast<float, __nv_bfloat16>(__nv_bfloat16 val) { return __bfloat162float(val); }
+template<> __device__ inline float2 cuda_cast<float2, __nv_bfloat162>(__nv_bfloat162 val) { return bf1622float2(val); }
+template<> __device__ inline half cuda_cast<half, __nv_bfloat16>(__nv_bfloat16 val) { return __float2half(__bfloat162float(val)); }
+template<> __device__ inline int16_t cuda_cast<int16_t, __nv_bfloat162>(__nv_bfloat162 val) { return bf1622int16(val); }
+template<> __device__ inline __nv_bfloat16 cuda_cast<__nv_bfloat16, float>(float val) { return __float2bfloat16(val); }
+template<> __device__ inline __nv_bfloat16 cuda_cast<__nv_bfloat16, half>(half val) { return __float2bfloat16(__half2float(val)); }
+template<> __device__ inline __nv_bfloat162 cuda_cast<__nv_bfloat162, __nv_bfloat16>(__nv_bfloat16 val) { return bf162bf162(val); }
+template<> __device__ inline __nv_bfloat162 cuda_cast<__nv_bfloat162, float>(float val) { return __float2bfloat162_rn(val); }
+template<> __device__ inline __nv_bfloat162 cuda_cast<__nv_bfloat162, float2>(float2 val) { return float22bf162(val); }
+template<> __device__ inline __nv_bfloat162 cuda_cast<__nv_bfloat162, int16_t>(int16_t val) {
+    union { int8_t int8[2]; int16_t int16; };
+    int16 = val;
+    __nv_bfloat162 res;
+    res.x = cuda_cast<__nv_bfloat16>(int8[0]);
+    res.y = cuda_cast<__nv_bfloat16>(int8[1]);
+    return res;
+}
+template<> __device__ inline __nv_bfloat162 cuda_cast<__nv_bfloat162, half2>(half2 val) { return float22bf162(__half22float2(val)); }
+#endif // ENABLE BF16
+template<typename T> __device__ inline T cuda_abs(T val);
+template<> __device__ inline float cuda_abs(float val) { return fabs(val); }
+template<> __device__ inline half  cuda_abs(half  val) { return __habs(val); }
+template<> __device__ inline half2 cuda_abs(half2 val) { return __habs2(val); }
+#ifdef ENABLE_BF16
+#if __CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)
+template<> __device__ inline __nv_bfloat16  cuda_abs(__nv_bfloat16  val) { return __habs(val); }
+template<> __device__ inline __nv_bfloat162 cuda_abs(__nv_bfloat162 val) { return __habs2(val); }
+#else
+template<> __device__ inline __nv_bfloat16  cuda_abs(__nv_bfloat16  val) { return fabs(val); }
+template<> __device__ inline __nv_bfloat162 cuda_abs(__nv_bfloat162 val) { return make_bfloat162(fabs(val.x), fabs(val.y)); }
+#endif
+#endif // ENABLE_FP16
+// Unary maximum: compute the max of a vector type
+template<typename To, typename Ti> __device__ inline To cuda_max(Ti val)
+{
+    return cuda_cast<To>(val);
+};
+template<> __device__ inline half cuda_max(half2 val) { return (val.x > val.y) ? val.x : val.y; }
+#ifdef ENABLE_BF16
+template<> __device__ inline __nv_bfloat16 cuda_max(__nv_bfloat162 val) { return (val.x > val.y) ? val.x : val.y; }
+#endif
+// Binary maximum: compute the max of two scalar types
+template<typename T> __device__ inline T cuda_max(T val1, T val2) { return (val1 > val2) ? val1 : val2; }
+#ifdef ENABLE_FP8
+template<> __device__ inline float2 cuda_cast<float2, __nv_fp8x2_e4m3>(__nv_fp8x2_e4m3 val) { return bf1622float2(fp8x2_e4m3_to_bfloat2(&val)); }
+template<> __device__ inline __nv_fp8x2_e4m3 cuda_cast<__nv_fp8x2_e4m3, float2>(float2 val) { return __nv_fp8x2_e4m3(bf1622float2(float22bf162(val))); }
+template<> __device__ inline __nv_fp8_e4m3 cuda_cast<__nv_fp8_e4m3, half>(half val) { return __nv_fp8_e4m3(val); }
+template<> __device__ inline __nv_fp8_e4m3 cuda_cast<__nv_fp8_e4m3, __nv_bfloat16>(__nv_bfloat16 val) { return __nv_fp8_e4m3(val); }
+template<> __device__ inline __nv_fp8_e4m3 cuda_cast<__nv_fp8_e4m3, float>(float val) { return __nv_fp8_e4m3(val); }
+template<> __device__ inline float cuda_cast<float, __nv_fp8_e4m3>(__nv_fp8_e4m3 val) { return (float)val; }
+template<> __device__ inline __nv_bfloat162 cuda_cast<__nv_bfloat162, __nv_fp8x2_e4m3>(__nv_fp8x2_e4m3 val) { return fp8x2_e4m3_to_bfloat2(&val); }
+template<> __device__ inline int8_t cuda_cast<int8_t, __nv_fp8_e4m3>(__nv_fp8_e4m3 val)
+{
+    // no impl
+    return 0;
+}
+template<> __device__ inline __nv_fp8_e4m3 cuda_cast<__nv_fp8_e4m3, int8_t>(int8_t val)
+{
+    return cuda_cast<__nv_fp8_e4m3>(cuda_cast<__nv_bfloat16>(cuda_cast<float>(val)));
+}
+#endif // ENABLE_FP8
+}
--- a/src/fastertransformer/utils/cuda_utils.cc
+++ b/src/fastertransformer/utils/cuda_utils.cc
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include "src/fastertransformer/utils/cuda_fp8_utils.h"
+namespace fastertransformer {
+/* **************************** debug tools ********************************* */
+template<typename T>
+void print_to_file(const T* result, const int size, const char* file, cudaStream_t stream, std::ios::openmode open_mode)
+{
+    cudaDeviceSynchronize();
+    check_cuda_error(cudaGetLastError());
+    printf("[INFO] file: %s with size %d.\n", file, size);
+    std::ofstream outFile(file, open_mode);
+    if (outFile) {
+        T* tmp = new T[size];
+        check_cuda_error(cudaMemcpyAsync(tmp, result, sizeof(T) * size, cudaMemcpyDeviceToHost, stream));
+        for (int i = 0; i < size; ++i) {
+            float val = (float)(tmp[i]);
+            outFile << val << std::endl;
+        }
+        delete[] tmp;
+    }
+    else {
+        throw std::runtime_error(std::string("[FT][ERROR] Cannot open file: ") + file + "\n");
+    }
+    cudaDeviceSynchronize();
+    check_cuda_error(cudaGetLastError());
+}
+template void
+print_to_file(const float* result, const int size, const char* file, cudaStream_t stream, std::ios::openmode open_mode);
+template void
+print_to_file(const half* result, const int size, const char* file, cudaStream_t stream, std::ios::openmode open_mode);
+#ifdef ENABLE_BF16
+template void print_to_file(
+    const __nv_bfloat16* result, const int size, const char* file, cudaStream_t stream, std::ios::openmode open_mode);
+#endif
+template<typename T>
+void print_abs_mean(const T* buf, uint size, cudaStream_t stream, std::string name)
+{
+    if (buf == nullptr) {
+        FT_LOG_WARNING("It is an nullptr, skip!");
+        return;
+    }
+    cudaDeviceSynchronize();
+    check_cuda_error(cudaGetLastError());
+    T* h_tmp = new T[size];
+    cudaMemcpyAsync(h_tmp, buf, sizeof(T) * size, cudaMemcpyDeviceToHost, stream);
+    cudaDeviceSynchronize();
+    check_cuda_error(cudaGetLastError());
+    double   sum        = 0.0f;
+    uint64_t zero_count = 0;
+    float    max_val    = -1e10;
+    bool     find_inf   = false;
+    for (uint i = 0; i < size; i++) {
+        if (std::isinf((float)(h_tmp[i]))) {
+            find_inf = true;
+            continue;
+        }
+        sum += abs((double)h_tmp[i]);
+        if ((float)h_tmp[i] == 0.0f) {
+            zero_count++;
+        }
+        max_val = max_val > abs(float(h_tmp[i])) ? max_val : abs(float(h_tmp[i]));
+    }
+    printf("[INFO][FT] %20s size: %u, abs mean: %f, abs sum: %f, abs max: %f, find inf: %s",
+           name.c_str(),
+           size,
+           sum / size,
+           sum,
+           max_val,
+           find_inf ? "true" : "false");
+    std::cout << std::endl;
+    delete[] h_tmp;
+    cudaDeviceSynchronize();
+    check_cuda_error(cudaGetLastError());
+}
+template void print_abs_mean(const float* buf, uint size, cudaStream_t stream, std::string name);
+template void print_abs_mean(const half* buf, uint size, cudaStream_t stream, std::string name);
+#ifdef ENABLE_BF16
+template void print_abs_mean(const __nv_bfloat16* buf, uint size, cudaStream_t stream, std::string name);
+#endif
+template void print_abs_mean(const int* buf, uint size, cudaStream_t stream, std::string name);
+template void print_abs_mean(const uint* buf, uint size, cudaStream_t stream, std::string name);
+template void print_abs_mean(const int8_t* buf, uint size, cudaStream_t stream, std::string name);
+#ifdef ENABLE_FP8
+template void print_abs_mean(const __nv_fp8_e4m3* buf, uint size, cudaStream_t stream, std::string name);
+#endif
+template<typename T>
+void print_to_screen(const T* result, const int size)
+{
+    if (result == nullptr) {
+        FT_LOG_WARNING("It is an nullptr, skip! \n");
+        return;
+    }
+    T* tmp = reinterpret_cast<T*>(malloc(sizeof(T) * size));
+    check_cuda_error(cudaMemcpy(tmp, result, sizeof(T) * size, cudaMemcpyDeviceToHost));
+    for (int i = 0; i < size; ++i) {
+        printf("%d, %f\n", i, static_cast<float>(tmp[i]));
+    }
+    free(tmp);
+}
+template void print_to_screen(const float* result, const int size);
+template void print_to_screen(const half* result, const int size);
+#ifdef ENABLE_BF16
+template void print_to_screen(const __nv_bfloat16* result, const int size);
+#endif
+template void print_to_screen(const int* result, const int size);
+template void print_to_screen(const uint* result, const int size);
+template void print_to_screen(const bool* result, const int size);
+#ifdef ENABLE_FP8
+template void print_to_screen(const __nv_fp8_e4m3* result, const int size);
+#endif
+template<typename T>
+void printMatrix(T* ptr, int m, int k, int stride, bool is_device_ptr)
+{
+    T* tmp;
+    if (is_device_ptr) {
+        // k < stride ; stride = col-dimension.
+        tmp = reinterpret_cast<T*>(malloc(m * stride * sizeof(T)));
+        check_cuda_error(cudaMemcpy(tmp, ptr, sizeof(T) * m * stride, cudaMemcpyDeviceToHost));
+        cudaDeviceSynchronize();
+    }
+    else {
+        tmp = ptr;
+    }
+    for (int ii = -1; ii < m; ++ii) {
+        if (ii >= 0) {
+            printf("%02d ", ii);
+        }
+        else {
+            printf("   ");
+        }
+        for (int jj = 0; jj < k; jj += 1) {
+            if (ii >= 0) {
+                printf("%7.3f ", (float)tmp[ii * stride + jj]);
+            }
+            else {
+                printf("%7d ", jj);
+            }
+        }
+        printf("\n");
+    }
+    if (is_device_ptr) {
+        free(tmp);
+    }
+}
+template void printMatrix(float* ptr, int m, int k, int stride, bool is_device_ptr);
+template void printMatrix(half* ptr, int m, int k, int stride, bool is_device_ptr);
+#ifdef ENABLE_BF16
+template void printMatrix(__nv_bfloat16* ptr, int m, int k, int stride, bool is_device_ptr);
+#endif
+void printMatrix(unsigned long long* ptr, int m, int k, int stride, bool is_device_ptr)
+{
+    typedef unsigned long long T;
+    T*                         tmp;
+    if (is_device_ptr) {
+        // k < stride ; stride = col-dimension.
+        tmp = reinterpret_cast<T*>(malloc(m * stride * sizeof(T)));
+        check_cuda_error(cudaMemcpy(tmp, ptr, sizeof(T) * m * stride, cudaMemcpyDeviceToHost));
+        cudaDeviceSynchronize();
+    }
+    else {
+        tmp = ptr;
+    }
+    for (int ii = -1; ii < m; ++ii) {
+        if (ii >= 0) {
+            printf("%02d ", ii);
+        }
+        else {
+            printf("   ");
+        }
+        for (int jj = 0; jj < k; jj += 1) {
+            if (ii >= 0) {
+                printf("%4llu ", tmp[ii * stride + jj]);
+            }
+            else {
+                printf("%4d ", jj);
+            }
+        }
+        printf("\n");
+    }
+    if (is_device_ptr) {
+        free(tmp);
+    }
+}
+void printMatrix(int* ptr, int m, int k, int stride, bool is_device_ptr)
+{
+    typedef int T;
+    T*          tmp;
+    if (is_device_ptr) {
+        // k < stride ; stride = col-dimension.
+        tmp = reinterpret_cast<T*>(malloc(m * stride * sizeof(T)));
+        check_cuda_error(cudaMemcpy(tmp, ptr, sizeof(T) * m * stride, cudaMemcpyDeviceToHost));
+        cudaDeviceSynchronize();
+    }
+    else {
+        tmp = ptr;
+    }
+    for (int ii = -1; ii < m; ++ii) {
+        if (ii >= 0) {
+            printf("%02d ", ii);
+        }
+        else {
+            printf("   ");
+        }
+        for (int jj = 0; jj < k; jj += 1) {
+            if (ii >= 0) {
+                printf("%4d ", tmp[ii * stride + jj]);
+            }
+            else {
+                printf("%4d ", jj);
+            }
+        }
+        printf("\n");
+    }
+    if (is_device_ptr) {
+        free(tmp);
+    }
+}
+void printMatrix(size_t* ptr, int m, int k, int stride, bool is_device_ptr)
+{
+    typedef size_t T;
+    T*             tmp;
+    if (is_device_ptr) {
+        // k < stride ; stride = col-dimension.
+        tmp = reinterpret_cast<T*>(malloc(m * stride * sizeof(T)));
+        check_cuda_error(cudaMemcpy(tmp, ptr, sizeof(T) * m * stride, cudaMemcpyDeviceToHost));
+        cudaDeviceSynchronize();
+    }
+    else {
+        tmp = ptr;
+    }
+    for (int ii = -1; ii < m; ++ii) {
+        if (ii >= 0) {
+            printf("%02d ", ii);
+        }
+        else {
+            printf("   ");
+        }
+        for (int jj = 0; jj < k; jj += 1) {
+            if (ii >= 0) {
+                printf("%4ld ", tmp[ii * stride + jj]);
+            }
+            else {
+                printf("%4d ", jj);
+            }
+        }
+        printf("\n");
+    }
+    if (is_device_ptr) {
+        free(tmp);
+    }
+}
+template<typename T>
+void check_max_val(const T* result, const int size)
+{
+    T* tmp = new T[size];
+    cudaMemcpy(tmp, result, sizeof(T) * size, cudaMemcpyDeviceToHost);
+    float max_val = -100000;
+    for (int i = 0; i < size; i++) {
+        float val = static_cast<float>(tmp[i]);
+        if (val > max_val) {
+            max_val = val;
+        }
+    }
+    delete tmp;
+    printf("[INFO][CUDA] addr %p max val: %f \n", result, max_val);
+}
+template void check_max_val(const float* result, const int size);
+template void check_max_val(const half* result, const int size);
+#ifdef ENABLE_BF16
+template void check_max_val(const __nv_bfloat16* result, const int size);
+#endif
+template<typename T>
+void check_abs_mean_val(const T* result, const int size)
+{
+    T* tmp = new T[size];
+    cudaMemcpy(tmp, result, sizeof(T) * size, cudaMemcpyDeviceToHost);
+    float sum = 0.0f;
+    for (int i = 0; i < size; i++) {
+        sum += abs(static_cast<float>(tmp[i]));
+    }
+    delete tmp;
+    printf("[INFO][CUDA] addr %p abs mean val: %f \n", result, sum / size);
+}
+template void check_abs_mean_val(const float* result, const int size);
+template void check_abs_mean_val(const half* result, const int size);
+#ifdef ENABLE_BF16
+template void check_abs_mean_val(const __nv_bfloat16* result, const int size);
+#endif
+/* ***************************** common utils ****************************** */
+cudaError_t getSetDevice(int i_device, int* o_device)
+{
+    int         current_dev_id = 0;
+    cudaError_t err            = cudaSuccess;
+    if (o_device != NULL) {
+        err = cudaGetDevice(&current_dev_id);
+        if (err != cudaSuccess) {
+            return err;
+        }
+        if (current_dev_id == i_device) {
+            *o_device = i_device;
+        }
+        else {
+            err = cudaSetDevice(i_device);
+            if (err != cudaSuccess) {
+                return err;
+            }
+            *o_device = current_dev_id;
+        }
+    }
+    else {
+        err = cudaSetDevice(i_device);
+        if (err != cudaSuccess) {
+            return err;
+        }
+    }
+    return cudaSuccess;
+}
+FtCudaDataType getModelFileType(std::string ini_file, std::string section_name)
+{
+    FtCudaDataType model_file_type;
+    INIReader      reader = INIReader(ini_file);
+    if (reader.ParseError() < 0) {
+        FT_LOG_WARNING("Can't load %s. Use FP32 as default", ini_file.c_str());
+        model_file_type = FtCudaDataType::FP32;
+    }
+    else {
+        std::string weight_data_type_str = std::string(reader.Get(section_name, "weight_data_type"));
+        if (weight_data_type_str.find("fp32") != std::string::npos) {
+            model_file_type = FtCudaDataType::FP32;
+        }
+        else if (weight_data_type_str.find("fp16") != std::string::npos) {
+            model_file_type = FtCudaDataType::FP16;
+        }
+        else if (weight_data_type_str.find("bf16") != std::string::npos) {
+            model_file_type = FtCudaDataType::BF16;
+        }
+        else {
+            FT_LOG_WARNING("Invalid type %s. Use FP32 as default", weight_data_type_str.c_str());
+            model_file_type = FtCudaDataType::FP32;
+        }
+    }
+    return model_file_type;
+}
+/* ************************** end of common utils ************************** */
+}  // namespace fastertransformer
--- a/src/fastertransformer/utils/cuda_utils.h
+++ b/src/fastertransformer/utils/cuda_utils.h
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "3rdparty/INIReader.h"
+#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
+#include "src/fastertransformer/utils/logger.h"
+#include <cublasLt.h>
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+#ifdef SPARSITY_ENABLED
+#include <cusparseLt.h>
+#endif
+namespace fastertransformer {
+#define MAX_CONFIG_NUM 20
+#define COL32_ 32
+// workspace for cublas gemm : 32MB
+#define CUBLAS_WORKSPACE_SIZE 33554432
+typedef struct __align__(4)
+{
+    half x, y, z, w;
+}
+half4;
+/* **************************** type definition ***************************** */
+enum CublasDataType {
+    FLOAT_DATATYPE    = 0,
+    HALF_DATATYPE     = 1,
+    BFLOAT16_DATATYPE = 2,
+    INT8_DATATYPE     = 3,
+    FP8_DATATYPE      = 4
+};
+enum FtCudaDataType {
+    FP32 = 0,
+    FP16 = 1,
+    BF16 = 2,
+    INT8 = 3,
+    FP8  = 4
+};
+enum class OperationType {
+    FP32,
+    FP16,
+    BF16,
+    INT8,
+    FP8
+};
+/* **************************** debug tools ********************************* */
+static const char* _cudaGetErrorEnum(cudaError_t error)
+{
+    return cudaGetErrorString(error);
+}
+static const char* _cudaGetErrorEnum(cublasStatus_t error)
+{
+    switch (error) {
+        case CUBLAS_STATUS_SUCCESS:
+            return "CUBLAS_STATUS_SUCCESS";
+        case CUBLAS_STATUS_NOT_INITIALIZED:
+            return "CUBLAS_STATUS_NOT_INITIALIZED";
+        case CUBLAS_STATUS_ALLOC_FAILED:
+            return "CUBLAS_STATUS_ALLOC_FAILED";
+        case CUBLAS_STATUS_INVALID_VALUE:
+            return "CUBLAS_STATUS_INVALID_VALUE";
+        case CUBLAS_STATUS_ARCH_MISMATCH:
+            return "CUBLAS_STATUS_ARCH_MISMATCH";
+        case CUBLAS_STATUS_MAPPING_ERROR:
+            return "CUBLAS_STATUS_MAPPING_ERROR";
+        case CUBLAS_STATUS_EXECUTION_FAILED:
+            return "CUBLAS_STATUS_EXECUTION_FAILED";
+        case CUBLAS_STATUS_INTERNAL_ERROR:
+            return "CUBLAS_STATUS_INTERNAL_ERROR";
+        case CUBLAS_STATUS_NOT_SUPPORTED:
+            return "CUBLAS_STATUS_NOT_SUPPORTED";
+        case CUBLAS_STATUS_LICENSE_ERROR:
+            return "CUBLAS_STATUS_LICENSE_ERROR";
+    }
+    return "<unknown>";
+}
+template<typename T>
+void check(T result, char const* const func, const char* const file, int const line)
+{
+    if (result) {
+        throw std::runtime_error(std::string("[FT][ERROR] CUDA runtime error: ") + (_cudaGetErrorEnum(result)) + " "
+                                 + file + ":" + std::to_string(line) + " \n");
+    }
+}
+#define check_cuda_error(val) check((val), #val, __FILE__, __LINE__)
+#define check_cuda_error_2(val, file, line) check((val), #val, file, line)
+inline void syncAndCheck(const char* const file, int const line)
+{
+    // When FT_DEBUG_LEVEL=DEBUG, must check error
+    static char* level_name = std::getenv("FT_DEBUG_LEVEL");
+    if (level_name != nullptr) {
+        static std::string level = std::string(level_name);
+        if (level == "DEBUG") {
+            cudaDeviceSynchronize();
+            cudaError_t result = cudaGetLastError();
+            if (result) {
+                throw std::runtime_error(std::string("[FT][ERROR] CUDA runtime error: ") + (_cudaGetErrorEnum(result))
+                                         + " " + file + ":" + std::to_string(line) + " \n");
+            }
+            FT_LOG_DEBUG(fmtstr("run syncAndCheck at %s:%d", file, line));
+        }
+    }
+#ifndef NDEBUG
+    cudaDeviceSynchronize();
+    cudaError_t result = cudaGetLastError();
+    if (result) {
+        throw std::runtime_error(std::string("[FT][ERROR] CUDA runtime error: ") + (_cudaGetErrorEnum(result)) + " "
+                                 + file + ":" + std::to_string(line) + " \n");
+    }
+#endif
+}
+#define sync_check_cuda_error() syncAndCheck(__FILE__, __LINE__)
+#define checkCUDNN(expression)                                                                                         \
+    {                                                                                                                  \
+        cudnnStatus_t status = (expression);                                                                           \
+        if (status != CUDNN_STATUS_SUCCESS) {                                                                          \
+            std::cerr << "Error on file " << __FILE__ << " line " << __LINE__ << ": " << cudnnGetErrorString(status)   \
+                      << std::endl;                                                                                    \
+            std::exit(EXIT_FAILURE);                                                                                   \
+        }                                                                                                              \
+    }
+template<typename T>
+void print_to_file(const T*           result,
+                   const int          size,
+                   const char*        file,
+                   cudaStream_t       stream    = 0,
+                   std::ios::openmode open_mode = std::ios::out);
+template<typename T>
+void print_abs_mean(const T* buf, uint size, cudaStream_t stream, std::string name = "");
+template<typename T>
+void print_to_screen(const T* result, const int size);
+template<typename T>
+void printMatrix(T* ptr, int m, int k, int stride, bool is_device_ptr);
+void printMatrix(unsigned long long* ptr, int m, int k, int stride, bool is_device_ptr);
+void printMatrix(int* ptr, int m, int k, int stride, bool is_device_ptr);
+void printMatrix(size_t* ptr, int m, int k, int stride, bool is_device_ptr);
+template<typename T>
+void check_max_val(const T* result, const int size);
+template<typename T>
+void check_abs_mean_val(const T* result, const int size);
+#define PRINT_FUNC_NAME_()                                                                                             \
+    do {                                                                                                               \
+        std::cout << "[FT][CALL] " << __FUNCTION__ << " " << std::endl;                                                \
+    } while (0)
+[[noreturn]] inline void throwRuntimeError(const char* const file, int const line, std::string const& info = "")
+{
+    throw std::runtime_error(std::string("[FT][ERROR] ") + info + " Assertion fail: " + file + ":"
+                             + std::to_string(line) + " \n");
+}
+inline void myAssert(bool result, const char* const file, int const line, std::string const& info = "")
+{
+    if (!result) {
+        throwRuntimeError(file, line, info);
+    }
+}
+#define FT_CHECK(val) myAssert(val, __FILE__, __LINE__)
+#define FT_CHECK_WITH_INFO(val, info)                                                                                  \
+    do {                                                                                                               \
+        bool is_valid_val = (val);                                                                                     \
+        if (!is_valid_val) {                                                                                           \
+            fastertransformer::myAssert(is_valid_val, __FILE__, __LINE__, (info));                                     \
+        }                                                                                                              \
+    } while (0)
+#define FT_THROW(info) throwRuntimeError(__FILE__, __LINE__, info)
+#ifdef SPARSITY_ENABLED
+#define CHECK_CUSPARSE(func)                                                                                           \
+    {                                                                                                                  \
+        cusparseStatus_t status = (func);                                                                              \
+        if (status != CUSPARSE_STATUS_SUCCESS) {                                                                       \
+            throw std::runtime_error(std::string("[FT][ERROR] CUSPARSE API failed at line ")                           \
+                                     + std::to_string(__LINE__) + " in file " + __FILE__ + ": "                        \
+                                     + cusparseGetErrorString(status) + " " + std::to_string(status));                 \
+        }                                                                                                              \
+    }
+#endif
+/*************Time Handling**************/
+class CudaTimer {
+private:
+    cudaEvent_t  event_start_;
+    cudaEvent_t  event_stop_;
+    cudaStream_t stream_;
+public:
+    explicit CudaTimer(cudaStream_t stream = 0)
+    {
+        stream_ = stream;
+    }
+    void start()
+    {
+        check_cuda_error(cudaEventCreate(&event_start_));
+        check_cuda_error(cudaEventCreate(&event_stop_));
+        check_cuda_error(cudaEventRecord(event_start_, stream_));
+    }
+    float stop()
+    {
+        float time;
+        check_cuda_error(cudaEventRecord(event_stop_, stream_));
+        check_cuda_error(cudaEventSynchronize(event_stop_));
+        check_cuda_error(cudaEventElapsedTime(&time, event_start_, event_stop_));
+        check_cuda_error(cudaEventDestroy(event_start_));
+        check_cuda_error(cudaEventDestroy(event_stop_));
+        return time;
+    }
+    ~CudaTimer() {}
+};
+static double diffTime(timeval start, timeval end)
+{
+    return (end.tv_sec - start.tv_sec) * 1000 + (end.tv_usec - start.tv_usec) * 0.001;
+}
+/* ***************************** common utils ****************************** */
+inline void print_mem_usage(std::string time = "after allocation")
+{
+    size_t free_bytes, total_bytes;
+    check_cuda_error(cudaMemGetInfo(&free_bytes, &total_bytes));
+    float free  = static_cast<float>(free_bytes) / 1024.0 / 1024.0 / 1024.0;
+    float total = static_cast<float>(total_bytes) / 1024.0 / 1024.0 / 1024.0;
+    float used  = total - free;
+    printf("%-20s: free: %5.2f GB, total: %5.2f GB, used: %5.2f GB\n", time.c_str(), free, total, used);
+}
+inline int getSMVersion()
+{
+    int device{-1};
+    check_cuda_error(cudaGetDevice(&device));
+    int sm_major = 0;
+    int sm_minor = 0;
+    check_cuda_error(cudaDeviceGetAttribute(&sm_major, cudaDevAttrComputeCapabilityMajor, device));
+    check_cuda_error(cudaDeviceGetAttribute(&sm_minor, cudaDevAttrComputeCapabilityMinor, device));
+    return sm_major * 10 + sm_minor;
+}
+inline int getMaxSharedMemoryPerBlock()
+{
+    int device{-1};
+    check_cuda_error(cudaGetDevice(&device));
+    int max_shared_memory_size = 0;
+    check_cuda_error(cudaDeviceGetAttribute(&max_shared_memory_size, cudaDevAttrMaxSharedMemoryPerBlock, device));
+    return max_shared_memory_size;
+}
+inline std::string getDeviceName()
+{
+    int device{-1};
+    check_cuda_error(cudaGetDevice(&device));
+    cudaDeviceProp props;
+    check_cuda_error(cudaGetDeviceProperties(&props, device));
+    return std::string(props.name);
+}
+inline int div_up(int a, int n)
+{
+    return (a + n - 1) / n;
+}
+cudaError_t getSetDevice(int i_device, int* o_device = NULL);
+inline int getDevice()
+{
+    int current_dev_id = 0;
+    check_cuda_error(cudaGetDevice(&current_dev_id));
+    return current_dev_id;
+}
+inline int getDeviceCount()
+{
+    int count = 0;
+    check_cuda_error(cudaGetDeviceCount(&count));
+    return count;
+}
+template<typename T>
+CublasDataType getCublasDataType()
+{
+    if (std::is_same<T, half>::value) {
+        return HALF_DATATYPE;
+    }
+#ifdef ENABLE_BF16
+    else if (std::is_same<T, __nv_bfloat16>::value) {
+        return BFLOAT16_DATATYPE;
+    }
+#endif
+    else if (std::is_same<T, float>::value) {
+        return FLOAT_DATATYPE;
+    }
+    else {
+        FT_CHECK(false);
+        return FLOAT_DATATYPE;
+    }
+}
+template<typename T>
+cudaDataType_t getCudaDataType()
+{
+    if (std::is_same<T, half>::value) {
+        return CUDA_R_16F;
+    }
+#ifdef ENABLE_BF16
+    else if (std::is_same<T, __nv_bfloat16>::value) {
+        return CUDA_R_16BF;
+    }
+#endif
+    else if (std::is_same<T, float>::value) {
+        return CUDA_R_32F;
+    }
+    else {
+        FT_CHECK(false);
+        return CUDA_R_32F;
+    }
+}
+template<CublasDataType T>
+struct getTypeFromCudaDataType {
+    using Type = float;
+};
+template<>
+struct getTypeFromCudaDataType<HALF_DATATYPE> {
+    using Type = half;
+};
+#ifdef ENABLE_BF16
+template<>
+struct getTypeFromCudaDataType<BFLOAT16_DATATYPE> {
+    using Type = __nv_bfloat16;
+};
+#endif
+FtCudaDataType getModelFileType(std::string ini_file, std::string section_name);
+// clang-format off
+template<typename T> struct packed_type;
+template <>          struct packed_type<float>         { using type = float; }; // we don't need to pack float by default
+template <>          struct packed_type<half>          { using type = half2; };
+#ifdef ENABLE_BF16
+template<>
+struct packed_type<__nv_bfloat16> {
+    using type = __nv_bfloat162;
+};
+#endif
+template<typename T> struct num_elems;
+template <>          struct num_elems<float>           { static constexpr int value = 1; };
+template <>          struct num_elems<float2>          { static constexpr int value = 2; };
+template <>          struct num_elems<float4>          { static constexpr int value = 4; };
+template <>          struct num_elems<half>            { static constexpr int value = 1; };
+template <>          struct num_elems<half2>           { static constexpr int value = 2; };
+#ifdef ENABLE_BF16
+template <>          struct num_elems<__nv_bfloat16>   { static constexpr int value = 1; };
+template <>          struct num_elems<__nv_bfloat162>  { static constexpr int value = 2; };
+#endif
+template<typename T, int num> struct packed_as;
+template<typename T>          struct packed_as<T, 1>              { using type = T; };
+template<>                    struct packed_as<half,  2>          { using type = half2; };
+template<>                    struct packed_as<float,  2>         { using type = float2; };
+template<>                    struct packed_as<int8_t, 2>         { using type = int16_t; };
+template<>                    struct packed_as<int32_t, 2>        { using type = int2; };
+template<>                    struct packed_as<half2, 1>          { using type = half; };
+#ifdef ENABLE_BF16
+template<> struct packed_as<__nv_bfloat16,  2> { using type = __nv_bfloat162; };
+template<> struct packed_as<__nv_bfloat162, 1> { using type = __nv_bfloat16;  };
+#endif
+inline __device__ float2 operator*(float2 a, float2 b) { return make_float2(a.x * b.x, a.y * b.y); }
+inline __device__ float2 operator*(float2 a, float  b) { return make_float2(a.x * b, a.y * b); }
+// clang-format on
+template<typename T1, typename T2>
+void compareTwoTensor(
+    const T1* pred, const T2* ref, const int size, const int print_size = 0, const std::string filename = "")
+{
+    T1* h_pred = new T1[size];
+    T2* h_ref  = new T2[size];
+    check_cuda_error(cudaMemcpy(h_pred, pred, size * sizeof(T1), cudaMemcpyDeviceToHost));
+    check_cuda_error(cudaMemcpy(h_ref, ref, size * sizeof(T2), cudaMemcpyDeviceToHost));
+    FILE* fd = nullptr;
+    if (filename != "") {
+        fd = fopen(filename.c_str(), "w");
+        fprintf(fd, "| %10s | %10s | %10s | %10s | \n", "pred", "ref", "abs_diff", "rel_diff(%)");
+    }
+    if (print_size > 0) {
+        FT_LOG_INFO("  id |   pred  |   ref   |abs diff | rel diff (%) |");
+    }
+    float mean_abs_diff = 0.0f;
+    float mean_rel_diff = 0.0f;
+    int   count         = 0;
+    for (int i = 0; i < size; i++) {
+        if (i < print_size) {
+            FT_LOG_INFO("%4d | % 6.4f | % 6.4f | % 6.4f | % 7.4f |",
+                        i,
+                        (float)h_pred[i],
+                        (float)h_ref[i],
+                        abs((float)h_pred[i] - (float)h_ref[i]),
+                        abs((float)h_pred[i] - (float)h_ref[i]) / (abs((float)h_ref[i]) + 1e-6f) * 100.f);
+        }
+        if ((float)h_pred[i] == 0) {
+            continue;
+        }
+        count += 1;
+        mean_abs_diff += abs((float)h_pred[i] - (float)h_ref[i]);
+        mean_rel_diff += abs((float)h_pred[i] - (float)h_ref[i]) / (abs((float)h_ref[i]) + 1e-6f) * 100.f;
+        if (fd != nullptr) {
+            fprintf(fd,
+                    "| %10.5f | %10.5f | %10.5f | %11.5f |\n",
+                    (float)h_pred[i],
+                    (float)h_ref[i],
+                    abs((float)h_pred[i] - (float)h_ref[i]),
+                    abs((float)h_pred[i] - (float)h_ref[i]) / (abs((float)h_ref[i]) + 1e-6f) * 100.f);
+        }
+    }
+    mean_abs_diff = mean_abs_diff / (float)count;
+    mean_rel_diff = mean_rel_diff / (float)count;
+    FT_LOG_INFO("mean_abs_diff: % 6.4f, mean_rel_diff: % 6.4f (%%)", mean_abs_diff, mean_rel_diff);
+    if (fd != nullptr) {
+        fprintf(fd, "mean_abs_diff: % 6.4f, mean_rel_diff: % 6.4f (%%)", mean_abs_diff, mean_rel_diff);
+        fclose(fd);
+    }
+    delete[] h_pred;
+    delete[] h_ref;
+}
+/* ************************** end of common utils ************************** */
+}  // namespace fastertransformer
--- a/src/fastertransformer/utils/custom_ar_comm.cc
+++ b/src/fastertransformer/utils/custom_ar_comm.cc
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "custom_ar_comm.h"
+namespace fastertransformer {
+template<typename T>
+CustomAllReduceComm<T>::CustomAllReduceComm(size_t rank_size, size_t rank): rank_size_(rank_size), rank_(rank)
+{
+    param_.barrier_flag = 0;
+    // NOTE: assume All Reduce happens within the node (DGX A100)
+    param_.rank       = rank_;
+    param_.local_rank = rank_;
+    param_.node_id    = 0;
+}
+template<typename T>
+CustomAllReduceComm<T>::~CustomAllReduceComm()
+{
+    cudaPointerAttributes comm_buffer_attributes, barrier_attributes;
+    check_cuda_error(cudaPointerGetAttributes(&comm_buffer_attributes, param_.peer_comm_buffer_ptrs[rank_]));
+    check_cuda_error(cudaPointerGetAttributes(&barrier_attributes, param_.peer_barrier_ptrs[rank_]));
+    if (comm_buffer_attributes.type == 2) {
+        check_cuda_error(cudaFree(param_.peer_comm_buffer_ptrs[rank_]));
+    }
+    if (barrier_attributes.type == 2) {
+        check_cuda_error(cudaFree(param_.peer_barrier_ptrs[rank_]));
+    }
+}
+template<typename T>
+void CustomAllReduceComm<T>::customAllReduce(size_t elts, cudaStream_t stream)
+{
+    param_.elts_total   = elts;
+    param_.barrier_flag = FLAG(param_.barrier_flag + 1);
+    invokeOneOrTwoShotAllReduceKernel<T>(param_, stream);
+    // swap back
+    output_tensor_->at(0).data = (const void*)tmp_tensor_data_;
+}
+template<typename T>
+void CustomAllReduceComm<T>::allocateAndExchangePeerAccessPointer(
+    std::vector<std::shared_ptr<AbstractCustomComm>>* custom_all_reduce_comms)
+{
+    assert(custom_all_reduce_comms->size() == rank_size_);
+    assert(rank_ == 0);
+    // Enable Peer to Peer Access
+    enableP2P(rank_size_);
+    for (size_t i = 0; i < rank_size_; i++) {
+        check_cuda_error(cudaSetDevice(i));
+        check_cuda_error(cudaMalloc(&(param_.peer_comm_buffer_ptrs[i]), CUSTOM_AR_SIZE_THRESHOLD));
+        check_cuda_error(
+            cudaMalloc(&(param_.peer_barrier_ptrs[i]), rank_size_ * (MAX_ALL_REDUCE_BLOCKS + 1) * sizeof(uint32_t)));
+        check_cuda_error(
+            cudaMemset(param_.peer_barrier_ptrs[i], 0, rank_size_ * (MAX_ALL_REDUCE_BLOCKS + 1) * sizeof(uint32_t)));
+        T*        current_peer_comm_buffer_ptr = param_.peer_comm_buffer_ptrs[i];
+        uint32_t* current_peer_barrier_ptr     = param_.peer_barrier_ptrs[i];
+        // Assume current comm allocates device memory on all ranks (rank_ == 0)
+        for (size_t j = 1; j < rank_size_; j++) {
+            static_cast<CustomAllReduceComm<T>*>(custom_all_reduce_comms->at(j).get())
+                ->param_.peer_comm_buffer_ptrs[i] = current_peer_comm_buffer_ptr;
+            static_cast<CustomAllReduceComm<T>*>(custom_all_reduce_comms->at(j).get())->param_.peer_barrier_ptrs[i] =
+                current_peer_barrier_ptr;
+        }
+    }
+    // Set default local_output_buffer_ptr to local peer_comm_buffer_ptrs
+    for (size_t i = 0; i < rank_size_; i++) {
+        static_cast<CustomAllReduceComm<T>*>(custom_all_reduce_comms->at(i).get())->param_.local_output_buffer_ptr =
+            static_cast<CustomAllReduceComm<T>*>(custom_all_reduce_comms->at(i).get())->param_.peer_comm_buffer_ptrs[i];
+    }
+}
+template<typename T>
+void CustomAllReduceComm<T>::enableP2P(int ngpus)
+{
+    int peer_access_available = 0;
+    for (int i = 0; i < ngpus; i++) {
+        cudaSetDevice(i);
+        for (int j = 0; j < ngpus; j++) {
+            if (i == j) {
+                continue;
+            }
+            cudaDeviceCanAccessPeer(&peer_access_available, i, j);
+            // Custom AR Kernels need DGX A100 NVSWITCH connections
+            assert(peer_access_available);
+            cudaDeviceEnablePeerAccess(j, 0);
+        }
+    }
+}
+template<typename T>
+bool CustomAllReduceComm<T>::swapInternalBuffer(std::vector<Tensor>* tensor_buffer, size_t elts)
+{
+    // Check if all reduce elts meet the requirement of custom kernels
+    // If meet, then swap the local comm buffer ptr with output tensor data pointer (avoid additional
+    // memory movement)
+    if (rank_size_ > 1 && elts * sizeof(T) <= CUSTOM_AR_SIZE_THRESHOLD) {
+        tmp_tensor_data_               = (T*)(tensor_buffer->at(0).data);
+        output_tensor_                 = tensor_buffer;
+        tensor_buffer->at(0).data      = param_.peer_comm_buffer_ptrs[rank_];
+        param_.local_output_buffer_ptr = tmp_tensor_data_;
+        return true;
+    }
+    return false;
+}
+template<typename T>
+void initCustomAllReduceComm(std::vector<std::shared_ptr<AbstractCustomComm>>* custom_all_reduce_comms,
+                             int                                               enable_custom_all_reduce,
+                             size_t                                            rank_size)
+{
+    if (enable_custom_all_reduce == 0) {
+        // don't use custom all reduce kernels, fall back to NCCL
+        for (size_t i = 0; i < rank_size; i++) {
+            custom_all_reduce_comms->push_back(nullptr);
+        }
+        return;
+    }
+    if (rank_size != RANKS_PER_NODE) {
+#ifdef BUILD_MULTI_GPU
+        if (rank_size > 1) {
+            FT_LOG_WARNING("Custom All Reduce only supports 8 Ranks currently. Using NCCL as Comm.");
+        }
+#else
+        FT_CHECK_WITH_INFO(rank_size == 1,
+                           fmtstr("Custom All Reduce only supports 8 Ranks currently, got rank_size %ld. FT needs "
+                                  "the NCCL library to communicate among devices but has built without NCCL. "
+                                  "Please use the flag -DBUILD_MULTI_GPU=ON when compiling.",
+                                  rank_size));
+#endif
+        for (size_t i = 0; i < rank_size; i++) {
+            custom_all_reduce_comms->push_back(nullptr);
+        }
+        return;
+    }
+#if defined(CUDART_VERSION) && CUDART_VERSION >= 11020
+    for (size_t i = 0; i < rank_size; i++) {
+        custom_all_reduce_comms->push_back(std::make_shared<CustomAllReduceComm<T>>(rank_size, i));
+    }
+    custom_all_reduce_comms->at(0)->allocateAndExchangePeerAccessPointer(custom_all_reduce_comms);
+#else
+    FT_LOG_WARNING("Custom All Reduce is not supported before CUDA 11.2. Using NCCL as Comm.");
+    for (size_t i = 0; i < rank_size; i++) {
+        custom_all_reduce_comms->push_back(nullptr);
+    }
+#endif
+}
+// Template instantiation
+template class CustomAllReduceComm<uint16_t>;
+#ifdef ENABLE_BF16
+template class CustomAllReduceComm<__nv_bfloat16>;
+#endif
+template class CustomAllReduceComm<uint32_t>;
+template void
+initCustomAllReduceComm<uint16_t>(std::vector<std::shared_ptr<AbstractCustomComm>>* custom_all_reduce_comms,
+                                  int                                               enable_custom_all_reduce,
+                                  size_t                                            rank_size);
+#ifdef ENABLE_BF16
+template void
+initCustomAllReduceComm<__nv_bfloat16>(std::vector<std::shared_ptr<AbstractCustomComm>>* custom_all_reduce_comms,
+                                       int                                               enable_custom_all_reduce,
+                                       size_t                                            rank_size);
+#endif
+template void
+initCustomAllReduceComm<uint32_t>(std::vector<std::shared_ptr<AbstractCustomComm>>* custom_all_reduce_comms,
+                                  int                                               enable_custom_all_reduce,
+                                  size_t                                            rank_size);
+}  // namespace fastertransformer
--- a/src/fastertransformer/utils/custom_ar_comm.h
+++ b/src/fastertransformer/utils/custom_ar_comm.h
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <memory>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "src/fastertransformer/kernels/custom_ar_kernels.h"
+#include "src/fastertransformer/utils/Tensor.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include "src/fastertransformer/utils/logger.h"
+namespace fastertransformer {
+class AbstractCustomComm {
+public:
+    AbstractCustomComm()                                                             = default;
+    virtual ~AbstractCustomComm()                                                    = default;
+    virtual void customAllReduce(size_t elts, cudaStream_t stream)                   = 0;
+    virtual void enableP2P(int ngpus)                                                = 0;
+    virtual bool swapInternalBuffer(std::vector<Tensor>* tensor_buffer, size_t elts) = 0;
+    virtual void
+    allocateAndExchangePeerAccessPointer(std::vector<std::shared_ptr<AbstractCustomComm>>* custom_all_reduce_comms) = 0;
+};
+template<typename T>
+class CustomAllReduceComm: public AbstractCustomComm {
+public:
+    CustomAllReduceComm(size_t rank_size, size_t rank);
+    ~CustomAllReduceComm();
+    void customAllReduce(size_t elts, cudaStream_t stream);
+    void allocateAndExchangePeerAccessPointer(
+        std::vector<std::shared_ptr<AbstractCustomComm>>* custom_all_reduce_comms) override;
+    bool swapInternalBuffer(std::vector<Tensor>* tensor_buffer, size_t elts) override;
+    void enableP2P(int ngpus) override;
+private:
+    AllReduceParams<T>   param_;
+    std::vector<Tensor>* output_tensor_;
+    T*                   tmp_tensor_data_;
+    size_t               rank_size_;
+    size_t               rank_;
+};
+template<typename T>
+void initCustomAllReduceComm(std::vector<std::shared_ptr<AbstractCustomComm>>* custom_all_reduce_comms,
+                             int                                               enable_custom_all_reduce,
+                             size_t                                            rank_size);
+template<typename T>
+struct CustomARCommTypeConverter {
+    using Type = uint32_t;
+};
+template<>
+struct CustomARCommTypeConverter<half> {
+    using Type = uint16_t;
+};
+#ifdef ENABLE_BF16
+template<>
+struct CustomARCommTypeConverter<__nv_bfloat16> {
+    using Type = __nv_bfloat16;
+};
+#endif
+}  // namespace fastertransformer
\ No newline at end of file
--- a/src/fastertransformer/utils/gemm.cc
+++ b/src/fastertransformer/utils/gemm.cc
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "src/fastertransformer/utils/gemm.h"
+namespace fastertransformer {
+/* ***************************** GEMM Impl ******************************** */
+Gemm::Gemm(IAllocator* allocator, cudaStream_t stream, std::string config_file)
+{
+    allocator_ = allocator;
+    stream_    = stream;
+    mutex_     = new std::mutex();  // mutex per process
+    check_cuda_error(cublasCreate(&cublas_handle_));
+    check_cuda_error(cublasLtCreate(&cublaslt_handle_));
+    check_cuda_error(cublasSetStream(cublas_handle_, stream));
+    if (allocator_ != nullptr) {
+        workspace_ = allocator_->reMalloc(workspace_, WORKSPACE_SIZE);
+    }
+    loadGemmConfig(config_file);
+}
+Gemm::~Gemm()
+{
+    if (allocator_ != nullptr) {
+        allocator_->free((void**)(&workspace_));
+        allocator_ = nullptr;
+    }
+    cublasLtDestroy(cublaslt_handle_);
+    cublasDestroy(cublas_handle_);
+    delete cublas_algo_map_;
+    delete mutex_;
+}
+std::string Gemm::toString()
+{
+    const char* a_type_str       = a_type_ == TYPE_FP16 ? "FP16" : "FP32";
+    const char* b_type_str       = b_type_ == TYPE_FP16 ? "FP16" : "FP32";
+    const char* c_type_str       = c_type_ == TYPE_FP16 ? "FP16" : "FP32";
+    const char* compute_type_str = compute_type_ == TYPE_FP16 ? "FP16" : "FP32";
+    return fmtstr(
+        "Gemm[a_type=%s, b_type=%s, c_type=%s, compute_type=%s]", a_type_str, b_type_str, c_type_str, compute_type_str);
+}
+void Gemm::setAllocator(IAllocator* allocator)
+{
+    if (allocator_ != nullptr && workspace_ != nullptr) {
+        allocator_->free((void**)(&workspace_));
+    }
+    allocator_ = allocator;
+    if (allocator_ != nullptr) {
+        workspace_ = allocator_->reMalloc(workspace_, WORKSPACE_SIZE);
+    }
+}
+void Gemm::setCudaStream(cudaStream_t& stream)
+{
+    stream_ = stream;
+    cublasSetStream(cublas_handle_, stream);
+}
+void Gemm::setComputeType(DataType compute_type)
+{
+    checkDataTypeValidity(compute_type);
+    compute_type_ = compute_type;
+}
+void Gemm::setTypes(DataType a_type, DataType b_type, DataType c_type, DataType compute_type)
+{
+    checkDataTypeValidity(a_type);
+    checkDataTypeValidity(b_type);
+    checkDataTypeValidity(c_type);
+    a_type_ = a_type;
+    b_type_ = b_type;
+    c_type_ = c_type;
+    setComputeType(compute_type);
+}
+template<typename T>
+void Gemm::setDefaultTypes()
+{
+    if (std::is_same<T, float>::value) {
+        setTypes(TYPE_FP32, TYPE_FP32, TYPE_FP32, TYPE_FP32);
+    }
+    else if (std::is_same<T, half>::value) {
+        setTypes(TYPE_FP16, TYPE_FP16, TYPE_FP16, TYPE_FP16);
+    }
+    else {
+        throw GemmNotSupportedException("Gemm supports float or half type.");
+    }
+}
+void Gemm::loadGemmConfig(std::string config_file)
+{
+    if (cublas_algo_map_ != nullptr) {
+        delete cublas_algo_map_;  // unload the previous cublas map.
+    }
+    cublas_algo_map_ = new cublasAlgoMap(config_file);
+}
+void Gemm::gemm(const GemmOp              transa,
+                const GemmOp              transb,
+                const size_t              m,
+                const size_t              n,
+                const size_t              k,
+                const void*               input,
+                const DenseWeight<float>& weight,
+                void*                     output,
+                const float               alpha,
+                const float               beta)
+{
+    gemm(transa,
+         transb,
+         m,
+         n,
+         k,
+         input,
+         a_type_,
+         (transa == GEMM_OP_N) ? k : m,
+         (const void*)weight.kernel,
+         b_type_,
+         (transb == GEMM_OP_N) ? n : k,
+         output,
+         c_type_,
+         n,
+         alpha,
+         beta);
+}
+void Gemm::gemm(const GemmOp             transa,
+                const GemmOp             transb,
+                const size_t             m,
+                const size_t             n,
+                const size_t             k,
+                const void*              input,
+                const DenseWeight<half>& weight,
+                void*                    output,
+                const float              alpha,
+                const float              beta)
+{
+    gemm(transa,
+         transb,
+         m,
+         n,
+         k,
+         input,
+         a_type_,
+         (transa == GEMM_OP_N) ? k : m,
+         (const void*)weight.kernel,
+         b_type_,
+         (transb == GEMM_OP_N) ? n : k,
+         output,
+         c_type_,
+         n,
+         alpha,
+         beta);
+}
+void Gemm::gemm(const GemmOp transa,
+                const GemmOp transb,
+                const size_t m,
+                const size_t n,
+                const size_t k,
+                const void*  A,
+                const void*  B,
+                void*        C,
+                const float  alpha,
+                const float  beta)
+{
+    size_t lda = (transa == GEMM_OP_N) ? k : m;
+    size_t ldb = (transb == GEMM_OP_N) ? n : k;
+    size_t ldc = n;
+    gemm(transa, transb, m, n, k, A, a_type_, lda, B, b_type_, ldb, C, c_type_, ldc, alpha, beta);
+}
+void Gemm::gemm(const GemmOp transa,
+                const GemmOp transb,
+                const size_t m,
+                const size_t n,
+                const size_t k,
+                const void*  A,
+                const size_t lda,
+                const void*  B,
+                const size_t ldb,
+                void*        C,
+                const size_t ldc,
+                const float  alpha,
+                const float  beta)
+{
+    gemm(transa, transb, m, n, k, A, a_type_, lda, B, b_type_, ldb, C, c_type_, ldc, alpha, beta);
+}
+void Gemm::gemm(const GemmOp   transa,
+                const GemmOp   transb,
+                const size_t   m,
+                const size_t   n,
+                const size_t   k,
+                const void*    A,
+                const DataType Atype,
+                const size_t   lda,
+                const void*    B,
+                const DataType Btype,
+                const size_t   ldb,
+                void*          C,
+                const DataType Ctype,
+                const size_t   ldc,
+                const float    alpha,
+                const float    beta)
+{
+    FT_LOG_TRACE("Gemm::gemm [m=%ld, n=%ld, k=%ld, lda=%ld, ldb=%ld, ldc=%ld]", m, n, k, lda, ldb, ldc);
+    // Implementation copied from cublasMMWrapper::Gemm
+    // Switch A and B since both cublas and cublasLt assume a column major layout,
+    // while A and B are both row major layout.
+    const void* a_data_ptr = B;
+    const void* b_data_ptr = A;
+    cublasOperation_t a_op = getCublasOperation(transb);
+    cublasOperation_t b_op = getCublasOperation(transa);
+    cudaDataType_t a_type = getCublasDataType(Btype);
+    cudaDataType_t b_type = getCublasDataType(Atype);
+    cudaDataType_t c_type = getCublasDataType(Ctype);
+    // swap m and n
+    const size_t _m = n;
+    const size_t _n = m;
+    // swap lda and ldb;
+    const size_t _lda = ldb;
+    const size_t _ldb = lda;
+    mutex_->lock();
+    // Use cublas as default in FP32 and cublasLt as default in FP16
+    bool is_fp16_compute_type = compute_type_ == TYPE_FP16;
+    bool using_cublasLt       = Atype == TYPE_FP16;
+    int  batch_count          = 1;
+    half        h_alpha = (half)alpha;
+    half        h_beta  = (half)beta;
+    const void* alpha_ptr =
+        is_fp16_compute_type ? reinterpret_cast<const void*>(&h_alpha) : reinterpret_cast<const void*>(&alpha);
+    const void* beta_ptr =
+        is_fp16_compute_type ? reinterpret_cast<const void*>(&h_beta) : reinterpret_cast<const void*>(&beta);
+    // TODO: unify CUBLAS_DATA_TYPE and DataType.
+    int findAlgo =
+        cublas_algo_map_->isExist(batch_count, _m, _n, k, (a_type == CUDA_R_16F) ? HALF_DATATYPE : FLOAT_DATATYPE);
+    cublasLtMatmulAlgo_info info =
+        cublas_algo_map_->getAlgo(batch_count, _m, _n, k, (a_type == CUDA_R_16F) ? HALF_DATATYPE : FLOAT_DATATYPE);
+    if (findAlgo) {
+        using_cublasLt = (info.stages != -1);
+    }
+    if (using_cublasLt) {
+        const size_t a_rows = (a_op == getCublasOperation(GEMM_OP_N)) ? _m : k;
+        const size_t a_cols = (a_op == getCublasOperation(GEMM_OP_N)) ? k : _m;
+        const size_t b_rows = (b_op == getCublasOperation(GEMM_OP_N)) ? k : _n;
+        const size_t b_cols = (b_op == getCublasOperation(GEMM_OP_N)) ? _n : k;
+        cublasLtMatmulDesc_t   matmul_desc = NULL;
+        cublasLtMatrixLayout_t a_desc = NULL, b_desc = NULL, c_desc = NULL;
+        cudaDataType_t         scale_type   = getCublasDataType(compute_type_);
+        auto                   compute_type = getCublasComputeType(compute_type_);
+        // --------------------------------------
+        // Create descriptors for the original matrices
+        cublasLtMatrixLayoutCreate(&a_desc, a_type, a_rows, a_cols, _lda);
+        cublasLtMatrixLayoutCreate(&b_desc, b_type, b_rows, b_cols, _ldb);
+        cublasLtMatrixLayoutCreate(&c_desc, c_type, _m, _n, ldc);
+#if (CUDART_VERSION >= 11000)
+        cublasLtMatmulDescCreate(&matmul_desc, compute_type, scale_type);
+#else
+        cublasLtMatmulDescCreate(&matmul_desc, compute_type);
+#endif
+        cublasLtMatmulDescSetAttribute(matmul_desc, CUBLASLT_MATMUL_DESC_TRANSA, &a_op, sizeof(cublasOperation_t));
+        cublasLtMatmulDescSetAttribute(matmul_desc, CUBLASLT_MATMUL_DESC_TRANSB, &b_op, sizeof(cublasOperation_t));
+        cublasLtMatmulAlgo_t algo;
+        void*                workspace      = workspace_;
+        int                  workspace_size = workspace_ == nullptr ? 0 : CUBLAS_WORKSPACE_SIZE;
+        if (findAlgo) {
+            if (info.workspaceSize > workspace_size) {
+                findAlgo = 0;
+            }
+            else {
+                cublasLtMatmulAlgoInit(
+                    cublaslt_handle_, compute_type, scale_type, a_type, b_type, c_type, c_type, info.algoId, &algo);
+                cublasLtMatmulAlgoConfigSetAttribute(
+                    &algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &(info.customOption), sizeof(info.customOption));
+                cublasLtMatmulAlgoConfigSetAttribute(
+                    &algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &(info.tile), sizeof(info.tile));
+                cublasLtMatmulAlgoConfigSetAttribute(
+                    &algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &(info.splitK_val), sizeof(info.splitK_val));
+                cublasLtMatmulAlgoConfigSetAttribute(
+                    &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(info.swizzle), sizeof(info.swizzle));
+                cublasLtMatmulAlgoConfigSetAttribute(
+                    &algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &(info.reductionScheme), sizeof(int));
+#if (CUDART_VERSION >= 11000)
+                cublasLtMatmulAlgoConfigSetAttribute(
+                    &algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(info.stages), sizeof(info.stages));
+#endif
+            }
+        }
+        cublasLtMatmul(cublaslt_handle_,
+                       matmul_desc,
+                       alpha_ptr,
+                       a_data_ptr,
+                       a_desc,
+                       b_data_ptr,
+                       b_desc,
+                       beta_ptr,
+                       C,
+                       c_desc,
+                       C,
+                       c_desc,
+                       (findAlgo == 1 ? (&algo) : NULL),
+                       workspace,
+                       workspace_size,
+                       stream_);
+        cublasLtMatmulDescDestroy(matmul_desc);
+        cublasLtMatrixLayoutDestroy(a_desc);
+        cublasLtMatrixLayoutDestroy(b_desc);
+        cublasLtMatrixLayoutDestroy(c_desc);
+        sync_check_cuda_error();
+    }
+    else {
+        cudaDataType_t compute_type = getCublasDataType(compute_type_);
+        int            cublas_algo  = info.algoId;
+        check_cuda_error(cublasGemmEx(cublas_handle_,
+                                      a_op,
+                                      b_op,
+                                      _m,
+                                      _n,
+                                      k,
+                                      alpha_ptr,
+                                      a_data_ptr,
+                                      a_type,
+                                      _lda,
+                                      b_data_ptr,
+                                      b_type,
+                                      _ldb,
+                                      beta_ptr,
+                                      C,
+                                      c_type,
+                                      ldc,
+                                      compute_type,
+                                      static_cast<cublasGemmAlgo_t>(cublas_algo)));
+        sync_check_cuda_error();
+    }
+    mutex_->unlock();
+}
+void Gemm::batchedGemm(const GemmOp       transa,
+                       const GemmOp       transb,
+                       const size_t       m,
+                       const size_t       n,
+                       const size_t       k,
+                       const void* const* A,
+                       const void* const* B,
+                       void* const*       C,
+                       const size_t       batch_size,
+                       const float        alpha,
+                       const float        beta)
+{
+    size_t lda = (transa == GEMM_OP_N) ? k : m;
+    size_t ldb = (transb == GEMM_OP_N) ? n : k;
+    size_t ldc = n;
+    batchedGemm(transa, transb, m, n, k, A, a_type_, lda, B, b_type_, ldb, C, c_type_, ldc, batch_size, alpha, beta);
+}
+void Gemm::batchedGemm(const GemmOp       transa,
+                       const GemmOp       transb,
+                       const size_t       m,
+                       const size_t       n,
+                       const size_t       k,
+                       const void* const* A,
+                       const size_t       lda,
+                       const void* const* B,
+                       const size_t       ldb,
+                       void* const*       C,
+                       const size_t       ldc,
+                       const size_t       batch_size,
+                       const float        alpha,
+                       const float        beta)
+{
+    batchedGemm(transa, transb, m, n, k, A, a_type_, lda, B, b_type_, ldb, C, c_type_, ldc, batch_size, alpha, beta);
+}
+void Gemm::batchedGemm(const GemmOp       transa,
+                       const GemmOp       transb,
+                       const size_t       m,
+                       const size_t       n,
+                       const size_t       k,
+                       const void* const* A,
+                       const DataType     Atype,
+                       const size_t       lda,
+                       const void* const* B,
+                       const DataType     Btype,
+                       const size_t       ldb,
+                       void* const*       C,
+                       const DataType     Ctype,
+                       const size_t       ldc,
+                       const size_t       batch_size,
+                       const float        alpha,
+                       const float        beta)
+{
+    FT_LOG_TRACE(
+        "Gemm::batchedGemm [b=%ld m=%ld, n=%ld, k=%ld, lda=%ld, ldb=%ld, ldc=%ld]", batch_size, m, n, k, lda, ldb, ldc);
+    // Switch A and B.
+    const void* const* a_data_ptr = B;
+    const void* const* b_data_ptr = A;
+    cublasOperation_t a_op = getCublasOperation(transb);
+    cublasOperation_t b_op = getCublasOperation(transa);
+    cudaDataType_t a_type = getCublasDataType(Btype);
+    cudaDataType_t b_type = getCublasDataType(Atype);
+    cudaDataType_t c_type = getCublasDataType(Ctype);
+    // swap m and n, lda and ldb
+    const size_t _m   = n;
+    const size_t _n   = m;
+    const size_t _lda = ldb;
+    const size_t _ldb = lda;
+    half h_alpha = (half)alpha;
+    half h_beta  = (half)beta;
+    mutex_->lock();
+    bool        is_fp16_compute_type = compute_type_ == TYPE_FP16;
+    const void* alpha_ptr =
+        is_fp16_compute_type ? reinterpret_cast<const void*>(&h_alpha) : reinterpret_cast<const void*>(&alpha);
+    const void* beta_ptr =
+        is_fp16_compute_type ? reinterpret_cast<const void*>(&h_beta) : reinterpret_cast<const void*>(&beta);
+    cublasLtMatmulAlgo_info info =
+        cublas_algo_map_->getAlgo(batch_size, m, n, k, (a_type == CUDA_R_16F) ? HALF_DATATYPE : FLOAT_DATATYPE);
+    check_cuda_error(cublasGemmBatchedEx(cublas_handle_,
+                                         a_op,
+                                         b_op,
+                                         _m,
+                                         _n,
+                                         k,
+                                         alpha_ptr,
+                                         a_data_ptr,
+                                         a_type,
+                                         _lda,
+                                         b_data_ptr,
+                                         b_type,
+                                         _ldb,
+                                         beta_ptr,
+                                         C,
+                                         c_type,
+                                         ldc,
+                                         batch_size,
+                                         getCublasComputeType(compute_type_),
+                                         static_cast<cublasGemmAlgo_t>(info.algoId)));
+    mutex_->unlock();
+}
+void Gemm::stridedBatchedGemm(GemmOp       transa,
+                              GemmOp       transb,
+                              const size_t m,
+                              const size_t n,
+                              const size_t k,
+                              const void*  A,
+                              const void*  B,
+                              void*        C,
+                              const size_t batch_size,
+                              const float  alpha,
+                              const float  beta)
+{
+    size_t  lda     = (transa == GEMM_OP_N) ? k : m;
+    size_t  ldb     = (transb == GEMM_OP_N) ? n : k;
+    size_t  ldc     = n;
+    int64_t stridea = m * k;
+    int64_t strideb = k * n;
+    int64_t stridec = m * n;
+    stridedBatchedGemm(transa,
+                       transb,
+                       m,
+                       n,
+                       k,
+                       A,
+                       a_type_,
+                       lda,
+                       stridea,
+                       B,
+                       b_type_,
+                       ldb,
+                       strideb,
+                       C,
+                       c_type_,
+                       ldc,
+                       stridec,
+                       batch_size,
+                       compute_type_,
+                       alpha,
+                       beta);
+}
+void Gemm::stridedBatchedGemm(GemmOp        transa,
+                              GemmOp        transb,
+                              const size_t  m,
+                              const size_t  n,
+                              const size_t  k,
+                              const void*   A,
+                              const int64_t strideA,
+                              const void*   B,
+                              const int64_t strideB,
+                              void*         C,
+                              const int64_t strideC,
+                              const size_t  batch_size,
+                              const float   alpha,
+                              const float   beta)
+{
+    size_t lda = (transa == GEMM_OP_N) ? k : m;
+    size_t ldb = (transb == GEMM_OP_N) ? n : k;
+    size_t ldc = n;
+    stridedBatchedGemm(transa,
+                       transb,
+                       m,
+                       n,
+                       k,
+                       A,
+                       a_type_,
+                       lda,
+                       strideA,
+                       B,
+                       b_type_,
+                       ldb,
+                       strideB,
+                       C,
+                       c_type_,
+                       ldc,
+                       strideC,
+                       batch_size,
+                       compute_type_,
+                       alpha,
+                       beta);
+}
+void Gemm::stridedBatchedGemm(GemmOp        transa,
+                              GemmOp        transb,
+                              const size_t  m,
+                              const size_t  n,
+                              const size_t  k,
+                              const void*   A,
+                              const size_t  lda,
+                              const int64_t strideA,
+                              const void*   B,
+                              const size_t  ldb,
+                              const int64_t strideB,
+                              void*         C,
+                              const size_t  ldc,
+                              const int64_t strideC,
+                              const size_t  batch_size,
+                              const float   alpha,
+                              const float   beta)
+{
+    stridedBatchedGemm(transa,
+                       transb,
+                       m,
+                       n,
+                       k,
+                       A,
+                       a_type_,
+                       lda,
+                       strideA,
+                       B,
+                       b_type_,
+                       ldb,
+                       strideB,
+                       C,
+                       c_type_,
+                       ldc,
+                       strideC,
+                       batch_size,
+                       compute_type_,
+                       alpha,
+                       beta);
+}
+void Gemm::stridedBatchedGemm(GemmOp        transa,
+                              GemmOp        transb,
+                              const size_t  m,
+                              const size_t  n,
+                              const size_t  k,
+                              const void*   A,
+                              DataType      Atype,
+                              const size_t  lda,
+                              const int64_t strideA,
+                              const void*   B,
+                              DataType      Btype,
+                              const size_t  ldb,
+                              const int64_t strideB,
+                              void*         C,
+                              DataType      Ctype,
+                              const size_t  ldc,
+                              const int64_t strideC,
+                              const size_t  batch_size,
+                              DataType      compute_type,
+                              const float   alpha,
+                              const float   beta)
+{
+    FT_LOG_TRACE("Gemm::stridedBatchedGemm [b=%ld, m=%ld, n=%ld, k=%ld, lda=%ld, ldb=%ld, ldc=%ld]",
+                 batch_size,
+                 m,
+                 n,
+                 k,
+                 lda,
+                 ldb,
+                 ldc);
+    // Switch A and B.
+    const void* a_data_ptr = B;
+    const void* b_data_ptr = A;
+    cublasOperation_t a_op = getCublasOperation(transb);
+    cublasOperation_t b_op = getCublasOperation(transa);
+    cudaDataType_t a_type = getCublasDataType(Btype);
+    cudaDataType_t b_type = getCublasDataType(Atype);
+    cudaDataType_t c_type = getCublasDataType(Ctype);
+    // swap m and n, lda and ldb, stride A and B
+    const size_t  _m       = n;
+    const size_t  _n       = m;
+    const size_t  _lda     = ldb;
+    const size_t  _ldb     = lda;
+    const int64_t _stridea = strideB;
+    const int64_t _strideb = strideA;
+    half h_alpha = (half)alpha;
+    half h_beta  = (half)beta;
+    mutex_->lock();
+    bool        is_fp16_compute_type = compute_type_ == TYPE_FP16;
+    const void* alpha_ptr =
+        is_fp16_compute_type ? reinterpret_cast<const void*>(&h_alpha) : reinterpret_cast<const void*>(&alpha);
+    const void* beta_ptr =
+        is_fp16_compute_type ? reinterpret_cast<const void*>(&h_beta) : reinterpret_cast<const void*>(&beta);
+    cublasLtMatmulAlgo_info info =
+        cublas_algo_map_->getAlgo(batch_size, m, n, k, (a_type == CUDA_R_16F) ? HALF_DATATYPE : FLOAT_DATATYPE);
+    check_cuda_error(cublasGemmStridedBatchedEx(cublas_handle_,
+                                                a_op,
+                                                b_op,
+                                                _m,
+                                                _n,
+                                                k,
+                                                alpha_ptr,
+                                                a_data_ptr,
+                                                a_type,
+                                                _lda,
+                                                _stridea,
+                                                b_data_ptr,
+                                                b_type,
+                                                _ldb,
+                                                _strideb,
+                                                beta_ptr,
+                                                C,
+                                                c_type,
+                                                ldc,
+                                                strideC,
+                                                batch_size,
+                                                getCublasComputeType(compute_type),
+                                                static_cast<cublasGemmAlgo_t>(info.algoId)));
+    mutex_->unlock();
+}
+void Gemm::checkDataTypeValidity(const DataType& type)
+{
+    if (type != TYPE_FP32 && type != TYPE_FP16) {
+        throw GemmNotSupportedException("Gemm supports TYPE_FP16 or TYPE_FP32");
+    }
+}
+/* ************************* End of GEMM Impl **************************** */
+// void Int8Gemm::gemm(Tensor& C,
+//                     const GemmOp transa,
+//                     const GemmOp transb,
+//                     const Tensor& A,
+//                     const Tensor& B,
+//                     const float alpha,
+//                     const float beta)
+// {
+// }
+/* ************************* SpGEMM Impl *********************************** */
+#ifdef SPARSITY_ENABLED
+SpGemm::SpGemm(IAllocator* allocator, cudaStream_t stream, std::string config_file, std::string spconfig_file):
+    Gemm(allocator, stream, config_file)
+{
+    CHECK_CUSPARSE(cusparseLtInit(&cusparselt_handle_));
+    // TODO(jaedeokk):
+    //   Let's make cublasAlgoMap load gemm/spgemm config separtely,
+    //   allowing us to inherit Gemm's constructor.
+    // cublas_algo_map_.loadSpGemmConfig(spconfig_file);  // enable this line later.
+    a_type_       = TYPE_FP16;
+    b_type_       = TYPE_FP16;
+    c_type_       = TYPE_FP16;
+    compute_type_ = TYPE_FP16;
+}
+SpGemm::~SpGemm()
+{
+    cusparseLtDestroy(&cusparselt_handle_);
+    // Need to destroy matmul description cache.
+    for (auto& kv : a_desc_map_) {  // kv = (mark, a_desc)
+        cusparseLtMatDescriptorDestroy(&a_desc_map_[kv.first]);
+    }
+    for (auto& kv : b_desc_map_) {  // kv = (mark, b_desc)
+        cusparseLtMatDescriptorDestroy(&b_desc_map_[kv.first]);
+    }
+    for (auto& kv : c_desc_map_) {  // kv = (mark, c_desc)
+        cusparseLtMatDescriptorDestroy(&c_desc_map_[kv.first]);
+    }
+}
+std::string SpGemm::toString()
+{
+    const char* a_type_str       = a_type_ == TYPE_FP16 ? "FP16" : "FP32";
+    const char* b_type_str       = b_type_ == TYPE_FP16 ? "FP16" : "FP32";
+    const char* c_type_str       = c_type_ == TYPE_FP16 ? "FP16" : "FP32";
+    const char* compute_type_str = compute_type_ == TYPE_FP16 ? "FP16" : "FP32";
+    return fmtstr("SpGemm[a_type=%s, b_type=%s, c_type=%s, compute_type=%s]",
+                  a_type_str,
+                  b_type_str,
+                  c_type_str,
+                  compute_type_str);
+}
+void SpGemm::loadGemmConfig(std::string config_file, std::string spconfig_file)
+{
+    if (cublas_algo_map_ != nullptr) {
+        delete cublas_algo_map_;  // unload algo map.
+    }
+    cublas_algo_map_ = new cublasAlgoMap(config_file, spconfig_file);
+}
+void SpGemm::checkDataTypeValidity(const DataType& type)
+{
+    if (type != TYPE_FP16) {
+        throw GemmNotSupportedException("Sparse GEMM only supports FP16 data type now.");
+    }
+}
+bool SpGemm::useBaseGemm(size_t batch_size, size_t m, size_t n, size_t k)
+{
+    return !cublas_algo_map_->isUseSparse(batch_size, m, n, k);
+}
+// Temporal gemm helper mtehod to use template T.
+template<typename T>
+void SpGemm::weightGemmHelper(const GemmOp          transa,
+                              const GemmOp          transb,
+                              const size_t          m,
+                              const size_t          n,
+                              const size_t          k,
+                              const void*           input,
+                              const DenseWeight<T>& weight,
+                              void*                 output,
+                              const float           alpha,
+                              const float           beta)
+{
+    size_t lda = (transa == GEMM_OP_N) ? k : m;
+    size_t ldb = (transb == GEMM_OP_N) ? n : k;
+    size_t ldc = n;
+    if (useBaseGemm(1, m, n, k) || weight.sp_kernel == nullptr) {
+        Gemm::gemm(transa,
+                   transb,
+                   m,
+                   n,
+                   k,
+                   input,
+                   a_type_,
+                   lda,
+                   (const void*)weight.kernel,
+                   b_type_,
+                   ldb,
+                   output,
+                   c_type_,
+                   ldc,
+                   alpha,
+                   beta);
+    }
+    else {
+        gemm(transa,
+             transb,
+             m,
+             n,
+             k,
+             input,
+             a_type_,
+             lda,
+             (const void*)weight.sp_kernel,
+             b_type_,
+             ldb,
+             output,
+             c_type_,
+             ldc,
+             alpha,
+             beta);
+    }
+}
+void SpGemm::gemm(const GemmOp              transa,
+                  const GemmOp              transb,
+                  const size_t              m,
+                  const size_t              n,
+                  const size_t              k,
+                  const void*               input,
+                  const DenseWeight<float>& weight,
+                  void*                     output,
+                  const float               alpha,
+                  const float               beta)
+{
+    weightGemmHelper<float>(transa, transb, m, n, k, input, weight, output, alpha, beta);
+}
+void SpGemm::gemm(const GemmOp             transa,
+                  const GemmOp             transb,
+                  const size_t             m,
+                  const size_t             n,
+                  const size_t             k,
+                  const void*              input,
+                  const DenseWeight<half>& weight,
+                  void*                    output,
+                  const float              alpha,
+                  const float              beta)
+{
+    weightGemmHelper<half>(transa, transb, m, n, k, input, weight, output, alpha, beta);
+}
+void SpGemm::gemm(const GemmOp   transa,
+                  const GemmOp   transb,
+                  const size_t   m,
+                  const size_t   n,
+                  const size_t   k,
+                  const void*    A,
+                  const DataType Atype,
+                  const size_t   lda,
+                  const void*    B,
+                  const DataType Btype,
+                  const size_t   ldb,
+                  void*          C,
+                  const DataType Ctype,
+                  const size_t   ldc,
+                  const float    alpha,
+                  const float    beta)
+{
+    FT_LOG_TRACE("SpGemm::gemm [m=%ld, n=%ld, k=%ld, lda=%ld, ldb=%ld, ldc=%ld]", m, n, k, lda, ldb, ldc);
+    checkDataTypeValidity(Atype);
+    checkDataTypeValidity(Btype);
+    checkDataTypeValidity(Ctype);
+    checkDataTypeValidity(compute_type_);
+    if (useBaseGemm(1, m, n, k)) {
+        // Compute by the base GEMM.
+        Gemm::gemm(transa, transb, m, n, k, A, Atype, lda, B, Btype, ldb, C, Ctype, ldc, alpha, beta);
+        return;
+    }
+    // Switch A/B due to column major layout in computation.
+    //  Typical usecase of Gemm family is to compute Y = X * W where X is an
+    //  input tensor and W is a kernel weight. Compression takes a lot time
+    //  so only the kernel weight (which is fixed in inference time) can be
+    //  sparse. Using B as sparse seems not stable, unfortunately.
+    //  (e.g. caching matrix descriptions is not correctly working.)
+    //  Thus, SpGemm considers a column major layout in computation to make
+    //  C^T = B^T * A^T, where a kernel weight "B" is located at the front.
+    const void* a_data = B;
+    const void* b_data = A;
+    cusparseOrder_t order = CUSPARSE_ORDER_COL;
+    cusparseOperation_t opA = getCusparseOperation(transb);
+    cusparseOperation_t opB = getCusparseOperation(transa);
+    cudaDataType_t a_type = getCublasDataType(Btype);
+    cudaDataType_t b_type = getCublasDataType(Atype);
+    cudaDataType_t c_type = getCublasDataType(Ctype);
+    const size_t _m   = n;
+    const size_t _n   = m;
+    const size_t _lda = ldb;
+    const size_t _ldb = lda;
+    const size_t a_rows = (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) ? _m : k;
+    const size_t a_cols = (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) ? k : _m;
+    const size_t b_rows = (opB == CUSPARSE_OPERATION_NON_TRANSPOSE) ? k : _n;
+    const size_t b_cols = (opB == CUSPARSE_OPERATION_NON_TRANSPOSE) ? _n : k;
+    const size_t c_rows = _m;
+    const size_t c_cols = _n;
+    const unsigned      alignment    = 16;
+    cusparseComputeType compute_type = getCusparseComputeType(compute_type_);
+    cusparseLtMatmulDescriptor_t   matmul;
+    cusparseLtMatmulAlgSelection_t alg_sel;
+    cusparseLtMatmulPlan_t         plan;
+    char mark[256];
+    sprintf(mark, "%d_%ld_%ld_%ld_%s_%s", 1, m, n, k, getGemmOpString(transb).c_str(), getGemmOpString(transa).c_str());
+    if (a_desc_map_.find(mark) != a_desc_map_.end()) {
+        CHECK_CUSPARSE(cusparseLtMatmulDescriptorInit(&cusparselt_handle_,
+                                                      &matmul,
+                                                      opA,
+                                                      opB,
+                                                      &a_desc_map_[mark],
+                                                      &b_desc_map_[mark],
+                                                      &c_desc_map_[mark],
+                                                      &c_desc_map_[mark],
+                                                      compute_type));
+    }
+    else {
+        // initializing MatDesc takes a lot of time
+        cusparseLtMatDescriptor_t a_desc, b_desc, c_desc;
+        a_desc_map_[mark] = a_desc;
+        b_desc_map_[mark] = b_desc;
+        c_desc_map_[mark] = c_desc;
+        CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(&cusparselt_handle_,
+                                                          &a_desc_map_[mark],
+                                                          a_rows,
+                                                          a_cols,
+                                                          _lda,
+                                                          alignment,
+                                                          a_type,
+                                                          order,
+                                                          CUSPARSELT_SPARSITY_50_PERCENT));
+        CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(
+            &cusparselt_handle_, &b_desc_map_[mark], b_rows, b_cols, _ldb, alignment, b_type, order));
+        CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(
+            &cusparselt_handle_, &c_desc_map_[mark], c_rows, c_cols, ldc, alignment, c_type, order));
+        CHECK_CUSPARSE(cusparseLtMatmulDescriptorInit(&cusparselt_handle_,
+                                                      &matmul,
+                                                      opA,
+                                                      opB,
+                                                      &a_desc_map_[mark],
+                                                      &b_desc_map_[mark],
+                                                      &c_desc_map_[mark],
+                                                      &c_desc_map_[mark],
+                                                      compute_type));
+    }
+    mutex_->lock();
+    CHECK_CUSPARSE(
+        cusparseLtMatmulAlgSelectionInit(&cusparselt_handle_, &alg_sel, &matmul, CUSPARSELT_MATMUL_ALG_DEFAULT));
+    int alg = cublas_algo_map_->getSpAlgo(1, a_rows, b_cols, a_cols);
+    CHECK_CUSPARSE(cusparseLtMatmulAlgSetAttribute(
+        &cusparselt_handle_, &alg_sel, CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg, sizeof(alg)));
+    size_t workspace_size;
+    CHECK_CUSPARSE(cusparseLtMatmulGetWorkspace(&cusparselt_handle_, &alg_sel, &workspace_size));
+    CHECK_CUSPARSE(cusparseLtMatmulPlanInit(&cusparselt_handle_, &plan, &matmul, &alg_sel, workspace_size));
+    void*        d_workspace = nullptr;  // Can we use the workspace of the class?
+    int          num_streams = 1;
+    cudaStream_t streams[1]  = {stream_};
+    CHECK_CUSPARSE(cusparseLtMatmul(
+        &cusparselt_handle_, &plan, &alpha, a_data, b_data, &beta, C, C, d_workspace, streams, num_streams))
+    CHECK_CUSPARSE(cusparseLtMatmulPlanDestroy(&plan))
+    mutex_->unlock();
+    sync_check_cuda_error();
+}
+#endif
+/* ************************* End of SpGEMM Impl ************************** */
+/* ***************************** GEMM utils ****************************** */
+std::shared_ptr<Gemm> createGemm(IAllocator* allocator, cudaStream_t stream, bool sparse, bool quantized)
+{
+    FT_LOG_TRACE(
+        "Create Gemm instance [sparse=%s, quantized=%s]", sparse ? "true" : "false", quantized ? "true" : "false");
+    std::shared_ptr<Gemm> gemm;
+    if (!sparse) {
+        if (!quantized) {
+            gemm = std::make_shared<Gemm>(allocator, stream);
+        }
+        else {
+            throw GemmNotSupportedException("Int8 Gemm is not supported yet");
+        }
+    }
+    else {
+#ifdef SPARSITY_ENABLED
+        if (sparse && !quantized) {
+            gemm = std::make_shared<SpGemm>(allocator, stream);
+        }
+        else {
+            throw GemmNotSupportedException("Int8 Sparse Gemm is not supported yet");
+        }
+#else
+        throw GemmNotSupportedException("Sparsity support is not enabled. To enabled sparisty, "
+                                        "please provide `-DSPARSITY_SUPPORT` flag for compilation.");
+#endif
+    }
+    return gemm;
+}
+cudaDataType_t getCublasDataType(DataType dtype)
+{
+    switch (dtype) {
+        case TYPE_FP16:
+            return CUDA_R_16F;
+        case TYPE_FP32:
+            return CUDA_R_32F;
+        default:
+            throw GemmNotSupportedException("Not supported data type.");
+    }
+}
+#if (CUDART_VERSION >= 11000)
+cublasComputeType_t getCublasComputeType(DataType ctype)
+{
+    switch (ctype) {
+        case TYPE_FP16:
+            return CUBLAS_COMPUTE_16F;
+        case TYPE_FP32:
+            return CUBLAS_COMPUTE_32F;
+        default:
+            throw GemmNotSupportedException("Not supported cublas compute type.");
+    }
+}
+#else
+cudaDataType_t getCublasComputeType(DataType ctype)
+{
+    switch (ctype) {
+        case TYPE_FP16:
+            return CUDA_R_16F;
+        case TYPE_FP32:
+            return CUDA_R_32F;
+        default:
+            throw GemmNotSupportedException("Not supported cublas compute type.");
+    }
+}
+#endif
+cublasOperation_t getCublasOperation(GemmOp op)
+{
+    switch (op) {
+        case GEMM_OP_N:
+            return CUBLAS_OP_N;
+        case GEMM_OP_T:
+            return CUBLAS_OP_T;
+        default:
+            throw GemmNotSupportedException("Unknown GemmOp provided.");
+    }
+}
+std::string getGemmOpString(const GemmOp& op)
+{
+    switch (op) {
+        case GEMM_OP_T:
+            return "T";
+        case GEMM_OP_N:
+            return "N";
+    }
+    throw GemmNotSupportedException("Unknown GemmOp provided.");
+}
+#ifdef SPARSITY_ENABLED
+cusparseOperation_t getCusparseOperation(GemmOp op)
+{
+    switch (op) {
+        case GEMM_OP_N:
+            return CUSPARSE_OPERATION_NON_TRANSPOSE;
+        case GEMM_OP_T:
+            return CUSPARSE_OPERATION_TRANSPOSE;
+        default:
+            throw GemmNotSupportedException("Unknown GemmOp provided.");
+    }
+}
+cusparseComputeType getCusparseComputeType(DataType ctype)
+{
+    if (ctype != TYPE_FP16) {
+        throw GemmNotSupportedException("Sparse GEMM supports TYPE_FP16 compute type only.");
+    }
+    return CUSPARSE_COMPUTE_16F;
+}
+void pruneMatrixB(void* data, const cudaStream_t& stream, const size_t k, const size_t n, const GemmOp trans)
+{
+    FT_LOG_TRACE("Prune matrix B [k=%ld, n=%ld, op=%s]", k, n, getGemmOpString(trans).c_str());
+    // Due to A/B switching, the matrix B will be used as a matrix A.
+    const cusparseOrder_t order     = CUSPARSE_ORDER_COL;
+    const size_t          rows      = (trans == GEMM_OP_N) ? n : k;
+    const size_t          cols      = (trans == GEMM_OP_N) ? k : n;
+    const size_t          ld        = rows;
+    const unsigned        alignment = 16;
+    const cusparseLtPruneAlg_t prune_alg = CUSPARSELT_PRUNE_SPMMA_STRIP;
+    const cusparseOperation_t  op        = getCusparseOperation(trans);
+    const cudaDataType_t       dtype     = CUDA_R_16F;  // fixed under cusparselt == 0.2.0.
+    // 0: B is sparse,  1: A is sparse
+    // B matrix will be used as A matrix at the SpGemm::gemm.
+    const int is_sparse_a = 1;
+    // TODO: Let the resource manager handle GPU-related resources later.
+    cusparseLtHandle_t handle;
+    CHECK_CUSPARSE(cusparseLtInit(&handle));
+    cusparseLtMatDescriptor_t mat_desc;
+    CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(
+        &handle, &mat_desc, rows, cols, ld, alignment, dtype, order, CUSPARSELT_SPARSITY_50_PERCENT));
+    CHECK_CUSPARSE(cusparseLtSpMMAPrune2(&handle, &mat_desc, is_sparse_a, op, data, data, prune_alg, stream));
+    CHECK_CUSPARSE(cusparseLtMatDescriptorDestroy(&mat_desc));
+    CHECK_CUSPARSE(cusparseLtDestroy(&handle));
+}
+size_t compressMatrixB(void**              output,
+                       IAllocator&         allocator,
+                       const cudaStream_t& stream,
+                       const void*         input,
+                       const size_t        k,
+                       const size_t        n,
+                       const GemmOp        trans)
+{
+    FT_LOG_TRACE("compressMatrix [k=%ld, n=%ld, dtype=FP16]", k, n);
+    // swap A/B due to column/row major layout mismatch.
+    cusparseOrder_t order = CUSPARSE_ORDER_COL;
+    const size_t    rows  = (trans == GEMM_OP_N) ? n : k;
+    const size_t    cols  = (trans == GEMM_OP_N) ? k : n;
+    const size_t    ld    = rows;
+    cudaDataType_t            dtype    = CUDA_R_16F;  // fixed under cusparselt == 0.2.0.
+    cusparseLtSparsity_t      sparsity = CUSPARSELT_SPARSITY_50_PERCENT;
+    cusparseOperation_t       op       = getCusparseOperation(trans);
+    cusparseLtMatDescriptor_t mat_desc;
+    const unsigned            alignment   = 16;
+    const int                 is_sparse_a = 1;  // 0: B is sparse,  1: A is sparse
+    cusparseLtHandle_t handle;
+    CHECK_CUSPARSE(cusparseLtInit(&handle));
+    CHECK_CUSPARSE(
+        cusparseLtStructuredDescriptorInit(&handle, &mat_desc, rows, cols, ld, alignment, dtype, order, sparsity))
+    size_t compressed_size = 0;
+    CHECK_CUSPARSE(cusparseLtSpMMACompressedSize2(&handle, &mat_desc, &compressed_size));
+    if (compressed_size == 0) {
+        throw GemmInvalidException("Fail to compute correct compressed_size, got 0. This error may be "
+                                   "caused by a too small input matrix.");
+    }
+    *output = allocator.malloc(compressed_size, false);
+    CHECK_CUSPARSE(cusparseLtSpMMACompress2(&handle, &mat_desc, is_sparse_a, op, input, *output, stream))
+    CHECK_CUSPARSE(cusparseLtMatDescriptorDestroy(&mat_desc));
+    CHECK_CUSPARSE(cusparseLtDestroy(&handle));
+    return compressed_size;
+}
+#endif
+/* ************************* End of GEMM utils **************************** */
+}  // end of namespace fastertransformer
--- a/src/fastertransformer/utils/gemm.h
+++ b/src/fastertransformer/utils/gemm.h
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <cublasLt.h>
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+#include <iostream>
+#include <map>
+#include <mutex>
+#include <stdexcept>
+#include <string>
+// TODO: Need to remove the dependency of the layer module.
+//   e.g. refactor Weight class to some base module.
+#include "src/fastertransformer/layers/DenseWeight.h"
+#include "src/fastertransformer/utils/Tensor.h"
+#include "src/fastertransformer/utils/allocator.h"
+#include "src/fastertransformer/utils/cublasAlgoMap.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include "src/fastertransformer/utils/logger.h"
+#include "src/fastertransformer/utils/memory_utils.h"
+#ifndef CUDART_VERSION
+#error CUDART_VERSION Undefined!
+#endif
+// cublas default workspace size: 32MB. Let me make this as a Gemm property.
+#define WORKSPACE_SIZE 33554432
+namespace fastertransformer {
+// A wrapper of cublas or cusparse matrix operator.
+//  - GEMM_OP_N = CUBLAS_OP_N or CUSPARSE_OP_N
+//  - GEMM_OP_T = CUBLAS_OP_T or CUSPARSE_OP_T
+enum GemmOp {
+    GEMM_OP_N,
+    GEMM_OP_T
+};
+// A base class of the GEMM family.
+// In the current version Gemm is as a base class as well as an interface.
+class Gemm {
+public:
+    Gemm() = delete;  // Disable a default constructor
+    /**
+     * A Gemm class.
+     *
+     * NOTE:
+     *   A, B, C are assumed to have a row major layout, while a backend cuda libraries
+     *   assumes a column major layout. However, a family of Gemm has already handled
+     *   such discrepancy internally. Please use naively without a trick like switching
+     *   inputs A and B that aligns the matrix layout.
+     *
+     * Restriction: Supported in/out data or compute types: TYPE_FP16, TYPE_FP32.
+     *
+     * TODO:
+     *   Unify resource allocation/release from a singleton GPU resource managers.
+     *   Thus, allocator, stream can be replaced by a resource handler later.
+     *   E.g. Gemm(std::shared_ptr<ResourceManager> resource_manager), and
+     *        stream_ = resource_manager.getCudaStream();
+     *        buffer = resource_manager.malloc(...);
+     *
+     * @param allocator   Resource allocator.
+     * @param stream      A CUDA stream.
+     * @param config_file A file path of a GEMM configuration.
+     */
+    Gemm(IAllocator* allocator, cudaStream_t stream, std::string config_file = GEMM_CONFIG);
+    Gemm(Gemm const& other) = delete;
+    virtual ~Gemm();
+    virtual std::string toString();
+    /**
+     * @brief Set GEMM compute type.
+     *
+     * @param compute_type The data type of accumulation type inside GEMM computation.
+     *                     (Choices: TYPE_FP16, TYPE_FP32)
+     *
+     * @throw GemmNotSupportedException if a type is not TYPE_FP16 or TYPE_FP32.
+     * @throw std::runtime_error  if any exception inside CUDA.
+     */
+    void setComputeType(DataType compute_type);
+    /**
+     * @brief Set matrix data types and compute precision.
+     *
+     * Supported data or compute types: TYPE_FP16, TYPE_FP32
+     *
+     * @param a_type  The data type of a matrix A.
+     * @param b_type  The data type of a matrix B.
+     * @param c_type  The data type of a matrix C.
+     * @param compute_type  The data type of accumulation type inside GEMM computation.
+     *
+     * @throw GemmNotSupportedException if a type is not TYPE_FP16 or TYPE_FP32.
+     * @throw std::runtime_error  if any exception inside CUDA.
+     */
+    void setTypes(DataType a_type, DataType b_type, DataType c_type, DataType compute_type);
+    /**
+     * @brief Set matrix data and compute types by default values.
+     *
+     * Default configs:
+     *  - T=float : data type=TYPE_FP32, compute type=TYPE_FP32
+     *  - T=half  : data type=TYPE_FP16, compute type=TYPE_FP32
+     */
+    template<typename T>
+    void setDefaultTypes();
+    void loadGemmConfig(std::string config_file);
+    void setAllocator(IAllocator* allocator);
+    void setCudaStream(cudaStream_t& stream);
+    // Th APIs below are to see how the interface will change
+    // if it cooperates with Tensor. To enable it, we need to
+    // update the Tensor class. For instance, data is need to
+    // be of type (void*) rather than (const void*) to pass it
+    // as the output C of gemm.
+    // virtual void gemm(Tensor& C,
+    //                   const GemmOp transa,
+    //                   const GemmOp transb,
+    //                   const Tensor& A,
+    //                   const Tensor& B,
+    //                   const float alpha = 1.0f,
+    //                   const float beta = 0.0f);
+    //
+    // virtual void batchedMatmul(std::vector<Tensor> Carray,
+    //                            const GemmOp transa,
+    //                            const GemmOp transb,
+    //                            const std::vector<Tensor> Aarray,
+    //                            const std::vector<Tensor> Barray,
+    //                            const float alpha = 1.0f,
+    //                            const float beta = 0.0f);
+    //
+    // virtual void stridedBatchedGemm(Tensor& C,
+    //                                 const GemmOp transa,
+    //                                 const GemmOp transb,
+    //                                 const Tensor& A,
+    //                                 const Tensor& B,
+    //                                 const float alpha = 1.0f,
+    //                                 const float beta = 0.0f);
+    // TODO:
+    // This function cooperates with a Weight object to simply Gemm calls
+    // inside layers, computing the following formula
+    //     output(C) = input(A) * weight_kernel(B)
+    // where weight_kernel can be changed according to Gemm functions.
+    // DenseWeight is of a template struct, not allowing override the method.
+    // We temperally add an interface here for two cases float/half,
+    // but to finialze this function, we need an interface of a weight class
+    // which is not a template class.
+    virtual void gemm(const GemmOp              transa,
+                      const GemmOp              transb,
+                      const size_t              m,
+                      const size_t              n,
+                      const size_t              k,
+                      const void*               input,
+                      const DenseWeight<float>& weight,
+                      void*                     output,
+                      const float               alpha = 1.0f,
+                      const float               beta  = 0.0f);
+    virtual void gemm(const GemmOp             transa,
+                      const GemmOp             transb,
+                      const size_t             m,
+                      const size_t             n,
+                      const size_t             k,
+                      const void*              input,
+                      const DenseWeight<half>& weight,
+                      void*                    output,
+                      const float              alpha = 1.0f,
+                      const float              beta  = 0.0f);
+    virtual void gemm(const GemmOp transa,
+                      const GemmOp transb,
+                      const size_t m,
+                      const size_t n,
+                      const size_t k,
+                      const void*  A,
+                      const void*  B,
+                      void*        C,
+                      const float  alpha = 1.0f,
+                      const float  beta  = 0.0f);
+    virtual void gemm(const GemmOp transa,
+                      const GemmOp transb,
+                      const size_t m,
+                      const size_t n,
+                      const size_t k,
+                      const void*  A,
+                      const size_t lda,
+                      const void*  B,
+                      const size_t ldb,
+                      void*        C,
+                      const size_t ldc,
+                      const float  alpha = 1.0f,
+                      const float  beta  = 0.0f);
+    /**
+     * @brief Compute the matrix multiplication `C = \alpha * op(A) * op(B) + \beta * C`.
+     *
+     * @param transa A transpose operation of a matrix A (GEMM_OP_N or GEMM_OP_T).
+     * @param transb A transpose operation of a matrix B (GEMM_OP_N or GEMM_OP_T).
+     * @param m      A number of rows of a matrix op(A) and C.
+     * @param n      A number of columns of a matrix op(B) or C.
+     * @param k      A number of columns of op(A) and rows of op(B).
+     * @param A      A device pointer of a matrix A of dimension (m x lda).
+     * @param Atype  A data type of A (TYPE_FP16 or TYPE_FP32)
+     * @param lda    A leading dimension of the matrix A.
+     * @param B      A device pointer of a matrix B of dimension (k x ldb).
+     * @param Btype  A data type of B (TYPE_FP16 or TYPE_FP32)
+     * @param ldb    A leading dimension of the matrix B.
+     * @param C      (Output) A device pointer of a matrix C of dimension (m x ldc).
+     * @param Ctype  A data type of C (TYPE_FP16 or TYPE_FP32)
+     * @param ldc    A leading dimension of the matrix C.
+     * @param alpha  A scale factor for A*B (default: 1.0f).
+     * @param beta   A scale factor for C (default: 0.0f).
+     *
+     * @throw GemmNotSupportedException if a type is not TYPE_FP16 or TYPE_FP32.
+     * @throw std::runtime_error  if any exception inside CUDA.
+     */
+    virtual void gemm(const GemmOp   transa,
+                      const GemmOp   transb,
+                      const size_t   m,
+                      const size_t   n,
+                      const size_t   k,
+                      const void*    A,
+                      const DataType Atype,
+                      const size_t   lda,
+                      const void*    B,
+                      const DataType Btype,
+                      const size_t   ldb,
+                      void*          C,
+                      const DataType Ctype,
+                      const size_t   ldc,
+                      const float    alpha = 1.0f,
+                      const float    beta  = 0.0f);
+    virtual void batchedGemm(const GemmOp       transa,
+                             const GemmOp       transb,
+                             const size_t       m,
+                             const size_t       n,
+                             const size_t       k,
+                             const void* const* A,
+                             const void* const* B,
+                             void* const*       C,
+                             const size_t       batch_size,
+                             const float        alpha = 1.0f,
+                             const float        beta  = 0.0f);
+    virtual void batchedGemm(const GemmOp       transa,
+                             const GemmOp       transb,
+                             const size_t       m,
+                             const size_t       n,
+                             const size_t       k,
+                             const void* const* A,
+                             const size_t       lda,
+                             const void* const* B,
+                             const size_t       ldb,
+                             void* const*       C,
+                             const size_t       ldc,
+                             const size_t       batch_size,
+                             const float        alpha = 1.0f,
+                             const float        beta  = 0.0f);
+    /**
+     * @brief Compute the matrix multiplication of batch of matrices As and Bs
+     *
+     * For input batch A[i]/B[i] and output batch C[i], i = 0, ..., batch_size - 1,
+     *  `C[i] = \alpha * op(A[i]) * op(B[i]) + \beta * C[i]`.
+     *
+     * @param transa A transpose operation of a matrix A (GEMM_OP_N or GEMM_OP_T).
+     * @param transb A transpose operation of a matrix B (GEMM_OP_N or GEMM_OP_T).
+     * @param m      A number of rows of a matrix op(A) and C.
+     * @param n      A number of columns of a matrix op(B) or C.
+     * @param k      A number of columns of op(A) and rows of op(B).
+     * @param A      An array of device pointers of batch of input A matrices.
+     * @param Atype  A data type of A (TYPE_FP16 or TYPE_FP32)
+     * @param lda    A leading dimension of the matrix A.
+     * @param B      An array of device pointers of batch of input B matrices.
+     * @param Btype  A data type of B (TYPE_FP16 or TYPE_FP32)
+     * @param ldb    A leading dimension of the matrix B.
+     * @param C      (Output) An array of device pointers of batch of output C matrices.
+     * @param Ctype  A data type of C (TYPE_FP16 or TYPE_FP32)
+     * @param ldc    A leading dimension of the matrix C.
+     * @param alpha  A scale factor for A*B (default: 1.0f).
+     * @param beta   A scale factor for C (default: 0.0f).
+     *
+     * @throw GemmNotSupportedException if a type is not TYPE_FP16 or TYPE_FP32.
+     * @throw std::runtime_error  if any exception inside CUDA.
+     */
+    virtual void batchedGemm(const GemmOp       transa,
+                             const GemmOp       transb,
+                             const size_t       m,
+                             const size_t       n,
+                             const size_t       k,
+                             const void* const* A,
+                             const DataType     Atype,
+                             const size_t       lda,
+                             const void* const* B,
+                             const DataType     Btype,
+                             const size_t       ldb,
+                             void* const*       C,
+                             const DataType     Ctype,
+                             const size_t       ldc,
+                             const size_t       batch_size,
+                             const float        alpha = 1.0f,
+                             const float        beta  = 0.0f);
+    virtual void stridedBatchedGemm(GemmOp       transa,
+                                    GemmOp       transb,
+                                    const size_t m,
+                                    const size_t n,
+                                    const size_t k,
+                                    const void*  A,
+                                    const void*  B,
+                                    void*        C,
+                                    const size_t batch_size,
+                                    const float  alpha = 1.0f,
+                                    const float  beta  = 0.0f);
+    virtual void stridedBatchedGemm(GemmOp        transa,
+                                    GemmOp        transb,
+                                    const size_t  m,
+                                    const size_t  n,
+                                    const size_t  k,
+                                    const void*   A,
+                                    const int64_t strideA,
+                                    const void*   B,
+                                    const int64_t strideB,
+                                    void*         C,
+                                    const int64_t strideC,
+                                    const size_t  batch_size,
+                                    const float   alpha = 1.0f,
+                                    const float   beta  = 0.0f);
+    virtual void stridedBatchedGemm(GemmOp        transa,
+                                    GemmOp        transb,
+                                    const size_t  m,
+                                    const size_t  n,
+                                    const size_t  k,
+                                    const void*   A,
+                                    const size_t  lda,
+                                    const int64_t strideA,
+                                    const void*   B,
+                                    const size_t  ldb,
+                                    const int64_t strideB,
+                                    void*         C,
+                                    const size_t  ldc,
+                                    const int64_t strideC,
+                                    const size_t  batch_size,
+                                    const float   alpha = 1.0f,
+                                    const float   beta  = 0.0f);
+    /**
+     * @brief Compute the strided matrix multiplication of batch of matrices As and Bs
+     *
+     * For input batch A[i]/B[i] and output batch C[i], i = 0, ..., batch_size - 1,
+     *  `C[i] = \alpha * op(A[i]) * op(B[i]) + \beta * C[i]`.
+     *
+     * @param transa   A transpose operation of a matrix A (GEMM_OP_N or GEMM_OP_T).
+     * @param transb   A transpose operation of a matrix B (GEMM_OP_N or GEMM_OP_T).
+     * @param m        A number of rows of a matrix op(A) and C.
+     * @param n        A number of columns of a matrix op(B) or C.
+     * @param k        A number of columns of op(A) and rows of op(B).
+     * @param A        An array of device pointers of batch of input A matrices.
+     * @param Atype    A data type of A (TYPE_FP16 or TYPE_FP32)
+     * @param lda      A leading dimension of the matrix A.
+     * @param strideA  An offset in number of elements between matrix A[i] and A[i+1].
+     * @param B        An array of device pointers of batch of input B matrices.
+     * @param Btype    A data type of B (TYPE_FP16 or TYPE_FP32)
+     * @param ldb      A leading dimension of the matrix B.
+     * @param strideB  An offset in number of elements between matrix B[i] and B[i+1].
+     * @param C        (Output) An array of device pointers of batch of output C matrices.
+     * @param Ctype    A data type of C (TYPE_FP16 or TYPE_FP32)
+     * @param ldc      A leading dimension of the matrix C.
+     * @param strideC  An offset in number of elements between matrix C[i] and C[i+1].
+     * @param compute_type  An accumulation type of GEMM.
+     * @param alpha    A scale factor for A*B (default: 1.0f).
+     * @param beta     A scale factor for C (default: 0.0f).
+     *
+     * @throw GemmNotSupportedException if a type is not TYPE_FP16 or TYPE_FP32.
+     * @throw std::runtime_error  if any exception inside CUDA.
+     */
+    virtual void stridedBatchedGemm(GemmOp        transa,
+                                    GemmOp        transb,
+                                    const size_t  m,
+                                    const size_t  n,
+                                    const size_t  k,
+                                    const void*   A,
+                                    DataType      Atype,
+                                    const size_t  lda,
+                                    const int64_t strideA,
+                                    const void*   B,
+                                    DataType      Btype,
+                                    const size_t  ldb,
+                                    const int64_t strideB,
+                                    void*         C,
+                                    DataType      Ctype,
+                                    const size_t  ldc,
+                                    const int64_t strideC,
+                                    const size_t  batch_size,
+                                    DataType      compute_type,
+                                    const float   alpha = 1.0f,
+                                    const float   beta  = 0.0f);
+protected:
+    IAllocator*    allocator_ = nullptr;
+    cudaStream_t   stream_;
+    std::mutex*    mutex_           = nullptr;
+    cublasAlgoMap* cublas_algo_map_ = nullptr;
+    cublasHandle_t   cublas_handle_;
+    cublasLtHandle_t cublaslt_handle_;
+    void*            workspace_ = nullptr;
+    // use FP32 as default
+    DataType a_type_       = TYPE_FP32;
+    DataType b_type_       = TYPE_FP32;
+    DataType c_type_       = TYPE_FP32;
+    DataType compute_type_ = TYPE_FP32;
+    // Check if data and inputs are valid in the Gemm class.
+    virtual void checkDataTypeValidity(const DataType& type);
+};
+// class Int8Gemm : public Gemm {
+// protected:
+//     bool use_ORDER_COL32_2R_4R4_; // what is this?
+// };
+#ifdef SPARSITY_ENABLED
+/**
+ * A Sparse Gemm class.
+ *
+ * NOTE:
+ *   A, B, C are assumed to have a row major layout.
+ *   There are two restrictions:
+ *   - It supports the case when the matrix B is sparse.
+ *   - Supported only TYPE_FP16 for in/out data or compute types.
+ */
+class SpGemm: public Gemm {
+protected:
+    cusparseLtHandle_t                               cusparselt_handle_;
+    std::map<std::string, cusparseLtMatDescriptor_t> a_desc_map_;
+    std::map<std::string, cusparseLtMatDescriptor_t> b_desc_map_;
+    std::map<std::string, cusparseLtMatDescriptor_t> c_desc_map_;
+    bool                                             useBaseGemm(size_t batch_size, size_t m, size_t n, size_t k);
+public:
+    using Gemm::setComputeType;
+    using Gemm::setTypes;
+    using Gemm::setDefaultTypes;
+    using Gemm::setAllocator;
+    using Gemm::setCudaStream;
+    using Gemm::gemm;
+    using Gemm::batchedGemm;
+    using Gemm::stridedBatchedGemm;
+    /**
+     * @param allocator   Resource allocator.
+     * @param stream      A CUDA stream.
+     * @param config_file A file path of a GEMM configuration.
+     */
+    // TODO: Let's unify algo map loading part.
+    SpGemm(IAllocator*  allocator,
+           cudaStream_t stream,
+           std::string  config_file   = GEMM_CONFIG,
+           std::string  spconfig_file = SPGEMM_CONFIG);
+    ~SpGemm();
+    std::string toString() override;
+    void        loadGemmConfig(std::string config_file, std::string spconfig_file);
+    // Template method cannot be overridden.
+    void gemm(const GemmOp              transa,
+              const GemmOp              transb,
+              const size_t              m,
+              const size_t              n,
+              const size_t              k,
+              const void*               input,
+              const DenseWeight<float>& weight,
+              void*                     output,
+              const float               alpha = 1.0f,
+              const float               beta  = 0.0f) override;
+    void gemm(const GemmOp             transa,
+              const GemmOp             transb,
+              const size_t             m,
+              const size_t             n,
+              const size_t             k,
+              const void*              input,
+              const DenseWeight<half>& weight,
+              void*                    output,
+              const float              alpha = 1.0f,
+              const float              beta  = 0.0f) override;
+    void gemm(const GemmOp   transa,
+              const GemmOp   transb,
+              const size_t   m,
+              const size_t   n,
+              const size_t   k,
+              const void*    A,
+              const DataType Atype,
+              const size_t   lda,
+              const void*    B,
+              const DataType Btype,
+              const size_t   ldb,
+              void*          C,
+              const DataType Ctype,
+              const size_t   ldc,
+              const float    alpha = 1.0f,
+              const float    beta  = 0.0f) override;
+private:
+    void checkDataTypeValidity(const DataType& type) override;
+    // Temporal gemm helper mtehod to use template T.
+    template<typename T>
+    void weightGemmHelper(const GemmOp          transa,
+                          const GemmOp          transb,
+                          const size_t          m,
+                          const size_t          n,
+                          const size_t          k,
+                          const void*           input,
+                          const DenseWeight<T>& weight,
+                          void*                 output,
+                          const float           alpha,
+                          const float           beta);
+};
+// class Int8SpGemm : public Int8Gemm, public SpGemm {
+// };
+#endif
+/* ***************************** GEMM Exceptions ******************************* */
+class GemmInvalidShapeException: public std::exception {
+private:
+    std::string msg_ = "Invalid matrix shapes.";
+public:
+    explicit GemmInvalidShapeException() = default;
+    template<typename... Args>
+    explicit GemmInvalidShapeException(const std::string format, const Args&... args): msg_(fmtstr(format, args...))
+    {
+    }
+    const char* what() const throw()
+    {
+        return msg_.c_str();
+    }
+};
+class GemmNotSupportedException: public std::exception {
+private:
+    std::string msg_ = "Not supported exception.";
+public:
+    explicit GemmNotSupportedException() = default;
+    template<typename... Args>
+    explicit GemmNotSupportedException(const std::string format, const Args&... args): msg_(fmtstr(format, args...))
+    {
+    }
+    const char* what() const throw()
+    {
+        return msg_.c_str();
+    }
+};
+class GemmInvalidException: public std::exception {
+private:
+    std::string msg_ = "Invalid use of gemm.";
+public:
+    explicit GemmInvalidException() = default;
+    template<typename... Args>
+    explicit GemmInvalidException(const std::string format, const Args&... args): msg_(fmtstr(format, args...))
+    {
+    }
+    const char* what() const throw()
+    {
+        return msg_.c_str();
+    }
+};
+/* ************************ End of GEMM Exceptions ************************ */
+/* ***************************** GEMM utils ******************************* */
+/**
+ * @brief Create method for the Gemm family.
+ *
+ * @param allocator  Resource allocator.
+ * @param stream     A CUDA stream.
+ * @param sparse     Whether to use sparse GEMM
+ * @param quantized  Whether to use int8 quantized GEMM.
+ * @return A shared pointer of a GemmCls instance.
+ */
+std::shared_ptr<Gemm>
+createGemm(IAllocator* allocator, cudaStream_t stream, bool sparse = false, bool quantized = false);
+cudaDataType_t getCublasDataType(DataType dtype);
+#if (CUDART_VERSION >= 11000)
+cublasComputeType_t getCublasComputeType(DataType dtype);
+#else
+cudaDataType_t getCublasComputeType(DataType dtype);
+#endif
+cublasOperation_t getCublasOperation(GemmOp op);
+std::string       getGemmOpString(const GemmOp& op);
+#ifdef SPARSITY_ENABLED
+cusparseOperation_t getCusparseOperation(GemmOp op);
+cusparseComputeType getCusparseComputeType(DataType dtype);
+/**
+ * @brief Prune a weight matrix (in-place).
+ *
+ * SpGemm supports a case when the sparse matrix is B in C=A*B.
+ *
+ * @param data    A data pointer
+ * @param stream  A cuda stream object.
+ * @param k       A number of rows of op(B).
+ * @param n       A number of columns of op(B).
+ * @param trans   A transpose operation that will be applied to the matrix
+ *                (default: GEMM_OP_N).
+ */
+void pruneMatrixB(
+    void* data, const cudaStream_t& stream, const size_t k, const size_t n, const GemmOp trans = GEMM_OP_N);
+/**
+ * @brief Compress the B matrix in a specific sparsity format.
+ *
+ * @param output A pointer where to allocate memory buffer to store a compressed matrix.
+ * @param alloactor  A resource allocator.
+ * @param stream A cuda stream object.
+ * @param input  An input matrix to compress.
+ * @param k      A number of rows of op(B).
+ * @param n      A number of columns of op(B).
+ * @param trans  A transpose operation that will be applied to the matrix (default: GEMM_OP_N).
+ *
+ * @return A size of the allocated device buffer of the compressed matrix.
+ *
+ * @throw GemmInvalidException  if the input matrix does not have 2:4 sparsity.
+ *              or if fail to compute a correct buffer size to store the compressed matrix.
+ * @throw std::runtime_error  if any exception inside CUDA.
+ */
+size_t compressMatrixB(void**              output,
+                       IAllocator&         allocator,
+                       const cudaStream_t& stream,
+                       const void*         input,
+                       const size_t        k,
+                       const size_t        n,
+                       const GemmOp        trans = GEMM_OP_N);
+#endif
+/* ************************* End of GEMM utils **************************** */
+}  // end of namespace fastertransformer
--- a/src/fastertransformer/utils/gemm_test/CMakeLists.txt
+++ b/src/fastertransformer/utils/gemm_test/CMakeLists.txt
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+cmake_minimum_required(VERSION 3.8)
+set(gemm_func_files
+  gemm_func.cc
+)
+set(encoder_gemm_func_files
+  encoder_gemm_func.cc
+)
+set(encoder_igemm_func_files
+  encoder_igemm_func.cc
+)
+set(decoding_gemm_func_files
+  decoding_gemm_func.cc
+)
+set(gpt_gemm_func_files
+  gpt_gemm_func.cc
+)
+set(xlnet_gemm_func_files
+  xlnet_gemm_func.cc
+)
+set(t5_gemm_func_files
+  t5_gemm_func.cc
+)
+set(swin_igemm_func_files
+  swin_igemm_func.cc
+)
+set(swin_gemm_func_files
+  swin_gemm_func.cc
+)
+add_library(gemm_func STATIC ${gemm_func_files})
+target_link_libraries(gemm_func PUBLIC -lcublas -lcublasLt -lcudart cuda_utils logger)
+set_property(TARGET gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
+set_property(TARGET gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
+add_library(encoder_gemm_func STATIC ${encoder_gemm_func_files})
+target_link_libraries(encoder_gemm_func PUBLIC -lcublas -lcublasLt -lcudart gemm_func cuda_utils logger)
+if (SPARSITY_SUPPORT)
+target_link_libraries(encoder_gemm_func PUBLIC -lcusparse -lcusparseLt)
+endif()
+set_property(TARGET encoder_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
+set_property(TARGET encoder_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
+add_library(encoder_igemm_func STATIC ${encoder_igemm_func_files})
+target_link_libraries(encoder_igemm_func PUBLIC -lcublas -lcublasLt -lcudart cuda_utils logger)
+if (SPARSITY_SUPPORT)
+target_link_libraries(encoder_igemm_func PUBLIC -lcusparse -lcusparseLt)
+endif()
+set_property(TARGET encoder_igemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
+set_property(TARGET encoder_igemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
+add_library(decoding_gemm_func STATIC ${decoding_gemm_func_files})
+target_link_libraries(decoding_gemm_func PUBLIC -lcublas -lcublasLt -lcudart gemm_func cuda_utils logger)
+set_property(TARGET decoding_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
+set_property(TARGET decoding_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
+add_library(gpt_gemm_func STATIC ${gpt_gemm_func_files})
+target_link_libraries(gpt_gemm_func PUBLIC -lcublas -lcublasLt -lcudart gemm_func cuda_utils logger)
+if (SPARSITY_SUPPORT)
+  target_link_libraries(gpt_gemm_func PUBLIC -lcusparse -lcusparseLt)
+endif()
+set_property(TARGET gpt_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
+set_property(TARGET gpt_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
+add_library(xlnet_gemm_func STATIC ${xlnet_gemm_func_files})
+target_link_libraries(xlnet_gemm_func PUBLIC -lcublas -lcublasLt -lcudart gemm_func cuda_utils logger)
+set_property(TARGET xlnet_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
+set_property(TARGET xlnet_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
+add_library(t5_gemm_func STATIC ${t5_gemm_func_files})
+target_link_libraries(t5_gemm_func PUBLIC -lcublas -lcublasLt -lcudart gemm_func cuda_utils logger)
+if (SPARSITY_SUPPORT)
+  target_link_libraries(t5_gemm_func PUBLIC -lcusparse -lcusparseLt)
+endif()
+set_property(TARGET t5_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
+set_property(TARGET t5_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
+add_library(swin_igemm_func STATIC ${swin_igemm_func_files})
+target_link_libraries(swin_igemm_func PUBLIC -lcublas -lcublasLt -lcudart gemm_func encoder_igemm_func cuda_utils logger)
+set_property(TARGET swin_igemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
+set_property(TARGET swin_igemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
+add_library(swin_gemm_func STATIC ${swin_gemm_func_files})
+target_link_libraries(swin_gemm_func PUBLIC -lcublas -lcublasLt -lcudart gemm_func cuda_utils logger)
+set_property(TARGET swin_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
+set_property(TARGET swin_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
--- a/src/fastertransformer/utils/gemm_test/decoding_gemm_func.cc
+++ b/src/fastertransformer/utils/gemm_test/decoding_gemm_func.cc
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "src/fastertransformer/utils/gemm_test/decoding_gemm_func.h"
+namespace fastertransformer {
+template<typename T>
+void generate_decoding_gemm_config(int   batch_size,
+                                   int   beam_width,
+                                   int   max_mem_seq_len,
+                                   int   head_num,
+                                   int   size_per_head,
+                                   int   inter_size,
+                                   int   vocab_size,
+                                   int   mem_hidden_units,
+                                   void* buffer_in,
+                                   bool  isAppend)
+{
+    void* cublas_workspace;
+    void* buffer;
+    int   workSpaceSize;
+#ifdef ENABLE_BF16
+    if (std::is_same<T, half>::value || std::is_same<T, __nv_bfloat16>::value) {
+#else
+    if (std::is_same<T, half>::value) {
+#endif  // ENABLE_BF16
+        // cublas_workspace_ should be the start pointer of cudaMalloc()
+        // to ensure 16B alignemnet
+        cublas_workspace = buffer_in;
+        buffer           = (void*)((char*)cublas_workspace + CUBLAS_WORKSPACE_SIZE);
+        workSpaceSize    = CUBLAS_WORKSPACE_SIZE;
+    }
+    else {
+        cublas_workspace = nullptr;
+        buffer           = buffer_in;
+        workSpaceSize    = 0;
+    }
+    struct cudaDeviceProp prop;
+    check_cuda_error(cudaGetDeviceProperties(&prop, 0));
+    printf("Device %s\n", prop.name);
+    // check config
+    FILE* fd;
+    int   line_count = 0;
+    if (!isAppend) {
+        fd = fopen(GEMM_CONFIG, "w+");
+    }
+    else {
+        fd = fopen(GEMM_CONFIG, "a+");
+        std::vector<std::string> config;
+        char                     line[1024];
+        while (fgets(line, 1024, fd) != NULL) {
+            config.push_back(std::string(line));
+        }
+        line_count = config.size();
+        if (config.size() >= (MAX_CONFIG_NUM * GEMM_NUM + 1))  // 6 cublas/cublasLt, first row is not included
+        {
+            int startIdx = config.size() - ((MAX_CONFIG_NUM - 1) * GEMM_NUM);
+            fclose(fd);
+            fd = fopen(GEMM_CONFIG, "w+");
+            fprintf(fd, "%s", config[0].c_str());
+            for (uint i = startIdx; i < config.size(); i++) {
+                fprintf(fd, "%s", config[i].c_str());
+            }
+            line_count = config.size() - (GEMM_NUM + 3);
+        }
+    }
+    const int hidden_units = head_num * size_per_head;
+    const int gemm_num     = 6;
+    int       M[gemm_num];
+    int       N[gemm_num];
+    int       K[gemm_num];
+    int       batchCount[gemm_num] = {1, 1, 1, 1, 1, 1};
+    char      mess[gemm_num][256];
+    // gemm 0
+    M[0] = batch_size * beam_width;
+    K[0] = hidden_units;
+    N[0] = K[0] * 3;
+    strcpy(mess[0], "from_tensor * weightQKV");
+    // gemm 1
+    M[1] = batch_size * beam_width;
+    K[1] = hidden_units;
+    N[1] = K[1];
+    strcpy(mess[1], "attr * output_kernel");
+    // gemm2
+    M[2] = batch_size * beam_width * max_mem_seq_len;
+    K[2] = mem_hidden_units;
+    N[2] = hidden_units;
+    strcpy(mess[2], "mem_tensor * weightK/V in cross attention");
+    // gemm 3
+    M[3] = batch_size * beam_width;
+    K[3] = hidden_units;
+    N[3] = inter_size;
+    strcpy(mess[3], "ffn gemm1 ");
+    // gemm 4
+    M[4] = batch_size * beam_width;
+    K[4] = inter_size;
+    N[4] = hidden_units;
+    strcpy(mess[4], "ffn gemm2");
+    // gemm5
+    M[5] = batch_size * beam_width;
+    K[5] = hidden_units;
+    N[5] = ceil(vocab_size / 8.) * 8;
+    strcpy(mess[5], "decoder_output * embedding_kernel -> embedding_output");
+    cublasHandle_t cublas_handle;
+    check_cuda_error(cublasCreate(&cublas_handle));
+    cublasLtHandle_t ltHandle;
+    check_cuda_error(cublasLtCreate(&ltHandle));
+    cudaDataType_t AType;
+    cudaDataType_t BType;
+    cudaDataType_t CType;
+    cudaDataType_t computeType;
+    int            startAlgo, endAlgo;
+    const int      ites = 100;
+    struct timeval start, end;
+    CublasDataType data_type;
+    if (std::is_same<T, float>::value) {
+        data_type   = FLOAT_DATATYPE;
+        AType       = CUDA_R_32F;
+        BType       = CUDA_R_32F;
+        CType       = CUDA_R_32F;
+        computeType = CUDA_R_32F;
+        startAlgo   = (int)CUBLAS_GEMM_DEFAULT;
+        endAlgo     = (int)CUBLAS_GEMM_ALGO23;
+    }
+    else if (std::is_same<T, half>::value) {
+        data_type   = HALF_DATATYPE;
+        AType       = CUDA_R_16F;
+        BType       = CUDA_R_16F;
+        CType       = CUDA_R_16F;
+        computeType = CUDA_R_32F;
+        startAlgo   = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
+        endAlgo     = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
+    }
+#ifdef ENABLE_BF16
+    else if (std::is_same<T, __nv_bfloat16>::value) {
+        data_type   = BFLOAT16_DATATYPE;
+        AType       = CUDA_R_16BF;
+        BType       = CUDA_R_16BF;
+        CType       = CUDA_R_16BF;
+        computeType = CUDA_R_32F;
+        startAlgo   = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
+        endAlgo     = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
+    }
+#endif
+    using scaleT = typename ScaleTypeConverter<T>::Type;
+    scaleT alpha = (scaleT)1.0f;
+    scaleT beta  = (scaleT)0.0f;
+    printf("***Encoder Gemm Testing Begin***\n");
+    printf("***Cublas Gemm Testing Begin***\n");
+    if (line_count == 0) {
+        fprintf(fd,
+                "batch_size, seq_len, head_num, size_per_head dataType ### batchCount, n, m, k, algoId, "
+                "customOption, tile, numSplitsK, swizzle, reductionScheme, workspaceSize, stages, exec_time\n");
+    }
+    for (int i = 0; i < gemm_num; ++i) {
+        int m = M[i], n = N[i], k = K[i];
+        printf("\n-----------------------------\n");
+        printf("GEMM test %d: [M: %d, K: %d, N: %d] %s\n", i, m, k, n, mess[i]);
+        T* d_A = (T*)buffer;
+        T* d_B = d_A + m * k * batchCount[i];
+        T* d_C = d_B + k * n * batchCount[i];
+        float exec_time = 99999.0f;
+        int   fast_algo = 0;
+        int   seq_len   = i == 2 ? max_mem_seq_len : 1;
+        for (int algo = startAlgo; algo <= endAlgo; algo++) {
+            cublasStatus_t status;
+            cudaDeviceSynchronize();
+            gettimeofday(&start, NULL);
+            for (int ite = 0; ite < ites; ++ite) {
+                status = cublasGemmEx(cublas_handle,
+                                      CUBLAS_OP_N,
+                                      CUBLAS_OP_N,
+                                      n,
+                                      m,
+                                      k,
+                                      &alpha,
+                                      d_B,
+                                      BType,
+                                      n,
+                                      d_A,
+                                      AType,
+                                      k,
+                                      &beta,
+                                      d_C,
+                                      CType,
+                                      n,
+                                      computeType,
+                                      static_cast<cublasGemmAlgo_t>(algo));
+                if (status != CUBLAS_STATUS_SUCCESS) {
+                    break;
+                }
+            }
+            cudaDeviceSynchronize();
+            gettimeofday(&end, NULL);
+            if (status == CUBLAS_STATUS_SUCCESS) {
+                printf("algo_%d costs %.3fms \n", algo, diffTime(start, end) / ites);
+                if (diffTime(start, end) / ites < exec_time) {
+                    exec_time = diffTime(start, end) / ites;
+                    fast_algo = algo;
+                }
+            }
+        }
+        printf("fast_algo %d costs %.3f ms\n", fast_algo, exec_time);
+        // for fp16 and bf16, we compare cublasLt
+        if (data_type != FLOAT_DATATYPE) {
+            printf("***cublasLt Gemm Testing Begin***\n");
+            // Let try a fixed number of combinations
+            int                ALGO_COMBINATIONS = 5000;
+            customMatmulPerf_t perfResults[ALGO_COMBINATIONS];
+            LtHgemmCustomFind<T, scaleT>(ltHandle,
+                                         batch_size * beam_width,
+                                         seq_len,
+                                         head_num,
+                                         size_per_head,
+                                         n,
+                                         m,
+                                         k,
+                                         &alpha,
+                                         d_B,
+                                         d_A,
+                                         &beta,
+                                         d_C,
+                                         cublas_workspace,
+                                         workSpaceSize,
+                                         fd,
+                                         perfResults,
+                                         ALGO_COMBINATIONS);
+            if (perfResults[0].time < exec_time) {
+                printPerfStructure(batch_size * beam_width,
+                                   seq_len,
+                                   head_num,
+                                   size_per_head,
+                                   n,
+                                   m,
+                                   k,
+                                   perfResults[0],
+                                   fd,
+                                   data_type,
+                                   0);
+            }
+            else {
+                fprintf(fd,
+                        "%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 "
+#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
+                        "-1 -1 "
+#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
+                        "-1 -1 -1 "
+#endif
+                        "%f\n",
+                        batch_size * beam_width,
+                        seq_len,
+                        head_num,
+                        size_per_head,
+                        data_type,
+                        batchCount[i],
+                        n,
+                        m,
+                        k,
+                        fast_algo,
+                        exec_time);
+            }
+            printf("***cublasLt Gemm Testing End***\n");
+        }
+        else {
+            fprintf(fd,
+                    "%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 "
+#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
+                    "-1 -1 "
+#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
+                    "-1 -1 -1 "
+#endif
+                    "%f\n",
+                    batch_size * beam_width,
+                    seq_len,
+                    head_num,
+                    size_per_head,
+                    data_type,
+                    batchCount[i],
+                    n,
+                    m,
+                    k,
+                    fast_algo,
+                    exec_time);
+        }
+    }
+    printf("***cublas Gemm Testing End***\n\n");
+    fclose(fd);
+    printf("***Decoding Gemm Testing End***\n");
+    return;
+}
+template void generate_decoding_gemm_config<float>(int   batch_size,
+                                                   int   beam_width,
+                                                   int   seq_len,
+                                                   int   head_num,
+                                                   int   size_per_head,
+                                                   int   inter_size,
+                                                   int   vocab_size,
+                                                   int   mem_hidden_units,
+                                                   void* buffer_in,
+                                                   bool  isAppend);
+template void generate_decoding_gemm_config<half>(int   batch_size,
+                                                  int   beam_width,
+                                                  int   seq_len,
+                                                  int   head_num,
+                                                  int   size_per_head,
+                                                  int   inter_size,
+                                                  int   vocab_size,
+                                                  int   mem_hidden_units,
+                                                  void* buffer_in,
+                                                  bool  isAppend);
+#ifdef ENABLE_BF16
+template void generate_decoding_gemm_config<__nv_bfloat16>(int   batch_size,
+                                                           int   beam_width,
+                                                           int   seq_len,
+                                                           int   head_num,
+                                                           int   size_per_head,
+                                                           int   inter_size,
+                                                           int   vocab_size,
+                                                           int   mem_hidden_units,
+                                                           void* buffer_in,
+                                                           bool  isAppend);
+#endif
+size_t calDecodingGemmTestBufSizeInByte(int            batch_size,
+                                        int            beam_width,
+                                        int            max_mem_seq_len,
+                                        int            head_num,
+                                        int            size_per_head,
+                                        int            inter_size,
+                                        int            memory_hidden_units,
+                                        int            vocab_size,
+                                        CublasDataType data_type)
+{
+    size_t       buf_size_in_byte   = 0;
+    const size_t tensor_para_size   = 1;
+    const size_t hidden_units       = head_num * size_per_head;
+    const size_t local_head_num     = head_num / tensor_para_size;
+    const size_t local_hidden_units = local_head_num * size_per_head;
+    // int wordSize = (data_type == FLOAT_DATATYPE ? sizeof(float) : sizeof(half));
+    // Because we always use float for some buffer, set the wordSize to float directly.
+    int wordSize = sizeof(float);
+    size_t              m = batch_size * beam_width;
+    std::vector<size_t> buff_size;
+    // for qkv gemm
+    buff_size.push_back(m * hidden_units + hidden_units * 3 * local_hidden_units + m * 3 * local_hidden_units);
+    // for attention output gemm
+    buff_size.push_back(m * hidden_units + hidden_units * local_hidden_units + m * local_hidden_units);
+    // for memory_tensor gemm
+    buff_size.push_back(m * max_mem_seq_len * memory_hidden_units + memory_hidden_units * local_hidden_units
+                        + m * max_mem_seq_len * local_hidden_units);
+    // for context ffn gemm
+    buff_size.push_back(m * inter_size / tensor_para_size + hidden_units * inter_size / tensor_para_size
+                        + m * hidden_units);
+    // for vocab
+    buff_size.push_back(m * hidden_units + hidden_units * ceil(vocab_size / 8.) * 8 / tensor_para_size
+                        + m * ceil(vocab_size / 8.) * 8 / tensor_para_size);
+    for (auto t : buff_size) {
+        buf_size_in_byte = buf_size_in_byte > t ? buf_size_in_byte : t;
+    }
+    buf_size_in_byte *= wordSize;
+    buf_size_in_byte += ((data_type == HALF_DATATYPE || data_type == BFLOAT16_DATATYPE) ? CUBLAS_WORKSPACE_SIZE : 0);
+    return buf_size_in_byte;
+}
+}  // namespace fastertransformer
--- a/src/fastertransformer/utils/gemm_test/decoding_gemm_func.h
+++ b/src/fastertransformer/utils/gemm_test/decoding_gemm_func.h
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "src/fastertransformer/utils/cublasAlgoMap.h"
+#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include "src/fastertransformer/utils/gemm_test/gemm_func.h"
+#include <cstdio>
+#include <cstdlib>
+#include <ctime>
+#include <cuda_fp16.h>
+#include <cuda_profiler_api.h>
+#include <map>
+#include <sys/time.h>
+#include <unistd.h>
+#include <vector>
+namespace fastertransformer {
+template<typename T>
+void generate_decoding_gemm_config(int   batch_size,
+                                   int   beam_width,
+                                   int   seq_len,
+                                   int   head_num,
+                                   int   size_per_head,
+                                   int   inter_size,
+                                   int   vocab_size,
+                                   int   mem_hidden_units,
+                                   void* buffer_in,
+                                   bool  isAppend);
+size_t calDecodingGemmTestBufSizeInByte(int            batch_size,
+                                        int            beam_width,
+                                        int            max_mem_seq_len,
+                                        int            head_num,
+                                        int            size_per_head,
+                                        int            inter_size,
+                                        int            memory_hidden_units,
+                                        int            vocab_size,
+                                        CublasDataType data_type);
+}  // namespace fastertransformer
--- a/src/fastertransformer/utils/gemm_test/encoder_gemm_func.cc
+++ b/src/fastertransformer/utils/gemm_test/encoder_gemm_func.cc
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "src/fastertransformer/utils/gemm_test/encoder_gemm_func.h"
+namespace fastertransformer {
+template<typename T>
+void generate_encoder_gemm_config(
+    int batch_size, int seq_len, int head_num, int size_per_head, void* buffer_in, bool isAppend, int tensor_para_size)
+{
+    void* cublas_workspace;
+    void* buffer;
+    int   workSpaceSize;
+#ifdef ENABLE_BF16
+    if (std::is_same<T, half>::value || std::is_same<T, __nv_bfloat16>::value) {
+#else
+    if (std::is_same<T, half>::value) {
+#endif  // ENABLE_BF16
+        // cublas_workspace_ should be the start pointer of cudaMalloc()
+        // to ensure 16B alignemnet
+        cublas_workspace = buffer_in;
+        buffer           = (void*)((char*)cublas_workspace + CUBLAS_WORKSPACE_SIZE);
+        workSpaceSize    = CUBLAS_WORKSPACE_SIZE;
+    }
+    else {
+        cublas_workspace = nullptr;
+        buffer           = buffer_in;
+        workSpaceSize    = 0;
+    }
+    struct cudaDeviceProp prop;
+    check_cuda_error(cudaGetDeviceProperties(&prop, 0));
+    printf("Device %s\n", prop.name);
+    // check config
+    FILE* fd;
+    int   line_count = 0;
+    if (!isAppend) {
+        fd = fopen(GEMM_CONFIG, "w+");
+    }
+    else {
+        fd = fopen(GEMM_CONFIG, "a+");
+        std::vector<std::string> config;
+        char                     line[1024];
+        while (fgets(line, 1024, fd) != NULL) {
+            config.push_back(std::string(line));
+        }
+        line_count = config.size();
+        if (config.size() >= (MAX_CONFIG_NUM * GEMM_NUM + 1))  // 6 cublas/cublasLt, first row is not included
+        {
+            int startIdx = config.size() - ((MAX_CONFIG_NUM - 1) * GEMM_NUM);
+            fclose(fd);
+            fd = fopen(GEMM_CONFIG, "w+");
+            fprintf(fd, "%s", config[0].c_str());
+            for (uint i = startIdx; i < config.size(); i++) {
+                fprintf(fd, "%s", config[i].c_str());
+            }
+            line_count = config.size() - (GEMM_NUM + 3);
+        }
+    }
+    const int gemm_num = 7;
+    int       M[gemm_num];
+    int       N[gemm_num];
+    int       K[gemm_num];
+    int       batchCount[gemm_num] = {1, 1, 1, 1, 1, 1, 1};
+    char      mess[gemm_num][256];
+    float     exec_times[gemm_num];
+    // gemm1
+    M[0] = batch_size * seq_len;
+    K[0] = head_num * size_per_head;
+    N[0] = (head_num / tensor_para_size) * size_per_head;
+    strcpy(mess[0], "from_tensor * weightQ/K/V");
+    // gemm2
+    M[1] = M[0];
+    K[1] = head_num * size_per_head;
+    N[1] = 4 * head_num * size_per_head / tensor_para_size;
+    strcpy(mess[1], "attr_output * inter_kernel");
+    // gemm3
+    M[2] = M[0];
+    K[2] = 4 * head_num * size_per_head / tensor_para_size;
+    N[2] = head_num * size_per_head;
+    strcpy(mess[2], "inter_matmul * output_kernel");
+    M[3]          = seq_len;
+    N[3]          = seq_len;
+    K[3]          = size_per_head;
+    batchCount[3] = batch_size * (head_num / tensor_para_size);
+    strcpy(mess[3], "attention batched Gemm1");
+    M[4]          = seq_len;
+    N[4]          = size_per_head;
+    K[4]          = seq_len;
+    batchCount[4] = batch_size * (head_num / tensor_para_size);
+    strcpy(mess[4], "attention batched Gemm2");
+    M[5]          = batch_size * seq_len;
+    N[5]          = (head_num / tensor_para_size) * size_per_head;
+    K[5]          = head_num * size_per_head;
+    batchCount[5] = 3;
+    strcpy(mess[5], "from_tensor * weight_QKV in BatchGemm");
+    M[6] = batch_size * seq_len;
+    K[6] = (head_num / tensor_para_size) * size_per_head;
+    N[6] = head_num * size_per_head;
+    strcpy(mess[6], "attr * output_kernel");
+    cublasHandle_t cublas_handle;
+    check_cuda_error(cublasCreate(&cublas_handle));
+    cublasLtHandle_t ltHandle;
+    check_cuda_error(cublasLtCreate(&ltHandle));
+    cudaDataType_t AType;
+    cudaDataType_t BType;
+    cudaDataType_t CType;
+    cudaDataType_t computeType;
+    int            startAlgo, endAlgo;
+    const int      ites = 100;
+    struct timeval start, end;
+    CublasDataType data_type;
+    if (std::is_same<T, float>::value) {
+        data_type   = FLOAT_DATATYPE;
+        AType       = CUDA_R_32F;
+        BType       = CUDA_R_32F;
+        CType       = CUDA_R_32F;
+        computeType = CUDA_R_32F;
+        startAlgo   = (int)CUBLAS_GEMM_DEFAULT;
+        endAlgo     = (int)CUBLAS_GEMM_ALGO23;
+    }
+    else if (std::is_same<T, half>::value) {
+        data_type   = HALF_DATATYPE;
+        AType       = CUDA_R_16F;
+        BType       = CUDA_R_16F;
+        CType       = CUDA_R_16F;
+        computeType = CUDA_R_32F;
+        startAlgo   = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
+        endAlgo     = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
+    }
+#ifdef ENABLE_BF16
+    else if (std::is_same<T, __nv_bfloat16>::value) {
+        data_type   = BFLOAT16_DATATYPE;
+        AType       = CUDA_R_16BF;
+        BType       = CUDA_R_16BF;
+        CType       = CUDA_R_16BF;
+        computeType = CUDA_R_32F;
+        startAlgo   = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
+        endAlgo     = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
+    }
+#endif
+    using scaleT = typename ScaleTypeConverter<T, false>::Type;
+    scaleT alpha = (scaleT)1.0f;
+    scaleT beta  = (scaleT)0.0f;
+    printf("***Encoder Gemm Testing Begin***\n");
+    printf("***Cublas Gemm Testing Begin***\n");
+    if (line_count == 0) {
+        fprintf(fd,
+                "batch_size, seq_len, head_num, size_per_head dataType ### batchCount, n, m, k, algoId, "
+                "customOption, tile, numSplitsK, swizzle, reductionScheme, workspaceSize, stages, exec_time\n");
+    }
+    for (int i = 0; i < gemm_num; ++i) {
+        // if(i != 0 && i != 5) continue;
+        int m = M[i], n = N[i], k = K[i];
+        printf("\n-----------------------------\n");
+        printf("GEMM test %d: [M: %d, K: %d, N: %d] %s\n", i, m, k, n, mess[i]);
+        T* d_A = (T*)buffer;
+        T* d_B = d_A + m * k * batchCount[i];
+        T* d_C = d_B + k * n * batchCount[i];
+        // array of pointer for batchedGemm
+        T* harray[12];
+        harray[0]  = (T*)buffer;
+        harray[1]  = (T*)((char*)buffer + sizeof(T) * m * k);
+        harray[2]  = (T*)((char*)buffer + 2 * sizeof(T) * m * k);
+        harray[4]  = (T*)((char*)buffer + 3 * sizeof(T) * m * k);
+        harray[5]  = (T*)((char*)buffer + 3 * sizeof(T) * m * k + sizeof(T) * k * n);
+        harray[6]  = (T*)((char*)buffer + 3 * sizeof(T) * m * k + 2 * sizeof(T) * k * n);
+        harray[8]  = (T*)((char*)buffer + 3 * sizeof(T) * m * k + 3 * sizeof(T) * k * n);
+        harray[9]  = (T*)((char*)buffer + 3 * sizeof(T) * m * k + 3 * sizeof(T) * k * n + sizeof(T) * m * n);
+        harray[10] = (T*)((char*)buffer + 3 * sizeof(T) * m * k + 3 * sizeof(T) * k * n + 2 * sizeof(T) * m * n);
+        T** darray = 0;
+        check_cuda_error(cudaMalloc((void**)&darray, sizeof(T*) * 12));
+        cudaMemcpy((void*)darray, (void*)harray, sizeof(T*) * 12, cudaMemcpyHostToDevice);
+        T** dAarray = darray;
+        T** dBarray = darray + 4;
+        T** dCarray = darray + 8;
+        float exec_time = 99999.0f;
+        int   fast_algo = 0;
+        for (int algo = startAlgo; algo <= endAlgo; algo++) {
+            cublasStatus_t status;
+            cudaDeviceSynchronize();
+            gettimeofday(&start, NULL);
+            for (int ite = 0; ite < ites; ++ite) {
+                if (i < 3) {
+                    status = cublasGemmEx(cublas_handle,
+                                          CUBLAS_OP_N,
+                                          CUBLAS_OP_N,
+                                          n,
+                                          m,
+                                          k,
+                                          &alpha,
+                                          d_B,
+                                          BType,
+                                          n,
+                                          d_A,
+                                          AType,
+                                          k,
+                                          &beta,
+                                          d_C,
+                                          CType,
+                                          n,
+                                          computeType,
+                                          static_cast<cublasGemmAlgo_t>(algo));
+                }
+                else if (i == 3) {
+                    status = cublasGemmStridedBatchedEx(cublas_handle,
+                                                        CUBLAS_OP_T,
+                                                        CUBLAS_OP_N,
+                                                        seq_len,
+                                                        seq_len,
+                                                        size_per_head,
+                                                        &alpha,
+                                                        d_B,
+                                                        BType,
+                                                        size_per_head,
+                                                        seq_len * size_per_head,
+                                                        d_A,
+                                                        AType,
+                                                        size_per_head,
+                                                        seq_len * size_per_head,
+                                                        &beta,
+                                                        d_C,
+                                                        CType,
+                                                        seq_len,
+                                                        seq_len * seq_len,
+                                                        batch_size * head_num,
+                                                        computeType,
+                                                        static_cast<cublasGemmAlgo_t>(algo));
+                }
+                else if (i == 4) {
+                    status = cublasGemmStridedBatchedEx(cublas_handle,
+                                                        CUBLAS_OP_N,
+                                                        CUBLAS_OP_N,
+                                                        size_per_head,
+                                                        seq_len,
+                                                        seq_len,
+                                                        &alpha,
+                                                        d_B,
+                                                        BType,
+                                                        size_per_head,
+                                                        seq_len * size_per_head,
+                                                        d_A,
+                                                        AType,
+                                                        seq_len,
+                                                        seq_len * seq_len,
+                                                        &beta,
+                                                        d_C,
+                                                        CType,
+                                                        size_per_head,
+                                                        seq_len * size_per_head,
+                                                        batch_size * head_num,
+                                                        computeType,
+                                                        static_cast<cublasGemmAlgo_t>(algo));
+                }
+                else if (i == 5) {
+                    status = cublasGemmBatchedEx(cublas_handle,
+                                                 CUBLAS_OP_N,
+                                                 CUBLAS_OP_N,
+                                                 n,
+                                                 m,
+                                                 k,
+                                                 &alpha,
+                                                 (const void* const*)dBarray,
+                                                 BType,
+                                                 n,
+                                                 (const void* const*)dAarray,
+                                                 AType,
+                                                 k,
+                                                 &beta,
+                                                 (void* const*)dCarray,
+                                                 CType,
+                                                 n,
+                                                 3,
+                                                 computeType,
+                                                 static_cast<cublasGemmAlgo_t>(algo));
+                }
+                if (status != CUBLAS_STATUS_SUCCESS) {
+                    break;
+                }
+            }
+            cudaDeviceSynchronize();
+            gettimeofday(&end, NULL);
+            if (status == CUBLAS_STATUS_SUCCESS) {
+                printf("algo_%d costs %.3fms \n", algo, diffTime(start, end) / ites);
+                if (diffTime(start, end) / ites < exec_time) {
+                    exec_time = diffTime(start, end) / ites;
+                    fast_algo = algo;
+                }
+            }
+        }
+        printf("fast_algo %d costs %.3f ms\n", fast_algo, exec_time);
+        // for fp16 and bf16, we compare cublasLt
+        if (i < 3 && data_type != FLOAT_DATATYPE) {
+            printf("***cublasLt Gemm Testing Begin***\n");
+            // Let try a fixed number of combinations
+            int                ALGO_COMBINATIONS = 5000;
+            customMatmulPerf_t perfResults[ALGO_COMBINATIONS];
+            LtHgemmCustomFind<T, scaleT>(ltHandle,
+                                         batch_size,
+                                         seq_len,
+                                         head_num,
+                                         size_per_head,
+                                         n,
+                                         m,
+                                         k,
+                                         &alpha,
+                                         d_B,
+                                         d_A,
+                                         &beta,
+                                         d_C,
+                                         cublas_workspace,
+                                         workSpaceSize,
+                                         fd,
+                                         perfResults,
+                                         ALGO_COMBINATIONS);
+            if (perfResults[0].time < exec_time) {
+                printPerfStructure(
+                    batch_size, seq_len, head_num, size_per_head, n, m, k, perfResults[0], fd, data_type, 0);
+                exec_time = perfResults[0].time;
+            }
+            else {
+                fprintf(fd,
+                        "%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 "
+#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
+                        "-1 -1 "
+#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
+                        "-1 -1 -1 "
+#endif
+                        "%f\n",
+                        batch_size,
+                        seq_len,
+                        head_num,
+                        size_per_head,
+                        data_type,
+                        batchCount[i],
+                        n,
+                        m,
+                        k,
+                        fast_algo,
+                        exec_time);
+            }
+            printf("***cublasLt Gemm Testing End***\n");
+        }
+        else {
+            fprintf(fd,
+                    "%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 "
+#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
+                    "-1 -1 "
+#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
+                    "-1 -1 -1 "
+#endif
+                    "%f\n",
+                    batch_size,
+                    seq_len,
+                    head_num,
+                    size_per_head,
+                    data_type,
+                    batchCount[i],
+                    n,
+                    m,
+                    k,
+                    fast_algo,
+                    exec_time);
+        }
+        exec_times[i] = exec_time;
+        cudaFree(darray);
+    }
+    printf("***cublas Gemm Testing End***\n\n");
+    fclose(fd);
+    printf("***Encoder Gemm Testing End***\n");
+#ifdef SPARSITY_ENABLED
+    bool do_sparse_test = false;
+    if (prop.major == 8 && (prop.minor == 0 || prop.minor == 6)) {
+        do_sparse_test = true;
+    }
+    if (do_sparse_test && sizeof(T) == sizeof(half)) {
+        printf("***cusparseLt Gemm Testing Begin***\n");
+        // only first 3 cases can be sparse
+        const int spgemm_num = 3;
+        if (!isAppend) {
+            fd = fopen(SPGEMM_CONFIG, "w+");
+        }
+        else {
+            fd = fopen(SPGEMM_CONFIG, "a+");
+            std::vector<std::string> config;
+            char                     line[1024];
+            while (fgets(line, 1024, fd) != NULL) {
+                config.push_back(std::string(line));
+            }
+            line_count = config.size();
+            if (config.size() >= (MAX_CONFIG_NUM * spgemm_num + 1))  // 6 cublas/cublasLt, first row is not included
+            {
+                int startIdx = config.size() - ((MAX_CONFIG_NUM - 1) * spgemm_num);
+                fclose(fd);
+                fd = fopen(SPGEMM_CONFIG, "w+");
+                fprintf(fd, "%s", config[0].c_str());
+                for (uint i = startIdx; i < config.size(); i++) {
+                    fprintf(fd, "%s", config[i].c_str());
+                }
+                line_count = config.size() - (spgemm_num + 3);
+            }
+        }
+        if (line_count == 0) {
+            fprintf(
+                fd,
+                "batch_size, seq_len, head_num, size_per_head dataType ### batchCount, m, n, k, algoId, exec_time\n");
+        }
+        cusparseLtHandle_t handle;
+        CHECK_CUSPARSE(cusparseLtInit(&handle));
+        cusparseOrder_t     order        = CUSPARSE_ORDER_COL;
+        cusparseOperation_t opA          = CUSPARSE_OPERATION_NON_TRANSPOSE;
+        cusparseOperation_t opB          = CUSPARSE_OPERATION_NON_TRANSPOSE;
+        cusparseComputeType compute_type = CUSPARSE_COMPUTE_16F;
+        unsigned            alignment    = 16;
+        cudaStream_t        stream       = 0;
+        float               alpha2       = 1.0f;
+        float               beta2        = 0.0f;
+        for (int i = 0; i < spgemm_num; ++i) {
+            // to be compatible with spgemm wrapper, we let A be the weight matrix
+            // so m and n are swapped
+            // A: mxk B: kxn C:mxn
+            int m = N[i], n = M[i], k = K[i];
+            printf("\n-----------------------------\n");
+            printf("GEMM test %d: [M: %d, K: %d, N: %d]\n", i, m, k, n);
+            T* d_A = (T*)buffer;
+            T* d_B = d_A + m * k * batchCount[i];
+            T* d_C = d_B + k * n * batchCount[i];
+            T* dA_compressed;
+            {
+                cusparseLtMatDescriptor_t matA;
+                CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(
+                    &handle, &matA, m, k, m, alignment, CUDA_R_16F, order, CUSPARSELT_SPARSITY_50_PERCENT))
+                CHECK_CUSPARSE(
+                    cusparseLtSpMMAPrune2(&handle, &matA, true, opA, d_A, d_A, CUSPARSELT_PRUNE_SPMMA_STRIP, stream))
+                size_t compressed_size;
+                CHECK_CUSPARSE(cusparseLtSpMMACompressedSize2(&handle, &matA, &compressed_size))
+                check_cuda_error(cudaMalloc((void**)&dA_compressed, compressed_size));
+                CHECK_CUSPARSE(cusparseLtSpMMACompress2(&handle, &matA, true, opA, d_A, dA_compressed, stream))
+            }
+            float exec_time = 99999.0f;
+            int   fast_algo = 0;
+            for (int alg = 0; alg < 4; ++alg) {
+                cudaDeviceSynchronize();
+                cusparseLtMatDescriptor_t matA, matB, matC;
+                void*                     d_workspace = nullptr;
+                int                       num_streams = 1;
+                cudaStream_t              streams[1]  = {stream};
+                CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(
+                    &handle, &matA, m, k, m, alignment, CUDA_R_16F, order, CUSPARSELT_SPARSITY_50_PERCENT))
+                CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(&handle, &matB, k, n, k, alignment, CUDA_R_16F, order))
+                CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(&handle, &matC, m, n, m, alignment, CUDA_R_16F, order))
+                gettimeofday(&start, NULL);
+                for (int ite = 0; ite < ites; ++ite) {
+                    // initializing MatDesc takes a lot of time
+                    // and these descs can be stored to other place
+                    // whereas storing MatMulPlan to other place will cause errors
+                    cusparseLtMatmulDescriptor_t   matmul;
+                    cusparseLtMatmulAlgSelection_t alg_sel;
+                    cusparseLtMatmulPlan_t         plan;
+                    CHECK_CUSPARSE(cusparseLtMatmulDescriptorInit(
+                        &handle, &matmul, opA, opB, &matA, &matB, &matC, &matC, compute_type))
+                    CHECK_CUSPARSE(
+                        cusparseLtMatmulAlgSelectionInit(&handle, &alg_sel, &matmul, CUSPARSELT_MATMUL_ALG_DEFAULT))
+                    CHECK_CUSPARSE(cusparseLtMatmulAlgSetAttribute(
+                        &handle, &alg_sel, CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg, sizeof(alg)))
+                    size_t workspace_size;
+                    CHECK_CUSPARSE(cusparseLtMatmulGetWorkspace(&handle, &alg_sel, &workspace_size))
+                    CHECK_CUSPARSE(cusparseLtMatmulPlanInit(&handle, &plan, &matmul, &alg_sel, workspace_size))
+                    CHECK_CUSPARSE(cusparseLtMatmul(&handle,
+                                                    &plan,
+                                                    &alpha2,
+                                                    dA_compressed,
+                                                    d_B,
+                                                    &beta2,
+                                                    d_C,
+                                                    d_C,
+                                                    d_workspace,
+                                                    streams,
+                                                    num_streams))
+                    CHECK_CUSPARSE(cusparseLtMatmulPlanDestroy(&plan))
+                }
+                cudaDeviceSynchronize();
+                gettimeofday(&end, NULL);
+                printf("algo_%d costs %.3fms \n", alg, diffTime(start, end) / ites);
+                if (diffTime(start, end) < exec_time) {
+                    exec_time = diffTime(start, end);
+                    fast_algo = alg;
+                }
+            }
+            exec_time /= ites;
+            if (exec_time >= exec_times[i]) {
+                fast_algo = -1;
+            }
+            printf("fast_algo %d\n", fast_algo);
+            fprintf(fd,
+                    "%d %d %d %d %d ### %d %d %d %d %d %f\n",
+                    batch_size,
+                    seq_len,
+                    head_num,
+                    size_per_head,
+                    HALF_DATATYPE,
+                    batchCount[i],
+                    m,
+                    n,
+                    k,
+                    fast_algo,
+                    exec_time);
+            cudaFree(dA_compressed);
+        }
+        CHECK_CUSPARSE(cusparseLtDestroy(&handle))
+        fclose(fd);
+        printf("***cusparseLt Gemm Testing End***\n");
+    }
+#endif
+    return;
+}
+template void generate_encoder_gemm_config<float>(
+    int batch_size, int seq_len, int head_num, int size_per_head, void* buffer, bool isAppend, int tensor_para_size);
+template void generate_encoder_gemm_config<half>(
+    int batch_size, int seq_len, int head_num, int size_per_head, void* buffer, bool isAppend, int tensor_para_size);
+#ifdef ENABLE_BF16
+template void generate_encoder_gemm_config<__nv_bfloat16>(
+    int batch_size, int seq_len, int head_num, int size_per_head, void* buffer, bool isAppend, int tensor_para_size);
+#endif
+}  // namespace fastertransformer
--- a/src/fastertransformer/utils/gemm_test/encoder_gemm_func.h
+++ b/src/fastertransformer/utils/gemm_test/encoder_gemm_func.h
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "src/fastertransformer/utils/cublasAlgoMap.h"
+#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include "src/fastertransformer/utils/gemm_test/gemm_func.h"
+#include <cstdio>
+#include <cstdlib>
+#include <ctime>
+#include <cuda_fp16.h>
+#include <cuda_profiler_api.h>
+#include <map>
+#include <sys/time.h>
+#include <unistd.h>
+#include <vector>
+namespace fastertransformer {
+template<typename T>
+void generate_encoder_gemm_config(int   batch_size,
+                                  int   seq_len,
+                                  int   head_num,
+                                  int   size_per_head,
+                                  void* buffer,
+                                  bool  isAppend         = true,
+                                  int   tensor_para_size = 1);
+}  // namespace fastertransformer
--- a/src/fastertransformer/utils/gemm_test/encoder_igemm_func.cc
+++ b/src/fastertransformer/utils/gemm_test/encoder_igemm_func.cc
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "encoder_igemm_func.h"
+#ifndef CUDART_VERSION
+#error CUDART_VERSION Undefined!
+#endif
+namespace fastertransformer {
+int batch_size_;
+int seq_len_;
+int head_num_;
+int size_per_head_;
+static const char* showStatus(cublasStatus_t error)
+{
+    switch (error) {
+        case CUBLAS_STATUS_SUCCESS:
+            return "CUBLAS_STATUS_SUCCESS";
+        case CUBLAS_STATUS_NOT_INITIALIZED:
+            return "CUBLAS_STATUS_NOT_INITIALIZED";
+        case CUBLAS_STATUS_ALLOC_FAILED:
+            return "CUBLAS_STATUS_ALLOC_FAILED";
+        case CUBLAS_STATUS_INVALID_VALUE:
+            return "CUBLAS_STATUS_INVALID_VALUE";
+        case CUBLAS_STATUS_ARCH_MISMATCH:
+            return "CUBLAS_STATUS_ARCH_MISMATCH";
+        case CUBLAS_STATUS_MAPPING_ERROR:
+            return "CUBLAS_STATUS_MAPPING_ERROR";
+        case CUBLAS_STATUS_EXECUTION_FAILED:
+            return "CUBLAS_STATUS_EXECUTION_FAILED";
+        case CUBLAS_STATUS_INTERNAL_ERROR:
+            return "CUBLAS_STATUS_INTERNAL_ERROR";
+        case CUBLAS_STATUS_NOT_SUPPORTED:
+            return "CUBLAS_STATUS_NOT_SUPPORTED";
+        case CUBLAS_STATUS_LICENSE_ERROR:
+            return "CUBLAS_STATUS_LICENSE_ERROR";
+    }
+    return "<unknown>";
+}
+// Utility function to print customMatmulPerf_t structure
+int printPerfStructure(int m, int n, int k, const customMatmulPerf_t& perf, FILE* fout, int hasPrint)
+{
+    int algoId, tile, swizzle, customOption, numSplitsK, reductionScheme, stages;
+    const cublasLtMatmulAlgo_t* matmulAlgo = &perf.algo;
+    cublasLtMatmulAlgoConfigGetAttribute(matmulAlgo, CUBLASLT_ALGO_CONFIG_ID, &algoId, sizeof(algoId), NULL);
+    cublasLtMatmulAlgoConfigGetAttribute(matmulAlgo, CUBLASLT_ALGO_CONFIG_TILE_ID, &tile, sizeof(tile), NULL);
+    cublasLtMatmulAlgoConfigGetAttribute(
+        matmulAlgo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &numSplitsK, sizeof(numSplitsK), NULL);
+    cublasLtMatmulAlgoConfigGetAttribute(
+        matmulAlgo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &reductionScheme, sizeof(reductionScheme), NULL);
+    cublasLtMatmulAlgoConfigGetAttribute(
+        matmulAlgo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &swizzle, sizeof(swizzle), NULL);
+    cublasLtMatmulAlgoConfigGetAttribute(
+        matmulAlgo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &customOption, sizeof(customOption), NULL);
+#if (CUDART_VERSION >= 11000)
+    cublasLtMatmulAlgoConfigGetAttribute(matmulAlgo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &stages, sizeof(stages), NULL);
+#else
+    stages                     = 0;
+#endif
+    printf("algo={ Id=%d, tileIdx=%d (%s) splitK=%d reduc=%d swizzle=%d custom=%d stages=%d} status %d "
+           "time %f workspace=%d mathMode=%d waves=%f\n",
+           algoId,
+           tile,
+           matmulTileName[tile],
+           numSplitsK,
+           reductionScheme,
+           swizzle,
+           customOption,
+           stages,
+           perf.status,
+           perf.time,
+           (int)perf.workspaceSize,
+           (int)perf.mathMode,
+           perf.wavesCount);
+    // chose the fastest algo that does not need workspace
+    if ((int)perf.workspaceSize == 0 && hasPrint == 0) {
+        fprintf(fout,
+                "%d %d %d %d %d ### 1 %d %d %d %d %d %d %d %d %d %d %d %f\n",
+                batch_size_,
+                seq_len_,
+                head_num_,
+                size_per_head_,
+                INT8_DATATYPE,
+                m,
+                n,
+                k,
+                algoId,
+                customOption,
+                tile,
+                numSplitsK,
+                swizzle,
+                reductionScheme,
+                (int)perf.workspaceSize,
+                stages,
+                perf.time);
+        return 1;
+    }
+    else {
+        return hasPrint;
+    }
+}
+int printBatchPerfStructure(
+    int batchCount, int m, int n, int k, const customMatmulPerf_t& perf, FILE* fout, int hasPrint)
+{
+    int algoId, tile, swizzle, customOption, numSplitsK, reductionScheme, stages;
+    const cublasLtMatmulAlgo_t* matmulAlgo = &perf.algo;
+    cublasLtMatmulAlgoConfigGetAttribute(matmulAlgo, CUBLASLT_ALGO_CONFIG_ID, &algoId, sizeof(algoId), NULL);
+    cublasLtMatmulAlgoConfigGetAttribute(matmulAlgo, CUBLASLT_ALGO_CONFIG_TILE_ID, &tile, sizeof(tile), NULL);
+    cublasLtMatmulAlgoConfigGetAttribute(
+        matmulAlgo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &numSplitsK, sizeof(numSplitsK), NULL);
+    cublasLtMatmulAlgoConfigGetAttribute(
+        matmulAlgo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &reductionScheme, sizeof(reductionScheme), NULL);
+    cublasLtMatmulAlgoConfigGetAttribute(
+        matmulAlgo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &swizzle, sizeof(swizzle), NULL);
+    cublasLtMatmulAlgoConfigGetAttribute(
+        matmulAlgo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &customOption, sizeof(customOption), NULL);
+#if (CUDART_VERSION >= 11000)
+    cublasLtMatmulAlgoConfigGetAttribute(matmulAlgo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &stages, sizeof(stages), NULL);
+#else
+    stages                     = 0;
+#endif
+    printf("algo={ Id=%d, tileIdx=%d (%s) splitK=%d reduc=%d swizzle=%d custom=%d stages=%d} status %d "
+           "time %f workspace=%d mathMode=%d waves=%f\n",
+           algoId,
+           tile,
+           matmulTileName[tile],
+           numSplitsK,
+           reductionScheme,
+           swizzle,
+           customOption,
+           stages,
+           perf.status,
+           perf.time,
+           (int)perf.workspaceSize,
+           (int)perf.mathMode,
+           perf.wavesCount);
+    // chose the fastest algo that does not need workspace
+    if ((int)perf.workspaceSize == 0 && hasPrint == 0) {
+        fprintf(fout,
+                "%d %d %d %d %d ### %d %d %d %d %d %d %d %d %d %d %d %d %f\n",
+                batch_size_,
+                seq_len_,
+                head_num_,
+                size_per_head_,
+                INT8_DATATYPE,
+                batchCount,
+                m,
+                n,
+                k,
+                algoId,
+                customOption,
+                tile,
+                numSplitsK,
+                swizzle,
+                reductionScheme,
+                (int)perf.workspaceSize,
+                stages,
+                perf.time);
+        return 1;
+    }
+    else {
+        return hasPrint;
+    }
+}
+static inline bool time_compare(const customMatmulPerf_t& perf_a, const customMatmulPerf_t& perf_b)
+{
+    return ((perf_a.status == CUBLAS_STATUS_SUCCESS) && (perf_a.time < perf_b.time));
+}
+static cublasStatus_t customMatmulRun(cublasLtHandle_t            ltHandle,  // to get the capabilities (required a GPU)
+                                      cublasLtMatmulDesc_t        operationDesc,
+                                      const void*                 alpha, /* host or device pointer */
+                                      const void*                 A,
+                                      cublasLtMatrixLayout_t      Adesc,
+                                      const void*                 B,
+                                      cublasLtMatrixLayout_t      Bdesc,
+                                      const void*                 beta, /* host or device pointer */
+                                      const void*                 C,
+                                      cublasLtMatrixLayout_t      Cdesc,
+                                      void*                       D,
+                                      cublasLtMatrixLayout_t      Ddesc,
+                                      const cublasLtMatmulAlgo_t& algo,
+                                      int                         kernelRepeats,
+                                      void*                       workSpace,
+                                      size_t                      workSpaceSizeInBytes,
+                                      customMatmulPerf_t&         perfResults,
+                                      cudaStream_t                stream)
+{
+    cublasLtMatmulHeuristicResult_t heurResult;
+    /* Looping over the Algo */
+    int            repeats = kernelRepeats;
+    cublasStatus_t algoStatus =
+        cublasLtMatmulAlgoCheck(ltHandle, operationDesc, Adesc, Bdesc, Cdesc, Ddesc, &algo, &heurResult);
+    if (algoStatus == CUBLAS_STATUS_SUCCESS) {
+        if (heurResult.workspaceSize <= workSpaceSizeInBytes) {
+            struct timeval start, end;
+            cublasStatus_t oneRunStatus;
+            cudaDeviceSynchronize();
+            gettimeofday(&start, NULL);
+            for (int loop = 0; loop < repeats; loop++) {
+                oneRunStatus = cublasLtMatmul(ltHandle,
+                                              operationDesc,
+                                              alpha,
+                                              A,
+                                              Adesc,
+                                              B,
+                                              Bdesc,
+                                              beta,
+                                              C,
+                                              Cdesc,
+                                              D,
+                                              Ddesc,
+                                              &algo,
+                                              workSpace,
+                                              workSpaceSizeInBytes,
+                                              stream);
+            }
+            cudaDeviceSynchronize();
+            gettimeofday(&end, NULL);
+            if (oneRunStatus != CUBLAS_STATUS_SUCCESS) {
+                algoStatus = oneRunStatus;
+            }
+            float time = diffTime(start, end);
+            // For the moment only add successful findings
+            if (algoStatus == CUBLAS_STATUS_SUCCESS) {
+                perfResults.algo          = algo;
+                perfResults.time          = time / repeats;
+                perfResults.workspaceSize = heurResult.workspaceSize;
+                perfResults.wavesCount    = heurResult.wavesCount;
+            }
+        }
+        else {
+            // printf("not enough workspace! %ld\n", heurResult.workspaceSize);
+            algoStatus = CUBLAS_STATUS_NOT_SUPPORTED;  // Not enough workspace
+        }
+    }
+    else {
+        // printf("check fail!\n");
+    }
+    return algoStatus;
+}
+// Sample wrapper running through multiple algo and config attributes combination for INT8 gemm using cublasLt low-level
+// API
+template<typename T, typename scaleT>
+int LtIgemmCustomFind(cublasLtHandle_t ltHandle,
+                      int              m,
+                      int              n,
+                      int              k,
+                      const scaleT*    alpha, /* host pointer */
+                      const int8_t*    A,
+                      const int8_t*    B,
+                      const scaleT*    beta, /* host pointer */
+                      T*               C,
+                      void*            workSpace,
+                      size_t           workSpaceSize,
+                      FILE*            fout)
+{
+    cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
+    cublasLtMatmulDesc_t   operationDesc = NULL;
+    cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL;
+    cudaStream_t           stream = 0;
+    // SplitK value that we are going to try when SplitK is supported for a given algo
+    const int splitKSequenceA[] = {2, 3, 4, 5, 6, 8, 12, 16, 32};
+    // Let try a fixed number of combinations
+#define ALGO_COMBINATIONS 50000
+    int                AlgoCombinations = ALGO_COMBINATIONS;
+    int                AlgoCount        = 0;
+    int                kernelRepeats    = 100;  // number of time the CUDA kernels will be run back to back
+    customMatmulPerf_t perfResults[ALGO_COMBINATIONS];
+    int                nbAlgoIds = 0;
+#define ALGO_IDS 100
+    int algoIdA[ALGO_IDS];
+    cudaDataType_t Atype, Btype, Ctype, scaleType;
+    Atype = CUDA_R_8I;
+    Btype = CUDA_R_8I;
+    if (std::is_same<T, int32_t>::value && std::is_same<scaleT, int>::value) {
+        Ctype     = CUDA_R_32I;
+        scaleType = CUDA_R_32I;
+    }
+    else if (std::is_same<T, int8_t>::value && std::is_same<scaleT, float>::value) {
+        Ctype     = CUDA_R_8I;
+        scaleType = CUDA_R_32F;
+    }
+    else {
+        printf("[ERROR]<T,scaleT> of igemm is invalid\n");
+        exit(-1);
+    }
+#if (CUDART_VERSION >= 11000)
+    cublasComputeType_t computeType = CUBLAS_COMPUTE_32I;
+#else
+    cudaDataType_t computeType = CUDA_R_32I;
+#endif
+    cublasOperation_t opTranspose = CUBLAS_OP_T;
+    bool use_ORDER_COL32_2R_4R4 = false;
+#if (CUDART_VERSION >= 11000)
+    int device{-1};
+    cudaGetDevice(&device);
+    cudaDeviceProp props;
+    cudaGetDeviceProperties(&props, device);
+    if (props.major * 10 + props.minor >= 80) {
+        use_ORDER_COL32_2R_4R4 = true;
+    }
+#endif
+    cublasLtOrder_t order_COL32 = CUBLASLT_ORDER_COL32;
+    cublasLtOrder_t order_matrixB;
+#if (CUDART_VERSION >= 11000)
+    if (use_ORDER_COL32_2R_4R4) {
+        order_matrixB = CUBLASLT_ORDER_COL32_2R_4R4;
+    }
+    else {
+        order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C;
+    }
+#else
+    order_matrixB              = CUBLASLT_ORDER_COL4_4R2_8C;
+#endif
+    int ldaTransform = 32 * m;
+    int ldbTransform;
+    if (use_ORDER_COL32_2R_4R4) {
+        ldbTransform = 32 * ((n + 32 - 1) / 32) * 32;
+    }
+    else {
+        ldbTransform = 32 * ((n + 8 - 1) / 8) * 8;
+    }
+    int ldcTransform = 32 * m;
+#if (CUDART_VERSION >= 11000)
+    status = cublasLtMatmulDescCreate(&operationDesc, computeType, scaleType);
+#else
+    status                     = cublasLtMatmulDescCreate(&operationDesc, scaleType);
+#endif
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        goto CLEANUP;
+    }
+    cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &opTranspose, sizeof(cublasOperation_t));
+    // Create matrix descriptors.
+    status = cublasLtMatrixLayoutCreate(&Adesc, Atype, m, k, ldaTransform);
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        goto CLEANUP;
+    }
+    status = cublasLtMatrixLayoutSetAttribute(Adesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_COL32, sizeof(order_COL32));
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        goto CLEANUP;
+    }
+    status = cublasLtMatrixLayoutCreate(&Bdesc, Btype, n, k, ldbTransform);
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        goto CLEANUP;
+    }
+    status =
+        cublasLtMatrixLayoutSetAttribute(Bdesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_matrixB, sizeof(order_matrixB));
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        goto CLEANUP;
+    }
+    status = cublasLtMatrixLayoutCreate(&Cdesc, Ctype, m, n, ldcTransform);
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        goto CLEANUP;
+    }
+    status = cublasLtMatrixLayoutSetAttribute(Cdesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_COL32, sizeof(order_COL32));
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        goto CLEANUP;
+    }
+    // Request AlgoId available for IGEMM
+    status = cublasLtMatmulAlgoGetIds(
+        ltHandle, computeType, scaleType, Atype, Btype, Ctype, Ctype, ALGO_IDS, algoIdA, &nbAlgoIds);
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        goto CLEANUP;
+    }
+    // Loop over the Algo IDs
+    for (int idx = 0; (idx < nbAlgoIds) && (AlgoCount < AlgoCombinations); idx++) {
+        cublasLtMatmulAlgo_t algo;
+        size_t               sizeWritten = 0;
+        /* Initialize algo structure with given Algp ID */
+        status =
+            cublasLtMatmulAlgoInit(ltHandle, computeType, scaleType, Atype, Btype, Ctype, Ctype, algoIdA[idx], &algo);
+        if (status != CUBLAS_STATUS_SUCCESS) {
+            continue;
+        }
+        // Query the tiles enums supported by that algo
+        cublasLtMatmulAlgoCapGetAttribute(&algo, CUBLASLT_ALGO_CAP_TILE_IDS, NULL, 0, &sizeWritten);
+        int  nbTiles = int(sizeWritten / sizeof(int));
+        int* tileA   = new int[nbTiles == 0 ? 1 : nbTiles];
+        if (nbTiles == 0) {
+            tileA[0] = CUBLASLT_MATMUL_TILE_UNDEFINED;
+            nbTiles  = 1;
+        }
+#if (CUDART_VERSION >= 11000)
+        cublasLtMatmulAlgoCapGetAttribute(&algo, CUBLASLT_ALGO_CAP_STAGES_IDS, NULL, 0, &sizeWritten);
+        int              nbStages = int(sizeWritten / sizeof(int));
+        std::vector<int> stagesA(nbStages == 0 ? 1 : nbStages);
+        if (nbStages == 0) {
+            stagesA[0] = CUBLASLT_MATMUL_STAGES_UNDEFINED;
+            nbStages   = 1;
+        }
+        else {
+            cublasLtMatmulAlgoCapGetAttribute(
+                &algo, CUBLASLT_ALGO_CAP_STAGES_IDS, stagesA.data(), sizeof(int) * nbStages, &sizeWritten);
+        }
+#endif
+        int splitkSupport, redMask, swizzlingMax, customOptionMax;
+        // Retrieve Algo Capabilities attributes to be able to setup loop over the different combinations
+        cublasLtMatmulAlgoCapGetAttribute(
+            &algo, CUBLASLT_ALGO_CAP_TILE_IDS, tileA, sizeof(int) * nbTiles, &sizeWritten);
+        cublasLtMatmulAlgoCapGetAttribute(
+            &algo, CUBLASLT_ALGO_CAP_SPLITK_SUPPORT, &splitkSupport, sizeof(splitkSupport), &sizeWritten);
+        cublasLtMatmulAlgoCapGetAttribute(
+            &algo, CUBLASLT_ALGO_CAP_REDUCTION_SCHEME_MASK, &redMask, sizeof(redMask), &sizeWritten);
+        cublasLtMatmulAlgoCapGetAttribute(
+            &algo, CUBLASLT_ALGO_CAP_CTA_SWIZZLING_SUPPORT, &swizzlingMax, sizeof(swizzlingMax), &sizeWritten);
+        cublasLtMatmulAlgoCapGetAttribute(
+            &algo, CUBLASLT_ALGO_CAP_CUSTOM_OPTION_MAX, &customOptionMax, sizeof(customOptionMax), &sizeWritten);
+        /* Loop over the different tiles */
+        for (int tileIdx = 0; tileIdx < nbTiles; tileIdx++) {
+#if (CUDART_VERSION >= 11000)
+            /* Loop over different stages count */
+            for (int stagesIdx = 0; stagesIdx < nbStages; stagesIdx++) {
+                cublasLtMatmulAlgoConfigSetAttribute(
+                    &algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &stagesA[stagesIdx], sizeof(stagesA[stagesIdx]));
+#endif
+                /* Loop over the different custom option if any */
+                for (int customOption = 0; customOption <= customOptionMax; customOption++) {
+                    cublasLtMatmulAlgoConfigSetAttribute(
+                        &algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &customOption, sizeof(customOption));
+                    /* Loop over the CTAs swizzling support */
+                    for (int k = 0; k <= swizzlingMax; k++) {
+                        int splitK_trial = 0;
+                        if (splitkSupport) {
+                            splitK_trial += sizeof(splitKSequenceA) / sizeof(splitKSequenceA[0]);
+                        }
+                        // Loop over the splitK value over a fixed sequence splitKSequenceA in addition to the case
+                        // where splitK is not enabled
+                        for (int l = 0; (l < (1 + splitK_trial)) && (AlgoCount < AlgoCombinations); l++) {
+                            /* Setup attribute of the algo to run */
+                            cublasLtMatmulAlgoConfigSetAttribute(
+                                &algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &tileA[tileIdx], sizeof(tileA[tileIdx]));
+                            int splitK_val = 0;
+                            int redScheme  = CUBLASLT_REDUCTION_SCHEME_NONE;
+                            cublasLtMatmulAlgoConfigSetAttribute(
+                                &algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &splitK_val, sizeof(splitK_val));
+                            cublasLtMatmulAlgoConfigSetAttribute(
+                                &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &k, sizeof(k));
+                            cublasLtMatmulAlgoConfigSetAttribute(
+                                &algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &redScheme, sizeof(int));
+                            if (l > 0) {  // Split-K case
+                                splitK_val = splitKSequenceA[l - 1];
+                                cublasLtMatmulAlgoConfigSetAttribute(&algo,
+                                                                     CUBLASLT_ALGO_CONFIG_SPLITK_NUM,
+                                                                     &splitKSequenceA[l - 1],
+                                                                     sizeof(splitKSequenceA[l - 1]));
+                                /* Going over all the reduction scheme  */
+                                for (redScheme = 1;
+                                     redScheme <= (int)CUBLASLT_REDUCTION_SCHEME_MASK && (AlgoCount < AlgoCombinations);
+                                     redScheme = redScheme << 1) {
+                                    if (redScheme & redMask) {
+                                        cublasLtMatmulAlgoConfigSetAttribute(&algo,
+                                                                             CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME,
+                                                                             &redScheme,
+                                                                             sizeof(redScheme));
+                                        status                        = customMatmulRun(ltHandle,
+                                                                 operationDesc,
+                                                                 alpha, /* host or device pointer */
+                                                                 A,
+                                                                 Adesc,
+                                                                 B,
+                                                                 Bdesc,
+                                                                 beta, /* host or device pointer */
+                                                                 C,
+                                                                 Cdesc,
+                                                                 C,
+                                                                 Cdesc,
+                                                                 algo,
+                                                                 kernelRepeats,
+                                                                 workSpace,
+                                                                 workSpaceSize,
+                                                                 perfResults[AlgoCount],
+                                                                 stream);
+                                        perfResults[AlgoCount].status = status;
+                                        if (status == CUBLAS_STATUS_SUCCESS) {
+                                            AlgoCount++;
+                                        }
+                                    }  // end if
+                                }      // end for
+                            }
+                            else {  // Non-splitK case
+                                /* if user preference is ok with workspace */
+                                if (AlgoCount < AlgoCombinations) {
+                                    status                        = customMatmulRun(ltHandle,
+                                                             operationDesc,
+                                                             alpha, /* host or device pointer */
+                                                             A,
+                                                             Adesc,
+                                                             B,
+                                                             Bdesc,
+                                                             beta, /* host or device pointer */
+                                                             C,
+                                                             Cdesc,
+                                                             C,
+                                                             Cdesc,
+                                                             algo,
+                                                             kernelRepeats,
+                                                             workSpace,
+                                                             workSpaceSize,
+                                                             perfResults[AlgoCount],
+                                                             stream);
+                                    perfResults[AlgoCount].status = status;
+                                    if (status == CUBLAS_STATUS_SUCCESS) {
+                                        AlgoCount++;
+                                    }
+                                }
+                            }
+                        }  // end l
+                    }      // end k
+                }          // end customOption
+#if (CUDART_VERSION >= 11000)
+            }  // end stagesIdx
+#endif
+        }  // end tileIdx
+        delete[] tileA;
+    }  // end idx
+    // Sort the results per run duration
+    std::sort(perfResults, perfResults + AlgoCount, time_compare);
+    // Print timing and perf details
+    for (int i = 0, hasPrint = 0; i < AlgoCount; i++) {
+        printf("result %03d : ", i);
+        hasPrint = printPerfStructure(m, n, k, perfResults[i], fout, hasPrint);
+    }
+CLEANUP:
+    // Descriptors are no longer needed as all GPU work was already enqueued
+    if (Cdesc) {
+        cublasLtMatrixLayoutDestroy(Cdesc);
+    }
+    if (Bdesc) {
+        cublasLtMatrixLayoutDestroy(Bdesc);
+    }
+    if (Adesc) {
+        cublasLtMatrixLayoutDestroy(Adesc);
+    }
+    if (operationDesc) {
+        cublasLtMatmulDescDestroy(operationDesc);
+    }
+    return status == CUBLAS_STATUS_SUCCESS ? 0 : 1;
+}
+template int LtIgemmCustomFind(cublasLtHandle_t ltHandle,
+                               int              m,
+                               int              n,
+                               int              k,
+                               const int*       alpha, /* host pointer */
+                               const int8_t*    A,
+                               const int8_t*    B,
+                               const int*       beta, /* host pointer */
+                               int32_t*         C,
+                               void*            workSpace,
+                               size_t           workSpaceSize,
+                               FILE*            fout);
+template int LtIgemmCustomFind(cublasLtHandle_t ltHandle,
+                               int              m,
+                               int              n,
+                               int              k,
+                               const float*     alpha, /* host pointer */
+                               const int8_t*    A,
+                               const int8_t*    B,
+                               const float*     beta, /* host pointer */
+                               int8_t*          C,
+                               void*            workSpace,
+                               size_t           workSpaceSize,
+                               FILE*            fout);
+template<typename T, typename scaleT>
+int LtBatchIgemmCustomFind(cublasLtHandle_t ltHandle,
+                           int              batchCount,
+                           int              m,
+                           int              n,
+                           int              k,
+                           const scaleT*    alpha, /* host pointer */
+                           const int8_t*    A,
+                           const int8_t*    B,
+                           const scaleT*    beta, /* host pointer */
+                           T*               C,
+                           void*            workSpace,
+                           size_t           workSpaceSize,
+                           FILE*            fout)
+{
+    cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
+    cublasLtMatmulDesc_t   operationDesc = NULL;
+    cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL;
+    cudaStream_t           stream = 0;
+    // SplitK value that we are going to try when SplitK is supported for a given algo
+    const int splitKSequenceA[] = {2, 3, 4, 5, 6, 8, 12, 16, 32};
+    // Let try a fixed number of combinations
+#define ALGO_COMBINATIONS 50000
+    int                AlgoCombinations = ALGO_COMBINATIONS;
+    int                AlgoCount        = 0;
+    int                kernelRepeats    = 100;  // number of time the CUDA kernels will be run back to back
+    customMatmulPerf_t perfResults[ALGO_COMBINATIONS];
+    int                nbAlgoIds = 0;
+#define ALGO_IDS 100
+    int algoIdA[ALGO_IDS];
+    cudaDataType_t Atype, Btype, Ctype, scaleType;
+    Atype = CUDA_R_8I;
+    Btype = CUDA_R_8I;
+    if (std::is_same<T, int32_t>::value && std::is_same<scaleT, int>::value) {
+        Ctype     = CUDA_R_32I;
+        scaleType = CUDA_R_32I;
+    }
+    else if (std::is_same<T, int8_t>::value && std::is_same<scaleT, float>::value) {
+        Ctype     = CUDA_R_8I;
+        scaleType = CUDA_R_32F;
+    }
+    else {
+        printf("[ERROR]<T,scaleT> of igemm is invalid\n");
+        exit(-1);
+    }
+#if (CUDART_VERSION >= 11000)
+    cublasComputeType_t computeType = CUBLAS_COMPUTE_32I;
+#else
+    cudaDataType_t computeType = CUDA_R_32I;
+#endif
+    cublasOperation_t opTranspose = CUBLAS_OP_T;
+    bool use_ORDER_COL32_2R_4R4 = false;
+#if (CUDART_VERSION >= 11000)
+    int device{-1};
+    cudaGetDevice(&device);
+    cudaDeviceProp props;
+    cudaGetDeviceProperties(&props, device);
+    if (props.major * 10 + props.minor >= 80) {
+        use_ORDER_COL32_2R_4R4 = true;
+    }
+#endif
+    cublasLtOrder_t order_COL32 = CUBLASLT_ORDER_COL32;
+    cublasLtOrder_t order_matrixB;
+#if (CUDART_VERSION >= 11000)
+    if (use_ORDER_COL32_2R_4R4) {
+        order_matrixB = CUBLASLT_ORDER_COL32_2R_4R4;
+    }
+    else {
+        order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C;
+    }
+#else
+    order_matrixB              = CUBLASLT_ORDER_COL4_4R2_8C;
+#endif
+    int ldaTransform = 32 * m;
+    int ldbTransform;
+    if (use_ORDER_COL32_2R_4R4) {
+        ldbTransform = 32 * ((n + 32 - 1) / 32) * 32;
+    }
+    else {
+        ldbTransform = 32 * ((n + 8 - 1) / 8) * 8;
+    }
+    int ldcTransform = 32 * m;
+    int64_t stridea, strideb, stridec;
+    stridea = m * k;
+    strideb = n * k;
+    stridec = m * n;
+#if (CUDART_VERSION >= 11000)
+    status = cublasLtMatmulDescCreate(&operationDesc, computeType, scaleType);
+#else
+    status                     = cublasLtMatmulDescCreate(&operationDesc, scaleType);
+#endif
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        goto CLEANUP;
+    }
+    cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &opTranspose, sizeof(cublasOperation_t));
+    // Create matrix descriptors.
+    status = cublasLtMatrixLayoutCreate(&Adesc, Atype, m, k, ldaTransform);
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        goto CLEANUP;
+    }
+    status = cublasLtMatrixLayoutSetAttribute(Adesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_COL32, sizeof(order_COL32));
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        goto CLEANUP;
+    }
+    cublasLtMatrixLayoutSetAttribute(Adesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount));
+    cublasLtMatrixLayoutSetAttribute(Adesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridea, sizeof(stridea));
+    status = cublasLtMatrixLayoutCreate(&Bdesc, Btype, n, k, ldbTransform);
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        goto CLEANUP;
+    }
+    status =
+        cublasLtMatrixLayoutSetAttribute(Bdesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_matrixB, sizeof(order_matrixB));
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        goto CLEANUP;
+    }
+    cublasLtMatrixLayoutSetAttribute(Bdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount));
+    cublasLtMatrixLayoutSetAttribute(Bdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideb, sizeof(strideb));
+    status = cublasLtMatrixLayoutCreate(&Cdesc, Ctype, m, n, ldcTransform);
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        goto CLEANUP;
+    }
+    status = cublasLtMatrixLayoutSetAttribute(Cdesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_COL32, sizeof(order_COL32));
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        goto CLEANUP;
+    }
+    cublasLtMatrixLayoutSetAttribute(Cdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount));
+    cublasLtMatrixLayoutSetAttribute(Cdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridec, sizeof(stridec));
+    // Request AlgoId available for IGEMM
+    status = cublasLtMatmulAlgoGetIds(
+        ltHandle, computeType, scaleType, Atype, Btype, Ctype, Ctype, ALGO_IDS, algoIdA, &nbAlgoIds);
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        goto CLEANUP;
+    }
+    // Loop over the Algo IDs
+    for (int idx = 0; (idx < nbAlgoIds) && (AlgoCount < AlgoCombinations); idx++) {
+        cublasLtMatmulAlgo_t algo;
+        size_t               sizeWritten = 0;
+        /* Initialize algo structure with given Algp ID */
+        status =
+            cublasLtMatmulAlgoInit(ltHandle, computeType, scaleType, Atype, Btype, Ctype, Ctype, algoIdA[idx], &algo);
+        if (status != CUBLAS_STATUS_SUCCESS) {
+            continue;
+        }
+        // Query the tiles enums supported by that algo
+        cublasLtMatmulAlgoCapGetAttribute(&algo, CUBLASLT_ALGO_CAP_TILE_IDS, NULL, 0, &sizeWritten);
+        int  nbTiles = int(sizeWritten / sizeof(int));
+        int* tileA   = new int[nbTiles == 0 ? 1 : nbTiles];
+        if (nbTiles == 0) {
+            tileA[0] = CUBLASLT_MATMUL_TILE_UNDEFINED;
+            nbTiles  = 1;
+        }
+#if (CUDART_VERSION >= 11000)
+        cublasLtMatmulAlgoCapGetAttribute(&algo, CUBLASLT_ALGO_CAP_STAGES_IDS, NULL, 0, &sizeWritten);
+        int              nbStages = int(sizeWritten / sizeof(int));
+        std::vector<int> stagesA(nbStages == 0 ? 1 : nbStages);
+        if (nbStages == 0) {
+            stagesA[0] = CUBLASLT_MATMUL_STAGES_UNDEFINED;
+            nbStages   = 1;
+        }
+        else {
+            cublasLtMatmulAlgoCapGetAttribute(
+                &algo, CUBLASLT_ALGO_CAP_STAGES_IDS, stagesA.data(), sizeof(int) * nbStages, &sizeWritten);
+        }
+#endif
+        int splitkSupport, redMask, swizzlingMax, customOptionMax;
+        // Retrieve Algo Capabilities attributes to be able to setup loop over the different combinations
+        cublasLtMatmulAlgoCapGetAttribute(
+            &algo, CUBLASLT_ALGO_CAP_TILE_IDS, tileA, sizeof(int) * nbTiles, &sizeWritten);
+        cublasLtMatmulAlgoCapGetAttribute(
+            &algo, CUBLASLT_ALGO_CAP_SPLITK_SUPPORT, &splitkSupport, sizeof(splitkSupport), &sizeWritten);
+        cublasLtMatmulAlgoCapGetAttribute(
+            &algo, CUBLASLT_ALGO_CAP_REDUCTION_SCHEME_MASK, &redMask, sizeof(redMask), &sizeWritten);
+        cublasLtMatmulAlgoCapGetAttribute(
+            &algo, CUBLASLT_ALGO_CAP_CTA_SWIZZLING_SUPPORT, &swizzlingMax, sizeof(swizzlingMax), &sizeWritten);
+        cublasLtMatmulAlgoCapGetAttribute(
+            &algo, CUBLASLT_ALGO_CAP_CUSTOM_OPTION_MAX, &customOptionMax, sizeof(customOptionMax), &sizeWritten);
+        /* Loop over the different tiles */
+        for (int tileIdx = 0; tileIdx < nbTiles; tileIdx++) {
+#if (CUDART_VERSION >= 11000)
+            /* Loop over different stages count */
+            for (int stagesIdx = 0; stagesIdx < nbStages; stagesIdx++) {
+                cublasLtMatmulAlgoConfigSetAttribute(
+                    &algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &stagesA[stagesIdx], sizeof(stagesA[stagesIdx]));
+#endif
+                /* Loop over the different custom option if any */
+                for (int customOption = 0; customOption <= customOptionMax; customOption++) {
+                    cublasLtMatmulAlgoConfigSetAttribute(
+                        &algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &customOption, sizeof(customOption));
+                    /* Loop over the CTAs swizzling support */
+                    for (int k = 0; k <= swizzlingMax; k++) {
+                        int splitK_trial = 0;
+                        if (splitkSupport) {
+                            splitK_trial += sizeof(splitKSequenceA) / sizeof(splitKSequenceA[0]);
+                        }
+                        // Loop over the splitK value over a fixed sequence splitKSequenceA in addition to the case
+                        // where splitK is not enabled
+                        for (int l = 0; (l < (1 + splitK_trial)) && (AlgoCount < AlgoCombinations); l++) {
+                            /* Setup attribute of the algo to run */
+                            cublasLtMatmulAlgoConfigSetAttribute(
+                                &algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &tileA[tileIdx], sizeof(tileA[tileIdx]));
+                            int splitK_val = 0;
+                            int redScheme  = CUBLASLT_REDUCTION_SCHEME_NONE;
+                            cublasLtMatmulAlgoConfigSetAttribute(
+                                &algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &splitK_val, sizeof(splitK_val));
+                            cublasLtMatmulAlgoConfigSetAttribute(
+                                &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &k, sizeof(k));
+                            cublasLtMatmulAlgoConfigSetAttribute(
+                                &algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &redScheme, sizeof(int));
+                            if (l > 0) {  // Split-K case
+                                splitK_val = splitKSequenceA[l - 1];
+                                cublasLtMatmulAlgoConfigSetAttribute(&algo,
+                                                                     CUBLASLT_ALGO_CONFIG_SPLITK_NUM,
+                                                                     &splitKSequenceA[l - 1],
+                                                                     sizeof(splitKSequenceA[l - 1]));
+                                /* Going over all the reduction scheme  */
+                                for (redScheme = 1;
+                                     redScheme <= (int)CUBLASLT_REDUCTION_SCHEME_MASK && (AlgoCount < AlgoCombinations);
+                                     redScheme = redScheme << 1) {
+                                    if (redScheme & redMask) {
+                                        cublasLtMatmulAlgoConfigSetAttribute(&algo,
+                                                                             CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME,
+                                                                             &redScheme,
+                                                                             sizeof(redScheme));
+                                        status                        = customMatmulRun(ltHandle,
+                                                                 operationDesc,
+                                                                 alpha, /* host or device pointer */
+                                                                 A,
+                                                                 Adesc,
+                                                                 B,
+                                                                 Bdesc,
+                                                                 beta, /* host or device pointer */
+                                                                 C,
+                                                                 Cdesc,
+                                                                 C,
+                                                                 Cdesc,
+                                                                 algo,
+                                                                 kernelRepeats,
+                                                                 workSpace,
+                                                                 workSpaceSize,
+                                                                 perfResults[AlgoCount],
+                                                                 stream);
+                                        perfResults[AlgoCount].status = status;
+                                        if (status == CUBLAS_STATUS_SUCCESS) {
+                                            AlgoCount++;
+                                        }
+                                    }  // end if
+                                }      // end for
+                            }
+                            else {  // Non-splitK case
+                                /* if user preference is ok with workspace */
+                                if (AlgoCount < AlgoCombinations) {
+                                    status                        = customMatmulRun(ltHandle,
+                                                             operationDesc,
+                                                             alpha, /* host or device pointer */
+                                                             A,
+                                                             Adesc,
+                                                             B,
+                                                             Bdesc,
+                                                             beta, /* host or device pointer */
+                                                             C,
+                                                             Cdesc,
+                                                             C,
+                                                             Cdesc,
+                                                             algo,
+                                                             kernelRepeats,
+                                                             workSpace,
+                                                             workSpaceSize,
+                                                             perfResults[AlgoCount],
+                                                             stream);
+                                    perfResults[AlgoCount].status = status;
+                                    if (status == CUBLAS_STATUS_SUCCESS) {
+                                        AlgoCount++;
+                                    }
+                                }
+                            }
+                        }  // end l
+                    }      // end k
+                }          // end customOption
+#if (CUDART_VERSION >= 11000)
+            }  // end stagesIdx
+#endif
+        }  // end tileIdx
+        delete[] tileA;
+    }  // end idx
+    // Sort the results per run duration
+    std::sort(perfResults, perfResults + AlgoCount, time_compare);
+    // Print timing and perf details
+    for (int i = 0, hasPrint = 0; i < AlgoCount; i++) {
+        printf("result %03d : ", i);
+        hasPrint = printBatchPerfStructure(batchCount, m, n, k, perfResults[i], fout, hasPrint);
+    }
+CLEANUP:
+    // Descriptors are no longer needed as all GPU work was already enqueued
+    if (Cdesc) {
+        cublasLtMatrixLayoutDestroy(Cdesc);
+    }
+    if (Bdesc) {
+        cublasLtMatrixLayoutDestroy(Bdesc);
+    }
+    if (Adesc) {
+        cublasLtMatrixLayoutDestroy(Adesc);
+    }
+    if (operationDesc) {
+        cublasLtMatmulDescDestroy(operationDesc);
+    }
+    return status == CUBLAS_STATUS_SUCCESS ? 0 : 1;
+}
+template int LtBatchIgemmCustomFind(cublasLtHandle_t ltHandle,
+                                    int              batchCount,
+                                    int              m,
+                                    int              n,
+                                    int              k,
+                                    const int*       alpha, /* host pointer */
+                                    const int8_t*    A,
+                                    const int8_t*    B,
+                                    const int*       beta, /* host pointer */
+                                    int32_t*         C,
+                                    void*            workSpace,
+                                    size_t           workSpaceSize,
+                                    FILE*            fout);
+template int LtBatchIgemmCustomFind(cublasLtHandle_t ltHandle,
+                                    int              batchCount,
+                                    int              m,
+                                    int              n,
+                                    int              k,
+                                    const float*     alpha, /* host pointer */
+                                    const int8_t*    A,
+                                    const int8_t*    B,
+                                    const float*     beta, /* host pointer */
+                                    int8_t*          C,
+                                    void*            workSpace,
+                                    size_t           workSpaceSize,
+                                    FILE*            fout);
+// initialize matrix in column-major
+void matInit(int rows, int cols, int8_t* p, int ld)
+{
+    srand(time(NULL));
+    for (int c = 0; c < cols; c++) {
+        for (int r = 0; r < rows; r++) {
+            int index = r + c * ld;
+            p[index] = rand() % 255 - 127;
+        }
+    }
+}
+int batch_igemm_config(int batchCount, int m, int n, int k, FILE* fout, void* buffer)
+{
+    printf("batchCount %d m %d n %d k %d\n", batchCount, m, n, k);
+    int alpha = 1;
+    int beta  = 0;
+    int8_t*  d_A = (int8_t*)buffer;                       // m * k, stored in column-major
+    int8_t*  d_B = d_A + batchCount * m * k;              // k * n, stored in column-major
+    int32_t* d_C = (int32_t*)(d_B + batchCount * k * n);  // m * n, stored in column-major
+    cublasLtHandle_t ltHandle;
+    cublasLtCreate(&ltHandle);
+    LtBatchIgemmCustomFind(ltHandle,
+                           batchCount,
+                           m,
+                           n,
+                           k,
+                           &alpha, /* host pointer */
+                           d_A,
+                           d_B,
+                           &beta, /* host pointer */
+                           d_C,
+                           NULL,
+                           0,
+                           fout);
+    // free memory
+    cublasLtDestroy(ltHandle);
+    return 0;
+}
+int igemm_config(int m, int n, int k, FILE* fout, void* buffer)
+{
+    printf("batchCount %d m %d n %d k %d\n", 1, m, n, k);
+    int alpha = 1;
+    int beta  = 0;
+    int8_t*  d_A = (int8_t*)buffer;          // m * k, stored in column-major
+    int8_t*  d_B = d_A + m * k;              // k * n, stored in column-major
+    int32_t* d_C = (int32_t*)(d_B + k * n);  // m * n, stored in column-major
+    cublasLtHandle_t ltHandle;
+    cublasLtCreate(&ltHandle);
+    LtIgemmCustomFind(ltHandle,
+                      m,
+                      n,
+                      k,
+                      &alpha, /* host pointer */
+                      d_A,
+                      d_B,
+                      &beta, /* host pointer */
+                      d_C,
+                      NULL,
+                      0,
+                      fout);
+    cublasLtDestroy(ltHandle);
+    return 0;
+}
+int generate_encoder_igemm_config(
+    int batch_size, int seq_len, int head_num, int size_per_head, void* buffer, bool isAppend)
+{
+    // ensure program running on SM >= 7.5
+    struct cudaDeviceProp prop;
+    check_cuda_error(cudaGetDeviceProperties(&prop, 0));
+    if (!(prop.major >= 8 || (prop.major >= 7 && prop.minor >= 5))) {
+        printf("[ERROR] INT8 mode > 0 is only supported on device with sm >= 7.5\n ");
+        exit(-1);
+    }
+    printf("Device %s\n", prop.name);
+    // check config
+    FILE* fout;
+    if (!isAppend) {
+        fout = fopen(IGEMM_CONFIG, "w+");
+        fprintf(
+            fout,
+            "batch_size seq_len head_num size_per_head dataType ### batchCount m n k algoId customOption tile splitK_val swizzle reductionScheme workspaceSize stages exec_time\n");
+    }
+    else {
+        fout = fopen(IGEMM_CONFIG, "a+");
+        std::vector<std::string> config;
+        char                     line[1024];
+        while (fgets(line, 1024, fout) != NULL) {
+            config.push_back(std::string(line));
+        }
+        if (config.size() >= MAX_CONFIG_NUM * GEMM_NUM) {
+            int startIdx = config.size() - (MAX_CONFIG_NUM - 1) * GEMM_NUM;
+            fclose(fout);
+            fout = fopen(IGEMM_CONFIG, "w+");
+            for (int i = startIdx; i < (int)config.size(); i++) {
+                fprintf(fout, "%s", config[i].c_str());
+            }
+        }
+    }
+    batch_size_    = batch_size;
+    seq_len_       = seq_len;
+    head_num_      = head_num;
+    size_per_head_ = size_per_head;
+    int m          = batch_size * seq_len;
+    int n          = head_num * size_per_head;
+    int k          = n;
+    int batchCount;
+    printf("***Encoder IGemm Testing Begin***\n");
+    printf("\n-----------------------------\n");
+    batchCount = 3;
+    m          = batch_size * seq_len;
+    k          = head_num * size_per_head;
+    n          = k;
+    if (n % 32 != 0 || k % 32 != 0) {
+        printf("[WARNING] For INT8 gemm test, n, k should be multiples of 32 (n = %d, k = %d)\n", n, k);
+    }
+    else {
+        batch_igemm_config(batchCount, m, n, k, fout, buffer);
+    }
+    printf("\n-----------------------------\n");
+    m          = seq_len;
+    n          = seq_len;
+    k          = size_per_head;
+    batchCount = batch_size * head_num;
+    if (n % 32 != 0 || k % 32 != 0) {
+        printf("[WARNING] For INT8 gemm test, n, k should be multiples of 32 (n = %d, k = %d)\n", n, k);
+    }
+    else {
+        batch_igemm_config(batchCount, m, n, k, fout, buffer);
+    }
+    printf("\n-----------------------------\n");
+    m          = seq_len;
+    n          = size_per_head;
+    k          = seq_len;
+    batchCount = batch_size * head_num;
+    if (n % 32 != 0 || k % 32 != 0) {
+        printf("[WARNING] For INT8 gemm test, n, k should be multiples of 32 (n = %d, k = %d)\n", n, k);
+    }
+    else {
+        batch_igemm_config(batchCount, m, n, k, fout, buffer);
+    }
+    printf("\n-----------------------------\n");
+    m = batch_size * seq_len;
+    n = head_num * size_per_head;
+    k = head_num * size_per_head;
+    if (n % 32 != 0 || k % 32 != 0) {
+        printf("[WARNING] For INT8 gemm test, n, k should be multiples of 32 (n = %d, k = %d)\n", n, k);
+    }
+    else {
+        igemm_config(m, n, k, fout, buffer);
+    }
+    printf("\n-----------------------------\n");
+    n = 4 * n;
+    if (n % 32 != 0 || k % 32 != 0) {
+        printf("[WARNING] For INT8 gemm test, n, k should be multiples of 32 (n = %d, k = %d)\n", n, k);
+    }
+    else {
+        igemm_config(m, n, k, fout, buffer);
+    }
+    printf("\n-----------------------------\n");
+    n = k;
+    k = 4 * n;
+    if (n % 32 != 0 || k % 32 != 0) {
+        printf("[WARNING] For INT8 gemm test, n, k should be multiples of 32 (n = %d, k = %d)\n", n, k);
+    }
+    else {
+        igemm_config(m, n, k, fout, buffer);
+    }
+    fclose(fout);
+    printf("\n-----------------------------\n");
+    printf("***Encoder IGemm Testing End***\n");
+#ifdef SPARSITY_ENABLED
+    bool do_sparse_test = false;
+    if (prop.major == 8 && (prop.minor == 0 || prop.minor == 6)) {
+        do_sparse_test = true;
+    }
+    if (do_sparse_test) {
+        printf("***cusparseLt Gemm Testing Begin***\n");
+        const int      spgemm_num = 3;
+        FILE*          fd;
+        int            line_count = 0;
+        const int      ites       = 100;
+        struct timeval start, end;
+        if (!isAppend) {
+            fd = fopen(SPIGEMM_CONFIG, "w+");
+        }
+        else {
+            fd = fopen(SPIGEMM_CONFIG, "a+");
+            std::vector<std::string> config;
+            char                     line[1024];
+            while (fgets(line, 1024, fd) != NULL) {
+                config.push_back(std::string(line));
+            }
+            line_count = config.size();
+            if (config.size() >= (MAX_CONFIG_NUM * spgemm_num + 1))  // 6 cublas/cublasLt, first row is not included
+            {
+                int startIdx = config.size() - ((MAX_CONFIG_NUM - 1) * spgemm_num);
+                fclose(fd);
+                fd = fopen(SPIGEMM_CONFIG, "w+");
+                fprintf(fd, "%s", config[0].c_str());
+                for (uint i = startIdx; i < config.size(); i++) {
+                    fprintf(fd, "%s", config[i].c_str());
+                }
+                line_count = config.size() - (spgemm_num + 3);
+            }
+        }
+        if (line_count == 0) {
+            fprintf(
+                fd,
+                "batch_size, seq_len, head_num, size_per_head dataType ### batchCount, m, n, k, algoId, exec_time\n");
+        }
+        int M[spgemm_num];
+        int N[spgemm_num];
+        int K[spgemm_num];
+        // gemm1
+        M[0] = batch_size * seq_len;
+        K[0] = head_num * size_per_head;
+        N[0] = K[0];
+        // gemm2
+        M[1] = M[0];
+        K[1] = K[0];
+        N[1] = 4 * N[0];
+        // gemm3
+        M[2] = M[0];
+        K[2] = 4 * K[0];
+        N[2] = N[0];
+        cusparseLtHandle_t handle;
+        CHECK_CUSPARSE(cusparseLtInit(&handle));
+        cusparseOrder_t     col_order    = CUSPARSE_ORDER_COL;
+        cusparseOrder_t     row_order    = CUSPARSE_ORDER_ROW;
+        cusparseOperation_t opA          = CUSPARSE_OPERATION_NON_TRANSPOSE;
+        cusparseOperation_t opB          = CUSPARSE_OPERATION_NON_TRANSPOSE;
+        cusparseComputeType compute_type = CUSPARSE_COMPUTE_32I;
+        unsigned            alignment    = 16;
+        cudaStream_t        stream       = 0;
+        float               alpha2       = 1.0f;
+        float               beta2        = 0.0f;
+        for (int i = 0; i < spgemm_num; ++i) {
+            // to be compatible with spgemm wrapper, we let A be the weight matrix
+            // so m and n are swapped
+            // A: mxk B: kxn C:mxn
+            int m = N[i], n = M[i], k = K[i];
+            printf("\n-----------------------------\n");
+            printf("GEMM test %d: [M: %d, K: %d, N: %d]\n", i, m, k, n);
+            int8_t* d_A = (int8_t*)buffer;
+            int8_t* d_B = d_A + m * k;
+            int8_t* d_C = d_B + k * n;
+            int8_t* dA_compressed;
+            {
+                cusparseLtMatDescriptor_t matA;
+                CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(
+                    &handle, &matA, m, k, k, alignment, CUDA_R_8I, row_order, CUSPARSELT_SPARSITY_50_PERCENT))
+                CHECK_CUSPARSE(
+                    cusparseLtSpMMAPrune2(&handle, &matA, true, opA, d_A, d_A, CUSPARSELT_PRUNE_SPMMA_STRIP, stream))
+                size_t compressed_size;
+                CHECK_CUSPARSE(cusparseLtSpMMACompressedSize2(&handle, &matA, &compressed_size))
+                check_cuda_error(cudaMalloc((void**)&dA_compressed, compressed_size));
+                CHECK_CUSPARSE(cusparseLtSpMMACompress2(&handle, &matA, true, opA, d_A, dA_compressed, stream))
+            }
+            cudaDeviceSynchronize();
+            cudaError_t result = cudaGetLastError();
+            if (result) {
+                throw std::runtime_error(std::string("[FT][ERROR] CUDA runtime error: "));
+            }
+            float exec_time = 99999.0f;
+            int   fast_algo = 0;
+            for (int alg = 0; alg < 4; ++alg) {
+                cudaDeviceSynchronize();
+                cusparseLtMatDescriptor_t matA, matB, matC;
+                void*                     d_workspace = nullptr;
+                int                       num_streams = 1;
+                cudaStream_t              streams[1]  = {stream};
+                CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(
+                    &handle, &matA, m, k, k, alignment, CUDA_R_8I, row_order, CUSPARSELT_SPARSITY_50_PERCENT))
+                CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(&handle, &matB, k, n, k, alignment, CUDA_R_8I, col_order))
+                CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(&handle, &matC, m, n, m, alignment, CUDA_R_8I, col_order))
+                gettimeofday(&start, NULL);
+                for (int ite = 0; ite < ites; ++ite) {
+                    // initializing MatDesc takes a lot of time
+                    // and these descs can be stored to other place
+                    // whereas storing MatMulPlan to other place will cause errors
+                    cusparseLtMatmulDescriptor_t   matmul;
+                    cusparseLtMatmulAlgSelection_t alg_sel;
+                    cusparseLtMatmulPlan_t         plan;
+                    CHECK_CUSPARSE(cusparseLtMatmulDescriptorInit(
+                        &handle, &matmul, opA, opB, &matA, &matB, &matC, &matC, compute_type))
+                    CHECK_CUSPARSE(
+                        cusparseLtMatmulAlgSelectionInit(&handle, &alg_sel, &matmul, CUSPARSELT_MATMUL_ALG_DEFAULT))
+                    CHECK_CUSPARSE(cusparseLtMatmulAlgSetAttribute(
+                        &handle, &alg_sel, CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg, sizeof(alg)))
+                    size_t workspace_size;
+                    CHECK_CUSPARSE(cusparseLtMatmulGetWorkspace(&handle, &alg_sel, &workspace_size))
+                    CHECK_CUSPARSE(cusparseLtMatmulPlanInit(&handle, &plan, &matmul, &alg_sel, workspace_size))
+                    CHECK_CUSPARSE(cusparseLtMatmul(&handle,
+                                                    &plan,
+                                                    &alpha2,
+                                                    dA_compressed,
+                                                    d_B,
+                                                    &beta2,
+                                                    d_C,
+                                                    d_C,
+                                                    d_workspace,
+                                                    streams,
+                                                    num_streams))
+                    CHECK_CUSPARSE(cusparseLtMatmulPlanDestroy(&plan))
+                }
+                cudaDeviceSynchronize();
+                gettimeofday(&end, NULL);
+                printf("algo_%d costs %.3fms \n", alg, diffTime(start, end) / ites);
+                if (diffTime(start, end) < exec_time) {
+                    exec_time = diffTime(start, end);
+                    fast_algo = alg;
+                }
+            }
+            exec_time /= ites;
+            printf("fast_algo %d\n", fast_algo);
+            fprintf(fd,
+                    "%d %d %d %d %d ### 1 %d %d %d %d %f\n",
+                    batch_size,
+                    seq_len,
+                    head_num,
+                    size_per_head,
+                    HALF_DATATYPE,
+                    m,
+                    n,
+                    k,
+                    fast_algo,
+                    exec_time);
+            cudaFree(dA_compressed);
+        }
+        CHECK_CUSPARSE(cusparseLtDestroy(&handle))
+        fclose(fd);
+        printf("***cusparseLt Gemm Testing End***\n");
+    }
+#endif
+    return 0;
+}
+}  // namespace fastertransformer
--- a/src/fastertransformer/utils/gemm_test/encoder_igemm_func.h
+++ b/src/fastertransformer/utils/gemm_test/encoder_igemm_func.h
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "src/fastertransformer/utils/cublasAlgoMap.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include <algorithm>
+#include <cublasLt.h>
+#include <cuda_runtime.h>
+#include <map>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/time.h>
+#include <time.h>
+#include <unistd.h>
+#include <vector>
+namespace fastertransformer {
+/* CAUTION : must match cublasLtMatmulTile_t */
+const char* const matmulTileName[] = {"UNDEF",  "8x8",     "8x16",    "16x8",    "8x32",    "16x16",  "32x8",
+                                      "8x64",   "16x32",   "32x16",   "64x8",    "32x32",   "32x64",  "64x32",
+                                      "32x128", "64x64",   "128x32",  "64x128",  "128x64",  "64x256", "128x128",
+                                      "256x64", "64x512",  "128x256", "256x128", "512x64",  "64x96",  "96*64",
+                                      "96x128", "128x160", "160x128", "192x128", "128x192", "128x96", "END"};
+int generate_encoder_igemm_config(
+    int batch_size, int seq_len, int head_num, int size_per_head, void* buffer, bool isAppend = true);
+int printPerfStructure(int m, int n, int k, const customMatmulPerf_t& perf, FILE* fout, int hasPrint);
+int printBatchPerfStructure(
+    int batchCount, int m, int n, int k, const customMatmulPerf_t& perf, FILE* fout, int hasPrint);
+template<typename T, typename scaleT>
+int LtIgemmCustomFind(cublasLtHandle_t ltHandle,
+                      int              m,
+                      int              n,
+                      int              k,
+                      const scaleT*    alpha, /* host pointer */
+                      const int8_t*    A,
+                      const int8_t*    B,
+                      const scaleT*    beta, /* host pointer */
+                      T*               C,
+                      void*            workSpace,
+                      size_t           workSpaceSize,
+                      FILE*            fout);
+template<typename T, typename scaleT>
+int LtBatchIgemmCustomFind(cublasLtHandle_t ltHandle,
+                           int              batchCount,
+                           int              m,
+                           int              n,
+                           int              k,
+                           const scaleT*    alpha, /* host pointer */
+                           const int8_t*    A,
+                           const int8_t*    B,
+                           const scaleT*    beta, /* host pointer */
+                           T*               C,
+                           void*            workSpace,
+                           size_t           workSpaceSize,
+                           FILE*            fout);
+void matInit(int rows, int cols, int8_t* p, int ld);
+}  // namespace fastertransformer
--- a/src/fastertransformer/utils/gemm_test/gemm_func.cc
+++ b/src/fastertransformer/utils/gemm_test/gemm_func.cc
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "encoder_gemm_func.h"
+#include <assert.h>
+#include <sys/types.h>
+#ifndef CUDART_VERSION
+#error CUDART_VERSION Undefined!
+#endif
+namespace fastertransformer {
+// Utility function to print customMatmulPerf_t structure
+int printPerfStructure(int                       batch_size,
+                       int                       seq_len,
+                       int                       head_num,
+                       int                       size_per_head,
+                       int                       m,
+                       int                       n,
+                       int                       k,
+                       const customMatmulPerf_t& perf,
+                       FILE*                     fout,
+                       CublasDataType            data_type,
+                       int                       hasPrint,
+                       int                       batch_count)
+{
+    int algoId, tile, swizzle, customOption, numSplitsK, reductionScheme, stages;
+    const cublasLtMatmulAlgo_t* matmulAlgo = &perf.algo;
+    cublasLtMatmulAlgoConfigGetAttribute(matmulAlgo, CUBLASLT_ALGO_CONFIG_ID, &algoId, sizeof(algoId), NULL);
+    cublasLtMatmulAlgoConfigGetAttribute(matmulAlgo, CUBLASLT_ALGO_CONFIG_TILE_ID, &tile, sizeof(tile), NULL);
+    cublasLtMatmulAlgoConfigGetAttribute(
+        matmulAlgo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &numSplitsK, sizeof(numSplitsK), NULL);
+    cublasLtMatmulAlgoConfigGetAttribute(
+        matmulAlgo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &reductionScheme, sizeof(reductionScheme), NULL);
+    cublasLtMatmulAlgoConfigGetAttribute(
+        matmulAlgo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &swizzle, sizeof(swizzle), NULL);
+    cublasLtMatmulAlgoConfigGetAttribute(
+        matmulAlgo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &customOption, sizeof(customOption), NULL);
+#if (CUDART_VERSION >= 11000)
+    cublasLtMatmulAlgoConfigGetAttribute(matmulAlgo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &stages, sizeof(stages), NULL);
+#else
+    stages = 0;
+#endif
+#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
+    uint16_t inner_shapeId, cluster_shapeId;
+    cublasLtMatmulAlgoConfigGetAttribute(
+        matmulAlgo, CUBLASLT_ALGO_CONFIG_INNER_SHAPE_ID, &inner_shapeId, sizeof(inner_shapeId), NULL);
+    cublasLtMatmulAlgoConfigGetAttribute(
+        matmulAlgo, CUBLASLT_ALGO_CONFIG_CLUSTER_SHAPE_ID, &cluster_shapeId, sizeof(cluster_shapeId), NULL);
+#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
+    uint16_t mma_shapeId, cga_shapeId, sche_mode;
+    cublasLtMatmulAlgoConfigGetAttribute(
+        matmulAlgo, CUBLASLT_ALGO_CONFIG_MMA_SHAPE_ID, &mma_shapeId, sizeof(mma_shapeId), NULL);
+    cublasLtMatmulAlgoConfigGetAttribute(
+        matmulAlgo, CUBLASLT_ALGO_CONFIG_CGA_SHAPE_ID, &cga_shapeId, sizeof(cga_shapeId), NULL);
+    cublasLtMatmulAlgoConfigGetAttribute(
+        matmulAlgo, CUBLASLT_ALGO_CONFIG_SCHEDULING_MODE, &sche_mode, sizeof(sche_mode), NULL);
+#endif
+    printf("algo={ Id=%d, tileIdx=%d (%s) splitK=%d reduc=%d swizzle=%d custom=%d "
+#if (CUDART_VERSION >= 11000)
+           "stages=%d "
+#endif
+#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
+           "inner_shapeId=%d cluster_shapeId=%d"
+#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
+           "mma_shapeId=%d cga_shapeId=%d schedule_mode=%d"
+#endif
+           "} status %d "
+           "time %fms workspace=%d mathMode=%d waves=%f\n",
+           algoId,
+           tile,
+           matmulTileName[tile],
+           numSplitsK,
+           reductionScheme,
+           swizzle,
+           customOption,
+#if (CUDART_VERSION >= 11000)
+           stages,
+#endif
+#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
+           inner_shapeId,
+           cluster_shapeId,
+#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
+           mma_shapeId,
+           cga_shapeId,
+           sche_mode,
+#endif
+           perf.status,
+           perf.time,
+           (int)perf.workspaceSize,
+           (int)perf.mathMode,
+           perf.wavesCount);
+    if (hasPrint == 0) {
+        fprintf(fout,
+                "%d %d %d %d %d ### %d %d %d %d %d %d %d %d %d %d %d %d "
+#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
+                "%d %d "
+#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
+                "%d %d %d "
+#endif
+                "%f\n",
+                batch_size,
+                seq_len,
+                head_num,
+                size_per_head,
+                data_type,
+                batch_count,
+                m,
+                n,
+                k,
+                algoId,
+                customOption,
+                tile,
+                numSplitsK,
+                swizzle,
+                reductionScheme,
+                (int)perf.workspaceSize,
+                stages,
+#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
+                inner_shapeId,
+                cluster_shapeId,
+#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
+                mma_shapeId,
+                cga_shapeId,
+                sche_mode,
+#endif
+                perf.time);
+        return 1;
+    }
+    else {
+        return hasPrint;
+    }
+}
+static inline bool time_compare(const customMatmulPerf_t& perf_a, const customMatmulPerf_t& perf_b)
+{
+    return ((perf_a.status == CUBLAS_STATUS_SUCCESS) && (perf_a.time < perf_b.time));
+}
+static cublasStatus_t customMatmulRun(cublasLtHandle_t            ltHandle,  // to get the capabilities (required a GPU)
+                                      cublasLtMatmulDesc_t        operationDesc,
+                                      const void*                 alpha, /* host or device pointer */
+                                      const void*                 A,
+                                      cublasLtMatrixLayout_t      Adesc,
+                                      const void*                 B,
+                                      cublasLtMatrixLayout_t      Bdesc,
+                                      const void*                 beta, /* host or device pointer */
+                                      const void*                 C,
+                                      cublasLtMatrixLayout_t      Cdesc,
+                                      void*                       D,
+                                      cublasLtMatrixLayout_t      Ddesc,
+                                      const cublasLtMatmulAlgo_t& algo,
+                                      int                         kernelRepeats,
+                                      void*                       workSpace,
+                                      size_t                      workSpaceSizeInBytes,
+                                      customMatmulPerf_t&         perfResults,
+                                      cudaStream_t                stream,
+                                      cudaEvent_t&                startEvent,
+                                      cudaEvent_t&                stopEvent)
+{
+    cublasLtMatmulHeuristicResult_t heurResult;
+    /* Looping over the Algo */
+    int            repeats = kernelRepeats;
+    cublasStatus_t algoStatus =
+        cublasLtMatmulAlgoCheck(ltHandle, operationDesc, Adesc, Bdesc, Cdesc, Ddesc, &algo, &heurResult);
+    if (algoStatus == CUBLAS_STATUS_SUCCESS) {
+        if (heurResult.workspaceSize <= workSpaceSizeInBytes) {
+            cudaError_t err, err1, err2, err3;
+            err = cudaEventRecord(startEvent, stream);
+            for (int loop = 0; loop < repeats; loop++) {
+                cublasStatus_t oneRunStatus = cublasLtMatmul(ltHandle,
+                                                             operationDesc,
+                                                             alpha,
+                                                             A,
+                                                             Adesc,
+                                                             B,
+                                                             Bdesc,
+                                                             beta,
+                                                             C,
+                                                             Cdesc,
+                                                             D,
+                                                             Ddesc,
+                                                             &algo,
+                                                             workSpace,
+                                                             workSpaceSizeInBytes,
+                                                             stream);
+                if (oneRunStatus != CUBLAS_STATUS_SUCCESS) {
+                    algoStatus = oneRunStatus;
+                    break;
+                }
+            }
+            err1 = cudaEventRecord(stopEvent, stream);
+            err2 = cudaEventSynchronize(stopEvent);
+            float time;
+            err3 = cudaEventElapsedTime(&time, startEvent, stopEvent);
+            if ((err != cudaSuccess) || (err1 != cudaSuccess) || (err2 != cudaSuccess) || (err3 != cudaSuccess)) {
+                algoStatus = CUBLAS_STATUS_INTERNAL_ERROR;
+            }
+            // For the moment only add successful findings
+            if (algoStatus == CUBLAS_STATUS_SUCCESS) {
+                perfResults.algo          = algo;
+                perfResults.time          = time / repeats;
+                perfResults.workspaceSize = heurResult.workspaceSize;
+                perfResults.wavesCount    = heurResult.wavesCount;
+            }
+        }
+        else {
+            // printf("not enough workspace! %ld\n", heurResult.workspaceSize);
+            algoStatus = CUBLAS_STATUS_NOT_SUPPORTED;  // Not enough workspace
+        }
+    }
+    return algoStatus;
+}
+template<typename T, typename scaleT>
+int LtHgemmCustomFind(cublasLtHandle_t   ltHandle,
+                      int                batch_size,
+                      int                seq_len,
+                      int                head_num,
+                      int                size_per_head,
+                      int                m,
+                      int                n,
+                      int                k,
+                      const scaleT*      alpha, /* host pointer */
+                      const T*           A,
+                      const T*           B,
+                      const scaleT*      beta, /* host pointer */
+                      T*                 C,
+                      void*              workSpace,
+                      size_t             workSpaceSize,
+                      FILE*              fout,
+                      customMatmulPerf_t perfResults[],
+                      int                AlgoCombinations,
+                      cudaDataType_t     dtype_fp8,
+                      int                batchCount,
+                      int64_t            strideA,
+                      int64_t            strideB,
+                      int64_t            strideD)
+{
+    cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
+    cudaEvent_t    startEvent;
+    cudaEvent_t    stopEvent;
+    CublasDataType data_type;
+    cublasLtMatmulDesc_t   operationDesc = NULL;
+    cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL, Ddesc = NULL;
+    cudaStream_t stream = 0;
+    // SplitK value that we are going to try when SplitK is supported for a
+    // given algo
+    const int splitKSequenceA[] = {2, 3, 4, 5, 6, 8, 12, 16, 32};
+    // Let try a fixed number of combinations
+    int                  AlgoCount         = 0;
+    int                  AlgoCountRestrict = 0;            // workspace == 0
+    int                  maxNumTraversal   = 50;           // max number of traversal
+    cublasLtMatmulAlgo_t algos[AlgoCombinations];          // 0 <= workspace <= 32MB
+    cublasLtMatmulAlgo_t algosRestrict[AlgoCombinations];  // workspace == 0
+    int                  kernelRepeats = 100;              // number of time the CUDA kernels will be run back to back
+    int                  nbAlgoIds     = 0;                // Number of algorithms actually returned by
+                                                           // cublasLtMatmulAlgoGetIds function.
+#define ALGO_IDS 100                                       // Number of algorithms requested.
+    int algoIdA[ALGO_IDS];                                 // Array containing the algorithm IDs returned by
+                                                           // cublasLtMatmulAlgoGetIds function.
+    cudaDataType_t Atype, Btype, Ctype, scaleType, Dtype;
+#if (CUDART_VERSION >= 11000)
+    cublasComputeType_t computeType;
+#else
+    cudaDataType_t computeType;
+#endif
+    if (std::is_same<T, float>::value) {
+        data_type = FLOAT_DATATYPE;
+        Atype = CUDA_R_32F, Btype = CUDA_R_32F, Ctype = CUDA_R_32F, Dtype = CUDA_R_32F;
+    }
+    else if (std::is_same<T, half>::value) {
+        data_type = HALF_DATATYPE;
+        Atype = CUDA_R_16F, Btype = CUDA_R_16F, Ctype = CUDA_R_16F, Dtype = CUDA_R_16F;
+    }
+#ifdef ENABLE_BF16
+    else if (std::is_same<T, __nv_bfloat16>::value) {
+        data_type = BFLOAT16_DATATYPE;
+        Atype = CUDA_R_16BF, Btype = CUDA_R_16BF, Ctype = CUDA_R_16BF, Dtype = CUDA_R_16BF;
+    }
+#endif
+#ifdef ENABLE_FP8
+    else if (std::is_same<T, __nv_fp8_e4m3>::value) {
+        data_type = FP8_DATATYPE;
+        Atype = CUDA_R_8F_E4M3, Btype = CUDA_R_8F_E4M3, Ctype = CUDA_R_16BF;
+#ifdef FP8_GEMM_OUTPUT_QUANT_DISABLE
+        Dtype = CUDA_R_16BF;
+#else
+        Dtype = dtype_fp8;
+#endif
+    }
+#endif
+    if (sizeof(scaleT) == sizeof(float)) {
+        scaleType = CUDA_R_32F;
+#if (CUDART_VERSION >= 11000)
+        computeType = CUBLAS_COMPUTE_32F;
+#else
+        computeType = CUDA_R_32F;
+#endif
+    }
+    else {
+        scaleType = CUDA_R_16F;
+#if (CUDART_VERSION >= 11000)
+        computeType = CUBLAS_COMPUTE_16F;
+#else
+        computeType = CUDA_R_16F;
+#endif
+    }
+    const cublasOperation_t tA = data_type == FP8_DATATYPE ? CUBLAS_OP_T : CUBLAS_OP_N;
+// Create operation descriptor; see cublasLtMatmulDescAttributes_t for
+// details about defaults; here we just need to set the transforms for A and
+// B
+#if (CUDART_VERSION >= 11000)
+    status = cublasLtMatmulDescCreate(&operationDesc, computeType,
+                                      scaleType);  //  creates a matrix multiply descriptor
+#else
+    status = cublasLtMatmulDescCreate(&operationDesc, computeType);
+#endif
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        goto CLEANUP;
+    }
+    status = cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &tA, sizeof(tA));
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        goto CLEANUP;
+    }
+#ifdef ENABLE_FP8
+    if (data_type == FP8_DATATYPE) {
+        const int8_t fastAccuMode = 1;  // enable fast imprecise accum
+        status                    = cublasLtMatmulDescSetAttribute(
+            operationDesc, CUBLASLT_MATMUL_DESC_FAST_ACCUM, &fastAccuMode, sizeof(decltype(fastAccuMode)));
+        if (status != CUBLAS_STATUS_SUCCESS) {
+            goto CLEANUP;
+        }
+    }
+#endif
+    // Create matrix descriptors. We are good with the details here so no need
+    // to set any extra attributes
+    if (data_type == FP8_DATATYPE) {
+        status = cublasLtMatrixLayoutCreate(&Adesc, Atype, k, m, k);
+    }
+    else {
+        status = cublasLtMatrixLayoutCreate(&Adesc, Atype, m, k, m);
+    }
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        goto CLEANUP;
+    }
+    status = cublasLtMatrixLayoutCreate(&Bdesc, Btype, k, n, k);
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        goto CLEANUP;
+    }
+    status = cublasLtMatrixLayoutCreate(&Cdesc, Ctype, m, n, m);
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        goto CLEANUP;
+    }
+    status = cublasLtMatrixLayoutCreate(&Ddesc, Dtype, m, n, m);
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        goto CLEANUP;
+    }
+    if (batchCount > 1) {
+        check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+            Adesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
+        check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+            Bdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
+        check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+            Cdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
+        check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+            Ddesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
+        check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+            Adesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideA, sizeof(strideA)));
+        check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+            Bdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideB, sizeof(strideB)));
+        check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+            Cdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideD, sizeof(strideD)));
+        check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+            Ddesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideD, sizeof(strideD)));
+    }
+    // Create CUDA event to time the execution time of each algo
+    if (cudaEventCreate(&startEvent, cudaEventBlockingSync) != cudaSuccess) {
+        goto CLEANUP;
+    }
+    if (cudaEventCreate(&stopEvent, cudaEventBlockingSync) != cudaSuccess) {
+        goto CLEANUP;
+    }
+    // Request the 100 first AlgoId available
+    status = cublasLtMatmulAlgoGetIds(
+        ltHandle, computeType, scaleType, Atype, Btype, Ctype, Dtype, ALGO_IDS, algoIdA, &nbAlgoIds);
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        goto CLEANUP;
+    }
+    if (nbAlgoIds > ALGO_IDS) {
+        printf(
+            "Warning: the algo id count is not large enough to guarantee the best algo %d, %d\n", nbAlgoIds, ALGO_IDS);
+    }
+    // Loop over the Algo IDs
+    // This loop doesn't work for fp8 gemm
+    for (int idx = 0; (idx < nbAlgoIds) && (AlgoCount < AlgoCombinations); idx++) {
+        cublasLtMatmulAlgo_t algo;
+        size_t               sizeWritten = 0;
+        /* Initialize algo structure with given Algp ID */
+        status =
+            cublasLtMatmulAlgoInit(ltHandle, computeType, scaleType, Atype, Btype, Ctype, Dtype, algoIdA[idx], &algo);
+        if (status != CUBLAS_STATUS_SUCCESS) {
+            continue;
+        }
+        // Query the tiles enums supported by that algo
+        cublasLtMatmulAlgoCapGetAttribute(&algo, CUBLASLT_ALGO_CAP_TILE_IDS, NULL, 0, &sizeWritten);
+        int  nbTiles = int(sizeWritten / sizeof(int));
+        int* tileA   = new int[nbTiles == 0 ? 1 : nbTiles];
+        if (nbTiles == 0) {
+            tileA[0] = CUBLASLT_MATMUL_TILE_UNDEFINED;
+            nbTiles  = 1;
+        }
+#if (CUDART_VERSION >= 11000)
+        cublasLtMatmulAlgoCapGetAttribute(&algo, CUBLASLT_ALGO_CAP_STAGES_IDS, NULL, 0, &sizeWritten);
+        int              nbStages = int(sizeWritten / sizeof(int));
+        std::vector<int> stagesA(nbStages == 0 ? 1 : nbStages);
+        if (nbStages == 0) {
+            stagesA[0] = CUBLASLT_MATMUL_STAGES_UNDEFINED;
+            nbStages   = 1;
+        }
+        else {
+            cublasLtMatmulAlgoCapGetAttribute(
+                &algo, CUBLASLT_ALGO_CAP_STAGES_IDS, stagesA.data(), sizeof(int) * nbStages, &sizeWritten);
+        }
+#endif
+        int splitkSupport, redMask, swizzlingMax, customOptionMax;
+        // Retrieve Algo Capabilities attributes to be able to setup loop over
+        // the different combinations
+        cublasLtMatmulAlgoCapGetAttribute(
+            &algo, CUBLASLT_ALGO_CAP_TILE_IDS, tileA, sizeof(int) * nbTiles, &sizeWritten);
+        cublasLtMatmulAlgoCapGetAttribute(
+            &algo, CUBLASLT_ALGO_CAP_SPLITK_SUPPORT, &splitkSupport, sizeof(splitkSupport), &sizeWritten);
+        cublasLtMatmulAlgoCapGetAttribute(
+            &algo, CUBLASLT_ALGO_CAP_REDUCTION_SCHEME_MASK, &redMask, sizeof(redMask), &sizeWritten);
+        cublasLtMatmulAlgoCapGetAttribute(
+            &algo, CUBLASLT_ALGO_CAP_CTA_SWIZZLING_SUPPORT, &swizzlingMax, sizeof(swizzlingMax), &sizeWritten);
+        cublasLtMatmulAlgoCapGetAttribute(
+            &algo, CUBLASLT_ALGO_CAP_CUSTOM_OPTION_MAX, &customOptionMax, sizeof(customOptionMax), &sizeWritten);
+        /* Loop over the different tiles */
+        for (int tileIdx = 0; tileIdx < nbTiles; tileIdx++) {
+#if (CUDART_VERSION >= 11000)
+            /* Loop over different stages count */
+            for (int stagesIdx = 0; stagesIdx < nbStages; stagesIdx++) {
+                cublasLtMatmulAlgoConfigSetAttribute(
+                    &algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &stagesA[stagesIdx], sizeof(stagesA[stagesIdx]));
+#endif
+                /* Loop over the different custom option if any */
+                for (int customOption = 0; customOption <= customOptionMax; customOption++) {
+                    cublasLtMatmulAlgoConfigSetAttribute(
+                        &algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &customOption, sizeof(customOption));
+                    /* Loop over the CTAs swizzling support */
+                    for (int k = 0; k <= swizzlingMax; k++) {
+                        int splitK_trial = 0;
+                        if (splitkSupport) {
+                            splitK_trial += sizeof(splitKSequenceA) / sizeof(splitKSequenceA[0]);
+                        }
+                        // Loop over the splitK value over a fixed sequence
+                        // splitKSequenceA in addition to the case where splitK
+                        // is not enabled
+                        for (int l = 0; (l < (1 + splitK_trial)) && (AlgoCount < AlgoCombinations); l++) {
+                            /* Setup attribute of the algo to run */
+                            cublasLtMatmulAlgoConfigSetAttribute(
+                                &algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &tileA[tileIdx], sizeof(tileA[tileIdx]));
+                            int splitK_val = 0;
+                            int redScheme  = CUBLASLT_REDUCTION_SCHEME_NONE;
+                            cublasLtMatmulAlgoConfigSetAttribute(
+                                &algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &splitK_val, sizeof(splitK_val));
+                            cublasLtMatmulAlgoConfigSetAttribute(
+                                &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &k, sizeof(k));
+                            cublasLtMatmulAlgoConfigSetAttribute(
+                                &algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &redScheme, sizeof(int));
+                            if (l > 0) {  // Split-K case
+                                splitK_val = splitKSequenceA[l - 1];
+                                cublasLtMatmulAlgoConfigSetAttribute(&algo,
+                                                                     CUBLASLT_ALGO_CONFIG_SPLITK_NUM,
+                                                                     &splitKSequenceA[l - 1],
+                                                                     sizeof(splitKSequenceA[l - 1]));
+                                /* Going over all the reduction scheme  */
+                                for (redScheme = 1;
+                                     redScheme < (int)CUBLASLT_REDUCTION_SCHEME_MASK && (AlgoCount < AlgoCombinations);
+                                     redScheme = redScheme << 1) {
+                                    if (redScheme & redMask) {
+                                        cublasLtMatmulAlgoConfigSetAttribute(&algo,
+                                                                             CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME,
+                                                                             &redScheme,
+                                                                             sizeof(redScheme));
+                                        cublasLtMatmulHeuristicResult_t heurResult;
+                                        cublasStatus_t                  algoStatus = cublasLtMatmulAlgoCheck(
+                                            ltHandle, operationDesc, Adesc, Bdesc, Cdesc, Cdesc, &algo, &heurResult);
+                                        if (heurResult.workspaceSize > workSpaceSize) {
+                                            // printf("not enough workspace!
+                                            // %ld\n",
+                                            // heurResult.workspaceSize);
+                                            algoStatus = CUBLAS_STATUS_NOT_SUPPORTED;  // Not enough workspace
+                                        }
+                                        else if (heurResult.workspaceSize == 0) {
+                                            if (algoStatus == CUBLAS_STATUS_SUCCESS) {
+                                                algosRestrict[AlgoCountRestrict++] = algo;
+                                            }
+                                        }
+                                        if (algoStatus == CUBLAS_STATUS_SUCCESS) {
+                                            algos[AlgoCount++] = algo;
+                                        }
+                                    }  // end if
+                                }      // end for
+                            }
+                            else {  // Non-splitK case
+                                /* if user preference is ok with workspace */
+                                if (AlgoCount < AlgoCombinations) {
+                                    cublasLtMatmulHeuristicResult_t heurResult;
+                                    cublasStatus_t                  algoStatus = cublasLtMatmulAlgoCheck(
+                                        ltHandle, operationDesc, Adesc, Bdesc, Cdesc, Cdesc, &algo, &heurResult);
+                                    if (heurResult.workspaceSize > workSpaceSize) {
+                                        // printf("not enough workspace! %ld\n",
+                                        // heurResult.workspaceSize);
+                                        algoStatus = CUBLAS_STATUS_NOT_SUPPORTED;  // Not
+                                                                                   // enough
+                                                                                   // workspace
+                                    }
+                                    else if (heurResult.workspaceSize == 0) {
+                                        if (algoStatus == CUBLAS_STATUS_SUCCESS) {
+                                            algosRestrict[AlgoCountRestrict++] = algo;
+                                        }
+                                    }
+                                    if (algoStatus == CUBLAS_STATUS_SUCCESS) {
+                                        algos[AlgoCount++] = algo;
+                                    }
+                                }
+                            }
+                        }  // end l
+                    }      // end k
+                }          // end customOption
+#if (CUDART_VERSION >= 11000)
+            }  // end stagesIdx
+#endif
+        }  // end tileIdx
+        delete[] tileA;
+    }  // end idx
+    printf("AlgoCount: %d\n", AlgoCount);
+    if (data_type == FP8_DATATYPE) {
+        assert(AlgoCount == 0);
+    }
+    if (AlgoCount < maxNumTraversal && data_type != FP8_DATATYPE) {
+        // 0 <= workspacesize <= 32MB
+        for (int i = 0; i < AlgoCount; i++) {
+            status                = customMatmulRun(ltHandle,
+                                     operationDesc,
+                                     alpha, /* host or device pointer */
+                                     A,
+                                     Adesc,
+                                     B,
+                                     Bdesc,
+                                     beta, /* host or device pointer */
+                                     C,
+                                     Cdesc,
+                                     C,
+                                     Cdesc,
+                                     algos[i],
+                                     kernelRepeats,
+                                     workSpace,
+                                     workSpaceSize,
+                                     perfResults[i],
+                                     stream,
+                                     startEvent,
+                                     stopEvent);
+            perfResults[i].status = status;
+            // if (status == CUBLAS_STATUS_SUCCESS) AlgoCount++;
+        }
+    }
+    else {
+        // Heuristic + workspacesize==0
+        AlgoCount = 0;
+        nbAlgoIds = 0;
+        cublasLtMatmulPreference_t pref;
+        cublasLtMatmulPreferenceCreate(&pref);
+        uint64_t maxWorkSpaceSize = workSpaceSize;  //(32MB)
+        cublasLtMatmulPreferenceSetAttribute(
+            pref, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &maxWorkSpaceSize, sizeof(maxWorkSpaceSize));
+        cublasLtMatmulHeuristicResult_t heuristicResultsArray[maxNumTraversal];
+        cublasLtMatmulAlgoGetHeuristic(ltHandle,
+                                       operationDesc,
+                                       Adesc,
+                                       Bdesc,
+                                       Cdesc,
+                                       Ddesc,
+                                       pref,
+                                       maxNumTraversal,
+                                       heuristicResultsArray,
+                                       &nbAlgoIds);
+        cublasLtMatmulPreferenceDestroy(pref);
+        printf("return %d and run heuristic algo\n", nbAlgoIds);
+        for (int i = 0; i < nbAlgoIds; i++) {
+            if (heuristicResultsArray[i].state == CUBLAS_STATUS_SUCCESS) {
+                status                        = customMatmulRun(ltHandle,
+                                         operationDesc,
+                                         alpha, /* host or device pointer */
+                                         A,
+                                         Adesc,
+                                         B,
+                                         Bdesc,
+                                         beta, /* host or device pointer */
+                                         C,
+                                         Cdesc,
+                                         C,
+                                         Ddesc,
+                                         heuristicResultsArray[i].algo,
+                                         kernelRepeats,
+                                         workSpace,
+                                         workSpaceSize,
+                                         perfResults[AlgoCount],
+                                         stream,
+                                         startEvent,
+                                         stopEvent);
+                perfResults[AlgoCount].status = status;
+                if (status == CUBLAS_STATUS_SUCCESS) {
+                    AlgoCount++;
+                }
+            }
+        }
+        // workspacesize==0
+        printf("workspacesize==0, run %d algos\n", AlgoCountRestrict);
+        for (int i = 0; i < AlgoCountRestrict && i < (maxNumTraversal - nbAlgoIds); i++) {
+            status                        = customMatmulRun(ltHandle,
+                                     operationDesc,
+                                     alpha, /* host or device pointer */
+                                     A,
+                                     Adesc,
+                                     B,
+                                     Bdesc,
+                                     beta, /* host or device pointer */
+                                     C,
+                                     Cdesc,
+                                     C,
+                                     Ddesc,
+                                     algosRestrict[i],
+                                     kernelRepeats,
+                                     NULL,
+                                     0,
+                                     perfResults[AlgoCount],
+                                     stream,
+                                     startEvent,
+                                     stopEvent);
+            perfResults[AlgoCount].status = status;
+            if (status == CUBLAS_STATUS_SUCCESS) {
+                AlgoCount++;
+            }
+        }
+    }
+    // Sort the results per run duration
+    std::sort(perfResults, perfResults + AlgoCount, time_compare);
+    // Print timing and perf details
+    for (int i = 0, hasPrint = 1; i < AlgoCount; i++) {
+        printf("result %03d : ", i);
+        hasPrint = printPerfStructure(batch_size,
+                                      seq_len,
+                                      head_num,
+                                      size_per_head,
+                                      m,
+                                      n,
+                                      k,
+                                      perfResults[i],
+                                      fout,
+                                      data_type,
+                                      hasPrint,
+                                      batchCount);
+    }
+CLEANUP:
+    // Descriptors are no longer needed as all GPU work was already enqueued
+    if (Cdesc) {
+        cublasLtMatrixLayoutDestroy(Cdesc);
+    }
+    if (Bdesc) {
+        cublasLtMatrixLayoutDestroy(Bdesc);
+    }
+    if (Adesc) {
+        cublasLtMatrixLayoutDestroy(Adesc);
+    }
+    if (operationDesc) {
+        cublasLtMatmulDescDestroy(operationDesc);
+    }
+    if (startEvent) {
+        cudaEventDestroy(startEvent);
+    }
+    if (stopEvent) {
+        cudaEventDestroy(stopEvent);
+    }
+    return status == CUBLAS_STATUS_SUCCESS ? 0 : 1;
+}
+template int LtHgemmCustomFind(cublasLtHandle_t   ltHandle,
+                               int                batch_size,
+                               int                seq_len,
+                               int                head_num,
+                               int                size_per_head,
+                               int                m,
+                               int                n,
+                               int                k,
+                               const float*       alpha, /* host pointer */
+                               const float*       A,
+                               const float*       B,
+                               const float*       beta, /* host pointer */
+                               float*             C,
+                               void*              workSpace,
+                               size_t             workSpaceSize,
+                               FILE*              fout,
+                               customMatmulPerf_t perfResults[],
+                               int                AlgoCombinations,
+                               cudaDataType_t     dtype_fp8,
+                               int                batchCount,
+                               int64_t            strideA,
+                               int64_t            strideB,
+                               int64_t            strideD);
+template int LtHgemmCustomFind(cublasLtHandle_t   ltHandle,
+                               int                batch_size,
+                               int                seq_len,
+                               int                head_num,
+                               int                size_per_head,
+                               int                m,
+                               int                n,
+                               int                k,
+                               const half*        alpha, /* host pointer */
+                               const half*        A,
+                               const half*        B,
+                               const half*        beta, /* host pointer */
+                               half*              C,
+                               void*              workSpace,
+                               size_t             workSpaceSize,
+                               FILE*              fout,
+                               customMatmulPerf_t perfResults[],
+                               int                AlgoCombinations,
+                               cudaDataType_t     dtype_fp8,
+                               int                batchCount,
+                               int64_t            strideA,
+                               int64_t            strideB,
+                               int64_t            strideD);
+#ifdef ENABLE_BF16
+template int LtHgemmCustomFind(cublasLtHandle_t     ltHandle,
+                               int                  batch_size,
+                               int                  seq_len,
+                               int                  head_num,
+                               int                  size_per_head,
+                               int                  m,
+                               int                  n,
+                               int                  k,
+                               const float*         alpha, /* host pointer */
+                               const __nv_bfloat16* A,
+                               const __nv_bfloat16* B,
+                               const float*         beta, /* host pointer */
+                               __nv_bfloat16*       C,
+                               void*                workSpace,
+                               size_t               workSpaceSize,
+                               FILE*                fout,
+                               customMatmulPerf_t   perfResults[],
+                               int                  AlgoCombinations,
+                               cudaDataType_t       dtype_fp8,
+                               int                  batchCount,
+                               int64_t              strideA,
+                               int64_t              strideB,
+                               int64_t              strideD);
+#endif
+#ifdef ENABLE_FP8
+template int LtHgemmCustomFind(cublasLtHandle_t     ltHandle,
+                               int                  batch_size,
+                               int                  seq_len,
+                               int                  head_num,
+                               int                  size_per_head,
+                               int                  m,
+                               int                  n,
+                               int                  k,
+                               const float*         alpha, /* host pointer */
+                               const __nv_fp8_e4m3* A,
+                               const __nv_fp8_e4m3* B,
+                               const float*         beta, /* host pointer */
+                               __nv_fp8_e4m3*       C,
+                               void*                workSpace,
+                               size_t               workSpaceSize,
+                               FILE*                fout,
+                               customMatmulPerf_t   perfResults[],
+                               int                  AlgoCombinations,
+                               cudaDataType_t       dtype_fp8,
+                               int                  batchCount,
+                               int64_t              strideA,
+                               int64_t              strideB,
+                               int64_t              strideD);
+#endif
+template int LtHgemmCustomFind(cublasLtHandle_t   ltHandle,
+                               int                batch_size,
+                               int                seq_len,
+                               int                head_num,
+                               int                size_per_head,
+                               int                m,
+                               int                n,
+                               int                k,
+                               const float*       alpha, /* host pointer */
+                               const half*        A,
+                               const half*        B,
+                               const float*       beta, /* host pointer */
+                               half*              C,
+                               void*              workSpace,
+                               size_t             workSpaceSize,
+                               FILE*              fout,
+                               customMatmulPerf_t perfResults[],
+                               int                AlgoCombinations,
+                               cudaDataType_t     dtype_fp8,
+                               int                batchCount,
+                               int64_t            strideA,
+                               int64_t            strideB,
+                               int64_t            strideD);
+size_t calGemmTestBufSizeInByte(int            batch_size,
+                                int            seq_len,
+                                int            head_num,
+                                int            size_per_head,
+                                int            inter_size,
+                                int            vocab_size,
+                                int            int8_mode,
+                                CublasDataType data_type)
+{
+    size_t buf_size_in_byte;
+    if (int8_mode > 0) {
+        int m = batch_size * seq_len;
+        int n = head_num * size_per_head;
+        int k = n;
+        size_t size1 = 3 * (m * k * sizeof(int8_t) + k * n * sizeof(int8_t) + m * n * sizeof(int));
+        size_t size2 = batch_size * head_num
+                       * (seq_len * size_per_head * sizeof(int8_t) + size_per_head * seq_len * sizeof(int8_t)
+                          + seq_len * seq_len * sizeof(int));
+        size_t size3 = batch_size * head_num
+                       * (seq_len * seq_len * sizeof(int8_t) + seq_len * size_per_head * sizeof(int8_t)
+                          + seq_len * size_per_head * sizeof(int));
+        size_t size4     = m * k * sizeof(int8_t) + k * inter_size * sizeof(int8_t) + m * inter_size * sizeof(int);
+        size_t size5     = m * k * sizeof(int8_t) + k * vocab_size * sizeof(int8_t) + m * vocab_size * sizeof(int);
+        buf_size_in_byte = size1 > size2 ? size1 : size2;
+        buf_size_in_byte = buf_size_in_byte > size3 ? buf_size_in_byte : size3;
+        buf_size_in_byte = buf_size_in_byte > size4 ? buf_size_in_byte : size4;
+        buf_size_in_byte = buf_size_in_byte > size5 ? buf_size_in_byte : size5;
+    }
+    else {
+        size_t m = batch_size * seq_len;
+        size_t n = head_num * size_per_head;
+        size_t k = n;
+        // TODO need to add bfloat16 here
+        int    wordSize = (data_type == FLOAT_DATATYPE ? sizeof(float) : sizeof(half));
+        size_t size1    = 3 * (m * k + k * n + m * n) * wordSize;
+        size_t size2    = (size_t)batch_size * (size_t)head_num
+                       * ((size_t)seq_len * (size_t)seq_len + (size_t)seq_len * (size_t)size_per_head
+                          + (size_t)seq_len * (size_t)size_per_head)
+                       * (size_t)wordSize;
+        size_t size3     = (m * k + k * inter_size + m * inter_size) * wordSize;
+        size_t size4     = (m * k + k * vocab_size + m * vocab_size) * wordSize;
+        buf_size_in_byte = size1 > size2 ? size1 : size2;
+        buf_size_in_byte = buf_size_in_byte > size3 ? buf_size_in_byte : size3;
+        buf_size_in_byte = buf_size_in_byte > size4 ? buf_size_in_byte : size4;
+        buf_size_in_byte +=
+            ((data_type == HALF_DATATYPE || data_type == BFLOAT16_DATATYPE) ? CUBLAS_WORKSPACE_SIZE : 0);
+    }
+    return buf_size_in_byte;
+}
+size_t calGemmTestBufSizeInByteXlnet(
+    int batch_size, int seq_len, int head_num, int size_per_head, int inter_size, int hidden_units, int is_fp16)
+{
+    int M[10]          = {0};
+    int N[10]          = {0};
+    int K[10]          = {0};
+    int batchCount[10] = {0};
+    // gemm1
+    M[0]          = hidden_units;
+    N[0]          = seq_len * batch_size;
+    K[0]          = hidden_units;
+    batchCount[0] = 3;
+    // gemm2
+    M[1]          = hidden_units;
+    N[1]          = seq_len * 2;
+    K[1]          = hidden_units;
+    batchCount[1] = 1;
+    // gemm3
+    M[2]          = seq_len;
+    N[2]          = seq_len;
+    K[2]          = size_per_head;
+    batchCount[2] = batch_size * head_num;
+    // gemm4
+    M[3]          = seq_len * 2;
+    N[3]          = seq_len;
+    K[3]          = size_per_head;
+    batchCount[3] = batch_size * head_num;
+    // gemm5
+    M[4]          = 2;
+    N[4]          = seq_len;
+    K[4]          = size_per_head;
+    batchCount[4] = batch_size * head_num;
+    // gemm6
+    M[5] = head_num;
+    N[5] = seq_len;
+    K[5] = 2;
+    // gemm7
+    M[6]          = size_per_head;
+    N[6]          = seq_len;
+    K[6]          = seq_len;
+    batchCount[6] = batch_size * head_num;
+    // gemm8
+    M[7]          = hidden_units;
+    N[7]          = seq_len;
+    K[7]          = hidden_units;
+    batchCount[7] = batch_size;
+    // gemm9
+    M[8]          = inter_size;
+    N[8]          = seq_len;
+    K[8]          = hidden_units;
+    batchCount[8] = batch_size;
+    // gemm10
+    M[9]          = hidden_units;
+    N[9]          = seq_len;
+    K[9]          = inter_size;
+    batchCount[9] = batch_size;
+    size_t max_size = 0;
+    for (int i = 0; i < 10; ++i) {
+        int    m = M[i], n = N[i], k = K[i];
+        size_t size = (M[i] * N[i] + M[i] * K[i] + N[i] * K[i]) * batchCount[i];
+        if (size > max_size) {
+            max_size = size;
+        }
+    }
+    int size_per_ele = 4;
+    if (is_fp16 == true) {
+        size_per_ele = 2;
+    }
+    return max_size * size_per_ele;
+}
+}  // namespace fastertransformer