check-in fastertransformer (#7)

* add ft code * gitignore * fix lint * revert fmha

check-in fastertransformer (#7)
* add ft code * gitignore * fix lint * revert fmha
9efcac38 · Li Zhang · GitHub · 720fc533 · 9efcac38 · 9efcac38
Unverified Commit 9efcac38 authored Jun 20, 2023 by Li Zhang Committed by GitHub Jun 20, 2023
20 changed files
--- a/src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h
+++ b/src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h
+/*
+ * Copyright (c) OpenMMLab. All rights reserved.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ // Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/multi_gpu_gpt/ParallelGptTritonModel.h
+
+#pragma once
+
+#include "src/fastertransformer/models/llama/LlamaV2.h"
+#include "src/fastertransformer/triton_backend/llama/LlamaTritonModel.h"
+#include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp"
+#include <memory>
+
+namespace ft = fastertransformer;
+
+template<typename T>
+struct LlamaTritonSharedModelInstance {
+    std::unique_ptr<ft::LlamaV2<T>>                         llm;
+    std::shared_ptr<ft::LlamaWeight<T>>                     llm_weight;
+    std::unique_ptr<ft::Allocator<ft::AllocatorType::CUDA>> allocator;
+    std::unique_ptr<ft::cublasAlgoMap>                      cublas_algo_map;
+    std::unique_ptr<std::mutex>                             cublas_wrapper_mutex;
+    std::unique_ptr<ft::cublasMMWrapper>                    cublas_wrapper;
+    std::unique_ptr<cudaDeviceProp>                         cuda_device_prop_ptr;
+    const int                                               session_len;
+};
+
+template<typename T>
+struct LlamaTritonModelInstance: AbstractTransformerModelInstance {
+
+    LlamaTritonModelInstance(std::shared_ptr<LlamaTritonSharedModelInstance<T>>      instance,
+                             std::unique_ptr<ft::Allocator<ft::AllocatorType::CUDA>> allocator);
+    ~LlamaTritonModelInstance();
+
+    std::shared_ptr<std::vector<triton::Tensor>>
+    forward(std::shared_ptr<std::vector<triton::Tensor>> input_tensors) override;
+
+    std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>
+    forward(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors) override;
+
+    std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>
+    forward(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors,
+            ft::AbstractInstanceComm*) override;
+
+    static std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>
+    convert_outputs(const std::unordered_map<std::string, ft::Tensor>& output_tensors);
+
+private:
+    const std::shared_ptr<LlamaTritonSharedModelInstance<T>>      instance_;
+    const std::unique_ptr<ft::Allocator<ft::AllocatorType::CUDA>> allocator_;
+
+    std::unordered_map<std::string, ft::Tensor>
+    convert_inputs(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors);
+
+    void allocateBuffer(const size_t request_batch_size, const size_t beam_width, const size_t session_len);
+    void freeBuffer();
+
+    int*   d_input_ids_                = nullptr;
+    int*   d_input_lengths_            = nullptr;
+    int*   d_input_bad_words_          = nullptr;
+    int*   d_input_stop_words_         = nullptr;
+    int*   d_request_prompt_lengths_   = nullptr;
+    T*     d_request_prompt_embedding_ = nullptr;
+    float* d_top_p_decay_              = nullptr;
+    float* d_top_p_min_                = nullptr;
+    int*   d_top_p_reset_ids_          = nullptr;
+
+    int*   d_output_ids_       = nullptr;
+    int*   d_sequence_lengths_ = nullptr;
+    float* d_output_log_probs_ = nullptr;
+    float* d_cum_log_probs_    = nullptr;
+
+    uint32_t*          h_total_output_lengths_ = nullptr;
+    std::exception_ptr h_exception_            = nullptr;
+};
--- a/src/fastertransformer/triton_backend/transformer_triton_backend.cpp
+++ b/src/fastertransformer/triton_backend/transformer_triton_backend.cpp
+/*
+ * Copyright (c) OpenMMLab. All rights reserved.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ // Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/transformer_triton_backend.cpp
+
+#include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp"
+#include "src/fastertransformer/utils/nccl_utils.h"
+
+std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>>
+AbstractTransformerModel::createNcclParams(const int node_id, const int device_id_start, const bool multi_node)
+{
+    const int gpu_count          = ft::getDeviceCount();
+    const int tensor_para_size   = getTensorParaSize();
+    const int pipeline_para_size = getPipelineParaSize();
+    const int local_comm_size    = multi_node ? gpu_count : tensor_para_size * pipeline_para_size;
+    ft::FT_CHECK(tensor_para_size > 0 && pipeline_para_size > 0);
+    ft::FT_CHECK(device_id_start + (int)local_comm_size <= gpu_count);
+
+    std::vector<ft::NcclUid> nccl_ids;
+    if (tensor_para_size > 1 || pipeline_para_size > 1) {
+        nccl_ids.resize(tensor_para_size + pipeline_para_size);
+        if (node_id == 0) {
+            for (uint32_t i = 0; i < nccl_ids.size(); i++) {
+                ft::ftNcclGetUniqueId(nccl_ids[i]);
+            }
+        }
+        for (size_t i = 0; i < nccl_ids.size(); i++) {
+            ft::mpi::bcast(&nccl_ids[i], sizeof(nccl_ids[i]), ft::mpi::MPI_TYPE_BYTE, 0, ft::mpi::COMM_WORLD);
+        }
+    }
+
+    std::vector<ft::NcclParam> tensor_para_params(local_comm_size);
+    std::vector<ft::NcclParam> pipeline_para_params(local_comm_size);
+    // Don't init comm when size == 1
+    if (tensor_para_size > 1) {
+        const auto group_id = ft::ftNcclNextGroupId();
+        ft::ftNcclGroupStart();
+        for (int gid = device_id_start; gid < device_id_start + local_comm_size; gid++) {
+            int rank               = node_id * gpu_count + gid - device_id_start;
+            int tensor_para_rank   = rank % tensor_para_size;
+            int pipeline_para_rank = rank / tensor_para_size;
+
+            ft::NcclUid tensor_para_nccl_uid = nccl_ids[pipeline_para_rank];
+            ft::check_cuda_error(cudaSetDevice(gid));
+            ft::ftNcclCommInitRank(
+                tensor_para_params[gid - device_id_start], tensor_para_rank, tensor_para_size, tensor_para_nccl_uid);
+            tensor_para_params[gid - device_id_start].group_id_ = group_id;
+        }
+        ft::ftNcclGroupEnd();
+    }
+    if (pipeline_para_size > 1) {
+        const auto group_id = ft::ftNcclNextGroupId();
+        ft::ftNcclGroupStart();
+        for (int gid = device_id_start; gid < device_id_start + local_comm_size; gid++) {
+            int rank               = node_id * gpu_count + gid - device_id_start;
+            int tensor_para_rank   = rank % tensor_para_size;
+            int pipeline_para_rank = rank / tensor_para_size;
+
+            ft::NcclUid pipeline_para_nccl_uid = nccl_ids[pipeline_para_size + tensor_para_rank];
+            ft::check_cuda_error(cudaSetDevice(gid));
+            ft::ftNcclCommInitRank(pipeline_para_params[gid - device_id_start],
+                                   pipeline_para_rank,
+                                   pipeline_para_size,
+                                   pipeline_para_nccl_uid);
+            pipeline_para_params[gid - device_id_start].group_id_ = group_id;
+        }
+        ft::ftNcclGroupEnd();
+    }
+    return std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>>(tensor_para_params, pipeline_para_params);
+}
--- a/src/fastertransformer/triton_backend/transformer_triton_backend.hpp
+++ b/src/fastertransformer/triton_backend/transformer_triton_backend.hpp
+/*
+ * Copyright (c) OpenMMLab. All rights reserved.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/transformer_triton_backend.hpp
+
+#pragma once
+
+#include <memory>
+#include <sstream>
+#include <sys/time.h>
+#include <vector>
+
+#include "src/fastertransformer/utils/Tensor.h"
+#include "src/fastertransformer/utils/custom_ar_comm.h"
+#include "src/fastertransformer/utils/instance_comm.h"
+#include "src/fastertransformer/utils/mpi_utils.h"
+#include "src/fastertransformer/utils/nccl_utils.h"
+
+namespace ft = fastertransformer;
+
+namespace triton {
+#ifdef USE_TRITONSERVER_DATATYPE
+
+#include "triton/core/tritonbackend.h"
+#include "triton/core/tritonserver.h"
+
+#ifndef TRITONSERVER_API_VERSION_MAJOR
+#error TRITONSERVER_API_VERSION_MAJOR Undefined!
+#endif
+
+#ifndef TRITONSERVER_API_VERSION_MINOR
+#error TRITONSERVER_API_VERSION_MINOR Undefined!
+#endif
+
+#if (TRITONSERVER_API_VERSION_MAJOR == 1 && TRITONSERVER_API_VERSION_MINOR >= 17)                                      \
+    || (TRITONSERVER_API_VERSION_MAJOR > 1)
+#define ENABLE_TRITON_BF16 1
+#endif
+
+typedef TRITONSERVER_DataType   DataType;
+typedef TRITONSERVER_MemoryType MemoryType;
+
+constexpr TRITONSERVER_DataType TYPE_INVALID = TRITONSERVER_TYPE_INVALID;
+constexpr TRITONSERVER_DataType TYPE_BOOL    = TRITONSERVER_TYPE_BOOL;
+constexpr TRITONSERVER_DataType TYPE_UINT8   = TRITONSERVER_TYPE_UINT8;
+constexpr TRITONSERVER_DataType TYPE_UINT16  = TRITONSERVER_TYPE_UINT16;
+constexpr TRITONSERVER_DataType TYPE_UINT32  = TRITONSERVER_TYPE_UINT32;
+constexpr TRITONSERVER_DataType TYPE_UINT64  = TRITONSERVER_TYPE_UINT64;
+constexpr TRITONSERVER_DataType TYPE_INT8    = TRITONSERVER_TYPE_INT8;
+constexpr TRITONSERVER_DataType TYPE_INT16   = TRITONSERVER_TYPE_INT16;
+constexpr TRITONSERVER_DataType TYPE_INT32   = TRITONSERVER_TYPE_INT32;
+constexpr TRITONSERVER_DataType TYPE_INT64   = TRITONSERVER_TYPE_INT64;
+constexpr TRITONSERVER_DataType TYPE_FP16    = TRITONSERVER_TYPE_FP16;
+constexpr TRITONSERVER_DataType TYPE_FP32    = TRITONSERVER_TYPE_FP32;
+constexpr TRITONSERVER_DataType TYPE_FP64    = TRITONSERVER_TYPE_FP64;
+constexpr TRITONSERVER_DataType TYPE_BYTES   = TRITONSERVER_TYPE_BYTES;
+
+#ifdef ENABLE_TRITON_BF16
+constexpr TRITONSERVER_DataType TYPE_BF16 = TRITONSERVER_TYPE_BF16;
+#endif
+constexpr TRITONSERVER_MemoryType MEMORY_CPU        = TRITONSERVER_MEMORY_CPU;
+constexpr TRITONSERVER_MemoryType MEMORY_CPU_PINNED = TRITONSERVER_MEMORY_CPU_PINNED;
+constexpr TRITONSERVER_MemoryType MEMORY_GPU        = TRITONSERVER_MEMORY_GPU;
+
+#else
+
+typedef ft::DataType   DataType;
+typedef ft::MemoryType MemoryType;
+
+constexpr DataType   TYPE_INVALID      = ft::TYPE_INVALID;
+constexpr DataType   TYPE_BOOL         = ft::TYPE_BOOL;
+constexpr DataType   TYPE_UINT8        = ft::TYPE_UINT8;
+constexpr DataType   TYPE_UINT16       = ft::TYPE_UINT16;
+constexpr DataType   TYPE_UINT32       = ft::TYPE_UINT32;
+constexpr DataType   TYPE_UINT64       = ft::TYPE_UINT64;
+constexpr DataType   TYPE_INT8         = ft::TYPE_INT8;
+constexpr DataType   TYPE_INT16        = ft::TYPE_INT16;
+constexpr DataType   TYPE_INT32        = ft::TYPE_INT32;
+constexpr DataType   TYPE_INT64        = ft::TYPE_INT64;
+constexpr DataType   TYPE_FP16         = ft::TYPE_FP16;
+constexpr DataType   TYPE_FP32         = ft::TYPE_FP32;
+constexpr DataType   TYPE_FP64         = ft::TYPE_FP64;
+constexpr DataType   TYPE_BYTES        = ft::TYPE_BYTES;
+constexpr DataType   TYPE_BF16         = ft::TYPE_BF16;
+constexpr MemoryType MEMORY_CPU        = ft::MEMORY_CPU;
+constexpr MemoryType MEMORY_CPU_PINNED = ft::MEMORY_CPU_PINNED;
+constexpr MemoryType MEMORY_GPU        = ft::MEMORY_GPU;
+
+#endif
+
+struct Tensor {
+    const MemoryType          where;
+    const DataType            type;
+    const std::vector<size_t> shape;
+    const void*               data;
+
+    Tensor(const MemoryType _where, const DataType _type, const std::vector<size_t> _shape, const void* _data):
+        where(_where), type(_type), shape(_shape), data(_data)
+    {
+    }
+
+    static ft::DataType convertTritonTypeToFt(DataType tmp_type)
+    {
+        ft::DataType ft_data_type;
+        switch (tmp_type) {
+            case TYPE_INVALID:
+                ft_data_type = ft::DataType::TYPE_INVALID;
+                break;
+            case TYPE_BOOL:
+                ft_data_type = ft::DataType::TYPE_BOOL;
+                break;
+            case TYPE_UINT8:
+                ft_data_type = ft::DataType::TYPE_UINT8;
+                break;
+            case TYPE_UINT16:
+                ft_data_type = ft::DataType::TYPE_UINT16;
+                break;
+            case TYPE_UINT32:
+                ft_data_type = ft::DataType::TYPE_UINT32;
+                break;
+            case TYPE_UINT64:
+                ft_data_type = ft::DataType::TYPE_UINT64;
+                break;
+            case TYPE_INT8:
+                ft_data_type = ft::DataType::TYPE_INT8;
+                break;
+            case TYPE_INT16:
+                ft_data_type = ft::DataType::TYPE_INT16;
+                break;
+            case TYPE_INT32:
+                ft_data_type = ft::DataType::TYPE_INT32;
+                break;
+            case TYPE_INT64:
+                ft_data_type = ft::DataType::TYPE_INT64;
+                break;
+            case TYPE_FP16:
+                ft_data_type = ft::DataType::TYPE_FP16;
+                break;
+            case TYPE_FP32:
+                ft_data_type = ft::DataType::TYPE_FP32;
+                break;
+            case TYPE_FP64:
+                ft_data_type = ft::DataType::TYPE_FP64;
+                break;
+#ifdef ENABLE_TRITON_BF16
+            case TYPE_BF16:
+                ft_data_type = ft::DataType::TYPE_BF16;
+                break;
+#endif
+            case TYPE_BYTES:
+                ft_data_type = ft::DataType::TYPE_BYTES;
+                break;
+            default:
+                FT_CHECK_WITH_INFO(false, "Unknown data type with type id: " + std::to_string(tmp_type));
+                break;
+        }
+        return ft_data_type;
+    }
+
+    ft::Tensor convertTritonTensorToFt()
+    {
+        ft::DataType   ft_data_type = convertTritonTypeToFt(type);
+        ft::MemoryType ft_memory_type;
+        switch (where) {
+            case MEMORY_CPU:
+                ft_memory_type = ft::MemoryType::MEMORY_CPU;
+                break;
+            case MEMORY_CPU_PINNED:
+                ft_memory_type = ft::MemoryType::MEMORY_CPU_PINNED;
+                break;
+            case MEMORY_GPU:
+                ft_memory_type = ft::MemoryType::MEMORY_GPU;
+                break;
+        }
+        return ft::Tensor{ft_memory_type, ft_data_type, shape, data};
+    }
+
+    static Tensor convertFtTensorToTriton(ft::Tensor ft_tensor)
+    {
+        DataType triton_data_type;
+        switch (ft_tensor.type) {
+            case TYPE_INVALID:
+                triton_data_type = TYPE_INVALID;
+                break;
+            case TYPE_BOOL:
+                triton_data_type = TYPE_BOOL;
+                break;
+            case TYPE_UINT8:
+                triton_data_type = TYPE_UINT8;
+                break;
+            case TYPE_UINT16:
+                triton_data_type = TYPE_UINT16;
+                break;
+            case TYPE_UINT32:
+                triton_data_type = TYPE_UINT32;
+                break;
+            case TYPE_UINT64:
+                triton_data_type = TYPE_UINT64;
+                break;
+            case TYPE_INT8:
+                triton_data_type = TYPE_INT8;
+                break;
+            case TYPE_INT16:
+                triton_data_type = TYPE_INT16;
+                break;
+            case TYPE_INT32:
+                triton_data_type = TYPE_INT32;
+                break;
+            case TYPE_INT64:
+                triton_data_type = TYPE_INT64;
+                break;
+            case TYPE_FP16:
+                triton_data_type = TYPE_FP16;
+                break;
+            case TYPE_FP32:
+                triton_data_type = TYPE_FP32;
+                break;
+            case TYPE_FP64:
+                triton_data_type = TYPE_FP64;
+                break;
+#ifdef ENABLE_TRITON_BF16
+            case TYPE_BF16:
+                triton_data_type = TYPE_BF16;
+                break;
+#endif
+            case TYPE_BYTES:
+                triton_data_type = TYPE_BYTES;
+                break;
+            default:
+                FT_CHECK_WITH_INFO(false, "Unknown data type with type id: " + std::to_string(ft_tensor.type));
+                break;
+        }
+        MemoryType triton_memory_type;
+        switch (ft_tensor.where) {
+            case MEMORY_CPU:
+                triton_memory_type = MEMORY_CPU;
+                break;
+            case MEMORY_CPU_PINNED:
+                triton_memory_type = MEMORY_CPU_PINNED;
+                break;
+            case MEMORY_GPU:
+                triton_memory_type = MEMORY_GPU;
+                break;
+        }
+        return Tensor{triton_memory_type, triton_data_type, ft_tensor.shape, ft_tensor.data};
+    }
+};
+
+}  // namespace triton
+
+using triton_stream_cb_t = void(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>, void*);
+
+struct AbstractTransformerModel;
+struct AbstractTransformerModelInstance;
+
+struct AbstractTransformerModelInstance {
+    virtual std::shared_ptr<std::vector<triton::Tensor>>
+    forward(std::shared_ptr<std::vector<triton::Tensor>> input_tensors) = 0;
+
+    virtual std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>
+    forward(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors) = 0;
+
+    virtual std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>
+    forward(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors, ft::AbstractInstanceComm*)
+    {
+        return forward(input_tensors);
+    }
+
+    void registerCallback(triton_stream_cb_t* cb, void* ctx)
+    {
+        stream_cb_  = cb;
+        stream_ctx_ = ctx;
+    }
+
+    void unRegisterCallback()
+    {
+        stream_cb_  = nullptr;
+        stream_ctx_ = nullptr;
+    }
+
+    triton_stream_cb_t* stream_cb_  = nullptr;
+    void*               stream_ctx_ = nullptr;
+};
+
+struct AbstractTransformerModel {
+    static std::shared_ptr<AbstractTransformerModel> createLlamaModel(std::string model_dir);
+
+    virtual std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>>
+    createNcclParams(const int node_id, const int device_id_start = 0, const bool multi_node = false);
+
+    virtual void createCustomComms(std::vector<std::shared_ptr<ft::AbstractCustomComm>>* custom_all_reduce_comms,
+                                   int                                                   world_size) = 0;
+
+    virtual std::unique_ptr<ft::AbstractInstanceComm> createInstanceComm(int size)
+    {
+        return nullptr;
+    }
+
+    virtual std::unique_ptr<AbstractTransformerModelInstance>
+    createModelInstance(int                                                               deviceId,
+                        int                                                               rank,
+                        cudaStream_t                                                      stream,
+                        std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>> nccl_params,
+                        std::shared_ptr<ft::AbstractCustomComm> custom_all_reduce_comm = nullptr) = 0;
+
+    virtual void createSharedWeights(int deviceId, int rank) = 0;
+
+    virtual std::string toString()            = 0;
+    virtual int         getTensorParaSize()   = 0;
+    virtual int         getPipelineParaSize() = 0;
+};
--- a/src/fastertransformer/triton_backend/triton_utils.hpp
+++ b/src/fastertransformer/triton_backend/triton_utils.hpp
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp"
+#include "src/fastertransformer/utils/Tensor.h"
+
+namespace ft = fastertransformer;
+
+template<typename T>
+void move_tensor_H2D(const triton::Tensor&                                          tensor,
+                     T*&                                                            d_ptr,
+                     const std::unique_ptr<ft::Allocator<ft::AllocatorType::CUDA>>* allocator)
+{
+    if (tensor.where == triton::MEMORY_GPU) {
+        return;
+    }
+
+    size_t tensor_size = 1;
+    for (auto t : tensor.shape) {
+        tensor_size *= t;
+    }
+
+    cudaStream_t stream = (*allocator)->returnStream();
+
+    d_ptr = (T*)((*allocator)->reMalloc(d_ptr, sizeof(T) * tensor_size, false));
+    ft::check_cuda_error(cudaMemcpyAsync(d_ptr, (T*)tensor.data, sizeof(T) * tensor_size, cudaMemcpyDefault, stream));
+}
+
+template<typename T>
+ft::Tensor as_GPU_tensor(const triton::Tensor& tensor, T* d_ptr)
+{
+    return ft::Tensor{ft::MEMORY_GPU,
+                      triton::Tensor::convertTritonTypeToFt(tensor.type),
+                      tensor.shape,
+                      tensor.where == triton::MEMORY_CPU ? d_ptr : tensor.data};
+}
+
+inline ft::Tensor as_CPU_tensor(const triton::Tensor& tensor)
+{
+    return ft::Tensor{ft::MEMORY_CPU, triton::Tensor::convertTritonTypeToFt(tensor.type), tensor.shape, tensor.data};
+}
--- a/src/fastertransformer/utils/CMakeLists.txt
+++ b/src/fastertransformer/utils/CMakeLists.txt
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+cmake_minimum_required(VERSION 3.8)
+
+add_subdirectory(gemm_test)
+
+add_library(cuda_utils STATIC cuda_utils.cc)
+set_property(TARGET cuda_utils PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET cuda_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+target_link_libraries(cuda_utils PUBLIC -lcudart)
+
+add_library(logger STATIC logger.cc)
+set_property(TARGET logger PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET logger PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+target_link_libraries(logger PUBLIC -lcudart)
+
+add_library(cublasAlgoMap STATIC cublasAlgoMap.cc)
+set_property(TARGET cublasAlgoMap PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET cublasAlgoMap PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+target_link_libraries(cublasAlgoMap PUBLIC -lcublas -lcudart -lcurand cuda_utils logger)
+
+add_library(cublasMMWrapper STATIC cublasMMWrapper.cc)
+set_property(TARGET cublasMMWrapper PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET cublasMMWrapper PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+target_link_libraries(cublasMMWrapper PUBLIC -lcublas -lcudart -lcurand cublasAlgoMap cuda_utils logger)
+if (SPARSITY_SUPPORT)
+target_link_libraries(cublasMMWrapper PUBLIC -lcusparse -lcusparseLt)
+endif()
+
+add_library(word_list STATIC word_list.cc)
+set_property(TARGET word_list PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET word_list PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+
+add_library(nvtx_utils STATIC nvtx_utils.cc)
+set_property(TARGET nvtx_utils PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET nvtx_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+target_link_libraries(nvtx_utils PUBLIC -lnvToolsExt)
+
+add_library(memory_utils STATIC memory_utils.cu)
+set_property(TARGET memory_utils PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET memory_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+target_link_libraries(memory_utils PUBLIC cuda_utils logger tensor)
+
+add_library(mpi_utils STATIC mpi_utils.cc)
+set_property(TARGET mpi_utils PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET mpi_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+if (BUILD_MULTI_GPU)
+    target_link_libraries(mpi_utils PUBLIC mpi logger)
+endif()
+
+add_library(nccl_utils STATIC nccl_utils.cc)
+set_property(TARGET nccl_utils PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET nccl_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+if (BUILD_MULTI_GPU)
+    target_link_libraries(nccl_utils PUBLIC ${NCCL_LIBRARIES} mpi_utils logger)
+endif()
+
+add_library(cublasINT8MMWrapper STATIC cublasINT8MMWrapper.cc)
+set_property(TARGET cublasINT8MMWrapper PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET cublasINT8MMWrapper PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+target_link_libraries(cublasINT8MMWrapper PUBLIC -lcublasLt -lcudart -lcurand cublasAlgoMap cublasMMWrapper cuda_utils logger)
+
+if(ENABLE_FP8)
+add_library(cublasFP8MMWrapper STATIC cublasFP8MMWrapper.cu)
+set_property(TARGET cublasFP8MMWrapper PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET cublasFP8MMWrapper PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+target_link_libraries(cublasFP8MMWrapper PUBLIC -lcublasLt -lcudart -lcurand 
+                      cublasAlgoMap cublasMMWrapper nvtx_utils fp8_qgmma_1x1_utils)
+endif()
+
+add_library(custom_ar_comm STATIC custom_ar_comm.cc)
+set_property(TARGET custom_ar_comm PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET custom_ar_comm PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+target_link_libraries(custom_ar_comm PUBLIC custom_ar_kernels memory_utils cuda_utils logger)
+
+add_library(gemm STATIC gemm.cc)
+set_property(TARGET gemm PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET gemm PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+target_link_libraries(gemm PUBLIC
+                      -lcublas -lcublasLt -lcudart -lcurand
+                      cublasAlgoMap memory_utils cuda_utils logger)
+if (SPARSITY_SUPPORT)
+    target_link_libraries(gemm PUBLIC -lcusparse -lcusparseLt)
+endif()
+
+add_library(cuda_fp8_utils STATIC cuda_fp8_utils.cu)
+set_property(TARGET cuda_fp8_utils PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET cuda_fp8_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+
+add_library(tensor STATIC Tensor.cc)
+set_property(TARGET tensor PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET tensor PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+target_link_libraries(tensor PUBLIC cuda_utils logger)
--- a/src/fastertransformer/utils/IA3.h
+++ b/src/fastertransformer/utils/IA3.h
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+namespace fastertransformer {
+
+enum IA3_config {
+    KEY_ADAPTER   = 1 << 0,
+    VALUE_ADAPTER = 1 << 1,
+    MLP_ADAPTER   = 1 << 2,
+};
+
+static constexpr IA3_config IA3_NONE                    = static_cast<IA3_config>(0);
+static constexpr size_t     IA3_ADAPTER_MAX_NUM_ENCODER = 3;
+static constexpr size_t     IA3_ADAPTER_MAX_NUM_DECODER = 5;
+
+static inline IA3_config operator&(IA3_config x, IA3_config y)
+{
+    return static_cast<IA3_config>(static_cast<int>(x) & static_cast<int>(y));
+}
+
+static inline IA3_config operator|(IA3_config x, IA3_config y)
+{
+    return static_cast<IA3_config>(static_cast<int>(x) | static_cast<int>(y));
+}
+
+static inline IA3_config& operator|=(IA3_config& x, IA3_config y)
+{
+    return x = static_cast<IA3_config>(static_cast<int>(x) | static_cast<int>(y));
+}
+
+}  // namespace fastertransformer
--- a/src/fastertransformer/utils/ScaleList.h
+++ b/src/fastertransformer/utils/ScaleList.h
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include "stdlib.h"
+
+namespace fastertransformer {
+
+#define ACTIVATION_AMAX_NUM 72
+#define INT8O_GEMM_NUM 8
+#define TRT_AMAX_NUM 3
+#define SCALE_RESERVE_NUM 21
+
+struct ScaleList {
+    // Part 1 -- 72:
+    //   First 72 are for activation amaxs. For each activation amax, there are 4 values: amax, amax/127.0f,
+    //   amax/127.0f/127.0f, 127.0f/amax -- input_amax 0-3 , Q_aftergemm_amax 4-7, Qbias_amax 8-11, K_aftergemm_amax
+    //   12-15, Kbias_amax 16-19, V_aftergemm_amax 20-23, Vbias_amax 24-27, bmm1_amax 28-31, Softmax_amax 32-35,
+    //   bmm2_amax 36-39, Proj_aftergemm_scale 40-43, ProjBiasNorm_amax 44-47, FC1_aftergemm_amax 48-51, F1Bias_amax
+    //   52-55, FC2_aftergemm_amax 56-59, F2BiasNorm_amax 60-63, reserve 64-71
+    // Part 2 -- 9*hidden_dim:
+    //   Kernel amaxs, for each kernel amax list, there are output_channel values : query_weight_amax_list,
+    //   key_weight_amax_list, value_weight_amax_list, proj_weight_amax_list, FC1_weight_amax_list, FC2_weight_amax_list
+    // Part 3 -- 8:
+    //   Int8 gemm deQFactor list (8 values): Q_deQ_scale, K_deQ_scale, V_deQ_scale, bmm1_deQ_scale, bmm2_deQ_scale,
+    //   FC0_deQ_scale, FC1_deQ_scale, FC2_deQ_scale
+    // Part 4 -- 3:
+    //   Amax used in trt fused mha kernel (3 values) : QKVbias_amax, Softmax_amax, bmm2_amax
+    // Part 5 -- 21: reverse
+    const float* d_scale_list_ = nullptr;
+    const float* h_scale_list_ = nullptr;
+    size_t       size_         = ACTIVATION_AMAX_NUM + 9 * 768 + INT8O_GEMM_NUM + TRT_AMAX_NUM;
+    size_t       p2_offset_    = ACTIVATION_AMAX_NUM;
+    size_t       p3_offset_    = ACTIVATION_AMAX_NUM + 9 * 768;
+    size_t       p4_offset_    = ACTIVATION_AMAX_NUM + 9 * 768 + INT8O_GEMM_NUM;
+};
+
+}  // namespace fastertransformer
--- a/src/fastertransformer/utils/Tensor.cc
+++ b/src/fastertransformer/utils/Tensor.cc
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/utils/Tensor.h"
+#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include "src/fastertransformer/utils/string_utils.h"
+
+#include "stdlib.h"
+#include <cuda_fp16.h>
+#include <cuda_runtime_api.h>
+#include <dirent.h>
+#include <numeric>
+#include <stdlib.h>
+#include <string>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unordered_map>
+#include <vector>
+
+namespace fastertransformer {
+
+Tensor::Tensor():
+    // a none tensor.
+    where(MEMORY_CPU),
+    type(TYPE_INVALID),
+    shape({}),
+    data(nullptr),
+    offsets({})  // only a record to record offset
+{
+}
+
+Tensor::Tensor(const MemoryType _where, const DataType _type, const std::vector<size_t> _shape, const void* _data):
+    where(_where), type(_type), shape(_shape), data(_data)
+{
+}
+
+Tensor::Tensor(const MemoryType          _where,
+               const DataType            _type,
+               const std::vector<size_t> _shape,
+               const void*               _data,
+               const std::vector<size_t> _offset):
+    where(_where), type(_type), shape(_shape), data(_data), offsets(_offset)
+{
+}
+
+void Tensor::parseNpyIntro(FILE*& f_ptr, uint32_t& header_len, uint32_t& start_data)
+{
+    const char magic[]                   = "\x93"
+                                           "NUMPY";
+    char       magic_test[sizeof(magic)] = "\0";
+
+    size_t n_elems = fread((void*)magic_test, sizeof(char), sizeof(magic) - 1, f_ptr);
+    if (n_elems != sizeof(magic) - 1 || std::string(magic) != std::string(magic_test)) {
+        throw std::runtime_error("Could read magic token in NPY file");
+    }
+
+    uint8_t npy_major = 0;
+    uint8_t npy_minor = 0;
+    n_elems           = fread((void*)&npy_major, sizeof(uint8_t), 1, f_ptr);
+    n_elems += fread((void*)&npy_minor, sizeof(uint8_t), 1, f_ptr);
+
+    if (npy_major == 1) {
+        uint16_t header_len_u16 = 0;
+        n_elems                 = fread((void*)&header_len_u16, sizeof(uint16_t), 1, f_ptr);
+        header_len              = header_len_u16;
+    }
+    else if (npy_major == 2) {
+        uint32_t header_len_u32 = 0;
+        n_elems                 = fread((void*)&header_len_u32, sizeof(uint32_t), 1, f_ptr);
+        header_len              = header_len_u32;
+    }
+    else {
+        throw std::runtime_error("Unsupported npy version: " + std::to_string(npy_major));
+    }
+
+    start_data = 8 + 2 * npy_major + header_len;
+}
+
+int Tensor::parseNpyHeader(FILE*& f_ptr, uint32_t header_len, DataType& type, std::vector<size_t>& shape)
+{
+    char*  header_c = (char*)malloc(header_len * sizeof(char));
+    size_t n_elems  = fread((void*)header_c, sizeof(char), header_len, f_ptr);
+    if (n_elems != header_len) {
+        free(header_c);
+        return -1;
+    }
+    std::string header(header_c, header_len);
+    free(header_c);
+
+    size_t start, end;
+    start = header.find("'descr'") + 7;
+    start = header.find("'", start);
+    end   = header.find("'", start + 1);
+    type  = typeFromNumpyDesc(header.substr(start + 1, end - start - 1));
+
+    start = header.find("'fortran_order'") + 15;
+    start = header.find(":", start);
+    end   = header.find(",", start + 1);
+    if (header.substr(start + 1, end - start - 1).find("False") == std::string::npos) {
+        throw std::runtime_error("Unsupported value for fortran_order while reading npy file");
+    }
+
+    start = header.find("'shape'") + 7;
+    start = header.find("(", start);
+    end   = header.find(")", start + 1);
+
+    std::istringstream shape_stream(header.substr(start + 1, end - start - 1));
+    std::string        token;
+
+    shape.clear();
+    while (std::getline(shape_stream, token, ',')) {
+        if (token.find_first_not_of(' ') == std::string::npos) {
+            break;
+        }
+        shape.push_back(std::stoul(token));
+    }
+
+    return 0;
+}
+
+Tensor Tensor::loadNpy(const std::string& npy_file, const MemoryType where)
+{
+    DataType            type;
+    std::vector<size_t> shape;
+
+    FILE* f_ptr = fopen(npy_file.c_str(), "rb");
+    if (f_ptr == nullptr) {
+        throw std::runtime_error("Could not open file " + npy_file);
+    }
+    uint32_t header_len, start_data;
+    parseNpyIntro(f_ptr, header_len, start_data);
+    parseNpyHeader(f_ptr, header_len, type, shape);
+
+    const size_t size     = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<size_t>());
+    void*        data_cpu = malloc(size * Tensor::getTypeSize(type));
+    void*        data     = data_cpu;
+
+    size_t n_elems = fread(data_cpu, Tensor::getTypeSize(type), size, f_ptr);
+    FT_CHECK_WITH_INFO(n_elems == size, "reading tensor failed");
+    if (where == MEMORY_GPU) {
+        cudaMalloc(&data, size * Tensor::getTypeSize(type));
+        cudaMemcpy(data, data_cpu, size * Tensor::getTypeSize(type), cudaMemcpyHostToDevice);
+        free(data_cpu);
+    }
+
+    fclose(f_ptr);
+    return Tensor(where, type, shape, data);
+}
+
+size_t Tensor::size() const
+{
+    if (data == nullptr || shape.size() == 0) {
+        return 0;
+    }
+    return std::accumulate(shape.begin(), shape.end(), (size_t)1, std::multiplies<size_t>());
+}
+
+size_t Tensor::sizeBytes() const
+{
+    return size() * Tensor::getTypeSize(type);
+}
+
+std::string Tensor::whereToString() const
+{
+    static const std::unordered_map<MemoryType, std::string> mem_to_string{
+        {MEMORY_CPU, "CPU"}, {MEMORY_CPU_PINNED, "CPU_PINNED"}, {MEMORY_GPU, "GPU"}};
+    return mem_to_string.at(where);
+}
+
+std::string Tensor::toString() const
+{
+    std::string memtype_str = whereToString();
+
+    static const std::unordered_map<DataType, std::string> type_to_string{
+        {TYPE_BOOL, "BOOL"},
+        {TYPE_UINT8, "UINT8"},
+        {TYPE_UINT16, "UINT16"},
+        {TYPE_UINT32, "UINT32"},
+        {TYPE_UINT64, "UINT64"},
+        {TYPE_INT8, "INT8"},
+        {TYPE_INT16, "INT16"},
+        {TYPE_INT32, "INT32"},
+        {TYPE_INT64, "INT64"},
+        {TYPE_BF16, "BF16"},
+        {TYPE_FP16, "FP16"},
+        {TYPE_FP32, "FP32"},
+        {TYPE_FP64, "FP64"},
+        {TYPE_BYTES, "BYTES"},
+        {TYPE_INVALID, "INVALID"},
+        {TYPE_FP8_E4M3, "E4M3"},
+        {TYPE_VOID, "VOID"},
+    };
+    return fmtstr("Tensor[where=%s, type=%s, shape=%s, data=%p]",
+                  memtype_str.c_str(),
+                  type_to_string.at(type).c_str(),
+                  vec2str(shape).c_str(),
+                  data);
+}
+
+DataType Tensor::typeFromNumpyDesc(std::string type)
+{
+    static const std::unordered_map<std::string, DataType> type_map{{"?", TYPE_BOOL},
+                                                                    {"b", TYPE_BYTES},
+                                                                    {"u1", TYPE_UINT8},
+                                                                    {"u2", TYPE_UINT16},
+                                                                    {"u4", TYPE_UINT32},
+                                                                    {"u8", TYPE_UINT64},
+                                                                    {"i1", TYPE_INT8},
+                                                                    {"i2", TYPE_INT16},
+                                                                    {"i4", TYPE_INT32},
+                                                                    {"i8", TYPE_INT64},
+                                                                    {"f2", TYPE_FP16},
+                                                                    {"f4", TYPE_FP32},
+                                                                    {"f8", TYPE_FP64}};
+    return type_map.at(type);
+}
+
+size_t Tensor::getTypeSize(DataType type)
+{
+    static const std::unordered_map<DataType, size_t> type_map{{TYPE_BOOL, sizeof(bool)},
+                                                               {TYPE_BYTES, sizeof(char)},
+                                                               {TYPE_UINT8, sizeof(uint8_t)},
+                                                               {TYPE_UINT16, sizeof(uint16_t)},
+                                                               {TYPE_UINT32, sizeof(uint32_t)},
+                                                               {TYPE_UINT64, sizeof(uint64_t)},
+                                                               {TYPE_INT8, sizeof(int8_t)},
+                                                               {TYPE_INT16, sizeof(int16_t)},
+                                                               {TYPE_INT32, sizeof(int32_t)},
+                                                               {TYPE_INT64, sizeof(int64_t)},
+#ifdef ENABLE_BF16
+                                                               {TYPE_BF16, sizeof(__nv_bfloat16)},
+#endif
+#ifdef ENABLE_FP8
+                                                               {TYPE_FP8_E4M3, sizeof(__nv_fp8_e4m3)},
+#endif
+                                                               {TYPE_FP16, sizeof(half)},
+                                                               {TYPE_FP32, sizeof(float)},
+                                                               {TYPE_FP64, sizeof(double)}};
+    return type_map.at(type);
+}
+
+std::string Tensor::getNumpyTypeDesc(DataType type) const
+{
+    static const std::unordered_map<DataType, std::string> type_map{{TYPE_INVALID, "x"},
+                                                                    {TYPE_BOOL, "?"},
+                                                                    {TYPE_BYTES, "b"},
+                                                                    {TYPE_UINT8, "u1"},
+                                                                    {TYPE_UINT16, "u2"},
+                                                                    {TYPE_UINT32, "u4"},
+                                                                    {TYPE_UINT64, "u8"},
+                                                                    {TYPE_INT8, "i1"},
+                                                                    {TYPE_INT16, "i2"},
+                                                                    {TYPE_INT32, "i4"},
+                                                                    {TYPE_INT64, "i8"},
+                                                                    {TYPE_FP16, "f2"},
+                                                                    {TYPE_FP32, "f4"},
+                                                                    {TYPE_FP64, "f8"}};
+
+    if (type == TYPE_BF16) {
+        FT_LOG_WARNING("getNumpyTypeDesc(TYPE_BF16) returns an invalid type 'x' since Numpy doesn't "
+                       "support bfloat16 as of now, it will be properly extended if numpy supports. "
+                       "Please refer for the discussions https://github.com/numpy/numpy/issues/19808.");
+    }
+
+    return type_map.count(type) > 0 ? type_map.at(type) : "x";
+}
+
+void Tensor::saveNpy(const std::string& filename) const
+{
+    // Save tensor to NPY 1.0 format (see https://numpy.org/neps/nep-0001-npy-format.html)
+    void*  cpu_data     = (void*)data;
+    bool   is_data_temp = false;
+    size_t tensor_size  = size();
+    if (where == MemoryType::MEMORY_GPU) {
+        cpu_data     = malloc(tensor_size * Tensor::getTypeSize(type));
+        is_data_temp = true;
+        cudaDeviceSynchronize();
+        cudaMemcpy(cpu_data, data, tensor_size * Tensor::getTypeSize(type), cudaMemcpyDeviceToHost);
+    }
+
+    const char    magic[]   = "\x93"
+                              "NUMPY";
+    const uint8_t npy_major = 1;
+    const uint8_t npy_minor = 0;
+
+    std::stringstream header_stream;
+    header_stream << "{'descr': '" << getNumpyTypeDesc(type) << "', 'fortran_order': False, 'shape': (";
+    for (size_t i = 0; i < shape.size(); ++i) {
+        header_stream << shape[i];
+        if (i + 1 < shape.size() || shape.size() == 1) {
+            header_stream << ", ";
+        }
+    }
+    header_stream << ")}";
+    int base_length = 6 + 4 + header_stream.str().size();
+    int pad_length  = 16 * ((base_length + 1 + 15) / 16);  // Take ceiling of base_length + 1 (for '\n' ending)
+    for (int i = 0; i < pad_length - base_length; ++i) {
+        header_stream << ((i == pad_length - base_length - 1) ? "\n" : "\x20");
+    }
+    std::string    header     = header_stream.str();
+    const uint16_t header_len = header.size();
+
+    FILE* f_ptr = fopen(filename.c_str(), "wb");
+    FT_CHECK_WITH_INFO(f_ptr != nullptr, fmtstr("Unable to open %s for writing.\n", filename.c_str()));
+
+    fwrite(magic, sizeof(char), sizeof(magic) - 1, f_ptr);
+    fwrite(&npy_major, sizeof(uint8_t), 1, f_ptr);
+    fwrite(&npy_minor, sizeof(uint8_t), 1, f_ptr);
+    fwrite(&header_len, sizeof(uint16_t), 1, f_ptr);
+    fwrite(header.c_str(), sizeof(char), header_len, f_ptr);
+    fwrite(cpu_data, Tensor::getTypeSize(type), tensor_size, f_ptr);
+
+    fclose(f_ptr);
+
+    if (is_data_temp) {
+        free(cpu_data);
+    }
+}
+
+Tensor Tensor::slice(std::vector<size_t> shape, size_t offset) const
+{
+    if (this->data != nullptr) {
+        size_t n_elts        = this->size();
+        size_t n_sliced_elts = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<size_t>());
+        FT_CHECK_WITH_INFO(
+            n_sliced_elts + offset <= n_elts,
+            fmtstr("The number (%ld) of elements of sliced tensor exceeds that (%ld) of the original tensor",
+                   n_sliced_elts + offset,
+                   n_elts));
+    }
+    return Tensor(this->where, this->type, shape, this->getPtrWithOffset(offset));
+}
+
+TensorMap::TensorMap(const std::unordered_map<std::string, Tensor>& tensor_map)
+{
+    for (auto& kv : tensor_map) {
+        if (isValid(kv.second)) {
+            insert(kv.first, kv.second);
+        }
+        else {
+            FT_LOG_DEBUG(fmtstr("%s is not a valid tensor, skipping insert into TensorMap", kv.first.c_str()));
+        }
+    }
+}
+
+TensorMap::TensorMap(const std::vector<Tensor>& tensor_map)
+{
+    for (size_t i = 0; i < tensor_map.size(); i++) {
+        insert(std::to_string(i), tensor_map[i]);
+    }
+}
+
+TensorMap::TensorMap(std::initializer_list<std::pair<std::string, Tensor>> tensor_map)
+{
+    for (auto& pair : tensor_map) {
+        if (isValid(pair.second)) {
+            insert(pair.first, pair.second);
+        }
+        else {
+            FT_LOG_DEBUG(fmtstr("%s is not a valid tensor, skipping insert into TensorMap", pair.first.c_str()));
+        }
+    }
+}
+
+TensorMap::~TensorMap()
+{
+    tensor_map_.clear();
+}
+
+std::vector<std::string> TensorMap::keys() const
+{
+    std::vector<std::string> key_names;
+    for (auto& kv : tensor_map_) {
+        key_names.push_back(kv.first);
+    }
+    return key_names;
+}
+
+std::string TensorMap::toString()
+{
+    std::stringstream ss;
+    ss << "{";
+    std::vector<std::string> key_names = keys();
+    for (size_t i = 0; i < tensor_map_.size(); ++i) {
+        ss << key_names[i] << ": " << at(key_names[i]).toString();
+        if (i < tensor_map_.size() - 1) {
+            ss << ", ";
+        }
+    }
+    ss << "}";
+    return ss.str();
+}
+
+TensorMap TensorMap::fromNpyFolder(const std::string& base_folder)
+{
+    DIR* dir_p = opendir(base_folder.c_str());
+    FT_CHECK_WITH_INFO(dir_p != nullptr, fmtstr("Could not open folder %s. ", base_folder.c_str()));
+    struct dirent* dp;
+
+    TensorMap ret_tensor;
+    while ((dp = readdir(dir_p)) != nullptr) {
+        std::string filename(dp->d_name);
+        size_t      len = filename.length();
+        if (len < 4 || filename.compare(len - 4, 4, ".npy")) {
+            continue;
+        }
+
+        size_t pos = filename.find('-');
+        FT_CHECK_WITH_INFO(pos != std::string::npos, fmtstr("Invalid filename: %s\n", filename.c_str()));
+
+        MemoryType where;
+        if (filename.compare(0, pos, "GPU") == 0) {
+            where = MEMORY_GPU;
+        }
+        else if (filename.compare(0, pos, "CPU") == 0) {
+            where = MEMORY_CPU;
+        }
+        else if (filename.compare(0, pos, "CPU_PINNED") == 0) {
+            where = MEMORY_CPU_PINNED;
+        }
+        else {
+            FT_CHECK_WITH_INFO(false, fmtstr("Invalid filename: %s\n", filename.c_str()));
+        }
+        std::string key = filename.substr(pos + 1, len - pos - 5);
+
+        ret_tensor.tensor_map_.insert({key, Tensor::loadNpy(base_folder + "/" + filename, where)});
+    }
+
+    closedir(dir_p);
+
+    return ret_tensor;
+}
+
+void TensorMap::saveNpy(const std::string& base_folder)
+{
+    mode_t mode_0755 = S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH;
+    int    ret       = mkdir(base_folder.c_str(), mode_0755);
+    FT_CHECK_WITH_INFO(ret == 0 || errno == EEXIST, fmtstr("Could not create folder %s.\n", base_folder.c_str()));
+
+    for (const auto& item : tensor_map_) {
+        item.second.saveNpy(base_folder + "/" + item.second.whereToString() + "-" + item.first + ".npy");
+    }
+}
+
+}  // namespace fastertransformer
--- a/src/fastertransformer/utils/Tensor.h
+++ b/src/fastertransformer/utils/Tensor.h
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
+#include "src/fastertransformer/utils/cuda_fp8_utils.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include "src/fastertransformer/utils/string_utils.h"
+
+#include "stdlib.h"
+#include <cuda_fp16.h>
+#include <cuda_runtime_api.h>
+#include <dirent.h>
+#include <numeric>
+#include <stdlib.h>
+#include <string>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unordered_map>
+#include <vector>
+
+namespace fastertransformer {
+
+typedef enum datatype_enum
+{
+    TYPE_INVALID,
+    TYPE_BOOL,
+    TYPE_UINT8,
+    TYPE_UINT16,
+    TYPE_UINT32,
+    TYPE_UINT64,
+    TYPE_INT8,
+    TYPE_INT16,
+    TYPE_INT32,
+    TYPE_INT64,
+    TYPE_FP16,
+    TYPE_FP32,
+    TYPE_FP64,
+    TYPE_BYTES,
+    TYPE_BF16,
+    TYPE_FP8_E4M3,
+    TYPE_STR,
+    TYPE_VOID,
+} DataType;
+
+template<typename T>
+DataType getTensorType()
+{
+    if (std::is_same<T, float>::value || std::is_same<T, const float>::value) {
+        return TYPE_FP32;
+    }
+    else if (std::is_same<T, half>::value || std::is_same<T, const half>::value) {
+        return TYPE_FP16;
+    }
+#ifdef ENABLE_BF16
+    else if (std::is_same<T, __nv_bfloat16>::value || std::is_same<T, const __nv_bfloat16>::value) {
+        return TYPE_BF16;
+    }
+#endif
+#ifdef ENABLE_FP8
+    else if (std::is_same<T, __nv_fp8_e4m3>::value || std::is_same<T, const __nv_fp8_e4m3>::value) {
+        return TYPE_FP8_E4M3;
+    }
+#endif
+    else if (std::is_same<T, int>::value || std::is_same<T, const int>::value) {
+        return TYPE_INT32;
+    }
+    else if (std::is_same<T, int8_t>::value || std::is_same<T, const int8_t>::value) {
+        return TYPE_INT8;
+    }
+    else if (std::is_same<T, uint>::value || std::is_same<T, const uint>::value) {
+        return TYPE_UINT32;
+    }
+    else if (std::is_same<T, unsigned long long int>::value || std::is_same<T, const unsigned long long int>::value) {
+        return TYPE_UINT64;
+    }
+    else if (std::is_same<T, bool>::value || std::is_same<T, const bool>::value) {
+        return TYPE_BOOL;
+    }
+    else if (std::is_same<T, char>::value || std::is_same<T, const char>::value) {
+        return TYPE_BYTES;
+    }
+    else {
+        return TYPE_INVALID;
+    }
+}
+
+typedef enum memorytype_enum
+{
+    MEMORY_CPU,
+    MEMORY_CPU_PINNED,
+    MEMORY_GPU
+} MemoryType;
+
+struct Tensor {
+    const MemoryType          where;
+    const DataType            type;
+    const std::vector<size_t> shape;
+    const void*               data;  // TODO(bhseuh) modify from const void* to void* const
+    const std::vector<size_t> offsets = std::vector<size_t>{};
+
+    Tensor();
+    Tensor(const MemoryType _where, const DataType _type, const std::vector<size_t> _shape, const void* _data);
+    Tensor(const MemoryType          _where,
+           const DataType            _type,
+           const std::vector<size_t> _shape,
+           const void*               _data,
+           const std::vector<size_t> _offset);
+
+    size_t size() const;
+    size_t sizeBytes() const;
+
+    std::string whereToString() const;
+    std::string toString() const;
+    std::string getNumpyTypeDesc(DataType type) const;
+
+    void          saveNpy(const std::string& filename) const;
+    static Tensor loadNpy(const std::string& npy_file, const MemoryType where);
+
+    static DataType typeFromNumpyDesc(std::string type);
+    static size_t   getTypeSize(DataType type);
+
+    template<typename T>
+    inline T getVal(size_t index) const
+    {
+        FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
+        FT_CHECK(where == MEMORY_CPU);
+        FT_CHECK(data != nullptr);
+        FT_CHECK_WITH_INFO(index < size(), "index is larger than buffer size");
+
+        if (getTensorType<T>() != type) {
+            FT_LOG_DEBUG("getVal with type %s, but data type is: %s",
+                         getNumpyTypeDesc(getTensorType<T>()).c_str(),
+                         getNumpyTypeDesc(type).c_str());
+        }
+        return ((T*)data)[index];
+    }
+
+    template<typename T>
+    inline T getVal() const
+    {
+        FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
+        if (getTensorType<T>() != type) {
+            FT_LOG_DEBUG("getVal with type %s, but data type is: %s",
+                         getNumpyTypeDesc(getTensorType<T>()).c_str(),
+                         getNumpyTypeDesc(type).c_str());
+        }
+        return getVal<T>(0);
+    }
+
+    template<typename T>
+    inline T* getPtr() const
+    {
+        FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
+        if (getTensorType<T>() != type) {
+            FT_LOG_DEBUG("getPtr with type %s, but data type is: %s",
+                         getNumpyTypeDesc(getTensorType<T>()).c_str(),
+                         getNumpyTypeDesc(type).c_str());
+        }
+        return (T*)data;
+    }
+
+    inline void* getPtrWithOffset(size_t offset) const
+    {
+        FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
+        if (data == nullptr) {
+            return (void*)data;
+        }
+        else {
+            FT_CHECK_WITH_INFO(offset < size(), "offset is larger than buffer size");
+            return (void*)((char*)data + offset * Tensor::getTypeSize(type));
+        }
+    }
+
+    template<typename T>
+    inline T* getPtrWithOffset(size_t offset) const
+    {
+        FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
+        if (getTensorType<T>() != type) {
+            FT_LOG_DEBUG("getVal with type %s, but data type is: %s",
+                         getNumpyTypeDesc(getTensorType<T>()).c_str(),
+                         getNumpyTypeDesc(type).c_str());
+        }
+        if (data == nullptr) {
+            return (T*)data;
+        }
+        else {
+            FT_CHECK_WITH_INFO(offset < size(),
+                               fmtstr("offset (%lu) is larger than buffer size (%lu)", offset, size()));
+            return ((T*)data) + offset;
+        }
+    }
+
+    template<typename T>
+    T max() const
+    {
+        if (getTensorType<T>() != type) {
+            FT_LOG_DEBUG("getVal with type %s, but data type is: %s",
+                         getNumpyTypeDesc(getTensorType<T>()).c_str(),
+                         getNumpyTypeDesc(type).c_str());
+        }
+        FT_CHECK_WITH_INFO(shape.size() > 0 && data != nullptr, "Should be a non-empty tensor.");
+        FT_CHECK_WITH_INFO(where == MEMORY_CPU || where == MEMORY_CPU_PINNED,
+                           "max() supports MEMORY_CPU or MEMORY_CPU_PINNED tensor.");
+        size_t max_idx = 0;
+        T      max_val = getVal<T>(max_idx);
+        for (size_t i = 1; i < size(); ++i) {
+            T val = getVal<T>(i);
+            if (val > max_val) {
+                max_idx = i;
+                max_val = val;
+            }
+        }
+        return max_val;
+    }
+
+    template<typename T>
+    T min() const
+    {
+        if (getTensorType<T>() != type) {
+            FT_LOG_DEBUG("getVal with type %s, but data type is: %s",
+                         getNumpyTypeDesc(getTensorType<T>()).c_str(),
+                         getNumpyTypeDesc(type).c_str());
+        }
+        FT_CHECK_WITH_INFO(shape.size() > 0 && data != nullptr, "Should be a non-empty tensor.");
+        FT_CHECK_WITH_INFO(where == MEMORY_CPU || where == MEMORY_CPU_PINNED,
+                           "min() supports MEMORY_CPU or MEMORY_CPU_PINNED tensor.");
+        size_t min_idx = 0;
+        T      min_val = getVal<T>(min_idx);
+        for (size_t i = 1; i < size(); ++i) {
+            T val = getVal<T>(i);
+            if (val < min_val) {
+                min_idx = i;
+                min_val = val;
+            }
+        }
+        return min_val;
+    }
+
+    template<typename T>
+    T any(T val) const
+    {
+        if (getTensorType<T>() != type) {
+            FT_LOG_DEBUG("getVal with type %s, but data type is: %s",
+                         getNumpyTypeDesc(getTensorType<T>()).c_str(),
+                         getNumpyTypeDesc(type).c_str());
+        }
+        FT_CHECK_WITH_INFO(shape.size() > 0 && data != nullptr, "Should be a non-empty tensor.");
+        FT_CHECK_WITH_INFO(where == MEMORY_CPU || where == MEMORY_CPU_PINNED,
+                           "any() supports MEMORY_CPU or MEMORY_CPU_PINNED tensor.");
+        for (size_t i = 0; i < size(); ++i) {
+            if (getVal<T>(i) == val) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    template<typename T>
+    T all(T val) const
+    {
+        if (getTensorType<T>() != type) {
+            FT_LOG_DEBUG("getVal with type %s, but data type is: %s",
+                         getNumpyTypeDesc(getTensorType<T>()).c_str(),
+                         getNumpyTypeDesc(type).c_str());
+        }
+        FT_CHECK_WITH_INFO(shape.size() > 0 && data != nullptr, "Should be a non-empty tensor.");
+        FT_CHECK_WITH_INFO(where == MEMORY_CPU || where == MEMORY_CPU_PINNED,
+                           "all() supports MEMORY_CPU or MEMORY_CPU_PINNED tensor.");
+        for (size_t i = 0; i < size(); ++i) {
+            if (getVal<T>(i) != val) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    void updateShape(size_t idx, size_t val)
+    {
+        // TODO: find a better way to update the shape
+        std::vector<size_t>& shape_ref = const_cast<std::vector<size_t>&>(shape);
+        shape_ref[idx]                 = val;
+    }
+
+    Tensor slice(std::vector<size_t> shape, size_t offset = 0) const;
+
+private:
+    static void parseNpyIntro(FILE*& f_ptr, uint32_t& header_len, uint32_t& start_data);
+    static int  parseNpyHeader(FILE*& f_ptr, uint32_t header_len, DataType& type, std::vector<size_t>& shape);
+};
+
+class TensorMap {
+private:
+    std::unordered_map<std::string, Tensor> tensor_map_;
+
+    inline bool isValid(const Tensor& tensor)
+    {
+        return tensor.size() > 0 && tensor.data != nullptr;
+    }
+
+public:
+    TensorMap() = default;
+    TensorMap(const std::unordered_map<std::string, Tensor>& tensor_map);
+    TensorMap(const std::vector<Tensor>& tensor_map);
+    TensorMap(std::initializer_list<std::pair<std::string, Tensor>> tensor_map);
+    ~TensorMap();
+
+    inline size_t size() const
+    {
+        return tensor_map_.size();
+    }
+
+    inline bool isExist(const std::string& key) const
+    {
+        FT_LOG_DEBUG("%s for key: %s", __PRETTY_FUNCTION__, key.c_str());
+        return tensor_map_.find(key) != tensor_map_.end();
+    }
+
+    std::vector<std::string> keys() const;
+
+    inline void insert(const std::string& key, const Tensor& value)
+    {
+        FT_CHECK_WITH_INFO(!isExist(key), fmtstr("Duplicated key %s", key.c_str()));
+        FT_CHECK_WITH_INFO(isValid(value), fmtstr("A none tensor or nullptr is not allowed (key is %s)", key.c_str()));
+        tensor_map_.insert({key, value});
+    }
+
+    inline void insertIfValid(const std::string& key, const Tensor& value)
+    {
+        if (isValid(value)) {
+            insert({key, value});
+        }
+    }
+
+    inline void insert(std::pair<std::string, Tensor> p)
+    {
+        tensor_map_.insert(p);
+    }
+
+    // prevent converting int or size_t to string automatically
+    Tensor at(int tmp)    = delete;
+    Tensor at(size_t tmp) = delete;
+
+    inline Tensor& at(const std::string& key)
+    {
+        FT_LOG_DEBUG("%s for key %s", __PRETTY_FUNCTION__, key.c_str());
+        FT_CHECK_WITH_INFO(isExist(key),
+                           fmtstr("Cannot find a tensor of name %s in the tensor map (keys: %s)",
+                                  key.c_str(),
+                                  vec2str(keys()).c_str()));
+        return tensor_map_.at(key);
+    }
+
+    inline Tensor at(const std::string& key) const
+    {
+        FT_CHECK_WITH_INFO(isExist(key),
+                           fmtstr("Cannot find a tensor of name %s in the tensor map (keys: %s)",
+                                  key.c_str(),
+                                  vec2str(keys()).c_str()));
+        return tensor_map_.at(key);
+    }
+
+    inline Tensor& at(const std::string& key, Tensor& default_tensor)
+    {
+        FT_LOG_DEBUG("%s for key %s", __PRETTY_FUNCTION__, key.c_str());
+        if (isExist(key)) {
+            return tensor_map_.at(key);
+        }
+        return default_tensor;
+    }
+
+    inline Tensor at(const std::string& key, Tensor& default_tensor) const
+    {
+        FT_LOG_DEBUG("%s for key %s", __PRETTY_FUNCTION__, key.c_str());
+        if (isExist(key)) {
+            return tensor_map_.at(key);
+        }
+        return default_tensor;
+    }
+
+    inline Tensor& at(const std::string& key, Tensor&& default_tensor)
+    {
+        FT_LOG_DEBUG("%s for key %s", __PRETTY_FUNCTION__, key.c_str());
+        if (isExist(key)) {
+            return tensor_map_.at(key);
+        }
+        return default_tensor;
+    }
+
+    inline Tensor at(const std::string& key, Tensor&& default_tensor) const
+    {
+        if (isExist(key)) {
+            return tensor_map_.at(key);
+        }
+        return default_tensor;
+    }
+
+    template<typename T>
+    inline T getVal(const std::string& key) const
+    {
+        FT_CHECK_WITH_INFO(isExist(key),
+                           fmtstr("Cannot find a tensor of name %s in the tensor map (keys: %s)",
+                                  key.c_str(),
+                                  vec2str(keys()).c_str()));
+        return tensor_map_.at(key).getVal<T>();
+    }
+
+    template<typename T>
+    inline T getVal(const std::string& key, T default_value) const
+    {
+        if (isExist(key)) {
+            return tensor_map_.at(key).getVal<T>();
+        }
+        return default_value;
+    }
+
+    template<typename T>
+    inline T getValWithOffset(const std::string& key, size_t index) const
+    {
+        FT_CHECK_WITH_INFO(isExist(key),
+                           fmtstr("Cannot find a tensor of name %s in the tensor map (keys: %s)",
+                                  key.c_str(),
+                                  vec2str(keys()).c_str()));
+        return tensor_map_.at(key).getVal<T>(index);
+    }
+
+    template<typename T>
+    inline T getValWithOffset(const std::string& key, size_t index, T default_value) const
+    {
+        if (isExist(key)) {
+            return tensor_map_.at(key).getVal<T>(index);
+        }
+        return default_value;
+    }
+
+    template<typename T>
+    inline T* getPtr(const std::string& key) const
+    {
+        FT_CHECK_WITH_INFO(isExist(key),
+                           fmtstr("Cannot find a tensor of name %s in the tensor map (keys: %s)",
+                                  key.c_str(),
+                                  vec2str(keys()).c_str()));
+        return tensor_map_.at(key).getPtr<T>();
+    }
+
+    template<typename T>
+    inline T* getPtr(const std::string& key, T* default_ptr) const
+    {
+        if (isExist(key)) {
+            return tensor_map_.at(key).getPtr<T>();
+        }
+        return default_ptr;
+    }
+
+    template<typename T>
+    inline T* getPtrWithOffset(const std::string& key, size_t index) const
+    {
+        FT_CHECK_WITH_INFO(isExist(key),
+                           fmtstr("Cannot find a tensor of name %s in the tensor map (keys: %s)",
+                                  key.c_str(),
+                                  vec2str(keys()).c_str()));
+        return tensor_map_.at(key).getPtrWithOffset<T>(index);
+    }
+
+    template<typename T>
+    inline T* getPtrWithOffset(const std::string& key, size_t index, T* default_ptr) const
+    {
+        if (isExist(key)) {
+            return tensor_map_.at(key).getPtrWithOffset<T>(index);
+        }
+        return default_ptr;
+    }
+
+    inline std::unordered_map<std::string, Tensor> getMap() const
+    {
+        return tensor_map_;
+    }
+
+    inline std::unordered_map<std::string, Tensor>::iterator begin()
+    {
+        return tensor_map_.begin();
+    }
+
+    inline std::unordered_map<std::string, Tensor>::iterator end()
+    {
+        return tensor_map_.end();
+    }
+
+    inline std::unordered_map<std::string, Tensor>& get()
+    {
+        return tensor_map_;
+    }
+
+    inline std::unordered_map<std::string, Tensor>::const_iterator begin() const
+    {
+        return tensor_map_.begin();
+    }
+
+    inline std::unordered_map<std::string, Tensor>::const_iterator end() const
+    {
+        return tensor_map_.end();
+    }
+
+    std::string      toString();
+    static TensorMap fromNpyFolder(const std::string& base_folder);
+    void             saveNpy(const std::string& base_folder);
+};
+
+}  // namespace fastertransformer
--- a/src/fastertransformer/utils/activation_types.h
+++ b/src/fastertransformer/utils/activation_types.h
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "src/fastertransformer/utils/cuda_utils.h"
+
+namespace fastertransformer {
+
+enum class ActivationType {
+    Gelu,
+    Relu,
+    Silu,
+    GeGLU,
+    ReGLU,
+    SiGLU,
+    Identity,
+    InvalidType
+};
+
+inline ActivationType getActivationType(std::string activation_type_str)
+{
+    if (activation_type_str == "Gelu" || activation_type_str == "gelu") {
+        return ActivationType::Gelu;
+    }
+    else if (activation_type_str == "Relu" || activation_type_str == "relu") {
+        return ActivationType::Relu;
+    }
+    else if (activation_type_str == "Silu" || activation_type_str == "silu") {
+        return ActivationType::Silu;
+    }
+    else if (activation_type_str == "GeGLU" || activation_type_str == "geglu" || activation_type_str == "gated-gelu") {
+        return ActivationType::GeGLU;
+    }
+    else if (activation_type_str == "ReGLU" || activation_type_str == "reglu" || activation_type_str == "gated-relu") {
+        return ActivationType::ReGLU;
+    }
+    else if (activation_type_str == "SiGLU" || activation_type_str == "gated-silu") {
+        return ActivationType::SiGLU;
+    }
+    else {
+        FT_CHECK_WITH_INFO(false, "Activation Type: " + activation_type_str + " not supported !");
+    }
+    return ActivationType::InvalidType;
+}
+
+inline bool isGatedActivation(ActivationType activaiton_type)
+{
+    return activaiton_type == ActivationType::GeGLU || activaiton_type == ActivationType::ReGLU
+           || activaiton_type == ActivationType::SiGLU;
+}
+
+}  // namespace fastertransformer
\ No newline at end of file
--- a/src/fastertransformer/utils/allocator.h
+++ b/src/fastertransformer/utils/allocator.h
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * Memory Allocator
+ **/
+
+#pragma once
+
+#include "cuda_utils.h"
+#include <cuda_runtime.h>
+#include <unordered_map>
+#include <vector>
+
+#ifdef GOOGLE_CUDA
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#endif
+
+#ifdef TORCH_CUDA
+#include "torch/extension.h"
+#include <memory>
+#endif
+
+#include "src/fastertransformer/utils/logger.h"
+
+#if defined(CUDART_VERSION) && CUDART_VERSION < 11020
+#define CUDA_MEMORY_POOL_DISABLED
+#endif
+
+namespace fastertransformer {
+
+enum class AllocatorType {
+    CUDA,
+    TF,
+    TH
+};
+
+enum class ReallocType {
+    INCREASE,
+    REUSE,
+    DECREASE,
+};
+
+class IAllocator {
+public:
+    virtual ~IAllocator(){};
+
+    virtual void*        malloc(size_t size, const bool is_set_zero = true, bool is_host = false) = 0;
+    virtual void         free(void** ptr, bool is_host = false) const                             = 0;
+    virtual void         setStream(cudaStream_t stream)                                           = 0;
+    virtual cudaStream_t returnStream()                                                           = 0;
+    virtual void         memSet(void* ptr, const int val, const size_t size)                      = 0;
+
+    template<typename T>
+    void* reMalloc(T* ptr, size_t size, const bool is_set_zero = true, bool is_host = false)
+    {
+        FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+        size              = ((size + 31) / 32) * 32;  // make the buffer align with 32 bytes
+        void* void_ptr    = (void*)ptr;
+        void* ptr_address = getAddress(void_ptr);
+        if (isExist(ptr_address)) {
+            ReallocType realloc_type = isReMalloc(ptr_address, size);
+            if (realloc_type == ReallocType::INCREASE) {
+                FT_LOG_DEBUG("ReMalloc the buffer %p since it is too small.", void_ptr);
+                free((void**)(&void_ptr), is_host);
+                return malloc(size, is_set_zero, is_host);
+            }
+#if !defined(CUDA_MEMORY_POOL_DISABLED)
+            else if (realloc_type == ReallocType::DECREASE) {
+                FT_LOG_DEBUG("ReMalloc the buffer %p to release unused memory to memory pools.", void_ptr);
+                free((void**)(&void_ptr), is_host);
+                return malloc(size, is_set_zero, is_host);
+            }
+#endif
+            else {
+                FT_LOG_DEBUG("Reuse original buffer %p with size %d and do nothing for reMalloc.", void_ptr, size);
+                if (is_set_zero) {
+                    memSet(void_ptr, 0, size);
+                }
+                return void_ptr;
+            }
+        }
+        else {
+            FT_LOG_DEBUG("Cannot find buffer %p, mallocing new one.", void_ptr);
+            return malloc(size, is_set_zero, is_host);
+        }
+    }
+
+protected:
+    virtual bool        isExist(void* address) const                 = 0;
+    virtual ReallocType isReMalloc(void* address, size_t size) const = 0;
+
+    void* getAddress(void* ptr) const
+    {
+        return ptr;
+    }
+};
+
+template<AllocatorType AllocType_>
+class Allocator;
+
+template<>
+class Allocator<AllocatorType::CUDA>: public IAllocator {
+private:
+    const int                          device_id_;
+    cudaStream_t                       stream_ = 0;  // initialize as default stream
+    std::unordered_map<void*, size_t>* pointer_mapping_;
+
+    bool isExist(void* address) const
+    {
+        return pointer_mapping_->count(address) > 0;
+    }
+    ReallocType isReMalloc(void* address, size_t size) const
+    {
+        FT_CHECK(isExist(address));
+        if (pointer_mapping_->at(address) < size) {
+            return ReallocType::INCREASE;
+        }
+        else if (pointer_mapping_->at(address) == size) {
+            return ReallocType::REUSE;
+        }
+        else {
+            return ReallocType::DECREASE;
+        }
+    }
+
+public:
+    Allocator(int device_id): device_id_(device_id)
+    {
+        FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+        pointer_mapping_ = new std::unordered_map<void*, size_t>();
+#if defined(CUDA_MEMORY_POOL_DISABLED)
+        FT_LOG_WARNING(
+            "Async cudaMalloc/Free is not supported before CUDA 11.2. Using Sync cudaMalloc/Free."
+            "Note this may lead to hang with NCCL kernels launched in parallel; if so, try NCCL_LAUNCH_MODE=GROUP");
+#else
+        int device_count = 1;
+        check_cuda_error(cudaGetDeviceCount(&device_count));
+        cudaMemPool_t mempool;
+        check_cuda_error(cudaDeviceGetDefaultMemPool(&mempool, device_id));
+        cudaMemAccessDesc desc                  = {};
+        int               peer_access_available = 0;
+        for (int i = 0; i < device_count; i++) {
+            if (i == device_id) {
+                continue;
+            }
+            check_cuda_error(cudaDeviceCanAccessPeer(&peer_access_available, device_id, i));
+            if (!peer_access_available) {
+                FT_LOG_WARNING("Device " + std::to_string(device_id) + " peer access Device " + std::to_string(i)
+                               + " is not available.");
+                continue;
+            }
+            desc.location.type = cudaMemLocationTypeDevice;
+            desc.location.id   = i;
+            desc.flags         = cudaMemAccessFlagsProtReadWrite;
+            check_cuda_error(cudaMemPoolSetAccess(mempool, &desc, 1));
+        }
+        // set memory pool threshold to avoid shrinking the pool
+        uint64_t setVal = UINT64_MAX;
+        check_cuda_error(cudaMemPoolSetAttribute(mempool, cudaMemPoolAttrReleaseThreshold, &setVal));
+#endif
+    }
+
+    virtual ~Allocator()
+    {
+        FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+        while (!pointer_mapping_->empty()) {
+            free((void**)(&pointer_mapping_->begin()->first));
+        }
+        delete pointer_mapping_;
+    }
+
+    void setStream(cudaStream_t stream)
+    {
+        stream_ = stream;
+    }
+
+    cudaStream_t returnStream()
+    {
+        return stream_;
+    };
+
+    void* malloc(size_t size, const bool is_set_zero = true, bool is_host = false)
+    {
+        FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+        if (size == 0) {
+            return nullptr;
+        }
+        void* ptr      = nullptr;
+        int   o_device = 0;
+
+        check_cuda_error(getSetDevice(device_id_, &o_device));
+        if (is_host) {
+            check_cuda_error(cudaMallocHost(&ptr, (size_t)(ceil(size / 32.)) * 32));
+        }
+        else {
+#if defined(CUDA_MEMORY_POOL_DISABLED)
+            check_cuda_error(cudaMalloc(&ptr, (size_t)(ceil(size / 32.)) * 32));
+#else
+            check_cuda_error(cudaMallocAsync(&ptr, (size_t)(ceil(size / 32.)) * 32, stream_));
+#endif
+        }
+        if (is_set_zero) {
+            check_cuda_error(cudaMemsetAsync(ptr, 0, (size_t)(ceil(size / 32.)) * 32, stream_));
+        }
+        check_cuda_error(getSetDevice(o_device));
+        FT_LOG_DEBUG("malloc buffer %p with size %ld", ptr, size);
+
+        pointer_mapping_->insert({getAddress(ptr), size});
+
+        return ptr;
+    }
+
+    void free(void** ptr, bool is_host = false) const
+    {
+        FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+        void* address = getAddress(*ptr);
+        if (*ptr != nullptr) {
+            int o_device = 0;
+            if (pointer_mapping_->count(address)) {
+                FT_LOG_DEBUG("Free buffer %p", address);
+                check_cuda_error(getSetDevice(device_id_, &o_device));
+                if (is_host) {
+                    check_cuda_error(cudaFreeHost(*ptr));
+                }
+                else {
+#if defined(CUDA_MEMORY_POOL_DISABLED)
+                    check_cuda_error(cudaFree(*ptr));
+#else
+                    check_cuda_error(cudaFreeAsync(*ptr, stream_));
+                    cudaStreamSynchronize(stream_);
+#endif
+                }
+                check_cuda_error(getSetDevice(o_device));
+                pointer_mapping_->erase(address);
+            }
+            else {
+                FT_LOG_WARNING("pointer_mapping_ does not have information of ptr at %p.", address);
+            }
+        }
+        *ptr = nullptr;
+        return;
+    }
+
+    void memSet(void* ptr, const int val, const size_t size)
+    {
+        check_cuda_error(cudaMemsetAsync(ptr, val, size, stream_));
+    }
+};
+
+#ifdef GOOGLE_CUDA
+using namespace tensorflow;
+template<>
+class Allocator<AllocatorType::TF>: public IAllocator {
+    OpKernelContext*                               context_;
+    std::unordered_map<void*, tensorflow::Tensor>* pointer_mapping_;
+    cudaStream_t                                   stream_;
+
+    bool isExist(void* address) const
+    {
+        return pointer_mapping_->count(address) > 0;
+    }
+    ReallocType isReMalloc(void* address, size_t size) const
+    {
+        FT_CHECK(isExist(address));
+        size_t current_buffer_size = 1;
+        for (int i = 0; i < pointer_mapping_->at(address).dims(); i++) {
+            current_buffer_size *= pointer_mapping_->at(address).dim_size(i);
+        }
+        FT_LOG_DEBUG("current_buffer_size: %d, new buffer: %d", current_buffer_size, size);
+        if (current_buffer_size < size) {
+            return ReallocType::INCREASE;
+        }
+        else if (current_buffer_size == size) {
+            return ReallocType::REUSE;
+        }
+        else {
+            return ReallocType::DECREASE;
+        }
+    }
+
+public:
+    Allocator(OpKernelContext* context, cudaStream_t stream): context_(context), stream_(stream)
+    {
+        pointer_mapping_ = new std::unordered_map<void*, tensorflow::Tensor>();
+    }
+
+    void setStream(cudaStream_t stream)
+    {
+        stream_ = stream;
+    }
+
+    cudaStream_t returnStream()
+    {
+        return stream_;
+    };
+
+    void* malloc(size_t size, const bool is_set_zero = true, bool is_host = false)
+    {
+        FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+        tensorflow::Tensor buf;
+        long long int      buf_size = ((long long int)ceil(size / 32.) * 32);
+        tensorflow::Status status;
+        if (is_host) {
+            tensorflow::AllocatorAttributes pinned_allocator;
+            pinned_allocator.set_on_host(true);
+            pinned_allocator.set_gpu_compatible(true);
+            status = context_->allocate_temp(DT_UINT8, TensorShape{buf_size}, &buf, pinned_allocator);
+        }
+        else {
+            status = context_->allocate_temp(DT_UINT8, TensorShape{buf_size}, &buf);
+        }
+
+        if (status != tensorflow::Status::OK()) {
+            throw std::runtime_error("TF error: context->allocate_temp failed");
+        }
+
+        auto  flat = buf.flat<uint8>();
+        void* ptr  = (void*)flat.data();
+        if (is_set_zero) {
+            cudaMemsetAsync(ptr, 0, buf_size, stream_);
+        }
+        pointer_mapping_->insert({getAddress(ptr), buf});
+
+        return ptr;
+    }
+
+    void free(void** ptr, bool is_host = false) const
+    {
+        FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+        void* address = getAddress(*ptr);
+        pointer_mapping_->erase(address);
+        *ptr = nullptr;
+        return;
+    }
+
+    virtual ~Allocator()
+    {
+        while (!pointer_mapping_->empty()) {
+            void* ptr = pointer_mapping_->begin()->second.flat<uint8>().data();
+            free((void**)(&ptr));
+        }
+        pointer_mapping_->clear();
+        delete pointer_mapping_;
+    }
+
+    void memSet(void* ptr, const int val, const size_t size)
+    {
+        check_cuda_error(cudaMemsetAsync(ptr, val, size, stream_));
+    }
+};
+#endif
+
+#ifdef TORCH_CUDA
+template<>
+class Allocator<AllocatorType::TH>: public IAllocator {
+    std::unordered_map<void*, torch::Tensor>* pointer_mapping_;
+
+    bool isExist(void* address) const
+    {
+        return pointer_mapping_->count(address) > 0;
+    }
+    ReallocType isReMalloc(void* address, size_t size) const
+    {
+        FT_CHECK(isExist(address));
+        size_t current_buffer_size = 1;
+        for (int i = 0; i < pointer_mapping_->at(address).dim(); i++) {
+            current_buffer_size *= pointer_mapping_->at(address).size(i);
+        }
+        FT_LOG_DEBUG(
+            "current_buffer_size: %d, original buffer: %p, new buffer: %d", current_buffer_size, address, size);
+        if (current_buffer_size < size) {
+            return ReallocType::INCREASE;
+        }
+        else if (current_buffer_size == size) {
+            return ReallocType::REUSE;
+        }
+        else {
+            return ReallocType::DECREASE;
+        }
+    }
+
+public:
+    Allocator()
+    {
+        pointer_mapping_ = new std::unordered_map<void*, torch::Tensor>();
+    }
+
+    void setStream(cudaStream_t stream)
+    {
+        // nothing to do here;
+    }
+
+    cudaStream_t returnStream()
+    {
+        // nothing to do here;
+        return 0;
+    };
+
+    void* malloc(size_t size, const bool is_set_zero = true, bool is_host = false)
+    {
+        FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+        int64_t       buf_size = static_cast<int64_t>(ceil(size / 32.)) * 32;
+        torch::Tensor buf;
+        if (is_host) {
+            buf = torch::empty({buf_size}, torch::dtype(torch::kUInt8).device(torch::kCPU).pinned_memory(true));
+        }
+        else {
+            buf = torch::empty({buf_size}, torch::dtype(torch::kUInt8).device(torch::kCUDA));
+        }
+        void* ptr = buf.data_ptr();
+        if (is_set_zero) {
+            cudaMemset(ptr, 0, buf_size);
+        }
+        FT_LOG_DEBUG("malloc buffer %p with size %ld", ptr, buf_size);
+        pointer_mapping_->insert({getAddress(ptr), buf});
+        return ptr;
+    }
+
+    void free(void** ptr, bool is_host = false) const
+    {
+        FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+        void* address = getAddress(*ptr);
+        pointer_mapping_->erase(address);
+        *ptr = nullptr;
+        return;
+    }
+
+    virtual ~Allocator()
+    {
+        FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+        while (!pointer_mapping_->empty()) {
+            void* ptr = pointer_mapping_->begin()->second.data_ptr();
+            free((void**)(&ptr));
+        }
+        pointer_mapping_->clear();
+        delete pointer_mapping_;
+    }
+
+    void memSet(void* ptr, const int val, const size_t size)
+    {
+        check_cuda_error(cudaMemset(ptr, val, size));
+    }
+};
+#endif
+}  // namespace fastertransformer
--- a/src/fastertransformer/utils/conv2d.h
+++ b/src/fastertransformer/utils/conv2d.h
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cublasLt.h"
+#include "cuda_utils.h"
+#include "math.h"
+#include "stdio.h"
+#include "stdlib.h"
+#include <cublas_v2.h>
+#include <cuda_fp16.h>
+#include <cudnn.h>
+
+namespace fastertransformer {
+
+template<typename T>
+void conv2d(T*             output,
+            const T*       input,
+            const T*       kernel,
+            const int      batch,
+            const int      h,
+            const int      w,
+            const int      in_channels,
+            const int      out_channels,
+            const int      kernel_size,
+            const int      stride,
+            cudnnHandle_t& cudnn_handle)
+{
+    cudnnDataType_t dataType;
+    cudnnDataType_t computeType = CUDNN_DATA_FLOAT;
+    float           alpha       = 1.0f;
+    float           beta        = 0.0f;
+    if (std::is_same<T, half>::value) {
+        dataType = CUDNN_DATA_HALF;
+    }
+#ifdef ENABLE_BF16
+    else if (std::is_same<T, __nv_bfloat16>::value) {
+        dataType = CUDNN_DATA_BFLOAT16;
+    }
+#endif
+    else {
+        dataType = CUDNN_DATA_FLOAT;
+    }
+
+    cudnnTensorDescriptor_t      input_descriptor_;
+    cudnnTensorDescriptor_t      output_descriptor_;
+    cudnnFilterDescriptor_t      kernel_descriptor_;
+    cudnnConvolutionDescriptor_t convolution_descriptor_;
+    cudnnConvolutionFwdAlgo_t    convolution_algorithm_ = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
+    // cudnnConvolutionFwdAlgo_t convolution_algorithm_ = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
+    // cudnnConvolutionFwdAlgo_t convolution_algorithm_ = CUDNN_CONVOLUTION_FWD_ALGO_GEMM;
+    // cudnnConvolutionFwdAlgo_t convolution_algorithm_ = CUDNN_CONVOLUTION_FWD_ALGO_DIRECT;
+    // cudnnConvolutionFwdAlgo_t convolution_algorithm_ = CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING;
+    // cudnnConvolutionFwdAlgo_t convolution_algorithm_ = CUDNN_CONVOLUTION_FWD_ALGO_FFT;
+    // cudnnConvolutionFwdAlgo_t convolution_algorithm_ = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD;
+    // cudnnConvolutionFwdAlgo_t convolution_algorithm_ = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED;
+
+    checkCUDNN(cudnnCreateTensorDescriptor(&input_descriptor_));
+    checkCUDNN(cudnnSetTensor4dDescriptor(input_descriptor_,
+                                          /*format=*/CUDNN_TENSOR_NCHW,
+                                          /*dataType=*/dataType,
+                                          /*batch_size=*/batch,
+                                          /*channels=*/in_channels,
+                                          /*image_height=*/h,
+                                          /*image_width=*/w));
+
+    checkCUDNN(cudnnCreateTensorDescriptor(&output_descriptor_));
+    checkCUDNN(cudnnSetTensor4dDescriptor(output_descriptor_,
+                                          /*format=*/CUDNN_TENSOR_NHWC,
+                                          /*dataType=*/dataType,
+                                          /*batch_size=*/batch,
+                                          /*channels=*/out_channels,
+                                          /*image_height=*/h / stride,
+                                          /*image_width=*/w / stride));
+
+    checkCUDNN(cudnnCreateFilterDescriptor(&kernel_descriptor_));
+    checkCUDNN(cudnnSetFilter4dDescriptor(kernel_descriptor_,
+                                          /*dataType=*/dataType,
+                                          /*format=*/CUDNN_TENSOR_NCHW,
+                                          /*out_channels=*/out_channels,
+                                          /*in_channels=*/in_channels,
+                                          /*kernel_height=*/kernel_size,
+                                          /*kernel_width=*/kernel_size));
+
+    checkCUDNN(cudnnCreateConvolutionDescriptor(&convolution_descriptor_));
+    checkCUDNN(cudnnSetConvolution2dDescriptor(convolution_descriptor_,
+                                               /*pad_height=*/0,
+                                               /*pad_width=*/0,
+                                               /*vertical_stride=*/stride,
+                                               /*horizontal_stride=*/stride,
+                                               /*dilation_height=*/1,
+                                               /*dilation_width=*/1,
+                                               /*mode=*//*CUDNN_CONVOLUTION,*/ CUDNN_CROSS_CORRELATION,
+                                               /*computeType=*/computeType));
+
+    /*checkCUDNN(cudnnGetConvolutionForwardAlgorithm(cudnn_handle,
+                                                   input_descriptor_,
+                                                   kernel_descriptor_,
+                                                   convolution_descriptor_,
+                                                   output_descriptor_,
+                                                   CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
+                                                   0,//memoryLimitInBytes
+                                                   &convolution_algorithm_));*/
+
+    checkCUDNN(cudnnConvolutionForward(cudnn_handle,
+                                       &alpha,
+                                       input_descriptor_,
+                                       input,
+                                       kernel_descriptor_,
+                                       kernel,
+                                       convolution_descriptor_,
+                                       convolution_algorithm_,
+                                       nullptr,
+                                       0,
+                                       &beta,
+                                       output_descriptor_,
+                                       output));
+
+    checkCUDNN(cudnnDestroyTensorDescriptor(input_descriptor_));
+    checkCUDNN(cudnnDestroyTensorDescriptor(output_descriptor_));
+    checkCUDNN(cudnnDestroyFilterDescriptor(kernel_descriptor_));
+    checkCUDNN(cudnnDestroyConvolutionDescriptor(convolution_descriptor_));
+}
+
+}  // namespace fastertransformer
--- a/src/fastertransformer/utils/convert_data_type.h
+++ b/src/fastertransformer/utils/convert_data_type.h
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include "stdio.h"
+#include "stdlib.h"
+
+// be consistent with FasterTransformer
+int8_t float_to_int8_rn_host(float x)
+{
+    int8_t  res;
+    int32_t tmp;
+    if (x >= 0) {
+        tmp = int(x + 0.5);
+        tmp = tmp > 127 ? 127 : tmp;
+        res = int8_t(tmp);
+    }
+    else {
+        tmp = int(x - 0.5);
+        tmp = tmp < -127 ? -127 : tmp;
+        res = int8_t(tmp);
+    }
+    return res;
+}
\ No newline at end of file
--- a/src/fastertransformer/utils/cublasAlgoMap.cc
+++ b/src/fastertransformer/utils/cublasAlgoMap.cc
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cublasAlgoMap.h"
+
+namespace fastertransformer {
+
+cublasAlgoMap::cublasAlgoMap(const std::string filename, const std::string sp_config_filename):
+    config_filename_(filename), sp_config_filename_(sp_config_filename)
+{
+    loadGemmConfig();
+    loadSpGemmConfig();
+}
+
+cublasAlgoMap::cublasAlgoMap(const cublasAlgoMap& algo_map):
+    config_filename_(algo_map.config_filename_),
+    sp_config_filename_(algo_map.sp_config_filename_),
+    algo_map_(algo_map.algo_map_),
+    sp_algo_map_(algo_map.sp_algo_map_)
+{
+}
+
+cublasAlgoMap::~cublasAlgoMap()
+{
+    algo_map_.clear();
+}
+
+void cublasAlgoMap::loadGemmConfig()
+{
+    FILE* fd;
+    fd = fopen(config_filename_.c_str(), "r");
+    if (fd == NULL) {
+        std::cout << "[WARNING] " << config_filename_ << " is not found; using default GEMM algo" << std::endl;
+        return;
+    }
+
+    int   batchCount2, m2, n2, k2, algoId, customOption, tile, splitK_val;
+    int   batch_size, seq_len, head_num, size_per_head, dataType;
+    int   swizzle, reductionScheme, workspaceSize, stages;
+    int   inner_shapeId, cluster_shapeId, mma_shapeId, cga_shapeId, sche_mode;
+    float exec_time;
+    char  tmp[1024];
+    if (!fgets(tmp, 1024, fd)) {
+        printf("[ERROR] fgets fail at %s:%d \n", __FILE__, __LINE__);
+        exit(-1);
+    }
+    while (fscanf(fd,
+                  "%d %d %d %d %d ### %d %d %d %d %d %d %d %d %d %d %d %d "
+#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
+                  "%d %d "
+#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
+                  "%d %d %d "
+#endif
+                  "%f\n",
+                  &batch_size,
+                  &seq_len,
+                  &head_num,
+                  &size_per_head,
+                  &dataType,
+                  &batchCount2,
+                  &n2,
+                  &m2,
+                  &k2,
+                  &algoId,
+                  &customOption,
+                  &tile,
+                  &splitK_val,
+                  &swizzle,
+                  &reductionScheme,
+                  &workspaceSize,
+                  &stages,
+#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
+                  &inner_shapeId,
+                  &cluster_shapeId,
+#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
+                  &mma_shapeId,
+                  &cga_shapeId,
+                  &sche_mode,
+#endif
+                  &exec_time)
+           != EOF) {
+        if (dataType != FLOAT_DATATYPE && dataType != HALF_DATATYPE && dataType != BFLOAT16_DATATYPE
+            && dataType != INT8_DATATYPE && dataType != FP8_DATATYPE) {
+            printf("[WARNING][readAlgoFromConfig] wrong dataType %d!\n", dataType);
+            continue;
+        }
+        cublasAlgoConfig_t markStr{batchCount2, m2, n2, k2, static_cast<CublasDataType>(dataType)};
+        // workspaceSize should be zero
+        if (algo_map_.find(markStr) == algo_map_.end()) {
+            algo_map_[markStr].algoId          = algoId;
+            algo_map_[markStr].customOption    = customOption;
+            algo_map_[markStr].tile            = tile;
+            algo_map_[markStr].splitK_val      = splitK_val;
+            algo_map_[markStr].swizzle         = swizzle;
+            algo_map_[markStr].reductionScheme = reductionScheme;
+            algo_map_[markStr].workspaceSize   = workspaceSize;
+            algo_map_[markStr].stages          = stages;
+#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
+            algo_map_[markStr].inner_shapeId   = (uint16_t)inner_shapeId;
+            algo_map_[markStr].cluster_shapeId = (uint16_t)cluster_shapeId;
+#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
+            algo_map_[markStr].mma_shapeId = (uint16_t)mma_shapeId;
+            algo_map_[markStr].cga_shapeId = (uint16_t)cga_shapeId;
+            algo_map_[markStr].sche_mode   = (uint16_t)sche_mode;
+#endif
+            algo_map_[markStr].exec_time = exec_time;
+        }
+    }
+    fclose(fd);
+}
+
+bool cublasAlgoMap::isExist(
+    const int batch_count, const int m, const int n, const int k, const CublasDataType data_type)
+{
+    cublasAlgoConfig_t mark{batch_count, n, m, k, data_type};
+    return algo_map_.find(mark) != algo_map_.end();
+}
+
+cublasLtMatmulAlgo_info
+cublasAlgoMap::getAlgo(const int batch_count, const int m, const int n, const int k, const CublasDataType data_type)
+{
+    cublasAlgoConfig_t mark{batch_count, n, m, k, data_type};
+    if (algo_map_.find(mark) != algo_map_.end()) {
+        return algo_map_[mark];
+    }
+    else {
+        cublasLtMatmulAlgo_info tmp_algo;
+        tmp_algo.algoId =
+            static_cast<int>(data_type == FLOAT_DATATYPE ? CUBLAS_GEMM_DEFAULT : CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+        tmp_algo.customOption    = -1;
+        tmp_algo.tile            = -1;
+        tmp_algo.splitK_val      = -1;
+        tmp_algo.swizzle         = -1;
+        tmp_algo.reductionScheme = -1;
+        tmp_algo.workspaceSize   = -1;
+        tmp_algo.stages          = -1;
+        tmp_algo.exec_time       = -1.0f;
+        return tmp_algo;
+    }
+}
+
+void cublasAlgoMap::loadSpGemmConfig()
+{
+    if (sp_config_filename_.empty()) {
+        return;
+    }
+    FILE* fd = fopen(sp_config_filename_.c_str(), "r");
+    if (fd == NULL) {
+        printf("[WARNING] %s is not found; using SPGEMM algo id 0\n", sp_config_filename_.c_str());
+        return;
+    }
+    sp_algo_map_.clear();
+    int   batch_size, seq_len, head_num, size_per_head, data_type;
+    int   batchCount, m, n, k, algoId;
+    float exec_time;
+    char  tmp[1024];
+    if (!fgets(tmp, 1024, fd)) {
+        printf("[ERROR] fgets fail at %s:%d \n", __FILE__, __LINE__);
+        exit(-1);
+    }
+    while (fscanf(fd,
+                  "%d %d %d %d %d ### %d %d %d %d %d %f\n",
+                  &batch_size,
+                  &seq_len,
+                  &head_num,
+                  &size_per_head,
+                  &data_type,
+                  &batchCount,
+                  &m,
+                  &n,
+                  &k,
+                  &algoId,
+                  &exec_time)
+           != EOF) {
+        char mark[256];
+        sprintf(mark, "%d_%d_%d_%d", batchCount, m, n, k);
+        std::string markStr(mark);
+        sp_algo_map_[markStr] = algoId;
+    }
+    fclose(fd);
+}
+
+int cublasAlgoMap::getSpAlgo(const int batch_count, const int m, const int n, const int k)
+{
+    char mark[256];
+    sprintf(mark, "%d_%d_%d_%d", batch_count, m, n, k);
+    if (sp_algo_map_.find(mark) != sp_algo_map_.end()) {
+        return sp_algo_map_[mark];
+    }
+    else {
+        // for remove padding, select algo 1 for simplicity
+        return 0;
+    }
+}
+
+bool cublasAlgoMap::isUseSparse(const int batch_count, const int m, const int n, const int k)
+{
+    // not available to use cusparselt.
+    if (m % 8 != 0 || n % 8 != 0 || k % 8 != 0) {
+        return false;
+    }
+    char mark[256];
+    sprintf(mark, "%d_%d_%d_%d", batch_count, m, n, k);
+    if (sp_algo_map_.find(mark) != sp_algo_map_.end()) {
+        return sp_algo_map_[mark] != -1;
+    }
+    else {
+        // no gemm test case, choose sparse according to sparse flag
+        return true;
+    }
+}
+
+}  // namespace fastertransformer
--- a/src/fastertransformer/utils/cublasAlgoMap.h
+++ b/src/fastertransformer/utils/cublasAlgoMap.h
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include <cublasLt.h>
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+#include <map>
+#include <string>
+#include <unordered_map>
+#include <utility>
+
+#pragma once
+namespace fastertransformer {
+
+#define GEMM_NUM 6
+#define GEMM_CONFIG "gemm_config.in"
+#define IGEMM_CONFIG "igemm_config.in"
+#define SPGEMM_CONFIG "spgemm_config.in"
+#define SPIGEMM_CONFIG "spigemm_config.in"
+
+typedef struct {
+    int algoId, customOption, tile, splitK_val;
+    int swizzle, reductionScheme, workspaceSize;
+    // only used in cublasLt >= 11.0
+    int stages;
+#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
+    uint16_t inner_shapeId, cluster_shapeId;
+#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
+    uint16_t mma_shapeId, cga_shapeId, sche_mode;
+#endif
+    float exec_time;
+} cublasLtMatmulAlgo_info;
+
+/* Structure to store information about different run trials */
+typedef struct {
+    cublasLtMatmulAlgo_t      algo;
+    cublasStatus_t            status;
+    float                     time;
+    size_t                    workspaceSize;  // actual memory workspace needed
+    cublasMath_t              mathMode;
+    cublasLtReductionScheme_t reductionScheme;
+    int                       customOption;
+    float                     wavesCount;
+} customMatmulPerf_t;
+
+struct cublasAlgoConfig_t {
+    int            batch_count;
+    int            m;
+    int            n;
+    int            k;
+    CublasDataType data_type;
+    bool           operator==(cublasAlgoConfig_t const& config) const
+    {
+        return (batch_count == config.batch_count) && (m == config.m) && (n == config.n) && (k == config.k)
+               && (data_type == config.data_type);
+    }
+};
+
+class cublasAlgoConfig_hasher {
+public:
+    std::size_t operator()(cublasAlgoConfig_t const& config) const
+    {
+        return config.batch_count * 98317ull ^ config.m * 49157ull ^ config.n * 24593ull ^ config.k * 196613ull
+               ^ static_cast<int>(config.data_type) * 6151ull;
+    }
+};
+
+class cublasAlgoMap {
+private:
+    std::unordered_map<cublasAlgoConfig_t, cublasLtMatmulAlgo_info, cublasAlgoConfig_hasher> algo_map_;
+    std::string                                                                              config_filename_;
+    std::string                                                                              sp_config_filename_;
+    std::map<std::string, int>                                                               sp_algo_map_;
+
+public:
+    cublasAlgoMap(){};
+    explicit cublasAlgoMap(const std::string filename, const std::string sp_config_filename = "");
+    cublasAlgoMap(const cublasAlgoMap& map);
+    ~cublasAlgoMap();
+    void loadGemmConfig();
+    void loadSpGemmConfig();
+    int  getSpAlgo(const int batch_count, const int m, const int n, const int k);
+    bool isUseSparse(const int batch_count, const int m, const int n, const int k);
+
+    bool isExist(const int batch_count, const int m, const int n, const int k, const CublasDataType data_type);
+
+    cublasLtMatmulAlgo_info
+    getAlgo(const int batch_count, const int m, const int n, const int k, const CublasDataType data_type);
+};
+
+}  // namespace fastertransformer
--- a/src/fastertransformer/utils/cublasFP8MMWrapper.cu
+++ b/src/fastertransformer/utils/cublasFP8MMWrapper.cu
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cublasFP8MMWrapper.h"
+#include "cuda_utils.h"
+
+namespace fastertransformer {
+
+#define CUBLAS_WORKSPACE_1MB 1048576
+cublasFP8MMWrapper::cublasFP8MMWrapper(cublasLtHandle_t cublaslt_handle,
+                                       cudaStream_t     stream,
+                                       cublasAlgoMap*   cublas_algo_map,
+                                       std::mutex*      mu,
+                                       IAllocator*      allocator):
+    cublasMMWrapper(nullptr, cublaslt_handle, stream, cublas_algo_map, mu, allocator)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    FT_CHECK_WITH_INFO(allocator != nullptr, "must pass allocator to cublasFP8MMWrapper");
+    cublasVersionCheck();
+
+    if (allocator_ != nullptr) {
+        cublas_workspace_qgemm_ = allocator_->reMalloc(cublas_workspace_qgemm_, CUBLAS_WORKSPACE_1MB, true);
+    }
+}
+
+cublasFP8MMWrapper::cublasFP8MMWrapper(cublasHandle_t   cublas_handle,
+                                       cublasLtHandle_t cublaslt_handle,
+                                       cudaStream_t     stream,
+                                       cublasAlgoMap*   cublas_algo_map,
+                                       std::mutex*      mu,
+                                       IAllocator*      allocator):
+    cublasMMWrapper(cublas_handle, cublaslt_handle, stream, cublas_algo_map, mu, allocator)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    FT_CHECK_WITH_INFO(allocator != nullptr, "must pass allocator to cublasFP8MMWrapper");
+    cublasVersionCheck();
+    if (allocator_ != nullptr) {
+        cublas_workspace_qgemm_ = allocator_->reMalloc(cublas_workspace_qgemm_, CUBLAS_WORKSPACE_1MB, true);
+    }
+}
+
+cublasFP8MMWrapper::~cublasFP8MMWrapper()
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    mu_ = nullptr;
+    if (allocator_ != nullptr) {
+        allocator_->free((void**)(&cublas_workspace_qgemm_));
+    }
+}
+
+cublasFP8MMWrapper::cublasFP8MMWrapper(const cublasFP8MMWrapper& wrapper):
+    cublasMMWrapper(wrapper.cublas_handle_,
+                    wrapper.cublaslt_handle_,
+                    wrapper.stream_,
+                    wrapper.cublas_algo_map_,
+                    wrapper.mu_,
+                    wrapper.allocator_)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    cublasVersionCheck();
+}
+
+void cublasFP8MMWrapper::cublasVersionCheck()
+{
+    cublasGetProperty(MAJOR_VERSION, &version_major_);
+    cublasGetProperty(MINOR_VERSION, &version_minor_);
+    cublasGetProperty(PATCH_LEVEL, &version_patch_);
+    size_t cublasVersion = (version_major_ * 10000 + version_minor_ * 100 + version_patch_);
+#if defined(FP8_MHA) || !defined(FP8_GEMM_OUTPUT_QUANT_DISABLE)
+    FT_CHECK_WITH_INFO((version_major_ > 11) || (version_major_ == 11 && version_minor_ == 11 && version_patch_ >= 4),
+                       "FP8 MHA needs d-scale, which is only supported after cublas 11.11.4 !");
+
+#endif
+}
+
+void cublasFP8MMWrapper::Gemm(__nv_bfloat16*       res,
+                              int                  batchCount,
+                              int                  m,
+                              int                  n,
+                              int                  k,
+                              int64_t              strideA,
+                              int64_t              strideB,
+                              int64_t              strideD,
+                              const float*         alpha,
+                              const float*         beta,
+                              const __nv_fp8_e4m3* input,
+                              const __nv_fp8_e4m3* kernel,
+                              const float*         input_scale,
+                              const float*         kernel_scale)
+{
+    Gemm(res,
+         batchCount,
+         m,
+         n,
+         k,
+         strideA,
+         strideB,
+         strideD,
+         alpha,
+         beta,
+         input,
+         kernel,
+         input_scale,
+         kernel_scale,
+         (cudaStream_t)0);
+}
+
+void cublasFP8MMWrapper::Gemm(__nv_bfloat16*       res,
+                              int                  batchCount,
+                              int                  m,
+                              int                  n,
+                              int                  k,
+                              int64_t              strideA,
+                              int64_t              strideB,
+                              int64_t              strideD,
+                              const float*         alpha,
+                              const float*         beta,
+                              const __nv_fp8_e4m3* input,
+                              const __nv_fp8_e4m3* kernel,
+                              const float*         input_scale,
+                              const float*         kernel_scale,
+                              cudaStream_t         stream,
+                              bool                 fastAccum)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    mu_->lock();
+
+    const void*  devAscalePtr = (const void*)kernel_scale;
+    const void*  devBscalePtr = (const void*)input_scale;
+    const size_t wsSizeBytes  = CUBLAS_WORKSPACE_SIZE;
+
+    const auto aType       = CUDA_R_8F_E4M3;
+    const auto bType       = CUDA_R_8F_E4M3;
+    const auto dType       = CUDA_R_16BF;
+    const auto computeType = CUBLAS_COMPUTE_32F;
+    const auto scaleType   = CUDA_R_32F;
+    // const auto epilogueAuxType = CUDA_R_16BF;
+
+    const cublasOperation_t tA = CUBLAS_OP_T;
+    const cublasOperation_t tB = CUBLAS_OP_N;
+
+    //------- init, desc & tensors
+    cublasLtMatmulDesc_t   matmulDesc;
+    cublasLtMatrixLayout_t Adesc;
+    cublasLtMatrixLayout_t Bdesc;
+    cublasLtMatrixLayout_t Ddesc;
+
+    {
+        check_cuda_error(cublasLtMatmulDescCreate(&matmulDesc, computeType, scaleType));
+        check_cuda_error(cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_TRANSA, &tA, sizeof(tA)));
+        check_cuda_error(cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &tB, sizeof(tB)));
+
+        if (version_major_ >= 11 && version_minor_ >= 11 && version_patch_ > 0 && fastAccum) {
+            const int8_t fastAccuMode = 1;  // enable fast imprecise accum
+            check_cuda_error(cublasLtMatmulDescSetAttribute(
+                matmulDesc, CUBLASLT_MATMUL_DESC_FAST_ACCUM, &fastAccuMode, sizeof(decltype(fastAccuMode))));
+        }
+
+        // TODO: Check that do we need to set these attributes
+        // TODO: comment them for compiler first
+        check_cuda_error(cublasLtMatmulDescSetAttribute(
+            matmulDesc, CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, &devAscalePtr, sizeof(devAscalePtr)));
+        check_cuda_error(cublasLtMatmulDescSetAttribute(
+            matmulDesc, CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, &devBscalePtr, sizeof(devBscalePtr)));
+    }
+
+    {
+        const int64_t lda = k;
+        const int64_t ldb = k;
+        const int64_t ldd = n;
+
+        // create matrix descriptors, we are good with the details here so no need
+        // to set any extra attributes
+        check_cuda_error(
+            cublasLtMatrixLayoutCreate(&Adesc, aType, tA == CUBLAS_OP_N ? n : k, tA == CUBLAS_OP_N ? k : n, lda));
+        if (batchCount > 1) {
+            check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+                Adesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
+            check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+                Adesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideA, sizeof(strideA)));
+        }
+
+        check_cuda_error(
+            cublasLtMatrixLayoutCreate(&Bdesc, bType, tB == CUBLAS_OP_N ? k : m, tB == CUBLAS_OP_N ? m : k, ldb));
+        if (batchCount > 1) {
+            check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+                Bdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
+            check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+                Bdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideB, sizeof(strideB)));
+        }
+
+        check_cuda_error(cublasLtMatrixLayoutCreate(&Ddesc, dType, n, m, ldd));
+        if (batchCount > 1) {
+            check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+                Ddesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
+            check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+                Ddesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideD, sizeof(strideD)));
+        }
+    }
+
+    bool                    findAlgo = cublas_algo_map_->isExist(batchCount, n, m, k, FP8_DATATYPE);
+    cublasLtMatmulAlgo_info info     = cublas_algo_map_->getAlgo(batchCount, n, m, k, FP8_DATATYPE);
+    if (info.stages == -1) {
+        findAlgo = false;
+    }
+
+    cublasLtMatmulAlgo_t algo;
+    int                  workspaceSize = cublas_workspace_ == NULL ? 0 : CUBLAS_WORKSPACE_SIZE;
+    if (findAlgo) {
+        if (info.workspaceSize > workspaceSize) {
+            findAlgo = false;
+        }
+        else {
+            cublasLtMatmulAlgoInit(
+                cublaslt_handle_, computeType, scaleType, aType, bType, dType, dType, info.algoId, &algo);
+            cublasLtMatmulAlgoConfigSetAttribute(
+                &algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &(info.customOption), sizeof(info.customOption));
+            cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &(info.tile), sizeof(info.tile));
+            cublasLtMatmulAlgoConfigSetAttribute(
+                &algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &(info.splitK_val), sizeof(info.splitK_val));
+            cublasLtMatmulAlgoConfigSetAttribute(
+                &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(info.swizzle), sizeof(info.swizzle));
+            cublasLtMatmulAlgoConfigSetAttribute(
+                &algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &(info.reductionScheme), sizeof(info.reductionScheme));
+
+#if (CUDART_VERSION >= 11000)
+            cublasLtMatmulAlgoConfigSetAttribute(
+                &algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(info.stages), sizeof(info.stages));
+#endif
+
+#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
+            cublasLtMatmulAlgoConfigSetAttribute(
+                &algo, CUBLASLT_ALGO_CONFIG_INNER_SHAPE_ID, &(info.inner_shapeId), sizeof(info.inner_shapeId));
+            cublasLtMatmulAlgoConfigSetAttribute(
+                &algo, CUBLASLT_ALGO_CONFIG_CLUSTER_SHAPE_ID, &(info.cluster_shapeId), sizeof(info.cluster_shapeId));
+#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
+            cublasLtMatmulAlgoConfigSetAttribute(
+                &algo, CUBLASLT_ALGO_CONFIG_MMA_SHAPE_ID, &(info.mma_shapeId), sizeof(info.mma_shapeId));
+            cublasLtMatmulAlgoConfigSetAttribute(
+                &algo, CUBLASLT_ALGO_CONFIG_CGA_SHAPE_ID, &(info.cga_shapeId), sizeof(info.cga_shapeId));
+            cublasLtMatmulAlgoConfigSetAttribute(
+                &algo, CUBLASLT_ALGO_CONFIG_SCHEDULING_MODE, &(info.sche_mode), sizeof(info.sche_mode));
+#endif
+        }
+    }
+
+    {
+        cublasStatus_t status = cublasLtMatmul(cublaslt_handle_,
+                                               matmulDesc,
+                                               alpha,
+                                               kernel,
+                                               Adesc,
+                                               input,
+                                               Bdesc,
+                                               beta,
+                                               nullptr,  // Cptr, not used here
+                                               Ddesc,
+                                               res,
+                                               Ddesc,
+                                               (findAlgo ? (&algo) : NULL),
+                                               cublas_workspace_,
+                                               wsSizeBytes,
+                                               stream);
+        check_cuda_error(status);
+    }
+
+    if (Ddesc) {
+        check_cuda_error(cublasLtMatrixLayoutDestroy(Ddesc));
+    }
+    if (Bdesc) {
+        check_cuda_error(cublasLtMatrixLayoutDestroy(Bdesc));
+    }
+    if (Adesc) {
+        check_cuda_error(cublasLtMatrixLayoutDestroy(Adesc));
+    }
+    if (matmulDesc) {
+        check_cuda_error(cublasLtMatmulDescDestroy(matmulDesc));
+    }
+
+    mu_->unlock();
+}
+
+void cublasFP8MMWrapper::Gemm(__nv_fp8_e4m3*       res,
+                              int                  batchCount,
+                              int                  m,
+                              int                  n,
+                              int                  k,
+                              int64_t              strideA,
+                              int64_t              strideB,
+                              int64_t              strideD,
+                              const float*         alpha,
+                              const float*         beta,
+                              const __nv_fp8_e4m3* input,
+                              const __nv_fp8_e4m3* kernel,
+                              const float*         input_scale,
+                              const float*         kernel_scale,
+                              const float*         output_scale)
+{
+    Gemm(res,
+         batchCount,
+         m,
+         n,
+         k,
+         strideA,
+         strideB,
+         strideD,
+         alpha,
+         beta,
+         input,
+         kernel,
+         input_scale,
+         kernel_scale,
+         output_scale,
+         0);
+}
+
+void cublasFP8MMWrapper::Gemm(__nv_fp8_e4m3*       res,
+                              int                  batchCount,
+                              int                  m,
+                              int                  n,
+                              int                  k,
+                              int64_t              strideA,
+                              int64_t              strideB,
+                              int64_t              strideD,
+                              const float*         alpha,
+                              const float*         beta,
+                              const __nv_fp8_e4m3* input,
+                              const __nv_fp8_e4m3* kernel,
+                              const float*         input_scale,
+                              const float*         kernel_scale,
+                              const float*         output_scale,
+                              cudaStream_t         stream,
+                              bool                 fastAccum)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    mu_->lock();
+
+    const void* devAscalePtr = (const void*)kernel_scale;
+    const void* devBscalePtr = (const void*)input_scale;
+    const void* devDscalePtr = (const void*)output_scale;
+
+    FT_CHECK(cublas_workspace_ != nullptr);
+    const size_t wsSizeBytes = CUBLAS_WORKSPACE_SIZE;
+
+    const auto aType       = CUDA_R_8F_E4M3;
+    const auto bType       = CUDA_R_8F_E4M3;
+    const auto cType       = CUDA_R_16BF;
+    const auto dType       = CUDA_R_8F_E4M3;
+    const auto computeType = CUBLAS_COMPUTE_32F;
+    const auto scaleType   = CUDA_R_32F;
+
+    const cublasOperation_t tA = CUBLAS_OP_T;
+    const cublasOperation_t tB = CUBLAS_OP_N;
+
+    //------- init, desc & tensors
+    cublasLtMatmulDesc_t   matmulDesc;
+    cublasLtMatrixLayout_t Adesc;
+    cublasLtMatrixLayout_t Bdesc;
+    cublasLtMatrixLayout_t Cdesc;
+    cublasLtMatrixLayout_t Ddesc;
+
+    {
+        check_cuda_error(cublasLtMatmulDescCreate(&matmulDesc, computeType, scaleType));
+        check_cuda_error(cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_TRANSA, &tA, sizeof(tA)));
+        check_cuda_error(cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &tB, sizeof(tB)));
+
+        if (version_major_ >= 11 && version_minor_ >= 11 && version_patch_ > 0 && fastAccum) {
+            const int8_t fastAccuMode = 1;  // enable fast imprecise accum
+            check_cuda_error(cublasLtMatmulDescSetAttribute(
+                matmulDesc, CUBLASLT_MATMUL_DESC_FAST_ACCUM, &fastAccuMode, sizeof(decltype(fastAccuMode))));
+        }
+
+        // TODO: Check that do we need to set these attributes
+        // TODO: comment them for compiler first
+        check_cuda_error(cublasLtMatmulDescSetAttribute(
+            matmulDesc, CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, &devAscalePtr, sizeof(devAscalePtr)));
+        check_cuda_error(cublasLtMatmulDescSetAttribute(
+            matmulDesc, CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, &devBscalePtr, sizeof(devBscalePtr)));
+        // check_cuda_error(cublasLtMatmulDescSetAttribute(
+        //     matmulDesc, CUBLASLT_MATMUL_DESC_C_SCALE_POINTER, &devDscalePtr, sizeof(devDscalePtr)));
+        check_cuda_error(cublasLtMatmulDescSetAttribute(
+            matmulDesc, CUBLASLT_MATMUL_DESC_D_SCALE_POINTER, &devDscalePtr, sizeof(devDscalePtr)));
+    }
+
+    {
+        const int64_t lda = k;
+        const int64_t ldb = k;
+        const int64_t ldd = n;
+
+        // create matrix descriptors, we are good with the details here so no need
+        // to set any extra attributes
+        check_cuda_error(
+            cublasLtMatrixLayoutCreate(&Adesc, aType, tA == CUBLAS_OP_N ? n : k, tA == CUBLAS_OP_N ? k : n, lda));
+        if (batchCount > 1) {
+            check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+                Adesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
+            check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+                Adesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideA, sizeof(strideA)));
+        }
+
+        check_cuda_error(
+            cublasLtMatrixLayoutCreate(&Bdesc, bType, tB == CUBLAS_OP_N ? k : m, tB == CUBLAS_OP_N ? m : k, ldb));
+        if (batchCount > 1) {
+            check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+                Bdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
+            check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+                Bdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideB, sizeof(strideB)));
+        }
+
+        check_cuda_error(cublasLtMatrixLayoutCreate(&Cdesc, cType, n, m, ldd));
+        if (batchCount > 1) {
+            check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+                Cdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
+            check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+                Cdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideD, sizeof(strideD)));
+        }
+        check_cuda_error(cublasLtMatrixLayoutCreate(&Ddesc, dType, n, m, ldd));
+        if (batchCount > 1) {
+            check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+                Ddesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
+            check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+                Ddesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideD, sizeof(strideD)));
+        }
+    }
+
+    bool                    findAlgo = cublas_algo_map_->isExist(batchCount, n, m, k, FP8_DATATYPE);
+    cublasLtMatmulAlgo_info info     = cublas_algo_map_->getAlgo(batchCount, n, m, k, FP8_DATATYPE);
+    if (info.stages == -1) {
+        findAlgo = false;
+    }
+
+    cublasLtMatmulAlgo_t algo;
+    int                  workspaceSize = cublas_workspace_ == NULL ? 0 : CUBLAS_WORKSPACE_SIZE;
+    if (findAlgo) {
+        if (info.workspaceSize > workspaceSize) {
+            findAlgo = false;
+        }
+        else {
+            cublasLtMatmulAlgoInit(
+                cublaslt_handle_, computeType, scaleType, aType, bType, cType, dType, info.algoId, &algo);
+            cublasLtMatmulAlgoConfigSetAttribute(
+                &algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &(info.customOption), sizeof(info.customOption));
+            cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &(info.tile), sizeof(info.tile));
+            cublasLtMatmulAlgoConfigSetAttribute(
+                &algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &(info.splitK_val), sizeof(info.splitK_val));
+            cublasLtMatmulAlgoConfigSetAttribute(
+                &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(info.swizzle), sizeof(info.swizzle));
+            cublasLtMatmulAlgoConfigSetAttribute(
+                &algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &(info.reductionScheme), sizeof(info.reductionScheme));
+
+#if (CUDART_VERSION >= 11000)
+            cublasLtMatmulAlgoConfigSetAttribute(
+                &algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(info.stages), sizeof(info.stages));
+#endif
+
+#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
+            cublasLtMatmulAlgoConfigSetAttribute(
+                &algo, CUBLASLT_ALGO_CONFIG_INNER_SHAPE_ID, &(info.inner_shapeId), sizeof(info.inner_shapeId));
+            cublasLtMatmulAlgoConfigSetAttribute(
+                &algo, CUBLASLT_ALGO_CONFIG_CLUSTER_SHAPE_ID, &(info.cluster_shapeId), sizeof(info.cluster_shapeId));
+#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
+            cublasLtMatmulAlgoConfigSetAttribute(
+                &algo, CUBLASLT_ALGO_CONFIG_MMA_SHAPE_ID, &(info.mma_shapeId), sizeof(info.mma_shapeId));
+            cublasLtMatmulAlgoConfigSetAttribute(
+                &algo, CUBLASLT_ALGO_CONFIG_CGA_SHAPE_ID, &(info.cga_shapeId), sizeof(info.cga_shapeId));
+            cublasLtMatmulAlgoConfigSetAttribute(
+                &algo, CUBLASLT_ALGO_CONFIG_SCHEDULING_MODE, &(info.sche_mode), sizeof(info.sche_mode));
+#endif
+        }
+    }
+
+    {
+        cublasStatus_t status = cublasLtMatmul(cublaslt_handle_,
+                                               matmulDesc,
+                                               alpha,
+                                               kernel,
+                                               Adesc,
+                                               input,
+                                               Bdesc,
+                                               beta,
+                                               nullptr,  // Cptr, not used here
+                                               Cdesc,
+                                               res,
+                                               Ddesc,
+                                               (findAlgo ? (&algo) : NULL),
+                                               cublas_workspace_,
+                                               wsSizeBytes,
+                                               stream);
+        check_cuda_error(status);
+    }
+
+    if (Ddesc) {
+        check_cuda_error(cublasLtMatrixLayoutDestroy(Ddesc));
+    }
+    if (Cdesc) {
+        check_cuda_error(cublasLtMatrixLayoutDestroy(Cdesc));
+    }
+    if (Bdesc) {
+        check_cuda_error(cublasLtMatrixLayoutDestroy(Bdesc));
+    }
+    if (Adesc) {
+        check_cuda_error(cublasLtMatrixLayoutDestroy(Adesc));
+    }
+    if (matmulDesc) {
+        check_cuda_error(cublasLtMatmulDescDestroy(matmulDesc));
+    }
+
+    mu_->unlock();
+}
+
+template<bool RELU, bool GELU>
+void cublasFP8MMWrapper::Conv1x1Gemm(__nv_fp8_e4m3*       res,
+                                     int                  m,
+                                     int                  n,
+                                     int                  k,
+                                     const __nv_fp8_e4m3* input,
+                                     const __nv_fp8_e4m3* kernel,
+                                     const __nv_bfloat16* bias,
+                                     const float          input_scale,
+                                     const float          kernel_scale,
+                                     const float          output_scale,
+                                     cudaStream_t         stream)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    mu_->lock();
+    size_t workspace_size = 0;
+    // get workspace size
+    qgmmaLauncher.getWorkSpaceSize<RELU, GELU>(n, workspace_size);
+
+    if (workspace_size > CUBLAS_WORKSPACE_1MB) {
+        throw std::runtime_error("Need to rellocate workspace for qgemm. It is not supported");
+        // cublas_workspace_qgemm_ = allocator_->reMalloc(cublas_workspace_qgemm_, workspace_size);
+    }
+
+    qgmmaLauncher.invokeQgmma1x1<RELU, GELU>(
+        res, m, n, k, input, kernel, bias, input_scale, kernel_scale, output_scale, cublas_workspace_qgemm_, stream);
+    sync_check_cuda_error();
+    mu_->unlock();
+}
+
+template void cublasFP8MMWrapper::Conv1x1Gemm<true, false>(__nv_fp8_e4m3*       res,
+                                                           int                  m,
+                                                           int                  n,
+                                                           int                  k,
+                                                           const __nv_fp8_e4m3* input,
+                                                           const __nv_fp8_e4m3* kernel,
+                                                           const __nv_bfloat16* bias,
+                                                           const float          input_scale,
+                                                           const float          kernel_scale,
+                                                           const float          output_scale,
+                                                           cudaStream_t         stream);
+template void cublasFP8MMWrapper::Conv1x1Gemm<true, true>(__nv_fp8_e4m3*       res,
+                                                          int                  m,
+                                                          int                  n,
+                                                          int                  k,
+                                                          const __nv_fp8_e4m3* input,
+                                                          const __nv_fp8_e4m3* kernel,
+                                                          const __nv_bfloat16* bias,
+                                                          const float          input_scale,
+                                                          const float          kernel_scale,
+                                                          const float          output_scale,
+                                                          cudaStream_t         stream);
+template void cublasFP8MMWrapper::Conv1x1Gemm<false, false>(__nv_fp8_e4m3*       res,
+                                                            int                  m,
+                                                            int                  n,
+                                                            int                  k,
+                                                            const __nv_fp8_e4m3* input,
+                                                            const __nv_fp8_e4m3* kernel,
+                                                            const __nv_bfloat16* bias,
+                                                            const float          input_scale,
+                                                            const float          kernel_scale,
+                                                            const float          output_scale,
+                                                            cudaStream_t         stream);
+template void cublasFP8MMWrapper::Conv1x1Gemm<false, true>(__nv_fp8_e4m3*       res,
+                                                           int                  m,
+                                                           int                  n,
+                                                           int                  k,
+                                                           const __nv_fp8_e4m3* input,
+                                                           const __nv_fp8_e4m3* kernel,
+                                                           const __nv_bfloat16* bias,
+                                                           const float          input_scale,
+                                                           const float          kernel_scale,
+                                                           const float          output_scale,
+                                                           cudaStream_t         stream);
+
+template<bool RELU, bool GELU>
+void cublasFP8MMWrapper::Gemm_Bias_Act(__nv_bfloat16*       res,
+                                       int                  batchCount,
+                                       int                  m,
+                                       int                  n,
+                                       int                  k,
+                                       int64_t              strideA,
+                                       int64_t              strideB,
+                                       int64_t              strideD,
+                                       const float*         alpha,
+                                       const float*         beta,
+                                       const __nv_fp8_e4m3* input,
+                                       const __nv_fp8_e4m3* kernel,
+                                       const float*         input_scale,
+                                       const float*         kernel_scale,
+                                       const __nv_bfloat16* bias,
+                                       const float*         output_scale,
+                                       cudaStream_t         stream)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    mu_->lock();
+
+    const void*  devAscalePtr = (const void*)kernel_scale;
+    const void*  devBscalePtr = (const void*)input_scale;
+    const void*  devDscalePtr = (const void*)output_scale;
+    const size_t wsSizeBytes  = CUBLAS_WORKSPACE_SIZE;
+
+    const auto aType       = CUDA_R_8F_E4M3;
+    const auto bType       = CUDA_R_8F_E4M3;
+    const auto dType       = CUDA_R_16BF;
+    const auto computeType = CUBLAS_COMPUTE_32F;
+    const auto scaleType   = CUDA_R_32F;
+    // const auto epilogueAuxType = CUDA_R_16BF;
+
+    const cublasOperation_t tA = CUBLAS_OP_T;
+    const cublasOperation_t tB = CUBLAS_OP_N;
+
+    //------- init, desc & tensors
+    cublasLtMatmulDesc_t   matmulDesc;
+    cublasLtMatrixLayout_t Adesc;
+    cublasLtMatrixLayout_t Bdesc;
+    cublasLtMatrixLayout_t Ddesc;
+
+    {
+        check_cuda_error(cublasLtMatmulDescCreate(&matmulDesc, computeType, scaleType));
+        check_cuda_error(cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_TRANSA, &tA, sizeof(tA)));
+        check_cuda_error(cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &tB, sizeof(tB)));
+
+        if (version_major_ >= 11 && version_minor_ >= 11 && version_patch_ > 0) {
+            const int8_t fastAccuMode = 1;  // enable fast imprecise accum
+            check_cuda_error(cublasLtMatmulDescSetAttribute(
+                matmulDesc, CUBLASLT_MATMUL_DESC_FAST_ACCUM, &fastAccuMode, sizeof(decltype(fastAccuMode))));
+        }
+
+        // TODO: Check that do we need to set these attributes
+        // TODO: comment them for compiler first
+        check_cuda_error(cublasLtMatmulDescSetAttribute(
+            matmulDesc, CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, &devAscalePtr, sizeof(devAscalePtr)));
+        check_cuda_error(cublasLtMatmulDescSetAttribute(
+            matmulDesc, CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, &devBscalePtr, sizeof(devBscalePtr)));
+
+        cublasLtEpilogue_t epi = CUBLASLT_EPILOGUE_BIAS;
+        if (RELU == true) {
+            epi = CUBLASLT_EPILOGUE_RELU_BIAS;
+        }
+        else if (GELU == true) {
+            epi = CUBLASLT_EPILOGUE_GELU_BIAS;
+        }
+        // cublasLtEpilogue_t epi = CUBLASLT_EPILOGUE_BIAS;
+        cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epi, sizeof(cublasLtEpilogue_t));
+        cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias, sizeof(const void*));
+    }
+
+    {
+        const int64_t lda = k;
+        const int64_t ldb = k;
+        const int64_t ldd = n;
+
+        // create matrix descriptors, we are good with the details here so no need
+        // to set any extra attributes
+        check_cuda_error(
+            cublasLtMatrixLayoutCreate(&Adesc, aType, tA == CUBLAS_OP_N ? n : k, tA == CUBLAS_OP_N ? k : n, lda));
+        if (batchCount > 1) {
+            check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+                Adesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
+            check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+                Adesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideA, sizeof(strideA)));
+        }
+
+        check_cuda_error(
+            cublasLtMatrixLayoutCreate(&Bdesc, bType, tB == CUBLAS_OP_N ? k : m, tB == CUBLAS_OP_N ? m : k, ldb));
+        if (batchCount > 1) {
+            check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+                Bdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
+            check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+                Bdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideB, sizeof(strideB)));
+        }
+
+        check_cuda_error(cublasLtMatrixLayoutCreate(&Ddesc, dType, n, m, ldd));
+        if (batchCount > 1) {
+            check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+                Ddesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
+            check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+                Ddesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideD, sizeof(strideD)));
+        }
+    }
+
+    const int                       requestedAlgoCount = 1;
+    cublasLtMatmulHeuristicResult_t heuristicResult;
+    cublasLtMatmulPreference_t      preference;
+    int                             returnedAlgoCount = -1;
+    check_cuda_error(cublasLtMatmulPreferenceCreate(&preference));
+    check_cuda_error(cublasLtMatmulPreferenceSetAttribute(
+        preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &wsSizeBytes, sizeof(wsSizeBytes)));
+
+    check_cuda_error(cublasLtMatmulAlgoGetHeuristic(cublaslt_handle_,
+                                                    matmulDesc,
+                                                    Adesc,
+                                                    Bdesc,
+                                                    Ddesc,
+                                                    Ddesc,
+                                                    preference,
+                                                    requestedAlgoCount,
+                                                    &heuristicResult,
+                                                    &returnedAlgoCount));
+
+    {
+        cublasStatus_t status = cublasLtMatmul(cublaslt_handle_,
+                                               matmulDesc,
+                                               alpha,
+                                               kernel,
+                                               Adesc,
+                                               input,
+                                               Bdesc,
+                                               beta,
+                                               res,
+                                               Ddesc,
+                                               res,
+                                               Ddesc,
+                                               &heuristicResult.algo,
+                                               cublas_workspace_,
+                                               wsSizeBytes,
+                                               stream);
+        check_cuda_error(status);
+    }
+
+    if (Ddesc) {
+        check_cuda_error(cublasLtMatrixLayoutDestroy(Ddesc));
+    }
+    if (Bdesc) {
+        check_cuda_error(cublasLtMatrixLayoutDestroy(Bdesc));
+    }
+    if (Adesc) {
+        check_cuda_error(cublasLtMatrixLayoutDestroy(Adesc));
+    }
+    if (matmulDesc) {
+        check_cuda_error(cublasLtMatmulDescDestroy(matmulDesc));
+    }
+
+    mu_->unlock();
+}
+
+template<bool RELU, bool GELU>
+void cublasFP8MMWrapper::Gemm_Bias_Act(__nv_fp8_e4m3*       res,
+                                       int                  batchCount,
+                                       int                  m,
+                                       int                  n,
+                                       int                  k,
+                                       int64_t              strideA,
+                                       int64_t              strideB,
+                                       int64_t              strideD,
+                                       const float*         alpha,
+                                       const float*         beta,
+                                       const __nv_fp8_e4m3* input,
+                                       const __nv_fp8_e4m3* kernel,
+                                       const float*         input_scale,
+                                       const float*         kernel_scale,
+                                       const __nv_bfloat16* bias,
+                                       const float*         output_scale,
+                                       cudaStream_t         stream)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    mu_->lock();
+
+    const void*  devAscalePtr = (const void*)kernel_scale;
+    const void*  devBscalePtr = (const void*)input_scale;
+    const void*  devDscalePtr = (const void*)output_scale;
+    const size_t wsSizeBytes  = CUBLAS_WORKSPACE_SIZE;
+
+    const auto aType       = CUDA_R_8F_E4M3;
+    const auto bType       = CUDA_R_8F_E4M3;
+    const auto cType       = CUDA_R_16BF;
+    const auto dType       = CUDA_R_8F_E4M3;
+    const auto computeType = CUBLAS_COMPUTE_32F;
+    const auto scaleType   = CUDA_R_32F;
+    // const auto epilogueAuxType = CUDA_R_16BF;
+
+    const cublasOperation_t tA = CUBLAS_OP_T;
+    const cublasOperation_t tB = CUBLAS_OP_N;
+
+    //------- init, desc & tensors
+    cublasLtMatmulDesc_t   matmulDesc;
+    cublasLtMatrixLayout_t Adesc;
+    cublasLtMatrixLayout_t Bdesc;
+    cublasLtMatrixLayout_t Cdesc;
+    cublasLtMatrixLayout_t Ddesc;
+
+    {
+        check_cuda_error(cublasLtMatmulDescCreate(&matmulDesc, computeType, scaleType));
+        check_cuda_error(cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_TRANSA, &tA, sizeof(tA)));
+        check_cuda_error(cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &tB, sizeof(tB)));
+
+        if (version_major_ >= 11 && version_minor_ >= 11 && version_patch_ > 0) {
+            const int8_t fastAccuMode = 1;  // enable fast imprecise accum
+            check_cuda_error(cublasLtMatmulDescSetAttribute(
+                matmulDesc, CUBLASLT_MATMUL_DESC_FAST_ACCUM, &fastAccuMode, sizeof(decltype(fastAccuMode))));
+        }
+
+        // TODO: Check that do we need to set these attributes
+        // TODO: comment them for compiler first
+        check_cuda_error(cublasLtMatmulDescSetAttribute(
+            matmulDesc, CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, &devAscalePtr, sizeof(devAscalePtr)));
+        check_cuda_error(cublasLtMatmulDescSetAttribute(
+            matmulDesc, CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, &devBscalePtr, sizeof(devBscalePtr)));
+        check_cuda_error(cublasLtMatmulDescSetAttribute(
+            matmulDesc, CUBLASLT_MATMUL_DESC_D_SCALE_POINTER, &devDscalePtr, sizeof(devDscalePtr)));
+
+        cublasLtEpilogue_t epi = CUBLASLT_EPILOGUE_GELU_BIAS;
+        // cublasLtEpilogue_t epi = CUBLASLT_EPILOGUE_BIAS;
+        cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epi, sizeof(cublasLtEpilogue_t));
+        cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias, sizeof(const void*));
+    }
+
+    {
+        const int64_t lda = k;
+        const int64_t ldb = k;
+        const int64_t ldd = n;
+
+        // create matrix descriptors, we are good with the details here so no need
+        // to set any extra attributes
+        check_cuda_error(
+            cublasLtMatrixLayoutCreate(&Adesc, aType, tA == CUBLAS_OP_N ? n : k, tA == CUBLAS_OP_N ? k : n, lda));
+        if (batchCount > 1) {
+            check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+                Adesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
+            check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+                Adesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideA, sizeof(strideA)));
+        }
+
+        check_cuda_error(
+            cublasLtMatrixLayoutCreate(&Bdesc, bType, tB == CUBLAS_OP_N ? k : m, tB == CUBLAS_OP_N ? m : k, ldb));
+        if (batchCount > 1) {
+            check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+                Bdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
+            check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+                Bdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideB, sizeof(strideB)));
+        }
+
+        check_cuda_error(cublasLtMatrixLayoutCreate(&Cdesc, cType, n, m, ldd));
+        // (TODO Hongbinl)Not sure if the implementation makes sense
+        if (batchCount > 1) {
+            check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+                Cdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
+            check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+                Cdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideD, sizeof(strideD)));
+        }
+
+        check_cuda_error(cublasLtMatrixLayoutCreate(&Ddesc, dType, n, m, ldd));
+        if (batchCount > 1) {
+            check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+                Ddesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
+            check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+                Ddesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideD, sizeof(strideD)));
+        }
+    }
+
+    const int                       requestedAlgoCount = 1;
+    cublasLtMatmulHeuristicResult_t heuristicResult;
+    cublasLtMatmulPreference_t      preference;
+    int                             returnedAlgoCount = -1;
+    check_cuda_error(cublasLtMatmulPreferenceCreate(&preference));
+    check_cuda_error(cublasLtMatmulPreferenceSetAttribute(
+        preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &wsSizeBytes, sizeof(wsSizeBytes)));
+#if (CUBLAS_VERSION) <= 12000
+    uint32_t pointer_mode_mask = 0;
+    check_cuda_error(cublasLtMatmulPreferenceSetAttribute(
+        preference, CUBLASLT_MATMUL_PREF_EPILOGUE_MASK, &pointer_mode_mask, sizeof(pointer_mode_mask)));
+#endif
+
+    check_cuda_error(cublasLtMatmulAlgoGetHeuristic(cublaslt_handle_,
+                                                    matmulDesc,
+                                                    Adesc,
+                                                    Bdesc,
+                                                    Cdesc,
+                                                    Ddesc,
+                                                    preference,
+                                                    requestedAlgoCount,
+                                                    &heuristicResult,
+                                                    &returnedAlgoCount));
+
+    {
+        cublasStatus_t status = cublasLtMatmul(cublaslt_handle_,
+                                               matmulDesc,
+                                               alpha,
+                                               kernel,
+                                               Adesc,
+                                               input,
+                                               Bdesc,
+                                               beta,
+                                               res,
+                                               Cdesc,
+                                               res,
+                                               Ddesc,
+                                               &heuristicResult.algo,
+                                               cublas_workspace_,
+                                               wsSizeBytes,
+                                               stream);
+        check_cuda_error(status);
+    }
+
+    if (Ddesc) {
+        check_cuda_error(cublasLtMatrixLayoutDestroy(Ddesc));
+    }
+    if (Bdesc) {
+        check_cuda_error(cublasLtMatrixLayoutDestroy(Bdesc));
+    }
+    if (Adesc) {
+        check_cuda_error(cublasLtMatrixLayoutDestroy(Adesc));
+    }
+    if (matmulDesc) {
+        check_cuda_error(cublasLtMatmulDescDestroy(matmulDesc));
+    }
+
+    mu_->unlock();
+}
+
+template void cublasFP8MMWrapper::Gemm_Bias_Act<false, true>(__nv_bfloat16*       res,
+                                                             int                  batchCount,
+                                                             int                  m,
+                                                             int                  n,
+                                                             int                  k,
+                                                             int64_t              strideA,
+                                                             int64_t              strideB,
+                                                             int64_t              strideD,
+                                                             const float*         alpha,
+                                                             const float*         beta,
+                                                             const __nv_fp8_e4m3* input,
+                                                             const __nv_fp8_e4m3* kernel,
+                                                             const float*         input_scale,
+                                                             const float*         kernel_scale,
+                                                             const __nv_bfloat16* bias,
+                                                             const float*         output_scale,
+                                                             cudaStream_t         stream);
+template void cublasFP8MMWrapper::Gemm_Bias_Act<false, true>(__nv_fp8_e4m3*       res,
+                                                             int                  batchCount,
+                                                             int                  m,
+                                                             int                  n,
+                                                             int                  k,
+                                                             int64_t              strideA,
+                                                             int64_t              strideB,
+                                                             int64_t              strideD,
+                                                             const float*         alpha,
+                                                             const float*         beta,
+                                                             const __nv_fp8_e4m3* input,
+                                                             const __nv_fp8_e4m3* kernel,
+                                                             const float*         input_scale,
+                                                             const float*         kernel_scale,
+                                                             const __nv_bfloat16* bias,
+                                                             const float*         output_scale,
+                                                             cudaStream_t         stream);
+template void cublasFP8MMWrapper::Gemm_Bias_Act<true, false>(__nv_bfloat16*       res,
+                                                             int                  batchCount,
+                                                             int                  m,
+                                                             int                  n,
+                                                             int                  k,
+                                                             int64_t              strideA,
+                                                             int64_t              strideB,
+                                                             int64_t              strideD,
+                                                             const float*         alpha,
+                                                             const float*         beta,
+                                                             const __nv_fp8_e4m3* input,
+                                                             const __nv_fp8_e4m3* kernel,
+                                                             const float*         input_scale,
+                                                             const float*         kernel_scale,
+                                                             const __nv_bfloat16* bias,
+                                                             const float*         output_scale,
+                                                             cudaStream_t         stream);
+template void cublasFP8MMWrapper::Gemm_Bias_Act<true, false>(__nv_fp8_e4m3*       res,
+                                                             int                  batchCount,
+                                                             int                  m,
+                                                             int                  n,
+                                                             int                  k,
+                                                             int64_t              strideA,
+                                                             int64_t              strideB,
+                                                             int64_t              strideD,
+                                                             const float*         alpha,
+                                                             const float*         beta,
+                                                             const __nv_fp8_e4m3* input,
+                                                             const __nv_fp8_e4m3* kernel,
+                                                             const float*         input_scale,
+                                                             const float*         kernel_scale,
+                                                             const __nv_bfloat16* bias,
+                                                             const float*         output_scale,
+                                                             cudaStream_t         stream);
+template void cublasFP8MMWrapper::Gemm_Bias_Act<false, false>(__nv_fp8_e4m3*       res,
+                                                              int                  batchCount,
+                                                              int                  m,
+                                                              int                  n,
+                                                              int                  k,
+                                                              int64_t              strideA,
+                                                              int64_t              strideB,
+                                                              int64_t              strideD,
+                                                              const float*         alpha,
+                                                              const float*         beta,
+                                                              const __nv_fp8_e4m3* input,
+                                                              const __nv_fp8_e4m3* kernel,
+                                                              const float*         input_scale,
+                                                              const float*         kernel_scale,
+                                                              const __nv_bfloat16* bias,
+                                                              const float*         output_scale,
+                                                              cudaStream_t         stream);
+
+}  // namespace fastertransformer
--- a/src/fastertransformer/utils/cublasFP8MMWrapper.h
+++ b/src/fastertransformer/utils/cublasFP8MMWrapper.h
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "3rdparty/fp8_qgmma_1x1/fp8_qgmma_1x1_utils.h"
+#include "cuda_utils.h"
+#include "src/fastertransformer/utils/cublasAlgoMap.h"
+#include "src/fastertransformer/utils/cublasMMWrapper.h"
+#include "src/fastertransformer/utils/cuda_fp8_utils.h"
+#include <cublasLt.h>
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+#include <map>
+#include <mutex>
+#include <string>
+
+#pragma once
+
+namespace fastertransformer {
+
+class cublasFP8MMWrapper: public cublasMMWrapper {
+public:
+    cublasFP8MMWrapper(cublasLtHandle_t cublaslt_handle_,
+                       cudaStream_t     stream,
+                       cublasAlgoMap*   map,
+                       std::mutex*      mu,
+                       IAllocator*      allocator);
+
+    cublasFP8MMWrapper(cublasHandle_t   cublas_handle,
+                       cublasLtHandle_t cublaslt_handle,
+                       cudaStream_t     stream,
+                       cublasAlgoMap*   map,
+                       std::mutex*      mu,
+                       IAllocator*      allocator);
+
+    virtual ~cublasFP8MMWrapper();
+
+    cublasFP8MMWrapper(const cublasFP8MMWrapper& wrapper);
+
+    virtual void cublasVersionCheck() override;
+
+    void Gemm(__nv_bfloat16*       res,
+              int                  batchCount,
+              int                  m,
+              int                  n,
+              int                  k,
+              int64_t              stridea,
+              int64_t              strideb,
+              int64_t              stridec,
+              const float*         alpha,
+              const float*         beta,
+              const __nv_fp8_e4m3* input,
+              const __nv_fp8_e4m3* kernel,
+              const float*         input_scale,
+              const float*         kernel_scale);
+
+    void Gemm(__nv_bfloat16*       res,
+              int                  batchCount,
+              int                  m,
+              int                  n,
+              int                  k,
+              int64_t              stridea,
+              int64_t              strideb,
+              int64_t              stridec,
+              const float*         alpha,
+              const float*         beta,
+              const __nv_fp8_e4m3* input,
+              const __nv_fp8_e4m3* kernel,
+              const float*         input_scale,
+              const float*         kernel_scale,
+              cudaStream_t         stream,
+              bool                 fastAccum = true);
+
+    void Gemm(__nv_fp8_e4m3*       res,
+              int                  batchCount,
+              int                  m,
+              int                  n,
+              int                  k,
+              int64_t              stridea,
+              int64_t              strideb,
+              int64_t              stridec,
+              const float*         alpha,
+              const float*         beta,
+              const __nv_fp8_e4m3* input,
+              const __nv_fp8_e4m3* kernel,
+              const float*         input_scale,
+              const float*         kernel_scale,
+              const float*         output_scale);
+
+    void Gemm(__nv_fp8_e4m3*       res,
+              int                  batchCount,
+              int                  m,
+              int                  n,
+              int                  k,
+              int64_t              stridea,
+              int64_t              strideb,
+              int64_t              stridec,
+              const float*         alpha,
+              const float*         beta,
+              const __nv_fp8_e4m3* input,
+              const __nv_fp8_e4m3* kernel,
+              const float*         input_scale,
+              const float*         kernel_scale,
+              const float*         output_scale,
+              cudaStream_t         stream,
+              bool                 fastAccum = true);
+
+    template<bool RELU, bool GELU>
+    void Conv1x1Gemm(__nv_fp8_e4m3*       res,
+                     int                  m,
+                     int                  n,
+                     int                  k,
+                     const __nv_fp8_e4m3* input,
+                     const __nv_fp8_e4m3* kernel,
+                     const __nv_bfloat16* bias,
+                     const float          input_scale,
+                     const float          kernel_scale,
+                     const float          output_scale,
+                     cudaStream_t         stream);
+
+    template<bool RELU, bool GELU>
+    void Gemm_Bias_Act(__nv_bfloat16*       res,
+                       int                  batchCount,
+                       int                  m,
+                       int                  n,
+                       int                  k,
+                       int64_t              stridea,
+                       int64_t              strideb,
+                       int64_t              stridec,
+                       const float*         alpha,
+                       const float*         beta,
+                       const __nv_fp8_e4m3* input,
+                       const __nv_fp8_e4m3* kernel,
+                       const float*         input_scale,
+                       const float*         kernel_scale,
+                       const __nv_bfloat16* bias,
+                       const float*         output_scale,
+                       cudaStream_t         stream);
+
+    template<bool RELU, bool GELU>
+    void Gemm_Bias_Act(__nv_fp8_e4m3*       res,
+                       int                  batchCount,
+                       int                  m,
+                       int                  n,
+                       int                  k,
+                       int64_t              stridea,
+                       int64_t              strideb,
+                       int64_t              stridec,
+                       const float*         alpha,
+                       const float*         beta,
+                       const __nv_fp8_e4m3* input,
+                       const __nv_fp8_e4m3* kernel,
+                       const float*         input_scale,
+                       const float*         kernel_scale,
+                       const __nv_bfloat16* bias,
+                       const float*         output_scale,
+                       cudaStream_t         stream);
+
+private:
+    int                                 version_major_, version_minor_, version_patch_;
+    fastertransformer::qgmma1x1Launcher qgmmaLauncher;
+    void*                               cublas_workspace_qgemm_ = nullptr;
+};
+
+}  // namespace fastertransformer
--- a/src/fastertransformer/utils/cublasINT8MMWrapper.cc
+++ b/src/fastertransformer/utils/cublasINT8MMWrapper.cc
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cublasINT8MMWrapper.h"
+
+#ifndef CUDART_VERSION
+#error CUDART_VERSION Undefined!
+#endif
+
+namespace fastertransformer {
+cublasINT8MMWrapper::cublasINT8MMWrapper(cublasLtHandle_t cublaslt_handle,
+                                         cudaStream_t     stream,
+                                         cublasAlgoMap*   cublas_algo_map,
+                                         std::mutex*      mu,
+                                         bool             use_ORDER_COL32_2R_4R4):
+    cublasMMWrapper(nullptr, cublaslt_handle, stream, cublas_algo_map, mu, nullptr),
+    use_ORDER_COL32_2R_4R4_(use_ORDER_COL32_2R_4R4)
+{
+}
+
+cublasINT8MMWrapper::cublasINT8MMWrapper(cublasHandle_t   cublas_handle,
+                                         cublasLtHandle_t cublaslt_handle,
+                                         cudaStream_t     stream,
+                                         cublasAlgoMap*   cublas_algo_map,
+                                         std::mutex*      mu,
+                                         bool             use_ORDER_COL32_2R_4R4):
+    cublasMMWrapper(cublas_handle, cublaslt_handle, stream, cublas_algo_map, mu, nullptr),
+    use_ORDER_COL32_2R_4R4_(use_ORDER_COL32_2R_4R4)
+{
+}
+
+#ifdef SPARSITY_ENABLED
+cublasINT8MMWrapper::cublasINT8MMWrapper(cublasLtHandle_t   cublaslt_handle,
+                                         cusparseLtHandle_t cusparselt_handle,
+                                         cudaStream_t       stream,
+                                         cublasAlgoMap*     cublas_algo_map,
+                                         std::mutex*        mu,
+                                         bool               use_ORDER_COL32_2R_4R4):
+    cublasMMWrapper(nullptr, cublaslt_handle, cusparselt_handle, stream, cublas_algo_map, mu, nullptr),
+    use_ORDER_COL32_2R_4R4_(use_ORDER_COL32_2R_4R4)
+{
+}
+#endif
+
+cublasINT8MMWrapper::~cublasINT8MMWrapper()
+{
+    mu_ = nullptr;
+}
+
+cublasINT8MMWrapper::cublasINT8MMWrapper(const cublasINT8MMWrapper& wrapper):
+#ifdef SPARSITY_ENABLED
+    cublasMMWrapper(nullptr,
+                    wrapper.cublaslt_handle_,
+                    wrapper.cusparselt_handle_,
+                    wrapper.stream_,
+                    wrapper.cublas_algo_map_,
+                    wrapper.mu_,
+                    wrapper.allocator_),
+#else
+    cublasMMWrapper(
+        nullptr, wrapper.cublaslt_handle_, wrapper.stream_, wrapper.cublas_algo_map_, wrapper.mu_, wrapper.allocator_),
+#endif
+    use_ORDER_COL32_2R_4R4_(wrapper.use_ORDER_COL32_2R_4R4_)
+{
+}
+
+// for int8 cublasLtMM with algo
+// ATransform should be m*n, CUBLASLT_ORDER_COL32
+// kernel should be n*k, CUBLASLT_ORDER_COL4_4R2_8C or CUBLASLT_ORDER_COL32_2R_4R4
+// res is m*n, CUBLASLT_ORDER_COL32
+void cublasINT8MMWrapper::Gemm(int*          res,
+                               int           batchCount,
+                               int           m,
+                               int           n,
+                               int           k,
+                               int64_t       stridea,
+                               int64_t       strideb,
+                               int64_t       stridec,
+                               const int8_t* ATransform,
+                               const int8_t* kernel)
+{
+    mu_->lock();
+    cublasOperation_t opTranspose = CUBLAS_OP_T;
+#if (CUDART_VERSION >= 11000)
+    cublasComputeType_t computeType = CUBLAS_COMPUTE_32I;
+#else
+    cudaDataType_t computeType = CUDA_R_32I;
+#endif
+    cublasLtMatmulDesc_t   matmulDesc;
+    cublasLtMatrixLayout_t AtransformDesc = NULL;
+    cublasLtMatrixLayout_t BtransformDesc = NULL;
+    cublasLtMatrixLayout_t CtransformDesc = NULL;
+    cublasLtOrder_t        order_COL32    = CUBLASLT_ORDER_COL32;
+
+    cublasLtOrder_t order_matrixB;
+#if (CUDART_VERSION >= 11000)
+    if (use_ORDER_COL32_2R_4R4_) {
+        order_matrixB = CUBLASLT_ORDER_COL32_2R_4R4;
+    }
+    else {
+        order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C;
+    }
+#else
+    order_matrixB              = CUBLASLT_ORDER_COL4_4R2_8C;
+#endif
+
+    int ldaTransform = 32 * m;
+    int ldbTransform;
+    if (use_ORDER_COL32_2R_4R4_) {
+        ldbTransform = 32 * ((n + 32 - 1) / 32) * 32;
+    }
+    else {
+        ldbTransform = 32 * ((n + 8 - 1) / 8) * 8;
+    }
+    int ldcTransform = 32 * m;
+
+    // create matmulDesc
+#if (CUDART_VERSION >= 11000)
+    cublasLtMatmulDescCreate(&matmulDesc, computeType, CUDA_R_32I);
+#else
+    cublasLtMatmulDescCreate(&matmulDesc, computeType);
+#endif
+    cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &opTranspose, sizeof(cublasOperation_t));
+    cublasLtMatrixLayoutCreate(&AtransformDesc, CUDA_R_8I, m, k, ldaTransform);
+    cublasLtMatrixLayoutSetAttribute(AtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_COL32, sizeof(order_COL32));
+    cublasLtMatrixLayoutCreate(&BtransformDesc, CUDA_R_8I, n, k, ldbTransform);
+    cublasLtMatrixLayoutSetAttribute(
+        BtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_matrixB, sizeof(order_matrixB));
+    cublasLtMatrixLayoutCreate(&CtransformDesc, CUDA_R_32I, m, n, ldcTransform);
+    cublasLtMatrixLayoutSetAttribute(CtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_COL32, sizeof(order_COL32));
+    if (batchCount > 1) {
+        cublasLtMatrixLayoutSetAttribute(
+            AtransformDesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount));
+        cublasLtMatrixLayoutSetAttribute(
+            AtransformDesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridea, sizeof(stridea));
+        cublasLtMatrixLayoutSetAttribute(
+            BtransformDesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount));
+        cublasLtMatrixLayoutSetAttribute(
+            BtransformDesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideb, sizeof(strideb));
+        cublasLtMatrixLayoutSetAttribute(
+            CtransformDesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount));
+        cublasLtMatrixLayoutSetAttribute(
+            CtransformDesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridec, sizeof(stridec));
+    }
+
+    int alphaI = 1;
+    int betaI  = 0;
+
+    // get algo
+    cublasLtMatmulAlgo_t algo;
+    int                  findAlgo = 0;
+    if (cublas_algo_map_->isExist(batchCount, m, n, k, INT8_DATATYPE)) {
+        // printf("find algo %s\n", markStr.c_str());
+        findAlgo = 1;
+
+        cublasLtMatmulAlgo_info tmp_info = cublas_algo_map_->getAlgo(batchCount, m, n, k, INT8_DATATYPE);
+
+        cublasLtMatmulAlgoInit(cublaslt_handle_,
+                               computeType,
+                               CUDA_R_32I,
+                               CUDA_R_8I,
+                               CUDA_R_8I,
+                               CUDA_R_32I,
+                               CUDA_R_32I,
+                               tmp_info.algoId,
+                               &algo);
+        cublasLtMatmulAlgoConfigSetAttribute(
+            &algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &(tmp_info.customOption), sizeof(tmp_info.customOption));
+        cublasLtMatmulAlgoConfigSetAttribute(
+            &algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &(tmp_info.tile), sizeof(tmp_info.tile));
+        cublasLtMatmulAlgoConfigSetAttribute(
+            &algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &(tmp_info.splitK_val), sizeof(tmp_info.splitK_val));
+        cublasLtMatmulAlgoConfigSetAttribute(
+            &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(tmp_info.swizzle), sizeof(tmp_info.swizzle));
+        cublasLtMatmulAlgoConfigSetAttribute(
+            &algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &(tmp_info.reductionScheme), sizeof(int));
+#if (CUDART_VERSION >= 11000)
+        cublasLtMatmulAlgoConfigSetAttribute(
+            &algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(tmp_info.stages), sizeof(tmp_info.stages));
+#endif
+    }
+    else {
+        findAlgo = 1;
+        int algoId;
+        if (use_ORDER_COL32_2R_4R4_) {
+            algoId = 7;
+        }
+        else {
+            algoId = 6;
+        }
+        int swizzle         = 0;
+        int customOption    = 0;
+        int tile            = 20;
+        int splitK_val      = 0;
+        int reductionScheme = 0;
+        cublasLtMatmulAlgoInit(
+            cublaslt_handle_, computeType, CUDA_R_32I, CUDA_R_8I, CUDA_R_8I, CUDA_R_32I, CUDA_R_32I, algoId, &algo);
+        cublasLtMatmulAlgoConfigSetAttribute(
+            &algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &(customOption), sizeof(customOption));
+        cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &(tile), sizeof(tile));
+        cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &(splitK_val), sizeof(splitK_val));
+        cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(swizzle), sizeof(swizzle));
+        cublasLtMatmulAlgoConfigSetAttribute(
+            &algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &(reductionScheme), sizeof(int));
+#if (CUDART_VERSION >= 11000)
+        int stages;
+        if (use_ORDER_COL32_2R_4R4_) {
+            stages = 15;
+        }
+        else {
+            stages = 13;
+        }
+        cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(stages), sizeof(stages));
+#endif
+    }
+
+    cublasLtMatmul(cublaslt_handle_,
+                   matmulDesc,
+                   &alphaI,
+                   ATransform,
+                   AtransformDesc,
+                   kernel,
+                   BtransformDesc,
+                   &betaI,
+                   res,
+                   CtransformDesc,
+                   res,
+                   CtransformDesc,
+                   (findAlgo == 1 ? (&algo) : NULL),
+                   NULL,
+                   0,
+                   stream_);
+
+    cublasLtMatmulDescDestroy(matmulDesc);
+    cublasLtMatrixLayoutDestroy(AtransformDesc);
+    cublasLtMatrixLayoutDestroy(BtransformDesc);
+    cublasLtMatrixLayoutDestroy(CtransformDesc);
+    sync_check_cuda_error();
+    mu_->unlock();
+}
+
+// for int8 IO cublasLtMM with algo
+// ATransform should be m*k CUBLASLT_ORDER_COL32
+// kernel should be n*k CUBLASLT_ORDER_COL4_4R2_8C
+// res is m*n CUBLASLT_ORDER_COL32
+void cublasINT8MMWrapper::Gemm(int8_t*       res,
+                               int           batchCount,
+                               int           m,
+                               int           n,
+                               int           k,
+                               int64_t       stridea,
+                               int64_t       strideb,
+                               int64_t       stridec,
+                               const float   alpha,
+                               const int8_t* ATransform,
+                               const int8_t* kernel)
+{
+    mu_->lock();
+    cublasOperation_t opTranspose = CUBLAS_OP_T;
+    // int8 gemm does not support CUBLAS_POINTER_MODE_DEVICE
+    // cublasLtPointerMode_t pointerMode = CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_ZERO;
+    cudaDataType_t scaleType = CUDA_R_32F;
+#if (CUDART_VERSION >= 11000)
+    cublasComputeType_t computeType = CUBLAS_COMPUTE_32I;
+#else
+    cudaDataType_t computeType = CUDA_R_32I;
+#endif
+    cublasLtMatmulDesc_t   matmulDesc;
+    cublasLtMatrixLayout_t AtransformDesc = NULL;
+    cublasLtMatrixLayout_t BtransformDesc = NULL;
+    cublasLtMatrixLayout_t CtransformDesc = NULL;
+    cublasLtOrder_t        order_COL32    = CUBLASLT_ORDER_COL32;
+
+    cublasLtOrder_t order_matrixB;
+#if (CUDART_VERSION >= 11000)
+    if (use_ORDER_COL32_2R_4R4_) {
+        order_matrixB = CUBLASLT_ORDER_COL32_2R_4R4;
+    }
+    else {
+        order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C;
+    }
+#else
+    order_matrixB              = CUBLASLT_ORDER_COL4_4R2_8C;
+#endif
+
+    int ldaTransform = 32 * m;
+
+    int ldbTransform;
+    if (use_ORDER_COL32_2R_4R4_) {
+        ldbTransform = 32 * ((n + 32 - 1) / 32) * 32;
+    }
+    else {
+        ldbTransform = 32 * ((n + 8 - 1) / 8) * 8;
+    }
+
+    int ldcTransform = 32 * m;
+
+    // create matmulDesc
+#if (CUDART_VERSION >= 11000)
+    cublasLtMatmulDescCreate(&matmulDesc, computeType, scaleType);
+#else
+    cublasLtMatmulDescCreate(&matmulDesc, computeType);
+#endif
+    cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &opTranspose, sizeof(cublasOperation_t));
+    cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_SCALE_TYPE, &scaleType, sizeof(scaleType));
+    // cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &pointerMode,
+    // sizeof(cublasLtPointerMode_t));
+    cublasLtMatrixLayoutCreate(&AtransformDesc, CUDA_R_8I, m, k, ldaTransform);
+    cublasLtMatrixLayoutSetAttribute(AtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_COL32, sizeof(order_COL32));
+    cublasLtMatrixLayoutCreate(&BtransformDesc, CUDA_R_8I, n, k, ldbTransform);
+    cublasLtMatrixLayoutSetAttribute(
+        BtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_matrixB, sizeof(order_matrixB));
+    cublasLtMatrixLayoutCreate(&CtransformDesc, CUDA_R_8I, m, n, ldcTransform);
+    cublasLtMatrixLayoutSetAttribute(CtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_COL32, sizeof(order_COL32));
+    if (batchCount > 1) {
+        cublasLtMatrixLayoutSetAttribute(
+            AtransformDesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount));
+        cublasLtMatrixLayoutSetAttribute(
+            AtransformDesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridea, sizeof(stridea));
+        cublasLtMatrixLayoutSetAttribute(
+            BtransformDesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount));
+        cublasLtMatrixLayoutSetAttribute(
+            BtransformDesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideb, sizeof(strideb));
+        cublasLtMatrixLayoutSetAttribute(
+            CtransformDesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount));
+        cublasLtMatrixLayoutSetAttribute(
+            CtransformDesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridec, sizeof(stridec));
+    }
+
+    // get algo
+    cublasLtMatmulAlgo_t algo;
+    int                  findAlgo = 0;
+    if (cublas_algo_map_->isExist(batchCount, m, n, k, INT8_DATATYPE)) {
+        findAlgo = 1;
+
+        cublasLtMatmulAlgo_info tmp_info = cublas_algo_map_->getAlgo(batchCount, m, n, k, INT8_DATATYPE);
+
+        cublasLtMatmulAlgoInit(cublaslt_handle_,
+                               computeType,
+                               CUDA_R_32F,
+                               CUDA_R_8I,
+                               CUDA_R_8I,
+                               CUDA_R_8I,
+                               CUDA_R_8I,
+                               tmp_info.algoId,
+                               &algo);
+        cublasLtMatmulAlgoConfigSetAttribute(
+            &algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &(tmp_info.customOption), sizeof(tmp_info.customOption));
+        cublasLtMatmulAlgoConfigSetAttribute(
+            &algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &(tmp_info.tile), sizeof(tmp_info.tile));
+        cublasLtMatmulAlgoConfigSetAttribute(
+            &algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &(tmp_info.splitK_val), sizeof(tmp_info.splitK_val));
+        cublasLtMatmulAlgoConfigSetAttribute(
+            &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(tmp_info.swizzle), sizeof(tmp_info.swizzle));
+        cublasLtMatmulAlgoConfigSetAttribute(
+            &algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &(tmp_info.reductionScheme), sizeof(int));
+#if (CUDART_VERSION >= 11000)
+        cublasLtMatmulAlgoConfigSetAttribute(
+            &algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(tmp_info.stages), sizeof(tmp_info.stages));
+#endif
+    }
+    else {
+        findAlgo = 1;
+        int algoId;
+        if (use_ORDER_COL32_2R_4R4_) {
+            algoId = 7;
+        }
+        else {
+            algoId = 6;
+        }
+        int swizzle         = 0;
+        int customOption    = 0;
+        int tile            = 20;
+        int splitK_val      = 0;
+        int reductionScheme = 0;
+        cublasLtMatmulAlgoInit(
+            cublaslt_handle_, computeType, CUDA_R_32F, CUDA_R_8I, CUDA_R_8I, CUDA_R_8I, CUDA_R_8I, algoId, &algo);
+        cublasLtMatmulAlgoConfigSetAttribute(
+            &algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &(customOption), sizeof(customOption));
+        cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &(tile), sizeof(tile));
+        cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &(splitK_val), sizeof(splitK_val));
+        cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(swizzle), sizeof(swizzle));
+        cublasLtMatmulAlgoConfigSetAttribute(
+            &algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &(reductionScheme), sizeof(int));
+#if (CUDART_VERSION >= 11000)
+        int stages;
+        if (use_ORDER_COL32_2R_4R4_) {
+            stages = 15;
+        }
+        else {
+            stages = 13;
+        }
+        cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(stages), sizeof(stages));
+#endif
+    }
+
+    float beta = 0.0f;
+    cublasLtMatmul(cublaslt_handle_,
+                   matmulDesc,
+                   &alpha,
+                   ATransform,
+                   AtransformDesc,
+                   kernel,
+                   BtransformDesc,
+                   &beta,
+                   res,
+                   CtransformDesc,
+                   res,
+                   CtransformDesc,
+                   (findAlgo == 1 ? (&algo) : NULL),
+                   NULL,
+                   0,
+                   stream_);
+
+    cublasLtMatmulDescDestroy(matmulDesc);
+    cublasLtMatrixLayoutDestroy(AtransformDesc);
+    cublasLtMatrixLayoutDestroy(BtransformDesc);
+    cublasLtMatrixLayoutDestroy(CtransformDesc);
+    sync_check_cuda_error();
+    mu_->unlock();
+}
+
+template<typename T>
+int cublasINT8MMWrapper::getFusedINT8QKVType(const int k, const int n, const AttentionWeight<T>* attention_weights)
+{
+
+    int           fusedINT8QKV_type = 0;
+    const int8_t* Q_weight          = (const int8_t*)(attention_weights->query_weight.kernel);
+    const int8_t* K_weight          = (const int8_t*)(attention_weights->key_weight.kernel);
+    const int8_t* V_weight          = (const int8_t*)(attention_weights->value_weight.kernel);
+    // for QKV weight are DataType_ & continue
+    if ((attention_weights->query_weight.kernel + n * k == attention_weights->key_weight.kernel)
+        && (attention_weights->key_weight.kernel + n * k == attention_weights->value_weight.kernel)) {
+        fusedINT8QKV_type = 1;
+    }
+    // for QVK weight are int8 & continue
+    else if ((Q_weight + n * k == K_weight) && (K_weight + n * k == V_weight)) {
+        fusedINT8QKV_type = 2;
+    }
+    return fusedINT8QKV_type;
+}
+
+bool cublasINT8MMWrapper::getUseOrderCol322R4R4()
+{
+    return use_ORDER_COL32_2R_4R4_;
+}
+
+template int
+cublasINT8MMWrapper::getFusedINT8QKVType(const int k, const int n, const AttentionWeight<float>* attention_weights);
+
+template int
+cublasINT8MMWrapper::getFusedINT8QKVType(const int k, const int n, const AttentionWeight<half>* attention_weights);
+
+#ifdef SPARSITY_ENABLED
+// A is sparse weight [m,k], non transposed row major
+// B is activation input [k, n], non transposed col major
+void cublasINT8MMWrapper::SpGemm(
+    const int m, const int n, const int k, const float alpha, const void* A, const void* B, void* C)
+{
+    cudaDataType_t                 Atype        = CUDA_R_8I;
+    cudaDataType_t                 Btype        = CUDA_R_8I;
+    cudaDataType_t                 Ctype        = CUDA_R_8I;
+    cusparseComputeType            compute_type = CUSPARSE_COMPUTE_32I;
+    cusparseOrder_t                col_order    = CUSPARSE_ORDER_COL;
+    cusparseOrder_t                row_order    = CUSPARSE_ORDER_ROW;
+    cusparseOperation_t            opA          = CUSPARSE_OPERATION_NON_TRANSPOSE;
+    cusparseOperation_t            opB          = CUSPARSE_OPERATION_NON_TRANSPOSE;
+    cusparseLtMatmulDescriptor_t   matmul;
+    cusparseLtMatmulAlgSelection_t alg_sel;
+    cusparseLtMatmulPlan_t         plan;
+
+    auto     num_A_rows = m;
+    auto     num_A_cols = k;
+    auto     num_B_rows = k;
+    auto     num_B_cols = n;
+    auto     num_C_rows = m;
+    auto     num_C_cols = n;
+    unsigned alignment  = 16;
+    auto     lda        = num_A_cols;
+    auto     ldb        = num_B_rows;
+    auto     ldc        = num_C_rows;
+    float    _beta(0.0f);
+
+    char mark[256];
+    sprintf(mark, "%d_%d_%d_%d", 1, m, n, k);
+    if (sp_mat_A_desc_map_.find(mark) != sp_mat_A_desc_map_.end()) {
+        CHECK_CUSPARSE(cusparseLtMatmulDescriptorInit(&cusparselt_handle_,
+                                                      &matmul,
+                                                      opA,
+                                                      opB,
+                                                      &sp_mat_A_desc_map_[mark],
+                                                      &sp_mat_B_desc_map_[mark],
+                                                      &sp_mat_C_desc_map_[mark],
+                                                      &sp_mat_C_desc_map_[mark],
+                                                      compute_type))
+    }
+    else {
+        // initializing MatDesc takes a lot of time
+        cusparseLtMatDescriptor_t matA, matB, matC;
+        sp_mat_A_desc_map_[mark] = matA;
+        sp_mat_B_desc_map_[mark] = matB;
+        sp_mat_C_desc_map_[mark] = matC;
+        CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(&cusparselt_handle_,
+                                                          &sp_mat_A_desc_map_[mark],
+                                                          num_A_rows,
+                                                          num_A_cols,
+                                                          lda,
+                                                          alignment,
+                                                          Atype,
+                                                          row_order,
+                                                          CUSPARSELT_SPARSITY_50_PERCENT))
+        CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(
+            &cusparselt_handle_, &sp_mat_B_desc_map_[mark], num_B_rows, num_B_cols, ldb, alignment, Btype, col_order))
+        CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(
+            &cusparselt_handle_, &sp_mat_C_desc_map_[mark], num_C_rows, num_C_cols, ldc, alignment, Ctype, col_order))
+        CHECK_CUSPARSE(cusparseLtMatmulDescriptorInit(&cusparselt_handle_,
+                                                      &matmul,
+                                                      opA,
+                                                      opB,
+                                                      &sp_mat_A_desc_map_[mark],
+                                                      &sp_mat_B_desc_map_[mark],
+                                                      &sp_mat_C_desc_map_[mark],
+                                                      &sp_mat_C_desc_map_[mark],
+                                                      compute_type))
+    }
+    mu_->lock();
+    CHECK_CUSPARSE(
+        cusparseLtMatmulAlgSelectionInit(&cusparselt_handle_, &alg_sel, &matmul, CUSPARSELT_MATMUL_ALG_DEFAULT))
+    int alg = cublas_algo_map_->getSpAlgo(1, num_A_rows, num_B_cols, num_A_cols);
+    CHECK_CUSPARSE(cusparseLtMatmulAlgSetAttribute(
+        &cusparselt_handle_, &alg_sel, CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg, sizeof(alg)))
+    size_t workspace_size;
+    CHECK_CUSPARSE(cusparseLtMatmulGetWorkspace(&cusparselt_handle_, &alg_sel, &workspace_size))
+    CHECK_CUSPARSE(cusparseLtMatmulPlanInit(&cusparselt_handle_, &plan, &matmul, &alg_sel, workspace_size))
+
+    void*        d_workspace = nullptr;
+    int          num_streams = 1;
+    cudaStream_t streams[1]  = {stream_};
+    CHECK_CUSPARSE(
+        cusparseLtMatmul(&cusparselt_handle_, &plan, &alpha, A, B, &_beta, C, C, d_workspace, streams, num_streams))
+    CHECK_CUSPARSE(cusparseLtMatmulPlanDestroy(&plan))
+    sync_check_cuda_error();
+    mu_->unlock();
+}
+#endif
+}  // namespace fastertransformer
--- a/src/fastertransformer/utils/cublasINT8MMWrapper.h
+++ b/src/fastertransformer/utils/cublasINT8MMWrapper.h
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cuda_utils.h"
+#include "src/fastertransformer/layers/attention_layers/AttentionWeight.h"
+#include "src/fastertransformer/utils/cublasAlgoMap.h"
+#include "src/fastertransformer/utils/cublasMMWrapper.h"
+#include <cublasLt.h>
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+#include <map>
+#include <mutex>
+#include <string>
+
+#pragma once
+namespace fastertransformer {
+
+class cublasINT8MMWrapper: public cublasMMWrapper {
+private:
+    bool use_ORDER_COL32_2R_4R4_;
+
+public:
+    cublasINT8MMWrapper(cublasLtHandle_t cublaslt_handle_,
+                        cudaStream_t     stream,
+                        cublasAlgoMap*   map,
+                        std::mutex*      mu,
+                        bool             use_ORDER_COL32_2R_4R4);
+
+    cublasINT8MMWrapper(cublasHandle_t   cublas_handle,
+                        cublasLtHandle_t cublaslt_handle,
+                        cudaStream_t     stream,
+                        cublasAlgoMap*   map,
+                        std::mutex*      mu,
+                        bool             use_ORDER_COL32_2R_4R4);
+#ifdef SPARSITY_ENABLED
+    cublasINT8MMWrapper(cublasLtHandle_t   cublaslt_handle_,
+                        cusparseLtHandle_t cusparselt_handle,
+                        cudaStream_t       stream,
+                        cublasAlgoMap*     map,
+                        std::mutex*        mu,
+                        bool               use_ORDER_COL32_2R_4R4);
+#endif
+
+    ~cublasINT8MMWrapper();
+
+    cublasINT8MMWrapper(const cublasINT8MMWrapper& wrapper);
+
+    void Gemm(int*          res,
+              int           batchCount,
+              int           m,
+              int           n,
+              int           k,
+              int64_t       stridea,
+              int64_t       strideb,
+              int64_t       stridec,
+              const int8_t* ATransform,
+              const int8_t* kernel);
+
+    void Gemm(int8_t*       res,
+              int           batchCount,
+              int           m,
+              int           n,
+              int           k,
+              int64_t       stridea,
+              int64_t       strideb,
+              int64_t       stridec,
+              const float   alpha,
+              const int8_t* ATransform,
+              const int8_t* kernel);
+
+    template<typename T>
+    int getFusedINT8QKVType(const int k, const int n, const AttentionWeight<T>* attention_weights);
+
+    bool getUseOrderCol322R4R4();
+
+#ifdef SPARSITY_ENABLED
+    void SpGemm(const int m, const int n, const int k, const float alpha, const void* A, const void* B, void* C);
+#endif
+};
+
+}  // namespace fastertransformer
--- a/src/fastertransformer/utils/cublasMMWrapper.cc
+++ b/src/fastertransformer/utils/cublasMMWrapper.cc
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cublasMMWrapper.h"
+#include "cuda_utils.h"
+
+#ifndef CUDART_VERSION
+#error CUDART_VERSION Undefined!
+#endif
+
+namespace fastertransformer {
+cublasMMWrapper::cublasMMWrapper(cublasHandle_t   cublas_handle,
+                                 cublasLtHandle_t cublaslt_handle,
+                                 cudaStream_t     stream,
+                                 cublasAlgoMap*   cublas_algo_map,
+                                 std::mutex*      mu,
+                                 IAllocator*      allocator):
+    cublas_handle_(cublas_handle),
+    cublaslt_handle_(cublaslt_handle),
+    stream_(stream),
+    cublas_algo_map_(cublas_algo_map),
+    mu_(mu),
+    allocator_(allocator)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    if (allocator_ != nullptr) {
+        cublas_workspace_ = allocator_->reMalloc(cublas_workspace_, CUBLAS_WORKSPACE_SIZE, false);
+    }
+}
+
+#ifdef SPARSITY_ENABLED
+cublasMMWrapper::cublasMMWrapper(cublasHandle_t     cublas_handle,
+                                 cublasLtHandle_t   cublaslt_handle,
+                                 cusparseLtHandle_t cusparselt_handle,
+                                 cudaStream_t       stream,
+                                 cublasAlgoMap*     cublas_algo_map,
+                                 std::mutex*        mu,
+                                 IAllocator*        allocator):
+    cublas_handle_(cublas_handle),
+    cublaslt_handle_(cublaslt_handle),
+    cusparselt_handle_(cusparselt_handle),
+    stream_(stream),
+    cublas_algo_map_(cublas_algo_map),
+    mu_(mu),
+    allocator_(allocator)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    if (allocator_ != nullptr) {
+        cublas_workspace_ = allocator_->reMalloc(cublas_workspace_, CUBLAS_WORKSPACE_SIZE, false);
+    }
+}
+#endif
+
+cublasMMWrapper::~cublasMMWrapper()
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    mu_ = nullptr;
+    if (allocator_ != nullptr) {
+        allocator_->free((void**)(&cublas_workspace_));
+        allocator_ = nullptr;
+    }
+}
+
+cublasMMWrapper::cublasMMWrapper(const cublasMMWrapper& wrapper):
+    cublas_handle_(wrapper.cublas_handle_),
+    cublaslt_handle_(wrapper.cublaslt_handle_),
+#ifdef SPARSITY_ENABLED
+    cusparselt_handle_(wrapper.cusparselt_handle_),
+#endif
+    stream_(wrapper.stream_),
+    cublas_algo_map_(wrapper.cublas_algo_map_),
+    mu_(wrapper.mu_),
+    allocator_(wrapper.allocator_)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    if (allocator_ != nullptr) {
+        cublas_workspace_ = allocator_->reMalloc(cublas_workspace_, CUBLAS_WORKSPACE_SIZE, false);
+    }
+}
+
+void cublasMMWrapper::Gemm(cublasOperation_t transa,
+                           cublasOperation_t transb,
+                           const int         m,
+                           const int         n,
+                           const int         k,
+                           const void*       alpha,
+                           const void*       A,
+                           cudaDataType_t    Atype,
+                           int               lda,
+                           const void*       B,
+                           cudaDataType_t    Btype,
+                           int               ldb,
+                           const void*       beta,
+                           void*             C,
+                           cudaDataType_t    Ctype,
+                           int               ldc,
+                           cudaDataType_t    computeType,
+                           cublasGemmAlgo_t  algo)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    mu_->lock();
+    check_cuda_error(cublasGemmEx(cublas_handle_,
+                                  transa,
+                                  transb,
+                                  m,
+                                  n,
+                                  k,
+                                  alpha,
+                                  A,
+                                  Atype,
+                                  lda,
+                                  B,
+                                  Btype,
+                                  ldb,
+                                  beta,
+                                  C,
+                                  Ctype,
+                                  ldc,
+                                  computeType,
+                                  algo));
+    sync_check_cuda_error();
+    mu_->unlock();
+}
+
+void cublasMMWrapper::Gemm(cublasOperation_t transa,
+                           cublasOperation_t transb,
+                           const int         m,
+                           const int         n,
+                           const int         k,
+                           const void*       A,
+                           const int         lda,
+                           const void*       B,
+                           const int         ldb,
+                           void*             C,
+                           const int         ldc)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    Gemm(transa, transb, m, n, k, A, lda, B, ldb, C, ldc, 1.0f, 0.0f);
+}
+
+void cublasMMWrapper::Gemm(cublasOperation_t transa,
+                           cublasOperation_t transb,
+                           const int         m,
+                           const int         n,
+                           const int         k,
+                           const void*       A,
+                           const int         lda,
+                           const void*       B,
+                           const int         ldb,
+                           void*             C,
+                           const int         ldc,
+                           float             f_alpha,
+                           float             f_beta)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    half h_alpha = (half)(f_alpha);
+    half h_beta  = (half)(f_beta);
+
+    mu_->lock();
+    // TODO: default cublas libs
+    int  is_fp16_computeType = computeType_ == CUDA_R_16F ? 1 : 0;
+    bool using_cublasLt      = (Atype_ == CUDA_R_16F) ? true : false;
+    int  batch_count         = 1;
+    // fp32 use cublas as default
+    // fp16 use cublasLt as default
+    const void* alpha = is_fp16_computeType ? reinterpret_cast<void*>(&h_alpha) : reinterpret_cast<void*>(&f_alpha);
+    const void* beta  = is_fp16_computeType ? reinterpret_cast<void*>(&h_beta) : reinterpret_cast<void*>(&f_beta);
+
+    int findAlgo = cublas_algo_map_->isExist(batch_count, m, n, k, getCublasDataType(Atype_));
+
+    cublasLtMatmulAlgo_info info = cublas_algo_map_->getAlgo(batch_count, m, n, k, getCublasDataType(Atype_));
+    if (findAlgo) {
+        if (info.stages != -1) {
+            using_cublasLt = true;
+        }
+        else {
+            using_cublasLt = false;
+        }
+    }
+
+    if (using_cublasLt) {
+        cublasLtMatmulDesc_t   operationDesc = NULL;
+        cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL;
+        cudaDataType_t         scaleType;
+#if (CUDART_VERSION >= 11000)
+        cublasComputeType_t computeType;
+#else
+        cudaDataType_t computeType;
+#endif
+
+        if (is_fp16_computeType) {
+#if (CUDART_VERSION >= 11000)
+            computeType = CUBLAS_COMPUTE_16F;
+#else
+            computeType = CUDA_R_16F;
+#endif
+            scaleType = CUDA_R_16F;
+        }
+        else {
+#if (CUDART_VERSION >= 11000)
+            computeType = CUBLAS_COMPUTE_32F;
+#else
+            computeType = CUDA_R_32F;
+#endif
+            scaleType = CUDA_R_32F;
+        }
+
+        // --------------------------------------
+        // Create descriptors for the original matrices
+        cublasLtMatrixLayoutCreate(&Adesc, Atype_, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda);
+        cublasLtMatrixLayoutCreate(&Bdesc, Btype_, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb);
+        cublasLtMatrixLayoutCreate(&Cdesc, Ctype_, m, n, ldc);
+#if (CUDART_VERSION >= 11000)
+        cublasLtMatmulDescCreate(&operationDesc, computeType, scaleType);
+#else
+        cublasLtMatmulDescCreate(&operationDesc, computeType);
+#endif
+
+        cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(cublasOperation_t));
+        cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(cublasOperation_t));
+
+        cublasLtMatmulAlgo_t algo;
+        void*                workSpace     = cublas_workspace_;
+        int                  workspaceSize = cublas_workspace_ == NULL ? 0 : CUBLAS_WORKSPACE_SIZE;
+        if (findAlgo) {
+            if (info.workspaceSize > workspaceSize) {
+                findAlgo = 0;
+            }
+            else {
+                cublasLtMatmulAlgoInit(
+                    cublaslt_handle_, computeType, scaleType, Atype_, Btype_, Ctype_, Ctype_, info.algoId, &algo);
+                cublasLtMatmulAlgoConfigSetAttribute(
+                    &algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &(info.customOption), sizeof(info.customOption));
+                cublasLtMatmulAlgoConfigSetAttribute(
+                    &algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &(info.tile), sizeof(info.tile));
+                cublasLtMatmulAlgoConfigSetAttribute(
+                    &algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &(info.splitK_val), sizeof(info.splitK_val));
+                cublasLtMatmulAlgoConfigSetAttribute(
+                    &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(info.swizzle), sizeof(info.swizzle));
+                cublasLtMatmulAlgoConfigSetAttribute(&algo,
+                                                     CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME,
+                                                     &(info.reductionScheme),
+                                                     sizeof(info.reductionScheme));
+
+#if (CUDART_VERSION >= 11000)
+                cublasLtMatmulAlgoConfigSetAttribute(
+                    &algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(info.stages), sizeof(info.stages));
+#endif
+
+#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
+                cublasLtMatmulAlgoConfigSetAttribute(
+                    &algo, CUBLASLT_ALGO_CONFIG_INNER_SHAPE_ID, &(info.inner_shapeId), sizeof(info.inner_shapeId));
+                cublasLtMatmulAlgoConfigSetAttribute(&algo,
+                                                     CUBLASLT_ALGO_CONFIG_CLUSTER_SHAPE_ID,
+                                                     &(info.cluster_shapeId),
+                                                     sizeof(info.cluster_shapeId));
+#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
+                cublasLtMatmulAlgoConfigSetAttribute(
+                    &algo, CUBLASLT_ALGO_CONFIG_MMA_SHAPE_ID, &(info.mma_shapeId), sizeof(info.mma_shapeId));
+                cublasLtMatmulAlgoConfigSetAttribute(
+                    &algo, CUBLASLT_ALGO_CONFIG_CGA_SHAPE_ID, &(info.cga_shapeId), sizeof(info.cga_shapeId));
+                cublasLtMatmulAlgoConfigSetAttribute(
+                    &algo, CUBLASLT_ALGO_CONFIG_SCHEDULING_MODE, &(info.sche_mode), sizeof(info.sche_mode));
+#endif
+            }
+        }
+
+        cublasLtMatmul(cublaslt_handle_,
+                       operationDesc,
+                       alpha,
+                       A,
+                       Adesc,
+                       B,
+                       Bdesc,
+                       beta,
+                       C,
+                       Cdesc,
+                       C,
+                       Cdesc,
+                       (findAlgo == 1 ? (&algo) : NULL),
+                       workSpace,
+                       workspaceSize,
+                       stream_);
+
+        cublasLtMatmulDescDestroy(operationDesc);
+        cublasLtMatrixLayoutDestroy(Adesc);
+        cublasLtMatrixLayoutDestroy(Bdesc);
+        cublasLtMatrixLayoutDestroy(Cdesc);
+        sync_check_cuda_error();
+    }
+    else {
+        int cublasAlgo = info.algoId;
+        check_cuda_error(cublasGemmEx(cublas_handle_,
+                                      transa,
+                                      transb,
+                                      m,
+                                      n,
+                                      k,
+                                      alpha,
+                                      A,
+                                      Atype_,
+                                      lda,
+                                      B,
+                                      Btype_,
+                                      ldb,
+                                      beta,
+                                      C,
+                                      Ctype_,
+                                      ldc,
+                                      computeType_,
+                                      static_cast<cublasGemmAlgo_t>(cublasAlgo)));
+        sync_check_cuda_error();
+    }
+    mu_->unlock();
+}
+
+void cublasMMWrapper::setFP32GemmConfig()
+{
+    Atype_       = CUDA_R_32F;
+    Btype_       = CUDA_R_32F;
+    Ctype_       = CUDA_R_32F;
+    computeType_ = CUDA_R_32F;
+}
+
+void cublasMMWrapper::setFP16GemmConfig()
+{
+    Atype_       = CUDA_R_16F;
+    Btype_       = CUDA_R_16F;
+    Ctype_       = CUDA_R_16F;
+    computeType_ = CUDA_R_32F;
+}
+
+#ifdef ENABLE_BF16
+void cublasMMWrapper::setBF16GemmConfig()
+{
+    Atype_       = CUDA_R_16BF;
+    Btype_       = CUDA_R_16BF;
+    Ctype_       = CUDA_R_16BF;
+    computeType_ = CUDA_R_32F;
+}
+#endif
+
+void cublasMMWrapper::setGemmConfig(cudaDataType_t aType,
+                                    cudaDataType_t bType,
+                                    cudaDataType_t cType,
+                                    cudaDataType_t computeType)
+{
+    Atype_       = aType;
+    Btype_       = bType;
+    Ctype_       = cType;
+    computeType_ = computeType;
+}
+
+CublasDataType cublasMMWrapper::getCublasDataType(cudaDataType_t data_type)
+{
+    if (data_type == CUDA_R_16F) {
+        return HALF_DATATYPE;
+    }
+    else if (data_type == CUDA_R_32F) {
+        return FLOAT_DATATYPE;
+    }
+#ifdef ENABLE_BF16
+    else if (data_type == CUDA_R_16BF) {
+        return BFLOAT16_DATATYPE;
+    }
+#endif
+    return FLOAT_DATATYPE;
+}
+
+#if (CUDART_VERSION >= 11000)
+// input, weight, output are row-major
+// only works for cublas 11.x
+void cublasMMWrapper::Gemm(cublasOperation_t transa,
+                           cublasOperation_t transb,
+                           const int         m,
+                           const int         n,
+                           const int         k,
+                           const void*       A,
+                           const int         lda,
+                           const void*       B,
+                           const int         ldb,
+                           const void*       bias,
+                           void*             C,
+                           const int         ldc)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    cudaDataType_t      Atype, Btype, Ctype;
+    cublasComputeType_t computeType;
+    cudaDataType_t      scaleType;
+    float               alpha_float = 1.0f;
+    float               beta_float  = 0.0f;
+    half                alpha_half  = half(1.0f);
+    half                beta_half   = half(0.0f);
+    void *              alpha, *beta;
+
+    // int is_fp16_computeType = computeType_ == CUDA_R_16F ? 1 : 0;
+    if (Atype_ == CUDA_R_32F) {
+        computeType = CUBLAS_COMPUTE_32F_FAST_TF32;
+        Atype       = CUDA_R_32F;
+        Btype       = CUDA_R_32F;
+        Ctype       = CUDA_R_32F;
+        scaleType   = CUDA_R_32F;
+        alpha       = &alpha_float;
+        beta        = &beta_float;
+    }
+    else if (Atype_ == CUDA_R_16BF) {
+        computeType = CUBLAS_COMPUTE_32F_FAST_TF32;
+        Atype       = CUDA_R_16BF;
+        Btype       = CUDA_R_16BF;
+        Ctype       = CUDA_R_16BF;
+        scaleType   = CUDA_R_32F;
+        alpha       = &alpha_float;
+        beta        = &beta_float;
+    }
+    else {
+        computeType = CUBLAS_COMPUTE_16F;
+        Atype       = CUDA_R_16F;
+        Btype       = CUDA_R_16F;
+        Ctype       = CUDA_R_16F;
+        scaleType   = CUDA_R_16F;
+        alpha       = &alpha_half;
+        beta        = &beta_half;
+    }
+
+    cublasLtMatmulDesc_t   operationDesc = NULL;
+    cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL;
+    cublasLtEpilogue_t     epi = CUBLASLT_EPILOGUE_BIAS;
+    cublasLtMatrixLayoutCreate(&Adesc, Atype, (transa == CUBLAS_OP_N) ? m : k, (transa == CUBLAS_OP_N) ? k : m, lda);
+    cublasLtMatrixLayoutCreate(&Bdesc, Btype, (transb == CUBLAS_OP_N) ? k : n, (transb == CUBLAS_OP_N) ? n : k, ldb);
+    cublasLtMatrixLayoutCreate(&Cdesc, Ctype, m, n, ldc);
+
+    cublasLtMatmulDescCreate(&operationDesc, computeType, scaleType);
+    cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(cublasOperation_t));
+    cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(cublasOperation_t));
+    cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epi, sizeof(cublasLtEpilogue_t));
+    cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias, sizeof(const void*));
+    check_cuda_error(cublasLtMatmul(
+        cublaslt_handle_, operationDesc, alpha, A, Adesc, B, Bdesc, beta, C, Cdesc, C, Cdesc, NULL, NULL, 0, stream_));
+    cublasLtMatrixLayoutDestroy(Adesc);
+    cublasLtMatrixLayoutDestroy(Bdesc);
+    cublasLtMatrixLayoutDestroy(Cdesc);
+    cublasLtMatmulDescDestroy(operationDesc);
+}
+#endif
+void cublasMMWrapper::setStream(cudaStream_t stream)
+{
+    stream_ = stream;
+}
+
+void cublasMMWrapper::stridedBatchedGemm(cublasOperation_t transa,
+                                         cublasOperation_t transb,
+                                         const int         m,
+                                         const int         n,
+                                         const int         k,
+                                         const void*       A,
+                                         const int         lda,
+                                         const int64_t     strideA,
+                                         const void*       B,
+                                         const int         ldb,
+                                         const int64_t     strideB,
+                                         void*             C,
+                                         const int         ldc,
+                                         const int64_t     strideC,
+                                         const int         batch_count,
+                                         const float       f_alpha,
+                                         const float       f_beta)
+{
+    half h_alpha = (half)f_alpha;
+    half h_beta  = (half)f_beta;
+
+    mu_->lock();
+    int         is_fp16_computeType = computeType_ == CUDA_R_16F ? 1 : 0;
+    const void* alpha =
+        is_fp16_computeType ? reinterpret_cast<void*>(&h_alpha) : reinterpret_cast<const void*>(&f_alpha);
+    const void* beta = is_fp16_computeType ? reinterpret_cast<void*>(&h_beta) : reinterpret_cast<const void*>(&f_beta);
+    cublasLtMatmulAlgo_info info = cublas_algo_map_->getAlgo(batch_count, m, n, k, getCublasDataType(Atype_));
+
+    check_cuda_error(cublasGemmStridedBatchedEx(cublas_handle_,
+                                                transa,
+                                                transb,
+                                                m,
+                                                n,
+                                                k,
+                                                alpha,
+                                                A,
+                                                Atype_,
+                                                lda,
+                                                strideA,
+                                                B,
+                                                Btype_,
+                                                ldb,
+                                                strideB,
+                                                beta,
+                                                C,
+                                                Ctype_,
+                                                ldc,
+                                                strideC,
+                                                batch_count,
+                                                computeType_,
+                                                static_cast<cublasGemmAlgo_t>(info.algoId)));
+
+    mu_->unlock();
+}
+
+void cublasMMWrapper::stridedBatchedGemm(cublasOperation_t transa,
+                                         cublasOperation_t transb,
+                                         const int         m,
+                                         const int         n,
+                                         const int         k,
+                                         const float       f_alpha,
+                                         const void*       A,
+                                         cudaDataType_t    AType,
+                                         const int         lda,
+                                         const int64_t     strideA,
+                                         const void*       B,
+                                         cudaDataType_t    BType,
+                                         const int         ldb,
+                                         const int64_t     strideB,
+                                         const float       f_beta,
+                                         void*             C,
+                                         cudaDataType_t    CType,
+                                         const int         ldc,
+                                         const int64_t     strideC,
+                                         const int         batch_count,
+                                         cudaDataType_t    computeType)
+{
+    half h_alpha = (half)f_alpha;
+    half h_beta  = (half)f_beta;
+
+    mu_->lock();
+    int         is_fp16_computeType = computeType == CUDA_R_16F ? 1 : 0;
+    const void* alpha =
+        is_fp16_computeType ? reinterpret_cast<void*>(&h_alpha) : reinterpret_cast<const void*>(&f_alpha);
+    const void* beta = is_fp16_computeType ? reinterpret_cast<void*>(&h_beta) : reinterpret_cast<const void*>(&f_beta);
+    cublasLtMatmulAlgo_info info = cublas_algo_map_->getAlgo(batch_count, m, n, k, getCublasDataType(Atype_));
+
+    check_cuda_error(cublasGemmStridedBatchedEx(cublas_handle_,
+                                                transa,
+                                                transb,
+                                                m,
+                                                n,
+                                                k,
+                                                alpha,
+                                                A,
+                                                AType,
+                                                lda,
+                                                strideA,
+                                                B,
+                                                BType,
+                                                ldb,
+                                                strideB,
+                                                beta,
+                                                C,
+                                                CType,
+                                                ldc,
+                                                strideC,
+                                                batch_count,
+                                                computeType,
+                                                static_cast<cublasGemmAlgo_t>(info.algoId)));
+
+    mu_->unlock();
+}
+
+void cublasMMWrapper::batchedGemm(cublasOperation_t  transa,
+                                  cublasOperation_t  transb,
+                                  const int          m,
+                                  const int          n,
+                                  const int          k,
+                                  const void* const* A,
+                                  const int          lda,
+                                  const void* const* B,
+                                  const int          ldb,
+                                  void* const*       C,
+                                  const int          ldc,
+                                  const int          batch_count)
+{
+    float f_alpha = static_cast<float>(1.0f);
+    float f_beta  = static_cast<float>(0.0f);
+
+    half h_alpha = (half)1.0f;
+    half h_beta  = (half)0.0f;
+
+    mu_->lock();
+    int         is_fp16_computeType = computeType_ == CUDA_R_16F ? 1 : 0;
+    const void* alpha = is_fp16_computeType ? reinterpret_cast<void*>(&h_alpha) : reinterpret_cast<void*>(&f_alpha);
+    const void* beta  = is_fp16_computeType ? reinterpret_cast<void*>(&h_beta) : reinterpret_cast<void*>(&f_beta);
+    cublasLtMatmulAlgo_info info = cublas_algo_map_->getAlgo(batch_count, m, n, k, getCublasDataType(Atype_));
+
+    check_cuda_error(cublasGemmBatchedEx(cublas_handle_,
+                                         transa,
+                                         transb,
+                                         m,
+                                         n,
+                                         k,
+                                         alpha,
+                                         A,
+                                         Atype_,
+                                         lda,
+                                         B,
+                                         Btype_,
+                                         ldb,
+                                         beta,
+                                         C,
+                                         Ctype_,
+                                         ldc,
+                                         batch_count,
+                                         computeType_,
+                                         static_cast<cublasGemmAlgo_t>(info.algoId)));
+    mu_->unlock();
+}
+
+bool cublasMMWrapper::isFuseBatchGemm(const int batch_count, const int m, const int k, const int n)
+{
+    CublasDataType data_type = getCublasDataType(Atype_);
+
+    if (cublas_algo_map_->isExist(batch_count, m, k, n, data_type) == false
+        || cublas_algo_map_->isExist(1, m, k, n, data_type) == false) {
+        return false;
+    }
+    else {
+        return cublas_algo_map_->getAlgo(batch_count, m, k, n, data_type).exec_time
+               < 3 * cublas_algo_map_->getAlgo(1, m, k, n, data_type).exec_time;
+    }
+}
+
+#ifdef SPARSITY_ENABLED
+void cublasMMWrapper::SpGemm(cublasOperation_t transa,
+                             cublasOperation_t transb,
+                             const int         m,
+                             const int         n,
+                             const int         k,
+                             const void*       A,
+                             const void*       B,
+                             void*             C)
+{
+    if (Atype_ != CUDA_R_16F || Btype_ != CUDA_R_16F || Ctype_ != CUDA_R_16F) {
+        throw std::runtime_error("\n[FT][ERROR] sparse GEMM only supports FP16 data type now.");
+    }
+    static bool not_printed_fp32_accumulation_warning = true;
+    if (computeType_ != CUDA_R_16F && not_printed_fp32_accumulation_warning) {
+        printf("[FT][WARNING] cublasMMWrapper sets to FP32 compute type, "
+               "but sparse gemm will use FP16 compute type since cusparselt "
+               "supports FP16 accumulation only.\n");
+        not_printed_fp32_accumulation_warning = false;
+    }
+    cusparseOrder_t     order = CUSPARSE_ORDER_COL;
+    cusparseOperation_t opA = (transa == CUBLAS_OP_N) ? CUSPARSE_OPERATION_NON_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
+    cusparseOperation_t opB = (transb == CUBLAS_OP_N) ? CUSPARSE_OPERATION_NON_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
+    cusparseComputeType compute_type = CUSPARSE_COMPUTE_16F;
+    cusparseLtMatmulDescriptor_t   matmul;
+    cusparseLtMatmulAlgSelection_t alg_sel;
+    cusparseLtMatmulPlan_t         plan;
+
+    bool     is_rowmajor    = (order == CUSPARSE_ORDER_ROW);
+    bool     isA_transposed = (opA != CUSPARSE_OPERATION_NON_TRANSPOSE);
+    bool     isB_transposed = (opB != CUSPARSE_OPERATION_NON_TRANSPOSE);
+    auto     num_A_rows     = (isA_transposed) ? k : m;
+    auto     num_A_cols     = (isA_transposed) ? m : k;
+    auto     num_B_rows     = (isB_transposed) ? n : k;
+    auto     num_B_cols     = (isB_transposed) ? k : n;
+    auto     num_C_rows     = m;
+    auto     num_C_cols     = n;
+    unsigned alignment      = 16;
+    auto     lda            = (is_rowmajor) ? num_A_cols : num_A_rows;
+    auto     ldb            = (is_rowmajor) ? num_B_cols : num_B_rows;
+    auto     ldc            = (is_rowmajor) ? num_C_cols : num_C_rows;
+    float    _alpha(1.0f);
+    float    _beta(0.0f);
+
+    char mark[256];
+    sprintf(mark, "%d_%d_%d_%d", 1, m, n, k);
+    if (sp_mat_A_desc_map_.find(mark) != sp_mat_A_desc_map_.end()) {
+        CHECK_CUSPARSE(cusparseLtMatmulDescriptorInit(&cusparselt_handle_,
+                                                      &matmul,
+                                                      opA,
+                                                      opB,
+                                                      &sp_mat_A_desc_map_[mark],
+                                                      &sp_mat_B_desc_map_[mark],
+                                                      &sp_mat_C_desc_map_[mark],
+                                                      &sp_mat_C_desc_map_[mark],
+                                                      compute_type))
+    }
+    else {
+        // initializing MatDesc takes a lot of time
+        cusparseLtMatDescriptor_t matA, matB, matC;
+        sp_mat_A_desc_map_[mark] = matA;
+        sp_mat_B_desc_map_[mark] = matB;
+        sp_mat_C_desc_map_[mark] = matC;
+        CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(&cusparselt_handle_,
+                                                          &sp_mat_A_desc_map_[mark],
+                                                          num_A_rows,
+                                                          num_A_cols,
+                                                          lda,
+                                                          alignment,
+                                                          Atype_,
+                                                          order,
+                                                          CUSPARSELT_SPARSITY_50_PERCENT))
+        CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(
+            &cusparselt_handle_, &sp_mat_B_desc_map_[mark], num_B_rows, num_B_cols, ldb, alignment, Btype_, order))
+        CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(
+            &cusparselt_handle_, &sp_mat_C_desc_map_[mark], num_C_rows, num_C_cols, ldc, alignment, Ctype_, order))
+        CHECK_CUSPARSE(cusparseLtMatmulDescriptorInit(&cusparselt_handle_,
+                                                      &matmul,
+                                                      opA,
+                                                      opB,
+                                                      &sp_mat_A_desc_map_[mark],
+                                                      &sp_mat_B_desc_map_[mark],
+                                                      &sp_mat_C_desc_map_[mark],
+                                                      &sp_mat_C_desc_map_[mark],
+                                                      compute_type))
+    }
+    mu_->lock();
+    CHECK_CUSPARSE(
+        cusparseLtMatmulAlgSelectionInit(&cusparselt_handle_, &alg_sel, &matmul, CUSPARSELT_MATMUL_ALG_DEFAULT))
+    int alg = cublas_algo_map_->getSpAlgo(1, num_A_rows, num_B_cols, num_A_cols);
+    CHECK_CUSPARSE(cusparseLtMatmulAlgSetAttribute(
+        &cusparselt_handle_, &alg_sel, CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg, sizeof(alg)))
+    size_t workspace_size;
+    CHECK_CUSPARSE(cusparseLtMatmulGetWorkspace(&cusparselt_handle_, &alg_sel, &workspace_size))
+    CHECK_CUSPARSE(cusparseLtMatmulPlanInit(&cusparselt_handle_, &plan, &matmul, &alg_sel, workspace_size))
+
+    void*        d_workspace = nullptr;
+    int          num_streams = 1;
+    cudaStream_t streams[1]  = {stream_};
+    CHECK_CUSPARSE(
+        cusparseLtMatmul(&cusparselt_handle_, &plan, &_alpha, A, B, &_beta, C, C, d_workspace, streams, num_streams))
+    CHECK_CUSPARSE(cusparseLtMatmulPlanDestroy(&plan))
+    sync_check_cuda_error();
+    mu_->unlock();
+}
+
+size_t cublasMMWrapper::getSparseMatrixSize(int m, int k)
+{
+    // Get a compressed matrix size of shape (m, k) used in cusparselt.
+    auto            Atype_     = CUDA_R_16F;
+    cusparseOrder_t order      = CUSPARSE_ORDER_COL;
+    unsigned        alignment  = 16;
+    int             num_A_rows = m;
+    int             num_A_cols = k;
+    int             lda        = num_A_rows;
+
+    cusparseLtMatDescriptor_t matA;
+    CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(&cusparselt_handle_,
+                                                      &matA,
+                                                      num_A_rows,
+                                                      num_A_cols,
+                                                      lda,
+                                                      alignment,
+                                                      Atype_,
+                                                      order,
+                                                      CUSPARSELT_SPARSITY_50_PERCENT));
+    size_t compressed_size = 0;
+    CHECK_CUSPARSE(cusparseLtSpMMACompressedSize2(&cusparselt_handle_, &matA, &compressed_size));
+    return compressed_size;
+}
+
+void cublasMMWrapper::compressMatrix(const void* input, void* output, const int m, const int k)
+{
+    cusparseOrder_t           order = CUSPARSE_ORDER_COL;
+    cusparseOperation_t       opA   = CUSPARSE_OPERATION_NON_TRANSPOSE;
+    cusparseLtMatDescriptor_t matA;
+    unsigned                  alignment = 16;
+    CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(
+        &cusparselt_handle_, &matA, m, k, m, alignment, CUDA_R_16F, order, CUSPARSELT_SPARSITY_50_PERCENT))
+    CHECK_CUSPARSE(cusparseLtSpMMACompress2(&cusparselt_handle_, &matA, true, opA, input, output, stream_))
+    sync_check_cuda_error();
+}
+
+bool cublasMMWrapper::isUseSparse(const int batch_count, const int m, const int n, const int k)
+{
+    return cublas_algo_map_->isUseSparse(batch_count, m, n, k);
+}
+#endif
+
+std::pair<bool, cublasLtMatmulAlgo_t> cublasMMWrapper::findBestAlgo(cublasLtHandle_t       lightHandle,
+                                                                    cublasLtMatmulDesc_t   computeDesc,
+                                                                    const void*            alpha,
+                                                                    const void*            A,
+                                                                    cublasLtMatrixLayout_t Adesc,
+                                                                    const void*            B,
+                                                                    cublasLtMatrixLayout_t Bdesc,
+                                                                    const void*            beta,
+                                                                    const void*            C,
+                                                                    cublasLtMatrixLayout_t Cdesc,
+                                                                    void*                  D,
+                                                                    cublasLtMatrixLayout_t Ddesc,
+                                                                    cudaStream_t           stream)
+{
+#if (CUBLAS_VERSION) <= 11601
+    FT_CHECK_WITH_INFO(false, "CUBLAS version too low.");
+    return {false, cublasLtMatmulAlgo_t{}};
+#else
+    size_t returnSize;
+    int32_t pointer_mode;
+    cublasLtMatmulDescGetAttribute(
+        computeDesc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &pointer_mode, sizeof(pointer_mode), &returnSize);
+
+    std::vector<cublasLtMatmulHeuristicResult_t> heuristics(200);
+    cublasLtMatmulPreference_t preference;
+    check_cuda_error(cublasLtMatmulPreferenceCreate(&preference));
+    check_cuda_error(cublasLtMatmulPreferenceInit(preference));
+    uint64_t workspace_size = CUBLAS_WORKSPACE_SIZE;
+    check_cuda_error(cublasLtMatmulPreferenceSetAttribute(
+        preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspace_size, sizeof(workspace_size)));
+#if (CUBLAS_VERSION) <= 12000
+    uint32_t pointer_mode_mask = 0;
+    check_cuda_error(cublasLtMatmulPreferenceSetAttribute(
+        preference, CUBLASLT_MATMUL_PREF_EPILOGUE_MASK, &pointer_mode_mask, sizeof(pointer_mode_mask)));
+#endif
+
+    int return_count = 0;
+    auto ret = cublasLtMatmulAlgoGetHeuristic(lightHandle,
+                                              computeDesc,
+                                              Adesc,
+                                              Bdesc,
+                                              Cdesc,
+                                              Ddesc,
+                                              preference,
+                                              heuristics.size(),
+                                              heuristics.data(),
+                                              &return_count);
+    heuristics.resize(return_count);
+
+    std::map<int, std::vector<float>> algo_results;
+    for (const auto& heuristic : heuristics) {
+        cublasLtMatmulAlgo_t algo = heuristic.algo;
+        int32_t algo_id;
+        cublasLtMatmulAlgoConfigGetAttribute(&algo, CUBLASLT_ALGO_CONFIG_ID, &algo_id, sizeof(algo_id), &returnSize);
+
+        cudaEvent_t start_event, stop_event;
+        cudaEventCreate(&start_event);
+        cudaEventCreate(&stop_event);
+
+        float my_alpha = 1.0f;
+        float my_beta = 0.0f;
+
+        for (int i = 0; i < 11; i++) {
+            float duration_ms;
+            cudaEventRecord(start_event, stream);
+            check_cuda_error(cublasLtMatmul(lightHandle,
+                                            computeDesc,
+                                            alpha,
+                                            A,
+                                            Adesc,
+                                            B,
+                                            Bdesc,
+                                            beta,
+                                            C,
+                                            Cdesc,
+                                            D,
+                                            Ddesc,
+                                            &algo,
+                                            cublas_workspace_,
+                                            CUBLAS_WORKSPACE_SIZE,
+                                            stream));
+            cudaEventRecord(stop_event, stream);
+            cudaEventSynchronize(stop_event);
+            cudaEventElapsedTime(&duration_ms, start_event, stop_event);
+
+            algo_results[algo_id].push_back(duration_ms);
+        }
+        std::sort(algo_results[algo_id].begin(), algo_results[algo_id].end());
+    }
+
+    cublasLtMatmulHeuristicResult_t result;
+    float best_time = INFINITY;
+    for (const auto& heuristic : heuristics) {
+        cublasLtMatmulAlgo_t algo = heuristic.algo;
+        int32_t algo_id;
+        cublasLtMatmulAlgoConfigGetAttribute(&algo, CUBLASLT_ALGO_CONFIG_ID, &algo_id, sizeof(algo_id), &returnSize);
+        const auto& results = algo_results[algo_id];
+
+        if (results.size() > 0 && results[5] < best_time) {
+            best_time = results[5];
+            result = heuristic;
+        }
+    }
+
+    return {best_time != INFINITY, result.algo};
+#endif
+}
+
+cublasMMWrapper::MatrixLayout cublasMMWrapper::createMatrixLayout(cublasLtMatrixLayout_t Mdesc)
+{
+    size_t       returnSize;
+    MatrixLayout m_layout;
+
+    cublasLtMatrixLayoutGetAttribute(
+        Mdesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &std::get<0>(m_layout), sizeof(std::get<0>(m_layout)), &returnSize);
+    cublasLtMatrixLayoutGetAttribute(
+        Mdesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &std::get<1>(m_layout), sizeof(std::get<1>(m_layout)), &returnSize);
+    cublasLtMatrixLayoutGetAttribute(
+        Mdesc, CUBLASLT_MATRIX_LAYOUT_ROWS, &std::get<2>(m_layout), sizeof(std::get<2>(m_layout)), &returnSize);
+    cublasLtMatrixLayoutGetAttribute(
+        Mdesc, CUBLASLT_MATRIX_LAYOUT_COLS, &std::get<3>(m_layout), sizeof(std::get<3>(m_layout)), &returnSize);
+
+    return m_layout;
+}
+
+cublasStatus_t cublasMMWrapper::cublasLtMatmulWrapper(cublasLtHandle_t            lightHandle,
+                                                      cublasLtMatmulDesc_t        computeDesc,
+                                                      const void*                 alpha,
+                                                      const void*                 A,
+                                                      cublasLtMatrixLayout_t      Adesc,
+                                                      const void*                 B,
+                                                      cublasLtMatrixLayout_t      Bdesc,
+                                                      const void*                 beta,
+                                                      const void*                 C,
+                                                      cublasLtMatrixLayout_t      Cdesc,
+                                                      void*                       D,
+                                                      cublasLtMatrixLayout_t      Ddesc,
+                                                      const cublasLtMatmulAlgo_t* algo,
+                                                      void*                       workspace,
+                                                      size_t                      workspaceSizeInBytes,
+                                                      cudaStream_t                stream)
+{
+    cache_idx_t cache_idx{
+        computeDesc,
+        {createMatrixLayout(Adesc), createMatrixLayout(Bdesc), createMatrixLayout(Cdesc), createMatrixLayout(Ddesc)}};
+
+    cublasLtMatmulAlgo_t algo_value;
+    bool                 found_algo = false;
+    if (algo == nullptr) {
+        if (algo_cache.find(cache_idx) == algo_cache.end()) {
+            auto result =
+                findBestAlgo(lightHandle, computeDesc, alpha, A, Adesc, B, Bdesc, beta, C, Cdesc, D, Ddesc, stream);
+            if (result.first) {
+                algo_cache[cache_idx] = result.second;
+                algo_value            = result.second;
+                found_algo            = true;
+            }
+        }
+        else {
+            algo_value = algo_cache[cache_idx];
+            found_algo = true;
+        }
+    }
+
+    return cublasLtMatmul(lightHandle,
+                          computeDesc,
+                          alpha,
+                          A,
+                          Adesc,
+                          B,
+                          Bdesc,
+                          beta,
+                          C,
+                          Cdesc,
+                          D,
+                          Ddesc,
+                          found_algo ? &algo_value : algo,
+                          workspace,
+                          workspaceSizeInBytes,
+                          stream);
+}
+
+void cublasMMWrapper::_Int8Gemm(const int     m,
+                                const int     n,
+                                const int     k,
+                                const int8_t* A,
+                                const int     lda,
+                                const int8_t* B,
+                                const int     ldb,
+                                void*         C,
+                                const int     ldc,
+                                const void*   alpha,
+                                const int     mode,
+                                const bool    per_column_scaling)
+{
+    /* mode:
+     *  - 0: int8 * int8 -> int32 -> int8
+     *  - 1: int8 * int8 -> int32 -> int32
+     */
+#if (CUBLAS_VERSION) <= 11601
+    FT_CHECK_WITH_INFO(false, "CUBLAS version too low.");
+#else
+
+    mu_->lock();
+    const auto op_a = CUBLAS_OP_T;
+    const auto op_b = CUBLAS_OP_N;
+    const auto dataType = CUDA_R_8I;
+    const auto resultType = mode == 0 ? CUDA_R_8I : CUDA_R_32I;
+    const auto computeType = CUBLAS_COMPUTE_32I;
+    const auto scaleType = mode == 0 ? CUDA_R_32F : CUDA_R_32I;
+    const int batch_count = 1;
+    const void* beta;
+
+    int findAlgo = cublas_algo_map_->isExist(batch_count, m, n, k, getCublasDataType(dataType));
+
+    cublasLtMatmulAlgo_info info = cublas_algo_map_->getAlgo(batch_count, m, n, k, getCublasDataType(dataType));
+
+    cublasLtMatmulDesc_t operationDesc = NULL;
+    cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL;
+
+    // --------------------------------------
+    // Create descriptors for the original matrices
+    check_cuda_error(cublasLtMatrixLayoutCreate(&Adesc, dataType, k, m, lda));
+    check_cuda_error(cublasLtMatrixLayoutCreate(&Bdesc, dataType, k, n, ldb));
+    check_cuda_error(cublasLtMatrixLayoutCreate(&Cdesc, resultType, m, n, ldc));
+
+    check_cuda_error(cublasLtMatmulDescCreate(&operationDesc, computeType, scaleType));
+
+    auto pointer_mode = CUBLASLT_POINTER_MODE_HOST;
+    if (mode == 0) {
+        pointer_mode =
+            per_column_scaling ? CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_HOST : CUBLASLT_POINTER_MODE_DEVICE;
+    }
+    check_cuda_error(
+        cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &op_a, sizeof(cublasOperation_t)));
+    check_cuda_error(
+        cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &op_b, sizeof(cublasOperation_t)));
+    check_cuda_error(
+        cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSC, &op_b, sizeof(cublasOperation_t)));
+    check_cuda_error(cublasLtMatmulDescSetAttribute(
+        operationDesc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &pointer_mode, sizeof(pointer_mode)));
+
+    const int32_t int_one = 1;
+    const int32_t int_zero = 0;
+    const float float_zero = 0;
+    if (mode == 0) {
+        beta = per_column_scaling ? &float_zero : NULL;
+    }
+    else {
+        alpha = &int_one;
+        beta = &int_zero;
+    }
+
+    cublasLtMatmulAlgo_t algo;
+    void* workSpace = cublas_workspace_;
+    int workspaceSize = cublas_workspace_ == NULL ? 0 : CUBLAS_WORKSPACE_SIZE;
+
+    sync_check_cuda_error();
+    auto ret = cublasLtMatmulWrapper(cublaslt_handle_,
+                                     operationDesc,
+                                     alpha,
+                                     A,
+                                     Adesc,
+                                     B,
+                                     Bdesc,
+                                     beta,
+                                     C,
+                                     Cdesc,
+                                     C,
+                                     Cdesc,
+                                     NULL,
+                                     workSpace,
+                                     workspaceSize,
+                                     stream_);
+    check_cuda_error(ret);
+    sync_check_cuda_error();
+
+    cublasLtMatmulDescDestroy(operationDesc);
+    cublasLtMatrixLayoutDestroy(Adesc);
+    cublasLtMatrixLayoutDestroy(Bdesc);
+    cublasLtMatrixLayoutDestroy(Cdesc);
+    sync_check_cuda_error();
+    mu_->unlock();
+#endif
+}
+
+void cublasMMWrapper::Int8Gemm(const int     m,
+                               const int     n,
+                               const int     k,
+                               const int8_t* A,
+                               const int     lda,
+                               const int8_t* B,
+                               const int     ldb,
+                               int8_t*       C,
+                               const int     ldc,
+                               const float*  alpha,
+                               const bool    per_column_scaling)
+{
+    return _Int8Gemm(m, n, k, A, lda, B, ldb, C, ldc, alpha, 0, per_column_scaling);
+}
+
+void cublasMMWrapper::Int8Gemm(const int     m,
+                               const int     n,
+                               const int     k,
+                               const int8_t* A,
+                               const int     lda,
+                               const int8_t* B,
+                               const int     ldb,
+                               int32_t*      C,
+                               const int     ldc)
+{
+    return _Int8Gemm(m, n, k, A, lda, B, ldb, C, ldc, (float*)nullptr, 1, false);
+}
+
+}  // namespace fastertransformer