[feat] refactored extension module (#5298)

* [feat] refactored extension module * polish * polish * polish * polish * polish * polish * polish * polish * polish * polish

[feat] refactored extension module (#5298)
* [feat] refactored extension module * polish * polish * polish * polish * polish * polish * polish * polish * polish * polish
7cfed5f0 · Frank Lee · GitHub · d7f8db8e · 7cfed5f0 · 7cfed5f0
Unverified Commit 7cfed5f0 authored Jan 25, 2024 by Frank Lee Committed by GitHub Jan 25, 2024
20 changed files
--- a/.github/workflows/build_on_pr.yml
+++ b/.github/workflows/build_on_pr.yml
@@ -140,7 +140,7 @@ jobs:
      - name: Install Colossal-AI
        run: |
-          CUDA_EXT=1 pip install -v -e .
+          BUILD_EXT=1 pip install -v -e .
          pip install -r requirements/requirements-test.txt
      - name: Store Colossal-AI Cache

--- a/.github/workflows/build_on_schedule.yml
+++ b/.github/workflows/build_on_schedule.yml
@@ -55,7 +55,7 @@ jobs:
        if: steps.check-avai.outputs.avai == 'true'
        run: |
          [ ! -z "$(ls -A /github/home/cuda_ext_cache/)" ] && cp -r /github/home/cuda_ext_cache/* /__w/ColossalAI/ColossalAI/
-          CUDA_EXT=1 pip install -v -e .
+          BUILD_EXT=1 pip install -v -e .
          cp -r /__w/ColossalAI/ColossalAI/build /github/home/cuda_ext_cache/
          pip install -r requirements/requirements-test.txt

--- a/MANIFEST.in
+++ b/MANIFEST.in
 include *.txt README.md
 recursive-include requirements *.txt
 recursive-include colossalai *.cpp *.h *.cu *.tr *.cuh *.cc *.pyi
-recursive-include op_builder *.py
+recursive-include extensions *.py *.cpp *.h *.cu *.tr *.cuh *.cc *.pyi
--- a/colossalai/accelerator/base_accelerator.py
+++ b/colossalai/accelerator/base_accelerator.py
@@ -48,6 +48,12 @@ class BaseAccelerator(ABC):
    # =======================
    # device APIs
    # =======================
+    @abstractmethod
+    def get_version(self) -> str:
+        """
+        Return the version of the accelerator which torch is built against.
+        """
    @abstractmethod
    def get_current_device(self) -> torch.device:
        """
@@ -66,6 +72,7 @@ class BaseAccelerator(ABC):
        Bind the current process to a device.
        """
+    @abstractmethod
    def get_device_name(self, device: Union[torch.device, int]) -> str:
        """
        Return the name of the device.

--- a/colossalai/accelerator/cpu_accelerator.py
+++ b/colossalai/accelerator/cpu_accelerator.py
@@ -24,6 +24,12 @@ class CpuAccelerator(BaseAccelerator):
    # =======================
    # device APIs
    # =======================
+    def get_version(self) -> str:
+        """
+        Return the version of the accelerator which torch is built against.
+        """
+        return ""
    def get_current_device(self) -> torch.device:
        """
        Return the current device.

--- a/colossalai/accelerator/cuda_accelerator.py
+++ b/colossalai/accelerator/cuda_accelerator.py
@@ -21,6 +21,12 @@ class CudaAccelerator(BaseAccelerator):
    # =======================
    # device APIs
    # =======================
+    def get_version(self) -> str:
+        """
+        Return the version of the accelerator which torch is built against.
+        """
+        return torch.version.cuda
    def get_current_device(self) -> torch.device:
        """
        Return the current device.

--- a/colossalai/accelerator/npu_accelerator.py
+++ b/colossalai/accelerator/npu_accelerator.py
@@ -27,6 +27,12 @@ class NpuAccelerator(BaseAccelerator):
    # =======================
    # device APIs
    # =======================
+    def get_version(self) -> str:
+        """
+        Return the version of the accelerator which torch is built against.
+        """
+        return torch.version.npu
    def get_current_device(self) -> torch.device:
        """
        Return the current device.

--- a/colossalai/kernel/__init__.py
+++ b/colossalai/kernel/__init__.py
-from .cpu_adam_loader import CPUAdamLoader
-from .cuda_native import FusedScaleMaskSoftmax, LayerNorm, MultiHeadAttention
-from .extensions.flash_attention import AttnMaskType
-from .flash_attention_loader import ColoAttention, FlashAttentionLoader
-__all__ = [
-    "LayerNorm",
-    "FusedScaleMaskSoftmax",
-    "MultiHeadAttention",
-    "CPUAdamLoader",
-    "FlashAttentionLoader",
-    "ColoAttention",
-    "AttnMaskType",
-]
--- a/colossalai/kernel/base_kernel_loader.py
+++ b/colossalai/kernel/base_kernel_loader.py
-from abc import ABC, abstractmethod
-from typing import Dict, List
-from .extensions.base_extension import BaseExtension
-class BaseKernelLoader(ABC):
-    """
-    Usage:
-        kernel_loader = KernelLoader()
-        kernel = kernel_loader.load()
-    """
-    def __init__(self, extension_map: Dict[str, BaseExtension], supported_device: List[str]):
-        self._extension_map = extension_map
-        self._supported_device = supported_device
-    def run_checks(self):
-        # run supported device check and other possible checks
-        pass
-    @abstractmethod
-    def fetch_kernel(self):
-        pass
-    def load(self):
-        self.run_checks()
-        return self.fetch_kernel()
--- a/colossalai/kernel/cpu_adam_loader.py
+++ b/colossalai/kernel/cpu_adam_loader.py
-import platform
-from collections import OrderedDict
-from .base_kernel_loader import BaseKernelLoader
-from .extensions.cpu_adam import ArmCPUAdamExtension, X86CPUAdamExtension
-class CPUAdamLoader(BaseKernelLoader):
-    """
-    CPU Adam Loader
-    Usage:
-        # init
-        cpu_adam = CPUAdamLoader().load()
-        cpu_adam_op = cpu_adam.CPUAdamOptimizer(
-            alpha, beta1, beta2, epsilon, weight_decay, adamw_mode,
-        )
-        ...
-        # optim step
-        cpu_adam_op.step(
-            step, lr, beta1, beta2, epsilon, weight_decay, bias_correction,
-            params, grads, exp_avg, exp_avg_sq, loss_scale,
-        )
-    Args:
-        func CPUAdamOptimizer:
-            alpha (float): learning rate. Default to 1e-3.
-            beta1 (float): coefficients used for computing running averages of gradient. Default to 0.9.
-            beta2 (float): coefficients used for computing running averages of its square. Default to 0.99.
-            epsilon (float): term added to the denominator to improve numerical stability. Default to 1e-8.
-            weight_decay (float): weight decay (L2 penalty). Default to 0.
-            adamw_mode (bool): whether to use the adamw. Default to True.
-        func step:
-            step (int): current step.
-            lr (float): learning rate.
-            beta1 (float): coefficients used for computing running averages of gradient.
-            beta2 (float): coefficients used for computing running averages of its square.
-            epsilon (float): term added to the denominator to improve numerical stability.
-            weight_decay (float): weight decay (L2 penalty).
-            bias_correction (bool): whether to use bias correction.
-            params (torch.Tensor): parameter.
-            grads (torch.Tensor): gradient.
-            exp_avg (torch.Tensor): exp average.
-            exp_avg_sq (torch.Tensor): exp average square.
-            loss_scale (float): loss scale value.
-    """
-    def __init__(self):
-        super().__init__(
-            extension_map=OrderedDict(
-                arm=ArmCPUAdamExtension,
-                x86=X86CPUAdamExtension,
-            ),
-            supported_device=["cpu"],
-        )
-    def fetch_kernel(self):
-        if platform.machine() == "x86_64":
-            kernel = self._extension_map["x86"]().fetch()
-        elif platform.machine() in ["aarch64", "aarch64_be", "armv8b", "armv8l"]:
-            kernel = self._extension_map["arm"]().fetch()
-        else:
-            raise Exception("not supported")
-        return kernel
--- a/colossalai/kernel/cuda_native/csrc/gptq/column_remap.cu
+++ b/colossalai/kernel/cuda_native/csrc/gptq/column_remap.cu
-// Adapted from turboderp exllama: https://github.com/turboderp/exllama
-#include "column_remap.cuh"
-#include "util.cuh"
-const int SHUF_BLOCKSIZE_X = 256;
-const int SHUF_BLOCKSIZE_Y = 16;
-__global__ void column_remap_kernel
-(
-    const half* __restrict__ x,
-    half* __restrict__ x_new,
-    const int x_width,
-    const int x_height,
-    const uint32_t* x_map
-)
-{
-    int x_column = SHUF_BLOCKSIZE_X * blockIdx.x + threadIdx.x;
-    int x_row = SHUF_BLOCKSIZE_Y * blockIdx.y;
-    if (x_column >= x_width) return;
-    //if (x_row >= x_height) return;
-    int x_stride = x_width;
-    int x_idx = x_row * x_stride + x_column;
-    int x_row_end = min(x_row + SHUF_BLOCKSIZE_Y, x_height);
-    int x_idx_end = x_row_end * x_stride + x_column;
-    int s_column = x_map[x_column];
-    int s_idx = x_row * x_stride + s_column;
-    while (x_idx < x_idx_end)
-    {
-        x_new[x_idx] = x[s_idx];
-        x_idx += x_stride;
-        s_idx += x_stride;
-    }
-}
-// Remap columns in x to correspond to sequential group index before matmul
-//
-// perform x -> seq_x such that seq_x @ seq_w == x @ w
-void column_remap_cuda
-(
-    const half* x,
-    half* x_new,
-    const int x_height,
-    const int x_width,
-    const uint32_t* x_map
-)
-{
-    dim3 threads(SHUF_BLOCKSIZE_X, 1, 1);
-    dim3 blocks
-    (
-        (x_width + SHUF_BLOCKSIZE_X - 1) / SHUF_BLOCKSIZE_X,
-        (x_height + SHUF_BLOCKSIZE_Y - 1) / SHUF_BLOCKSIZE_Y,
-        1
-    );
-    column_remap_kernel<<<blocks, threads>>>(x, x_new, x_width, x_height, x_map);
-}
--- a/colossalai/kernel/cuda_native/csrc/gptq/column_remap.cuh
+++ b/colossalai/kernel/cuda_native/csrc/gptq/column_remap.cuh
-// Adapted from turboderp exllama: https://github.com/turboderp/exllama
-#ifndef _column_remap_cuh
-#define _column_remap_cuh
-#include <cuda_runtime.h>
-#include <cuda_fp16.h>
-#include <cstdint>
-void column_remap_cuda
-(
-    const half* x,
-    half* x_new,
-    const int x_height,
-    const int x_width,
-    const uint32_t* x_map
-);
-#endif
--- a/colossalai/kernel/cuda_native/csrc/gptq/cu_compat.cuh
+++ b/colossalai/kernel/cuda_native/csrc/gptq/cu_compat.cuh
-// Adapted from turboderp exllama: https://github.com/turboderp/exllama
-#ifndef _cuda_compat_cuh
-#define _cuda_compat_cuh
-// atomicAdd for half types, to support CC < 7.x
-__device__ __forceinline__ void atomicAdd_half(half* address, half val)
-{
-    unsigned int * address_as_ui = (unsigned int *) ((char *)address - ((size_t)address & 2));
-    unsigned int old = *address_as_ui;
-    unsigned int assumed;
-    do
-    {
-        assumed = old;
-        __half_raw hsum;
-        hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
-        half tmpres = __hadd(hsum, val);
-        hsum = __half_raw(tmpres);
-        old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x;
-        old = atomicCAS(address_as_ui, assumed, old);
-    }
-    while (assumed != old);
-}
-// atomicAdd for half2 types
-__device__ __forceinline__ void atomicAdd_half2(half2* address, half2 val)
-{
-    unsigned int* address_as_ui = (unsigned int*)address;
-    unsigned int old = *address_as_ui;
-    unsigned int assumed;
-    do
-    {
-        assumed = old;
-        half2 old_val = *((half2*)&old);
-        half2 new_val = __hadd2(old_val, val);
-        old = atomicCAS(address_as_ui, assumed, *((unsigned int*)&new_val));
-    }
-    while (assumed != old);
-}
-//
-#if defined(__CUDA_ARCH__) || defined(USE_ROCM)
-#if __CUDA_ARCH__ < 700 || defined(USE_ROCM)
-__device__ __forceinline__ void atomicAdd(half* address, half val) { atomicAdd_half(address, val); }
-#if __CUDA_ARCH__ < 600 || defined(USE_ROCM)
-__device__ __forceinline__ void atomicAdd(half2* address, half2 val) { atomicAdd_half2(address, val); }
-#endif
-#endif
-#endif
-#endif
--- a/colossalai/kernel/cuda_native/csrc/gptq/cuda_buffers.cu
+++ b/colossalai/kernel/cuda_native/csrc/gptq/cuda_buffers.cu
-// Adapted from turboderp exllama: https://github.com/turboderp/exllama
-#define _cuda_buffers_cu
-#include "cuda_buffers.cuh"
-CudaBuffers* g_buffers[CUDA_MAX_DEVICES] = {NULL};
-// __constant__ half2 q4_table[16][256];
-// half2 q4_table_host[16][256];
-// bool q4_table_init = false;
-CudaBuffers::CudaBuffers
-(
-    int _device,
-    int _temp_state_size,
-    half* _temp_state,
-    half* _temp_dq
-) :
-    device(_device),
-    temp_state_size(_temp_state_size),
-    temp_state(_temp_state),
-    temp_dq(_temp_dq)
-{
-    cudaSetDevice(_device);
-    cudaStreamCreate(&alt_stream_1);
-    cudaStreamCreate(&alt_stream_2);
-    cudaStreamCreate(&alt_stream_3);
-    cudaEventCreate(&alt_stream_1_done);
-    cudaEventCreate(&alt_stream_2_done);
-    cudaEventCreate(&alt_stream_3_done);
-}
-CudaBuffers::~CudaBuffers()
-{
-    cudaStreamDestroy(alt_stream_1);
-    cudaStreamDestroy(alt_stream_2);
-    cudaStreamDestroy(alt_stream_3);
-    cudaEventDestroy(alt_stream_1_done);
-    cudaEventDestroy(alt_stream_2_done);
-    cudaEventDestroy(alt_stream_3_done);
-}
-CudaBuffers* get_buffers(const int device_index)
-{
-    return g_buffers[device_index];
-}
-void prepare_buffers_cuda
-(
-    int _device,
-    int _temp_state_size,
-    half* _temp_state,
-    half* _temp_dq
-)
-{
-    CudaBuffers* buffers = new CudaBuffers
-    (
-        _device,
-        _temp_state_size,
-        _temp_state,
-        _temp_dq
-    );
-    g_buffers[_device] = buffers;
-}
-void cleanup_buffers_cuda()
-{
-    for (int i = 0; i < CUDA_MAX_DEVICES; i++)
-    {
-        if (!g_buffers[i]) continue;
-        delete g_buffers[i];
-        g_buffers[i] = NULL;
-    }
-}
--- a/colossalai/kernel/cuda_native/csrc/gptq/cuda_buffers.cuh
+++ b/colossalai/kernel/cuda_native/csrc/gptq/cuda_buffers.cuh
-// Adapted from turboderp exllama: https://github.com/turboderp/exllama
-#ifndef _cuda_buffers_cuh
-#define _cuda_buffers_cuh
-#include <cuda_runtime.h>
-#include <cuda_fp16.h>
-#include <cstdint>
-#include <cstdio>
-const int CUDA_MAX_DEVICES = 16;
-// #ifndef _cuda_buffers_cu
-// extern __constant__ half2 q4_table[16][256];
-// #endif
-class CudaBuffers
-{
-public:
-    int device;
-    half* temp_state;           // [max_hidden_rows * intermediate_size]
-    int temp_state_size;
-    half* temp_dq;              // size of largest quant tensor * 8
-    cudaStream_t alt_stream_1;
-    cudaStream_t alt_stream_2;
-    cudaStream_t alt_stream_3;
-    cudaEvent_t alt_stream_1_done;
-    cudaEvent_t alt_stream_2_done;
-    cudaEvent_t alt_stream_3_done;
-    CudaBuffers
-    (
-        int _device,
-        int _temp_state_size,
-        half* _temp_state,
-        half* _temp_dq
-    );
-    ~CudaBuffers();
-};
-CudaBuffers* get_buffers(const int device_index);
-void prepare_buffers_cuda
-(
-    int _device,
-    int _temp_state_size,
-    half* _temp_state,
-    half* _temp_dq
-);
-void cleanup_buffers_cuda();
-#endif
--- a/colossalai/kernel/cuda_native/csrc/gptq/hip_compat.cuh
+++ b/colossalai/kernel/cuda_native/csrc/gptq/hip_compat.cuh
-// Adapted from turboderp exllama: https://github.com/turboderp/exllama
-#ifndef _hip_compat_cuh
-#define _hip_compat_cuh
-// Workaround for a bug in hipamd, backported from upstream.
-__device__ __forceinline__ __half __compat_hrcp(__half x) {
-    return __half_raw{
-        static_cast<_Float16>(__builtin_amdgcn_rcph(static_cast<__half_raw>(x).data))};
-}
-__device__ __forceinline__ __half2 __compat_h2rcp(__half2 x) {
-    return _Float16_2{static_cast<_Float16>(__builtin_amdgcn_rcph(x.x)),
-        static_cast<_Float16>(__builtin_amdgcn_rcph(x.y))};
-}
-#define hrcp __compat_hrcp
-#define h2rcp __compat_h2rcp
-// Workaround for hipify_python using rocblas instead of hipblas.
-__host__ __forceinline__ hipblasStatus_t __compat_hipblasHgemm(hipblasHandle_t    handle,
-                                                               hipblasOperation_t transA,
-                                                               hipblasOperation_t transB,
-                                                               int                m,
-                                                               int                n,
-                                                               int                k,
-                                                               const half*        alpha,
-                                                               const half*        AP,
-                                                               int                lda,
-                                                               const half*        BP,
-                                                               int                ldb,
-                                                               const half*        beta,
-                                                               half*              CP,
-                                                               int                ldc) {
-    return hipblasHgemm(handle, transA, transB, m, n, k,
-                        reinterpret_cast<const hipblasHalf *>(alpha),
-                        reinterpret_cast<const hipblasHalf *>(AP), lda,
-                        reinterpret_cast<const hipblasHalf *>(BP), ldb,
-                        reinterpret_cast<const hipblasHalf *>(beta),
-                        reinterpret_cast<hipblasHalf *>(CP), ldc);
-}
-#define rocblas_handle hipblasHandle_t
-#define rocblas_operation_none HIPBLAS_OP_N
-#define rocblas_get_stream hipblasGetStream
-#define rocblas_set_stream hipblasSetStream
-#define rocblas_hgemm __compat_hipblasHgemm
-#endif
--- a/colossalai/kernel/cuda_native/csrc/gptq/linear_gptq.cpp
+++ b/colossalai/kernel/cuda_native/csrc/gptq/linear_gptq.cpp
-// Adapted from turboderp exllama: https://github.com/turboderp/exllama
-#include <torch/extension.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <cuda_runtime.h>
-#include <cuda_fp16.h>
-#include <cstdint>
-#include <cstdio>
-#include "util.cuh"
-#include "tuning.h"
-#include "cuda_buffers.cuh"
-#include "q4_matrix.cuh"
-#include "q4_matmul.cuh"
-#include "column_remap.cuh"
-// Check CUDA return code. We don't want to include Torch headers in the .cu files because parsing them adds almost a
-// minute to the compile time on a 12900K. Also passing exceptions back to Python is super tricky, so in place of
-// exceptions, CUDA functions return with a cudaError_t which we can parse and dump to the console.
-void check_cuda(cudaError_t ret)
-{
-    switch (ret)
-    {
-        case cudaSuccess:
-            break;
-        case cudaUnspecified:
-            printf(" **** Unspecified error\n");
-            TORCH_CHECK(false, "CUDA error");
-            break;
-        default:
-            printf(" **** CUDA error\n"); \
-            printf(" **** %s\n", cudaGetErrorString(ret)); \
-            TORCH_CHECK(false, "CUDA error"); \
-            break;
-    }
-}
-// Some decluttering macros
-#define STRINGIFY_(__x) #__x
-#define STRINGIFY(__x) STRINGIFY_(__x)
-#define TORCH_CHECK_DTYPE(__x, __dtype) TORCH_CHECK((__x).dtype() == torch::__dtype, #__x " is incorrect datatype, must be " #__dtype)
-#define TORCH_CHECK_DTYPE_OPT(__x, __dtype) TORCH_CHECK((__x).device().is_meta() || (__x).dtype() == torch::__dtype, #__x " is incorrect datatype, must be " #__dtype)
-#define TORCH_CHECK_SHAPES(__x, __dim_x, __y, __dim_y, __scale_y) TORCH_CHECK((__x).size(__dim_x) == (__y).size(__dim_y) * __scale_y, #__x " and " #__y " have incompatible shapes")
-#define TORCH_CHECK_SHAPES_OPT(__x, __dim_x, __y, __dim_y, __scale_y) TORCH_CHECK((__x).device().is_meta() || (__x).size(__dim_x) == (__y).size(__dim_y) * __scale_y, #__x " and " #__y " have incompatible shapes")
-#define TORCH_CHECK_SHAPE_MOD(__x, __dim_x, __mod) TORCH_CHECK((__x).size(__dim_x) % __mod == 0, #__x ".shape[" STRINGIFY(__dim_x) "] must be a multiple of " STRINGIFY(__mod))
-#define TORCH_CHECK_BUFFER_SIZE(__buffer, __minimum_size) TORCH_CHECK((__buffer).numel() >= __minimum_size, #__buffer " is too small")
-#define TORCH_CHECK_DEVICE_INDEX(__index) \
-do { \
-    TORCH_CHECK(__index >= 0, "no device index"); \
-    TORCH_CHECK(__index < CUDA_MAX_DEVICES, "invalid device index"); \
-} while(0)
-#define TORCH_CHECK_QUANT(__w, __w_scales, __w_zeros, __seq_g_idx, __x_map) \
-do { \
-    TORCH_CHECK_DTYPE(__w, kInt); \
-    TORCH_CHECK_DTYPE(__w_scales, kHalf); \
-    TORCH_CHECK_DTYPE(__w_zeros, kInt); \
-    TORCH_CHECK_DTYPE_OPT(__seq_g_idx, kShort); \
-    TORCH_CHECK_DTYPE_OPT(__x_map, kInt); \
-    TORCH_CHECK_SHAPES_OPT(__seq_g_idx, 0, __w, 0, 2 * 8); \
-    TORCH_CHECK_SHAPES_OPT(__x_map, 0, __w, 0, 8); \
-} while(0)
-int get_groupsize(torch::Tensor w, torch::Tensor w_zeros)
-{
-    int groupsize = w.size(0) * 8 / w_zeros.size(0);
-    TORCH_CHECK(groupsize * w_zeros.size(0) == w.size(0) * 8, "w.shape[-2] must be a multiple of zeros.shape[-2]")
-    return groupsize;
-}
-// Tuning parameters
-ExLlamaTuning tuningParams;
-void set_tuning_params
-(
-    int matmul_recons_thd,
-    bool matmul_fused_remap,
-    bool matmul_no_half2
-)
-{
-    tuningParams.matmul_recons_thd = matmul_recons_thd;
-    tuningParams.matmul_fused_remap = matmul_fused_remap;
-    tuningParams.matmul_no_half2 = matmul_no_half2;
-}
-// Release all unmanaged objects allocated by the extension
-void cleanup()
-{
-    cleanup_buffers_cuda();
-    g_q4_free_matrices();
-}
-// Prepare buffers for forward pass
-void prepare_buffers
-(
-    torch::Device device,
-    torch::Tensor temp_state,
-    torch::Tensor temp_dq
-)
-{
-    int device_index = device.index();
-    TORCH_CHECK_DEVICE_INDEX(device_index);
-    const at::cuda::OptionalCUDAGuard device_guard(device);
-    prepare_buffers_cuda
-    (
-        device_index,
-        // buffer size used for sanity checks
-        temp_state.numel(),
-        (half*) temp_state.data_ptr(),
-        (half*) temp_dq.data_ptr()
-    );
-}
-// Create Q4Matrix, return handle
-uintptr_t make_q4
-(
-    torch::Tensor qweight,
-    torch::Tensor qzeros,
-    torch::Tensor scales,
-    torch::Tensor g_idx,
-    int device
-)
-{
-    TORCH_CHECK_DTYPE(qweight, kInt);
-    TORCH_CHECK_DTYPE(qzeros, kInt);
-    TORCH_CHECK_DTYPE(scales, kHalf);
-    TORCH_CHECK_DTYPE_OPT(g_idx, kInt);
-    TORCH_CHECK_SHAPES(qweight, 1, qzeros, 1, 8);
-    TORCH_CHECK_SHAPES(scales, 1, qweight, 1, 1);
-    TORCH_CHECK_SHAPES(qzeros, 0, scales, 0, 1);
-    int width = qweight.size(1);
-    int height = qweight.size(0) * 8;
-    int groups = qzeros.size(0);
-    Q4Matrix* m = new Q4Matrix
-    (
-        height,
-        width,
-        groups,
-        (uint32_t*) qweight.data_ptr(),
-        (uint32_t*) qzeros.data_ptr(),
-        (half*) scales.data_ptr(),
-        g_idx.device().is_meta() ? NULL : (uint32_t*) g_idx.data_ptr(),
-        device
-    );
-    g_q4_keep_matrix(m);
-    return reinterpret_cast<uintptr_t> (m);
-}
-// Matmul half @ quant -> half
-void q4_matmul
-(
-    torch::Tensor x,
-    uintptr_t w,
-    torch::Tensor out
-)
-{
-    Q4Matrix* wm = reinterpret_cast<Q4Matrix*> (w);
-    TORCH_CHECK_DTYPE(x, kHalf);
-    TORCH_CHECK_DTYPE(out, kHalf);
-    TORCH_CHECK_SHAPES(x, 0, out, 0, 1);
-    TORCH_CHECK(wm->height == x.size(-1), "x and w have incompatible shapes")
-    const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
-    int x_height = x.size(0);
-    if (tuningParams.matmul_recons_thd == 0 || x_height < tuningParams.matmul_recons_thd)
-    {
-        q4_matmul_cuda
-        (
-            &tuningParams,
-            (half*) x.data_ptr(),
-            x_height,
-            wm,
-            (half*) out.data_ptr()
-        );
-    }
-    else
-    {
-        q4_matmul_recons_cuda
-        (
-            &tuningParams,
-            (half*) x.data_ptr(),
-            x_height,
-            wm,
-            (half*) out.data_ptr(),
-            at::cuda::getCurrentCUDABlasHandle()
-        );
-    }
-}
-// Remap columns in half tensor
-void column_remap
-(
-    torch::Tensor x,
-    torch::Tensor x_new,
-    torch::Tensor x_map
-)
-{
-    TORCH_CHECK_DTYPE(x, kHalf);
-    TORCH_CHECK_DTYPE(x_new, kHalf);
-    TORCH_CHECK_DTYPE(x_map, kInt);
-    TORCH_CHECK_SHAPES(x_map, 0, x, 1, 1);
-    int height = x.size(0);
-    int width = x.size(1);
-    TORCH_CHECK_BUFFER_SIZE(x_new, height * width);
-    const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
-    column_remap_cuda
-    (
-        (half*) x.data_ptr(),
-        (half*) x_new.data_ptr(),
-        height,
-        width,
-        (uint32_t*) x_map.data_ptr()
-    );
-}
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
-{
-    m.def("set_tuning_params", &set_tuning_params, "set_tuning_params");
-    m.def("prepare_buffers", &prepare_buffers, "prepare_buffers");
-    m.def("cleanup", &cleanup, "cleanup");
-    m.def("make_q4", &make_q4, "make_q4");
-    m.def("q4_matmul", &q4_matmul, "q4_matmul");
-}
--- a/colossalai/kernel/cuda_native/csrc/gptq/matrix.cuh
+++ b/colossalai/kernel/cuda_native/csrc/gptq/matrix.cuh
-// Adapted from turboderp exllama: https://github.com/turboderp/exllama
-#ifndef _matrix_cuh
-#define _matrix_cuh
-#include <cuda_runtime.h>
-#include <cuda_fp16.h>
-class MatrixView_half
-{
-public:
-    const half* data;
-    const int height;
-    const int width;
-    __device__ __forceinline__ MatrixView_half(const half* data, const int height, const int width)
-        : data(data), height(height), width(width)
-    { }
-    __device__ __forceinline__ half item(int row, int column) const { return data[row * width + column]; }
-    __device__ __forceinline__ half2 item_half2(int row, int column) const { return ((half2*)data)[(row * width + column) / 2]; }
-    __device__ __forceinline__ half2 item_half2half2(int row, int column) const { return __half2half2(data[row * width + column]); }
-    __device__ __forceinline__ const half* item_ptr(int row, int column) const { return &data[row * width + column]; }
-};
-class MatrixView_half_rw
-{
-public:
-    half* data;
-    const int height;
-    const int width;
-    __device__ __forceinline__ MatrixView_half_rw(half* data, const int height, const int width)
-        : data(data), height(height), width(width)
-    { }
-    __device__ __forceinline__ half item(int row, int column) const { return data[row * width + column]; }
-    __device__ __forceinline__ half2 item_half2(int row, int column) const { return ((half2*)data)[(row * width + column) / 2]; }
-    __device__ __forceinline__ half* item_ptr(int row, int column) { return &data[row * width + column]; }
-    __device__ __forceinline__ void set(int row, int column, half value) { data[row * width + column] = value; }
-    __device__ __forceinline__ void set_half2(int row, int column, half2 value) { ((half2*)data)[(row * width + column) / 2] = value; }
-};
-class MatrixView_q4_row
-{
-public:
-    const uint32_t* data;
-    const int height;
-    const int width;
-    __device__ __forceinline__ MatrixView_q4_row(const uint32_t* data, const int height, const int width)
-        : data(data), height(height), width(width)
-    { }
-    __device__ __forceinline__ int item(int row, int column) const
-    {
-        int shift = (column & 0x07) * 4;
-        return (data[row * width / 8 + column / 8] >> shift) & 0x0f;
-    }
-};
-class MatrixView_q4_column
-{
-public:
-    const uint32_t* data;
-    const int height;
-    const int width;
-    __device__ __forceinline__ MatrixView_q4_column(const uint32_t* data, const int height, const int width)
-        : data(data), height(height), width(width)
-    { }
-    __device__ __forceinline__ int item(int row, int column) const
-    {
-        int shift = (row & 0x07) * 4;
-        return (data[row / 8 * width + column] >> shift) & 0x0f;
-    }
-    __device__ __forceinline__ uint32_t item_uint32_t(int row, int column) { return data[row / 8 * width + column]; }
-    __device__ __forceinline__ const uint32_t* item_uint32_ptr(int row, int column) { return &data[row / 8 * width + column]; }
-};
-// TODO: Rewrite all these dot product functions using functors or something, move to q4_matmul.cu
-// Accumulated dot product of 8-element row vectors in h and quantized column vectors in v, constant zero/scale
-__device__ __forceinline__ half2 dot_product_8
-(
-    const half2 acc,
-    MatrixView_half& h_,
-    const int h_row,
-    const int h_column,                 // divisible by 8
-    MatrixView_q4_column& v_,
-    const int v_row,                    // divisible by 8
-    const int v_column,
-    const half2 v_scale_2,
-    const uint32_t v_zero,              // + 1 (!!)
-    const int count
-)
-{
-    const half2* h_ptr = (const half2*) h_.item_ptr(h_row, h_column);
-    const uint32_t* v_ptr = (const uint32_t*) v_.item_uint32_ptr(v_row, v_column);
-    half2 result = acc;
-    for (int i = 0; i < count; i++)
-    {
-        uint32_t v_read = *v_ptr; v_ptr += v_.width;
-        half v_0 = __int2half_rn((int)((v_read      ) & 0x0f) - v_zero);
-        half v_1 = __int2half_rn((int)((v_read >>  4) & 0x0f) - v_zero);
-        half v_2 = __int2half_rn((int)((v_read >>  8) & 0x0f) - v_zero);
-        half v_3 = __int2half_rn((int)((v_read >> 12) & 0x0f) - v_zero);
-        half v_4 = __int2half_rn((int)((v_read >> 16) & 0x0f) - v_zero);
-        half v_5 = __int2half_rn((int)((v_read >> 20) & 0x0f) - v_zero);
-        half v_6 = __int2half_rn((int)((v_read >> 24) & 0x0f) - v_zero);
-        half v_7 = __int2half_rn((int)((v_read >> 28)       ) - v_zero);
-        half2 v_01 = __halves2half2(v_0, v_1);
-        half2 v_23 = __halves2half2(v_2, v_3);
-        half2 v_45 = __halves2half2(v_4, v_5);
-        half2 v_67 = __halves2half2(v_6, v_7);
-//         half2 v_01 = q4_table[v_zero - 1][(v_read      ) & 0xff]; // (constant memory is too slow apparently)
-//         half2 v_23 = q4_table[v_zero - 1][(v_read >>  8) & 0xff];
-//         half2 v_45 = q4_table[v_zero - 1][(v_read >> 16) & 0xff];
-//         half2 v_67 = q4_table[v_zero - 1][(v_read >> 24)       ];
-        half2 tmp = __hmul2(*h_ptr++, v_01);
-        tmp = __hfma2(*h_ptr++, v_23, tmp);
-        tmp = __hfma2(*h_ptr++, v_45, tmp);
-        tmp = __hfma2(*h_ptr++, v_67, tmp);
-        result = __hfma2(v_scale_2, tmp, result);
-    }
-    return result;
-}
-__device__ __forceinline__ half dot_product_8_h
-(
-    const half acc,
-    MatrixView_half& h_,
-    const int h_row,
-    const int h_column,                 // divisible by 8
-    MatrixView_q4_column& v_,
-    const int v_row,                    // divisible by 8
-    const int v_column,
-    const half v_scale,
-    const uint32_t v_zero,              // + 1 (!!)
-    const int count
-)
-{
-    const half* h_ptr = h_.item_ptr(h_row, h_column);
-    const uint32_t* v_ptr = (const uint32_t*) v_.item_uint32_ptr(v_row, v_column);
-    half result = acc;
-    for (int i = 0; i < count; i++)
-    {
-        uint32_t v_read = *v_ptr; v_ptr += v_.width;
-        half v_0 = __int2half_rn((int)((v_read      ) & 0x0f) - v_zero);
-        half v_1 = __int2half_rn((int)((v_read >>  4) & 0x0f) - v_zero);
-        half v_2 = __int2half_rn((int)((v_read >>  8) & 0x0f) - v_zero);
-        half v_3 = __int2half_rn((int)((v_read >> 12) & 0x0f) - v_zero);
-        half v_4 = __int2half_rn((int)((v_read >> 16) & 0x0f) - v_zero);
-        half v_5 = __int2half_rn((int)((v_read >> 20) & 0x0f) - v_zero);
-        half v_6 = __int2half_rn((int)((v_read >> 24) & 0x0f) - v_zero);
-        half v_7 = __int2half_rn((int)((v_read >> 28)       ) - v_zero);
-        half tmp = __hmul(*h_ptr++, v_0);
-        tmp = __hfma(*h_ptr++, v_1, tmp);
-        tmp = __hfma(*h_ptr++, v_2, tmp);
-        tmp = __hfma(*h_ptr++, v_3, tmp);
-        tmp = __hfma(*h_ptr++, v_4, tmp);
-        tmp = __hfma(*h_ptr++, v_5, tmp);
-        tmp = __hfma(*h_ptr++, v_6, tmp);
-        tmp = __hfma(*h_ptr++, v_7, tmp);
-        result = __hfma(v_scale, tmp, result);
-    }
-    return result;
-}
-// Accumulated dot product of 8-element row vectors in h and quantized column vectors in v, constant zero/scale, with x_map
-__device__ __forceinline__ half2 dot_product_8_x_map
-(
-    const half2 acc,
-    MatrixView_half& h_,
-    const int h_row,
-    const int h_column,                 // divisible by 8
-    MatrixView_q4_column& v_,
-    const int v_row,                    // divisible by 8
-    const int v_column,
-    const half2 v_scale_2,
-    const uint32_t v_zero,              // + 1 (!!)
-    const int count,
-    const uint32_t* x_map
-)
-{
-    const half* h_ptr = h_.item_ptr(h_row, 0);
-    const uint32_t* x_map_ptr = x_map + h_column;
-    const uint32_t* v_ptr = (const uint32_t*) v_.item_uint32_ptr(v_row, v_column);
-    half2 result = acc;
-    for (int i = 0; i < count; i++)
-    {
-        uint32_t v_read = *v_ptr; v_ptr += v_.width;
-        half v_0 = __int2half_rn((int)((v_read      ) & 0x0f) - v_zero);
-        half v_1 = __int2half_rn((int)((v_read >>  4) & 0x0f) - v_zero);
-        half v_2 = __int2half_rn((int)((v_read >>  8) & 0x0f) - v_zero);
-        half v_3 = __int2half_rn((int)((v_read >> 12) & 0x0f) - v_zero);
-        half v_4 = __int2half_rn((int)((v_read >> 16) & 0x0f) - v_zero);
-        half v_5 = __int2half_rn((int)((v_read >> 20) & 0x0f) - v_zero);
-        half v_6 = __int2half_rn((int)((v_read >> 24) & 0x0f) - v_zero);
-        half v_7 = __int2half_rn((int)((v_read >> 28)       ) - v_zero);
-        half2 v_01 = __halves2half2(v_0, v_1);
-        half2 v_23 = __halves2half2(v_2, v_3);
-        half2 v_45 = __halves2half2(v_4, v_5);
-        half2 v_67 = __halves2half2(v_6, v_7);
-        half h_0 = h_ptr[*x_map_ptr++];
-        half h_1 = h_ptr[*x_map_ptr++];
-        half h_2 = h_ptr[*x_map_ptr++];
-        half h_3 = h_ptr[*x_map_ptr++];
-        half h_4 = h_ptr[*x_map_ptr++];
-        half h_5 = h_ptr[*x_map_ptr++];
-        half h_6 = h_ptr[*x_map_ptr++];
-        half h_7 = h_ptr[*x_map_ptr++];
-        half2 h_01 = __halves2half2(h_0, h_1);
-        half2 h_23 = __halves2half2(h_2, h_3);
-        half2 h_45 = __halves2half2(h_4, h_5);
-        half2 h_67 = __halves2half2(h_6, h_7);
-        half2 tmp = __hmul2(h_01, v_01);
-        tmp = __hfma2(h_23, v_23, tmp);
-        tmp = __hfma2(h_45, v_45, tmp);
-        tmp = __hfma2(h_67, v_67, tmp);
-        result = __hfma2(v_scale_2, tmp, result);
-    }
-    return result;
-}
-__device__ __forceinline__ half dot_product_8_x_map_h
-(
-    const half acc,
-    MatrixView_half& h_,
-    const int h_row,
-    const int h_column,                 // divisible by 8
-    MatrixView_q4_column& v_,
-    const int v_row,                    // divisible by 8
-    const int v_column,
-    const half v_scale,
-    const uint32_t v_zero,              // + 1 (!!)
-    const int count,
-    const uint32_t* x_map
-)
-{
-    const half* h_ptr = h_.item_ptr(h_row, 0);
-    const uint32_t* x_map_ptr = x_map + h_column;
-    const uint32_t* v_ptr = (const uint32_t*) v_.item_uint32_ptr(v_row, v_column);
-    half result = acc;
-    for (int i = 0; i < count; i++)
-    {
-        uint32_t v_read = *v_ptr; v_ptr += v_.width;
-        half v_0 = __int2half_rn((int)((v_read      ) & 0x0f) - v_zero);
-        half v_1 = __int2half_rn((int)((v_read >>  4) & 0x0f) - v_zero);
-        half v_2 = __int2half_rn((int)((v_read >>  8) & 0x0f) - v_zero);
-        half v_3 = __int2half_rn((int)((v_read >> 12) & 0x0f) - v_zero);
-        half v_4 = __int2half_rn((int)((v_read >> 16) & 0x0f) - v_zero);
-        half v_5 = __int2half_rn((int)((v_read >> 20) & 0x0f) - v_zero);
-        half v_6 = __int2half_rn((int)((v_read >> 24) & 0x0f) - v_zero);
-        half v_7 = __int2half_rn((int)((v_read >> 28)       ) - v_zero);
-        half tmp = __hmul(h_ptr[*x_map_ptr++], v_0);
-        tmp = __hfma(h_ptr[*x_map_ptr++], v_1, tmp);
-        tmp = __hfma(h_ptr[*x_map_ptr++], v_2, tmp);
-        tmp = __hfma(h_ptr[*x_map_ptr++], v_3, tmp);
-        tmp = __hfma(h_ptr[*x_map_ptr++], v_4, tmp);
-        tmp = __hfma(h_ptr[*x_map_ptr++], v_5, tmp);
-        tmp = __hfma(h_ptr[*x_map_ptr++], v_6, tmp);
-        tmp = __hfma(h_ptr[*x_map_ptr++], v_7, tmp);
-        result = __hfma(v_scale, tmp, result);
-    }
-    return result;
-}
-#endif
--- a/colossalai/kernel/cuda_native/csrc/gptq/q4_matmul.cu
+++ b/colossalai/kernel/cuda_native/csrc/gptq/q4_matmul.cu
-// Adapted from turboderp exllama: https://github.com/turboderp/exllama
-#include "q4_matmul.cuh"
-#include "column_remap.cuh"
-#include "util.cuh"
-#include "matrix.cuh"
-#include "cu_compat.cuh"
-#include "cuda_buffers.cuh"
-#if defined(USE_ROCM)
-#include "hip_compat.cuh"
-#endif
-const int THREADS_X = 32;       // Block size and thread count along columns in w and out
-const int THREADS_Y = 1;        // Block size and thread count along rows in x and out
-typedef void (*fp_q4_matmul_kernel)
-(
-    const half*,
-    const uint32_t*,
-    half*,
-    const half*,
-    const uint32_t*,
-    const int,
-    const int,
-    const int,
-    const int,
-    const int,
-    const uint32_t*,
-    bool
-);
-template<bool use_half2, bool use_groupsize, bool use_x_map>
-__global__ void q4_matmul_kernel
-(
-    const half* __restrict__ x,
-    const uint32_t* __restrict__ w,
-    half* __restrict__ out,
-    const half* __restrict__ w_scales,
-    const uint32_t* __restrict__ w_zeros,
-    const int height,
-    const int dim,
-    const int width,
-    const int groupsize,
-    const int block_size_z,
-    const uint32_t* __restrict__ x_map,
-    bool no_zero
-)
-{
-    // Start of block
-    int x_column = block_size_z * blockIdx.z;
-    int x_column_end = min(dim, block_size_z * (blockIdx.z + 1));
-    int w_column = THREADS_X * blockIdx.x + threadIdx.x;
-    int x_row = THREADS_Y * blockIdx.y + threadIdx.y;
-    int iterations = (x_column_end - x_column) / 8;
-    // Views
-    MatrixView_half x_(x, height, dim);
-    MatrixView_half w_scales_(w_scales, dim / groupsize, width);
-    MatrixView_q4_row w_zeros_(w_zeros, dim / groupsize, width);
-    MatrixView_q4_column w_(w, dim, width);
-    MatrixView_half_rw out_(out, height, width);
-    // Zero output
-    if (!no_zero && blockIdx.z == 0 && (threadIdx.x & 1) == 0)
-    {
-        *((uint32_t*) out_.item_ptr(x_row, w_column)) = 0;
-        __syncthreads();
-    }
-    // Loop over part of x row (and w column)
-    half2 acc = {};
-    half acc_h = {};
-    if constexpr (use_groupsize)
-    {
-        // For quant matrices where groupsize divides BLOCK_SIZE_Z we always start on a group boundary, so this
-        // could be slightly faster
-        for (int k = x_column, group = x_column / groupsize; k < x_column + iterations * 8; group++, k += groupsize)
-        {
-            if constexpr (use_half2)
-            {
-                half2 w_scale = w_scales_.item_half2half2(group, w_column);
-                uint32_t w_zero = w_zeros_.item(group, w_column) + 1;
-                if constexpr (use_x_map) acc = dot_product_8_x_map(acc, x_, x_row, k, w_, k, w_column, w_scale, w_zero, groupsize / 8, x_map);
-                else                     acc = dot_product_8      (acc, x_, x_row, k, w_, k, w_column, w_scale, w_zero, groupsize / 8);
-            }
-            else
-            {
-                half w_scale = w_scales_.item(group, w_column);
-                uint32_t w_zero = w_zeros_.item(group, w_column) + 1;
-                if constexpr (use_x_map) acc_h = dot_product_8_x_map_h(acc_h, x_, x_row, k, w_, k, w_column, w_scale, w_zero, groupsize / 8, x_map);
-                else                     acc_h = dot_product_8_h      (acc_h, x_, x_row, k, w_, k, w_column, w_scale, w_zero, groupsize / 8);
-            }
-        }
-    }
-    else
-    {
-        // Otherwise assume groupsize is a multiple of 8, do 8 columns per iteration and trust the cache
-        for (int k = x_column; k < x_column + iterations * 8; k += 8)
-        {
-            if constexpr (use_half2)
-            {
-                int group = k / groupsize;
-                half2 w_scale = w_scales_.item_half2half2(group, w_column);
-                uint32_t w_zero = w_zeros_.item(group, w_column) + 1;
-                if constexpr (use_x_map) acc = dot_product_8_x_map(acc, x_, x_row, k, w_, k, w_column, w_scale, w_zero, 1, x_map);
-                else                     acc = dot_product_8      (acc, x_, x_row, k, w_, k, w_column, w_scale, w_zero, 1);
-            }
-            else
-            {
-                int group = k / groupsize;
-                half w_scale = w_scales_.item(group, w_column);
-                uint32_t w_zero = w_zeros_.item(group, w_column) + 1;
-                if constexpr (use_x_map) acc_h = dot_product_8_x_map_h(acc_h, x_, x_row, k, w_, k, w_column, w_scale, w_zero, 1, x_map);
-                else                     acc_h = dot_product_8_h      (acc_h, x_, x_row, k, w_, k, w_column, w_scale, w_zero, 1);
-            }
-        }
-    }
-    // Add to block result
-    if constexpr (use_half2)
-    {
-        half result = __hadd(__low2half(acc), __high2half(acc));
-        atomicAdd(out_.item_ptr(x_row, w_column), result);
-    }
-    else
-    {
-        atomicAdd(out_.item_ptr(x_row, w_column), acc_h);
-    }
-}
-fp_q4_matmul_kernel q4_matmul_kernel_pick(ExLlamaTuning* tuningParams, int block_size_z, int groupsize, uint32_t* x_map)
-{
-    // <bool use_half2, bool use_groupsize, bool use_x_map>
-    if (tuningParams->matmul_no_half2) {
-        if (block_size_z % groupsize == 0) {
-            if (x_map) return q4_matmul_kernel<false, true,  true >;
-            else       return q4_matmul_kernel<false, true,  false>;
-        } else {
-            if (x_map) return q4_matmul_kernel<false, false, true >;
-            else       return q4_matmul_kernel<false, false, false>;
-        }
-    } else {
-        if (block_size_z % groupsize == 0)
-        {
-            if (x_map) return q4_matmul_kernel<true,  true,  true >;
-            else       return q4_matmul_kernel<true,  true,  false>;
-        } else {
-            if (x_map) return q4_matmul_kernel<true,  false, true >;
-            else       return q4_matmul_kernel<true,  false, false>;
-        }
-    }
-};
-// Compute y = x @ w
-void q4_matmul_cuda
-(
-    ExLlamaTuning* tuningParams,
-    const half* x,
-    const int x_height,
-    const Q4Matrix* w,
-    half* out,
-    bool no_zero,
-    cudaStream_t alt_stream
-)
-{
-    int height = x_height;
-    int dim = w->height;
-    int width = w->width;
-    cudaSetDevice(w->device);
-    uint32_t* x_map = w->cuda_x_map;
-    const half* x_mapped = x;
-    if (x_map && !tuningParams->matmul_fused_remap && !alt_stream)
-    {
-        CudaBuffers* buffers = get_buffers(w->device);
-        column_remap_cuda(x, buffers->temp_state, x_height, dim, w->cuda_x_map);
-        x_mapped = buffers->temp_state;
-        x_map = NULL;
-    }
-    int block_size_z;
-    if (w->width == 4096) block_size_z = 384;           // 7B
-    else if (w->width == 11008) block_size_z = 256;
-    else if (w->width == 5120) block_size_z = 384;      // 13B
-    else if (w->width == 13824) block_size_z = 256;
-    else if (w->width == 6656) block_size_z = 256;      // 33B
-    else if (w->width == 17920) block_size_z = 128;
-    else block_size_z = 256;
-    //if (!no_zero) cudaMemsetAsync(out, 0, x_height * w->width * sizeof(half));
-    dim3 threads(THREADS_X, THREADS_Y, 1);
-    dim3 blocks
-    (
-        (width + threads.x - 1) / threads.x,
-        (height + threads.y - 1) / threads.y,
-        (dim + block_size_z - 1) / block_size_z
-    );
-    fp_q4_matmul_kernel kernel = q4_matmul_kernel_pick(tuningParams, block_size_z, w->groupsize, x_map);
-    kernel<<<blocks, threads, 0, alt_stream>>> (x_mapped, w->cuda_qweight, out, w->cuda_scales, w->cuda_qzeros, height, dim, width, w->groupsize, block_size_z, x_map, no_zero);
-}
-void q4_matmul_recons_cuda
-(
-    ExLlamaTuning* tuningParams,
-    const half* x,
-    const int x_height,
-    Q4Matrix* w,
-    half* out,
-    const cublasHandle_t handle,
-    bool no_zero
-)
-{
-    int height = x_height;
-    int dim = w->height;
-    int width = w->width;
-    cudaSetDevice(w->device);
-    CudaBuffers* buffers = get_buffers(w->device);
-    const half* x_mapped = x;
-    if (w->cuda_x_map)
-    {
-        TORCH_CHECK(buffers->temp_state_size >= x_height * dim, "temp_state buffer is too small");
-        column_remap_cuda(x, buffers->temp_state, x_height, dim, w->cuda_x_map);
-        x_mapped = buffers->temp_state;
-    }
-    w->reconstruct(buffers->temp_dq);
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700
-    const float alpha = 1.0f;
-    const float beta = no_zero ? 1.0f : 0.0f;
-    cublasSgemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, width, height, dim, &alpha, buffers->temp_dq, CUDA_R_16F, width,
-                  x_mapped, CUDA_R_16F, dim, &beta, out, CUDA_R_16F, width);
-#else
-    const half alpha = __float2half(1.0f);
-    const half beta = no_zero ? __float2half(1.0f) : __float2half(0.0f);
-    cublasHgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, width, height, dim, &alpha, buffers->temp_dq, width, x_mapped, dim, &beta, out, width);
-#endif
-}
--- a/colossalai/kernel/cuda_native/csrc/gptq/q4_matmul.cuh
+++ b/colossalai/kernel/cuda_native/csrc/gptq/q4_matmul.cuh
-// Adapted from turboderp exllama: https://github.com/turboderp/exllama
-#ifndef _q4_matmul_cuh
-#define _q4_matmul_cuh
-#include <cuda_runtime.h>
-#include <cuda_fp16.h>
-#include <cstdint>
-#include <cstdio>
-#include <ATen/cuda/CUDAContext.h>
-#include "q4_matrix.cuh"
-#include "tuning.h"
-// Workaround for hipify_python using rocblas instead of hipblas.
-#if defined(USE_ROCM)
-#include <hipblas/hipblas.h>
-#define rocblas_handle hipblasHandle_t
-#endif
-void q4_matmul_cuda
-(
-    ExLlamaTuning* tuningParams,
-    const half* x,
-    const int x_height,
-    const Q4Matrix* w,
-    half* out,
-    bool no_zero = false,
-    cudaStream_t alt_stream = NULL
-);
-void q4_matmul_recons_cuda
-(
-    ExLlamaTuning* tuningParams,
-    const half* x,
-    const int x_height,
-    Q4Matrix* w,
-    half* out,
-    const cublasHandle_t handle,
-    bool no_zero = false
-);
-#endif