update dtk to 24.04.1 and modify README

6a583c2f · chenych · 7d576a9a · 6a583c2f · 6a583c2f · 6a583c2f
Commit 6a583c2f authored Aug 21, 2024 by chenych
20 changed files
--- a/3rd_party/AutoGPTQ/auto_gptq/nn_modules/triton_utils/kernels.py
+++ b/3rd_party/AutoGPTQ/auto_gptq/nn_modules/triton_utils/kernels.py
+from logging import getLogger
+
+import torch
+import triton
+import triton.language as tl
+from torch.cuda.amp import custom_bwd, custom_fwd
+
+from . import custom_autotune
+
+
+logger = getLogger(__name__)
+
+
+# code based https://github.com/fpgaminer/GPTQ-triton
+
+
+@custom_autotune.autotune(
+    configs=[
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 64,
+                "BLOCK_SIZE_N": 256,
+                "BLOCK_SIZE_K": 32,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 128,
+                "BLOCK_SIZE_N": 128,
+                "BLOCK_SIZE_K": 32,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 64,
+                "BLOCK_SIZE_N": 128,
+                "BLOCK_SIZE_K": 32,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 128,
+                "BLOCK_SIZE_N": 32,
+                "BLOCK_SIZE_K": 32,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 64,
+                "BLOCK_SIZE_N": 64,
+                "BLOCK_SIZE_K": 32,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 64,
+                "BLOCK_SIZE_N": 128,
+                "BLOCK_SIZE_K": 32,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=2,
+            num_warps=8,
+        ),
+    ],
+    key=["M", "N", "K"],
+    nearest_power_of_two=True,
+    prune_configs_by={
+        "early_config_prune": custom_autotune.matmul248_kernel_config_pruner,
+        "perf_model": None,
+        "top_k": None,
+    },
+)
+@triton.jit
+def quant_matmul_248_kernel(
+    a_ptr,
+    b_ptr,
+    c_ptr,
+    scales_ptr,
+    zeros_ptr,
+    g_ptr,
+    M,
+    N,
+    K,
+    bits,
+    maxq,
+    stride_am,
+    stride_ak,
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
+    stride_scales,
+    stride_zeros,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    """
+    Compute the matrix multiplication C = A x B.
+    A is of shape (M, K) float16
+    B is of shape (K//8, N) int32
+    C is of shape (M, N) float16
+    scales is of shape (G, N) float16
+    zeros is of shape (G, N) float16
+    g_ptr is of shape (K) int32
+    """
+    infearure_per_bits = 32 // bits
+
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    num_pid_k = tl.cdiv(K, BLOCK_SIZE_K)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+
+    offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)  # (BLOCK_SIZE_M, BLOCK_SIZE_K)
+    a_mask = offs_am[:, None] < M
+    # b_ptrs is set up such that it repeats elements along the K axis 8 times
+    b_ptrs = b_ptr + (
+        (offs_k[:, None] // infearure_per_bits) * stride_bk + offs_bn[None, :] * stride_bn
+    )  # (BLOCK_SIZE_K, BLOCK_SIZE_N)
+    g_ptrs = g_ptr + offs_k
+    # shifter is used to extract the N bits of each element in the 32-bit word from B
+    scales_ptrs = scales_ptr + offs_bn[None, :]
+    zeros_ptrs = zeros_ptr + (offs_bn[None, :] // infearure_per_bits)
+
+    shifter = (offs_k % infearure_per_bits) * bits
+    zeros_shifter = (offs_bn % infearure_per_bits) * bits
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+
+    for k in range(0, num_pid_k):
+        g_idx = tl.load(g_ptrs)
+
+        # Fetch scales and zeros; these are per-outfeature and thus reused in the inner loop
+        scales = tl.load(scales_ptrs + g_idx[:, None] * stride_scales)  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
+        zeros = tl.load(zeros_ptrs + g_idx[:, None] * stride_zeros)  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
+
+        zeros = (zeros >> zeros_shifter[None, :]) & maxq
+        zeros = zeros + 1
+
+        a = tl.load(a_ptrs, mask=a_mask, other=0.0)  # (BLOCK_SIZE_M, BLOCK_SIZE_K)
+        b = tl.load(b_ptrs)  # (BLOCK_SIZE_K, BLOCK_SIZE_N), but repeated
+
+        # Now we need to unpack b (which is N-bit values) into 32-bit values
+        b = (b >> shifter[:, None]) & maxq  # Extract the N-bit values
+        b = (b - zeros) * scales  # Scale and shift
+
+        accumulator += tl.dot(a, b)
+        a_ptrs += BLOCK_SIZE_K
+        b_ptrs += (BLOCK_SIZE_K // infearure_per_bits) * stride_bk
+        g_ptrs += BLOCK_SIZE_K
+
+    c_ptrs = c_ptr + stride_cm * offs_am[:, None] + stride_cn * offs_bn[None, :]
+    c_mask = (offs_am[:, None] < M) & (offs_bn[None, :] < N)
+    tl.store(c_ptrs, accumulator, mask=c_mask)
+
+
+@custom_autotune.autotune(
+    configs=[
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 64,
+                "BLOCK_SIZE_N": 32,
+                "BLOCK_SIZE_K": 256,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 128,
+                "BLOCK_SIZE_N": 32,
+                "BLOCK_SIZE_K": 128,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 64,
+                "BLOCK_SIZE_N": 32,
+                "BLOCK_SIZE_K": 128,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 128,
+                "BLOCK_SIZE_N": 32,
+                "BLOCK_SIZE_K": 32,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 64,
+                "BLOCK_SIZE_N": 32,
+                "BLOCK_SIZE_K": 64,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 64,
+                "BLOCK_SIZE_N": 32,
+                "BLOCK_SIZE_K": 128,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=2,
+            num_warps=8,
+        ),
+    ],
+    key=["M", "N", "K"],
+    nearest_power_of_two=True,
+)
+@triton.jit
+def transpose_quant_matmul_248_kernel(
+    a_ptr,
+    b_ptr,
+    c_ptr,
+    scales_ptr,
+    zeros_ptr,
+    g_ptr,
+    M,
+    N,
+    K,
+    bits,
+    maxq,
+    stride_am,
+    stride_ak,
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
+    stride_scales,
+    stride_zeros,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    """
+    Compute the matrix multiplication C = A x B.
+    A is of shape (M, N) float16
+    B is of shape (K//8, N) int32
+    C is of shape (M, K) float16
+    scales is of shape (G, N) float16
+    zeros is of shape (G, N) float16
+    g_ptr is of shape (K) int32
+    """
+    infearure_per_bits = 32 // bits
+
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_k = tl.cdiv(K, BLOCK_SIZE_K)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_k
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_k = (pid % num_pid_in_group) // group_size_m
+
+    offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_bk = pid_k * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K)
+    offs_n = tl.arange(0, BLOCK_SIZE_N)
+    a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_n[None, :] * stride_ak)  # (BLOCK_SIZE_M, BLOCK_SIZE_N)
+    a_mask = offs_am[:, None] < M
+    # b_ptrs is set up such that it repeats elements along the K axis 8 times
+    b_ptrs = b_ptr + (
+        (offs_bk[:, None] // infearure_per_bits) * stride_bk + offs_n[None, :] * stride_bn
+    )  # (BLOCK_SIZE_K, BLOCK_SIZE_N)
+    g_ptrs = g_ptr + offs_bk
+    g_idx = tl.load(g_ptrs)
+
+    # shifter is used to extract the N bits of each element in the 32-bit word from B
+    scales_ptrs = scales_ptr + offs_n[None, :] + g_idx[:, None] * stride_scales
+    zeros_ptrs = zeros_ptr + (offs_n[None, :] // infearure_per_bits) + g_idx[:, None] * stride_zeros
+
+    shifter = (offs_bk % infearure_per_bits) * bits
+    zeros_shifter = (offs_n % infearure_per_bits) * bits
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_K), dtype=tl.float32)
+
+    for k in range(0, num_pid_n):
+        # Fetch scales and zeros; these are per-outfeature and thus reused in the inner loop
+        scales = tl.load(scales_ptrs)  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
+        zeros = tl.load(zeros_ptrs)  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
+
+        zeros = (zeros >> zeros_shifter[None, :]) & maxq
+        zeros = zeros + 1
+
+        a = tl.load(a_ptrs, mask=a_mask, other=0.0)  # (BLOCK_SIZE_M, BLOCK_SIZE_N)
+        b = tl.load(b_ptrs)  # (BLOCK_SIZE_K, BLOCK_SIZE_N), but repeated
+
+        # Now we need to unpack b (which is N-bit values) into 32-bit values
+        b = (b >> shifter[:, None]) & maxq  # Extract the N-bit values
+        b = (b - zeros) * scales  # Scale and shift
+        b = tl.trans(b)
+
+        accumulator += tl.dot(a, b)
+        a_ptrs += BLOCK_SIZE_N
+        b_ptrs += BLOCK_SIZE_N
+        scales_ptrs += BLOCK_SIZE_N
+        zeros_ptrs += BLOCK_SIZE_N // infearure_per_bits
+
+    c_ptrs = c_ptr + stride_cm * offs_am[:, None] + stride_cn * offs_bk[None, :]
+    c_mask = (offs_am[:, None] < M) & (offs_bk[None, :] < K)
+    tl.store(c_ptrs, accumulator, mask=c_mask)
+
+
+@triton.jit
+def silu(x):
+    return x * tl.sigmoid(x)
+
+
+def quant_matmul_248(input, qweight, scales, qzeros, g_idx, bits, maxq):
+    with torch.cuda.device(input.device):
+        output = torch.empty((input.shape[0], qweight.shape[1]), device=input.device, dtype=input.dtype)
+        grid = lambda META: (  # noqa: E731
+            triton.cdiv(input.shape[0], META["BLOCK_SIZE_M"]) * triton.cdiv(qweight.shape[1], META["BLOCK_SIZE_N"]),
+        )
+        quant_matmul_248_kernel[grid](
+            input,
+            qweight,
+            output,
+            scales.to(input.dtype),
+            qzeros,
+            g_idx,
+            input.shape[0],
+            qweight.shape[1],
+            input.shape[1],
+            bits,
+            maxq,
+            input.stride(0),
+            input.stride(1),
+            qweight.stride(0),
+            qweight.stride(1),
+            output.stride(0),
+            output.stride(1),
+            scales.stride(0),
+            qzeros.stride(0),
+        )
+        return output
+
+
+def transpose_quant_matmul_248(input, qweight, scales, qzeros, g_idx, bits, maxq):
+    with torch.cuda.device(input.device):
+        output_dim = (qweight.shape[0] * 32) // bits
+        output = torch.empty((input.shape[0], output_dim), device=input.device, dtype=input.dtype)
+        grid = lambda META: (  # noqa: E731
+            triton.cdiv(input.shape[0], META["BLOCK_SIZE_M"]) * triton.cdiv(output_dim, META["BLOCK_SIZE_K"]),
+        )
+        transpose_quant_matmul_248_kernel[grid](
+            input,
+            qweight,
+            output,
+            scales.to(input.dtype),
+            qzeros,
+            g_idx,
+            input.shape[0],
+            qweight.shape[1],
+            output_dim,
+            bits,
+            maxq,
+            input.stride(0),
+            input.stride(1),
+            qweight.stride(0),
+            qweight.stride(1),
+            output.stride(0),
+            output.stride(1),
+            scales.stride(0),
+            qzeros.stride(0),
+        )
+        return output
+
+
+class QuantLinearFunction(torch.autograd.Function):
+    @staticmethod
+    @custom_fwd
+    def forward(ctx, input, qweight, scales, qzeros, g_idx, bits, maxq):
+        output = quant_matmul_248(input, qweight, scales, qzeros, g_idx, bits, maxq)
+        ctx.save_for_backward(qweight, scales, qzeros, g_idx)
+        ctx.bits, ctx.maxq = bits, maxq
+        return output
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, grad_output):
+        qweight, scales, qzeros, g_idx = ctx.saved_tensors
+        bits, maxq = ctx.bits, ctx.maxq
+        grad_input = None
+
+        if ctx.needs_input_grad[0]:
+            grad_input = transpose_quant_matmul_248(grad_output, qweight, scales, qzeros, g_idx, bits, maxq)
+        return grad_input, None, None, None, None, None, None
+
+
+def quant_matmul_inference_only_248(input, qweight, scales, qzeros, g_idx, bits, maxq):
+    with torch.cuda.device(input.device):
+        output = torch.empty((input.shape[0], qweight.shape[1]), device=input.device, dtype=torch.float16)
+        grid = lambda META: (  # noqa: E731
+            triton.cdiv(input.shape[0], META["BLOCK_SIZE_M"]) * triton.cdiv(qweight.shape[1], META["BLOCK_SIZE_N"]),
+        )
+        quant_matmul_248_kernel[grid](
+            input,
+            qweight,
+            output,
+            scales,
+            qzeros,
+            g_idx,
+            input.shape[0],
+            qweight.shape[1],
+            input.shape[1],
+            bits,
+            maxq,
+            input.stride(0),
+            input.stride(1),
+            qweight.stride(0),
+            qweight.stride(1),
+            output.stride(0),
+            output.stride(1),
+            scales.stride(0),
+            qzeros.stride(0),
+        )
+        return output
+
+
+class QuantLinearInferenceOnlyFunction(torch.autograd.Function):
+    @staticmethod
+    @custom_fwd(cast_inputs=torch.float16)
+    def forward(ctx, input, qweight, scales, qzeros, g_idx, bits, maxq):
+        output = quant_matmul_248(input, qweight, scales, qzeros, g_idx, bits, maxq)
+        return output
--- a/3rd_party/AutoGPTQ/auto_gptq/nn_modules/triton_utils/mixin.py
+++ b/3rd_party/AutoGPTQ/auto_gptq/nn_modules/triton_utils/mixin.py
+class TritonModuleMixin:
+    @classmethod
+    def warmup(cls, model, transpose=False, seqlen=2048):
+        pass
--- a/3rd_party/AutoGPTQ/auto_gptq/quantization/ACKNOWLEDGEMENT.md
+++ b/3rd_party/AutoGPTQ/auto_gptq/quantization/ACKNOWLEDGEMENT.md
+The codes in this directory are mainly referenced from @qwopqwop200 's [GPTQ-for-LLaMa](https://github.com/qwopqwop200/GPTQ-for-LLaMa/tree/cuda), which itself is based on [gptq](https://github.com/IST-DASLab/gptq)
\ No newline at end of file
--- a/3rd_party/AutoGPTQ/auto_gptq/quantization/__init__.py
+++ b/3rd_party/AutoGPTQ/auto_gptq/quantization/__init__.py
+from .config import (
+    CHECKPOINT_FORMAT,
+    CHECKPOINT_FORMAT_FIELD,
+    CHECKPOINT_FORMAT_FIELD_COMPAT_MARLIN,
+    QUANT_CONFIG_FILENAME,
+    QUANT_METHOD,
+    QUANT_METHOD_FIELD,
+    BaseQuantizeConfig,
+)
+from .gptq import GPTQ
+from .quantizer import Quantizer, quantize
--- a/3rd_party/AutoGPTQ/auto_gptq/quantization/config.py
+++ b/3rd_party/AutoGPTQ/auto_gptq/quantization/config.py
+import json
+import logging
+import os
+from dataclasses import dataclass, field, fields
+from os.path import isdir, join
+from typing import Optional
+
+import huggingface_hub
+from transformers.utils.hub import PushToHubMixin, cached_file
+
+
+logger = logging.getLogger(__name__)
+handler = logging.StreamHandler()
+formatter = logging.Formatter("%(levelname)s - %(message)s")
+handler.setFormatter(formatter)
+logger.propagate = False
+logger.addHandler(handler)
+logger.setLevel(logging.INFO)
+
+CHECKPOINT_FORMAT_FIELD = "checkpoint_format"
+CHECKPOINT_FORMAT_FIELD_COMPAT_MARLIN = "is_marlin_format"
+QUANT_METHOD_FIELD = "quant_method"
+QUANT_CONFIG_FILENAME = "quantize_config.json"
+
+
+# checkpoint formats
+class CHECKPOINT_FORMAT:
+    GPTQ = "gptq"
+    MARLIN = "marlin"
+    AWQ_GEMM = "gemm"
+
+
+# quant methods
+class QUANT_METHOD:
+    GPTQ = "gptq"
+    AWQ = "awq"
+
+
+QUANT_METHOD_FORMAT_MAPPING = {
+    QUANT_METHOD.GPTQ: {
+        CHECKPOINT_FORMAT.GPTQ,
+        CHECKPOINT_FORMAT.MARLIN,
+    },
+    QUANT_METHOD.AWQ: {
+        CHECKPOINT_FORMAT.AWQ_GEMM
+    }
+}
+
+# awq is inference only
+QUANTIZE_BLACK_LIST = {QUANT_METHOD.AWQ}
+
+# compat
+QUANT_CONFIG_ARG_SYNONYMS = {
+    "w_bit": "bits",
+    "q_group_size": "group_size",
+}
+
+
+@dataclass
+class BaseQuantizeConfig(PushToHubMixin):
+    bits: int = field(default=4, metadata={"choices": [2, 3, 4, 8]})
+    group_size: int = field(default=-1)
+    damp_percent: float = field(default=0.01)
+    desc_act: bool = field(default=True)
+    static_groups: bool = field(default=False)
+    sym: bool = field(default=True)
+    true_sequential: bool = field(default=True)
+    quant_method: str = field(default=QUANT_METHOD.GPTQ)
+    checkpoint_format: str = field(default=CHECKPOINT_FORMAT.GPTQ)
+    model_name_or_path: Optional[str] = field(default=None)
+    model_file_base_name: Optional[str] = field(default=None)
+
+    def __post_init__(self):
+        fields_info = fields(self)
+
+        # validate quant method and format is matched
+        valid_checkpoint_formats = QUANT_METHOD_FORMAT_MAPPING.get(self.quant_method, None)
+        if valid_checkpoint_formats is None:
+            raise ValueError(f"Unsupported quantization method: {self.quant_method}")
+
+        if self.checkpoint_format not in valid_checkpoint_formats:
+            raise ValueError(
+                f"The checkpoint format used is {self.checkpoint_format}, and the quantization method is {self.quant_method}. "
+                f"This is not supported, please open an issue at https://github.com/AutoGPTQ/AutoGPTQ/issues.")
+
+        if self.bits not in fields_info[0].metadata["choices"]:
+            raise ValueError(f"only support quantize to {fields_info[0].metadata['choices']} bits.")
+
+        if self.group_size != -1 and self.group_size <= 0:
+            raise ValueError("unless equal to -1, group_size must greater then 0.")
+
+        if not (0 < self.damp_percent < 1):
+            raise ValueError("damp_percent must between 0 and 1.")
+
+    def save_pretrained(self, save_dir: str, **kwargs):
+        with open(join(save_dir,  QUANT_CONFIG_FILENAME), "w", encoding="utf-8") as f:
+            json.dump(self.to_dict(), f, indent=2)
+
+    @classmethod
+    # normalize quant config for compat and also performs validation
+    def from_quant_config(cls, quantize_cfg, checkpoint_format: str = None):
+        valid_formats = {CHECKPOINT_FORMAT.GPTQ, CHECKPOINT_FORMAT.MARLIN, CHECKPOINT_FORMAT.AWQ_GEMM}
+
+        checkpoint_format_auto_inferred = False
+        # compat: checkpoint_format can be passed in via from_quantized() if field missing from json
+        if checkpoint_format:
+            if checkpoint_format not in valid_formats:
+                raise ValueError(f"Unknown quantization checkpoint format: {checkpoint_format}.")
+            if quantize_cfg.get(CHECKPOINT_FORMAT_FIELD):
+                raise ValueError("Conflict: quantization checkpoint_format is passed in and also exists in model config.")
+        # compat: warn if checkpoint_format is missing
+        elif quantize_cfg.get(CHECKPOINT_FORMAT_FIELD) is None:
+            checkpoint_format_auto_inferred = True
+
+        field_names = [field.name for field in fields(cls)]
+
+        normalized = {QUANT_METHOD_FIELD: QUANT_METHOD.GPTQ, CHECKPOINT_FORMAT_FIELD: checkpoint_format if checkpoint_format else CHECKPOINT_FORMAT.GPTQ}
+        for key, val in quantize_cfg.items():
+            key = key.lower()
+
+            # remap keys according to compat map
+            if key in QUANT_CONFIG_ARG_SYNONYMS and QUANT_CONFIG_ARG_SYNONYMS[key] in field_names:
+                key = QUANT_CONFIG_ARG_SYNONYMS[key]
+
+            if key == CHECKPOINT_FORMAT_FIELD:
+                val = val.lower()
+
+                if val in {CHECKPOINT_FORMAT.GPTQ, CHECKPOINT_FORMAT.MARLIN, CHECKPOINT_FORMAT.AWQ_GEMM}:
+                    normalized[key] = val
+                else:
+                    raise ValueError(f"Unknown quantization format: {val}.")
+            elif key == QUANT_METHOD_FIELD:
+                val = val.lower()
+                # compat: some hf models use quant_method=marlin
+                if val == CHECKPOINT_FORMAT.MARLIN:
+                    normalized[CHECKPOINT_FORMAT_FIELD] = CHECKPOINT_FORMAT.MARLIN
+                elif val not in {QUANT_METHOD.GPTQ, QUANT_METHOD.AWQ}:
+                    raise ValueError(f"Unknown quantization method: {val}.")
+                else:
+                    normalized[QUANT_METHOD_FIELD] = val
+            elif key == CHECKPOINT_FORMAT_FIELD_COMPAT_MARLIN and val:
+                normalized[CHECKPOINT_FORMAT_FIELD] = CHECKPOINT_FORMAT.MARLIN
+            elif key == "version" and val.lower() == CHECKPOINT_FORMAT.AWQ_GEMM:
+                normalized[QUANT_METHOD_FIELD] = QUANT_METHOD.AWQ
+                normalized[CHECKPOINT_FORMAT_FIELD] = CHECKPOINT_FORMAT.AWQ_GEMM
+            elif key in field_names:
+                normalized[key] = val
+            else:
+                logger.info(f"Ignoring unknown parameter in the quantization configuration: {key}.")
+
+        if checkpoint_format_auto_inferred:
+            logger.info(f"`checkpoint_format` is missing from the quantization configuration and is automatically inferred to {normalized[CHECKPOINT_FORMAT_FIELD]}.")
+
+        if normalized[CHECKPOINT_FORMAT_FIELD] in {CHECKPOINT_FORMAT.AWQ_GEMM, CHECKPOINT_FORMAT.MARLIN}:
+            # AWQ and Marlin do not reorder the rows.
+            normalized["desc_act"] = False
+
+        if "sym" not in normalized:
+            logger.warning(
+                "The quantization configuration does not contain an entry `sym` (symmetric quantization). "
+                "This may result in silent errors. Defaulting to `sym=True`."
+            )
+
+        return cls(**normalized)
+
+    @classmethod
+    def from_pretrained(cls, save_dir: str, **kwargs):
+        # Parameters related to loading from Hugging Face Hub
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        resume_download = kwargs.pop("resume_download", False)
+        proxies = kwargs.pop("proxies", None)
+        local_files_only = kwargs.pop("local_files_only", False)
+        use_auth_token = kwargs.pop("use_auth_token", None)
+        revision = kwargs.pop("revision", None)
+        subfolder = kwargs.pop("subfolder", None)
+        commit_hash = kwargs.pop("_commit_hash", None)
+        checkpoint_format = kwargs.pop("checkpoint_format", None)
+
+        transformers_config = False
+        for quantize_config_filename in [
+            QUANT_CONFIG_FILENAME,
+            "quant_config.json",
+            "config.json",
+        ]:
+            if isdir(save_dir):  # Local
+                resolved_config_file = join(save_dir, quantize_config_filename)
+            else:  # Remote
+                resolved_config_file = cached_file(
+                    save_dir,
+                    quantize_config_filename,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    resume_download=resume_download,
+                    proxies=proxies,
+                    use_auth_token=use_auth_token,
+                    revision=revision,
+                    local_files_only=local_files_only,
+                    subfolder=subfolder,
+                    _raise_exceptions_for_missing_entries=False,
+                    _raise_exceptions_for_connection_errors=False,
+                    _commit_hash=commit_hash,
+                )
+            if resolved_config_file is not None:
+                if quantize_config_filename == "config.json":
+                    transformers_config = True
+                break
+
+        if resolved_config_file is None:
+            raise ValueError(
+                "No quantize_config.json, quant_config.json or config.json file was found in the model repository."
+            )
+
+        with open(resolved_config_file, "r", encoding="utf-8") as f:
+            args_from_json = json.load(f)
+
+            if transformers_config:
+                args_from_json = args_from_json["quantization_config"]
+
+            return cls.from_quant_config(args_from_json, checkpoint_format)
+
+    def get_cache_file_path(self, quant_method: QUANT_METHOD = None, checkpoint_format: CHECKPOINT_FORMAT = None):
+        """
+        Gets The Cached Weight Path.
+        If remote:   $HF_HOME/assets/autogptq/{model_name_or_path}/_{quant-method}_{checkpoint_format}.safetensors
+        If local:    {model_name_or_path}/autogptq_model_{quant-method}_{checkpoint_format}.safetensors
+        """
+
+        use_quant_method = quant_method if quant_method else self.quant_method
+        use_checkpoint_format = checkpoint_format if checkpoint_format else self.checkpoint_format
+
+        cache_file_name = f"autogptq_model_{use_quant_method}_{use_checkpoint_format}.safetensors"
+
+        if os.path.isdir(self.model_name_or_path):
+            cache_file_name = os.path.join(self.model_name_or_path, cache_file_name)
+        else:
+            namespace, subfolder = self.model_name_or_path.split("/")
+            assets_path = huggingface_hub.cached_assets_path(
+                library_name="auto_gptq", namespace=namespace, subfolder=subfolder
+            )
+            cache_file_name = os.path.join(assets_path, cache_file_name)
+
+        return cache_file_name, os.path.isfile(cache_file_name)
+
+    def to_dict(self):
+        return {
+            "bits": self.bits,
+            "group_size": self.group_size,
+            "damp_percent": self.damp_percent,
+            "desc_act": self.desc_act,
+            "static_groups": self.static_groups,
+            "sym": self.sym,
+            "true_sequential": self.true_sequential,
+            "model_name_or_path": self.model_name_or_path,
+            "model_file_base_name": self.model_file_base_name,
+            QUANT_METHOD_FIELD: self.quant_method,
+            CHECKPOINT_FORMAT_FIELD: self.checkpoint_format,
+        }
--- a/3rd_party/AutoGPTQ/auto_gptq/quantization/gptq.py
+++ b/3rd_party/AutoGPTQ/auto_gptq/quantization/gptq.py
+import math
+import os
+import time
+from logging import getLogger
+
+import torch
+import torch.nn as nn
+import transformers
+
+from .quantizer import Quantizer
+
+
+logger = getLogger(__name__)
+
+torch.backends.cuda.matmul.allow_tf32 = False
+torch.backends.cudnn.allow_tf32 = False
+
+
+class GPTQ:
+    def __init__(self, layer):
+        self.layer = layer
+        self.dev = self.layer.weight.device
+        W = layer.weight.data.clone()
+        if isinstance(self.layer, nn.Conv2d):
+            W = W.flatten(1)
+        if isinstance(self.layer, transformers.pytorch_utils.Conv1D):
+            W = W.t()
+        self.rows = W.shape[0]
+        self.columns = W.shape[1]
+        self.H = torch.zeros((self.columns, self.columns), device=self.dev)
+        self.nsamples = 0
+        self.quantizer = Quantizer()
+
+    def add_batch(self, inp, out):
+        if os.environ.get("DEBUG"):
+            self.inp1 = inp
+            self.out1 = out
+        if len(inp.shape) == 2:
+            inp = inp.unsqueeze(0)
+        tmp = inp.shape[0]
+        if isinstance(self.layer, nn.Linear) or isinstance(self.layer, transformers.Conv1D):
+            if len(inp.shape) == 3:
+                inp = inp.reshape((-1, inp.shape[-1]))
+            inp = inp.t()
+        if isinstance(self.layer, nn.Conv2d):
+            unfold = nn.Unfold(
+                self.layer.kernel_size,
+                dilation=self.layer.dilation,
+                padding=self.layer.padding,
+                stride=self.layer.stride,
+            )
+            inp = unfold(inp)
+            inp = inp.permute([1, 0, 2])
+            inp = inp.flatten(1)
+        self.H *= self.nsamples / (self.nsamples + tmp)
+        self.nsamples += tmp
+        # inp = inp.float()
+        inp = math.sqrt(2 / self.nsamples) * inp.float()
+        # self.H += 2 / self.nsamples * inp.matmul(inp.t())
+        self.H += inp.matmul(inp.t())
+
+    def fasterquant(
+        self,
+        blocksize=128,
+        percdamp=0.01,
+        group_size=-1,
+        actorder=False,
+        static_groups=False,
+    ):
+        W = self.layer.weight.data.clone()
+        if isinstance(self.layer, nn.Conv2d):
+            W = W.flatten(1)
+        if isinstance(self.layer, transformers.Conv1D):
+            W = W.t()
+        W = W.float()
+
+        tick = time.time()
+
+        if not self.quantizer.ready():
+            self.quantizer.find_params(W, weight=True)
+
+        H = self.H
+        del self.H
+        dead = torch.diag(H) == 0
+        H[dead, dead] = 1
+        W[:, dead] = 0
+
+        g_idx = []
+        scale = []
+        zero = []
+        now_idx = 1
+
+        if static_groups:
+            import copy
+
+            groups = []
+            for i in range(0, self.columns, group_size):
+                quantizer = copy.deepcopy(self.quantizer)
+                quantizer.find_params(W[:, i : (i + group_size)], weight=True)
+                scale.append(quantizer.scale)
+                zero.append(quantizer.zero)
+                groups.append(quantizer)
+
+        if actorder:
+            perm = torch.argsort(torch.diag(H), descending=True)
+            W = W[:, perm]
+            H = H[perm][:, perm]
+            invperm = torch.argsort(perm)
+
+        Losses = torch.zeros_like(W)
+        Q = torch.zeros_like(W)
+
+        damp = percdamp * torch.mean(torch.diag(H))
+        diag = torch.arange(self.columns, device=self.dev)
+        H[diag, diag] += damp
+        H = torch.linalg.cholesky(H)
+        H = torch.cholesky_inverse(H)
+        H = torch.linalg.cholesky(H, upper=True)
+        Hinv = H
+
+        for i1 in range(0, self.columns, blocksize):
+            i2 = min(i1 + blocksize, self.columns)
+            count = i2 - i1
+
+            W1 = W[:, i1:i2].clone()
+            Q1 = torch.zeros_like(W1)
+            Err1 = torch.zeros_like(W1)
+            Losses1 = torch.zeros_like(W1)
+            Hinv1 = Hinv[i1:i2, i1:i2]
+
+            for i in range(count):
+                w = W1[:, i]
+                d = Hinv1[i, i]
+
+                if group_size != -1:
+                    if not static_groups:
+                        if (i1 + i) % group_size == 0:
+                            self.quantizer.find_params(W[:, (i1 + i) : (i1 + i + group_size)], weight=True)
+
+                        if ((i1 + i) // group_size) - now_idx == -1:
+                            scale.append(self.quantizer.scale)
+                            zero.append(self.quantizer.zero)
+                            now_idx += 1
+                    else:
+                        idx = i1 + i
+                        if actorder:
+                            idx = perm[idx]
+                        self.quantizer = groups[idx // group_size]
+
+                q = self.quantizer.quantize(w.unsqueeze(1)).flatten()
+                Q1[:, i] = q
+                Losses1[:, i] = (w - q) ** 2 / d**2
+
+                err1 = (w - q) / d
+                W1[:, i:] -= err1.unsqueeze(1).matmul(Hinv1[i, i:].unsqueeze(0))
+                Err1[:, i] = err1
+
+            Q[:, i1:i2] = Q1
+            Losses[:, i1:i2] = Losses1 / 2
+
+            W[:, i2:] -= Err1.matmul(Hinv[i1:i2, i2:])
+
+            if os.environ.get("DEBUG"):
+                self.layer.weight.data[:, :i2] = Q[:, :i2]
+                self.layer.weight.data[:, i2:] = W[:, i2:]
+                logger.debug(torch.sum((self.layer(self.inp1) - self.out1) ** 2))
+                logger.debug(torch.sum(Losses))
+
+        torch.cuda.synchronize()
+        logger.info(f"duration: {(time.time() - tick)}")
+        logger.info(f"avg loss: {torch.sum(Losses).item() / self.nsamples}")
+
+        group_size = group_size if group_size != -1 else self.columns
+        if static_groups and actorder:
+            g_idx = [perm[i] // group_size for i in range(self.columns)]
+        else:
+            g_idx = [i // group_size for i in range(self.columns)]
+        g_idx = torch.tensor(g_idx, dtype=torch.int32, device=Q.device)
+        if actorder:
+            Q = Q[:, invperm]
+            g_idx = g_idx[invperm]
+
+        if isinstance(self.layer, transformers.Conv1D):
+            Q = Q.t()
+        self.layer.weight.data = Q.reshape(self.layer.weight.shape).type_as(self.layer.weight.data)
+        if os.environ.get("DEBUG"):
+            logger.debug(torch.sum((self.layer(self.inp1) - self.out1) ** 2))
+
+        if scale == []:
+            scale.append(self.quantizer.scale)
+            zero.append(self.quantizer.zero)
+        scale = torch.cat(scale, dim=1)
+        zero = torch.cat(zero, dim=1)
+        return scale, zero, g_idx
+
+    def free(self):
+        if os.environ.get("DEBUG"):
+            self.inp1 = None
+            self.out1 = None
+        self.H = None
+        self.Losses = None
+        self.Trace = None
+        torch.cuda.empty_cache()
+
+
+__all__ = ["GPTQ"]
--- a/3rd_party/AutoGPTQ/auto_gptq/quantization/quantizer.py
+++ b/3rd_party/AutoGPTQ/auto_gptq/quantization/quantizer.py
+from logging import getLogger
+
+import torch
+import torch.nn as nn
+
+
+logger = getLogger(__name__)
+
+
+def quantize(x, scale, zero, maxq):
+    if maxq < 0:
+        return (x > scale / 2).float() * scale + (x < zero / 2).float() * zero
+    q = torch.clamp(torch.round(x / scale) + zero, 0, maxq)
+    return scale * (q - zero)
+
+
+class Quantizer(nn.Module):
+    def __init__(self, shape=1):
+        super(Quantizer, self).__init__()
+        self.register_buffer("maxq", torch.tensor(0))
+        self.register_buffer("scale", torch.zeros(shape))
+        self.register_buffer("zero", torch.zeros(shape))
+
+    def configure(
+        self,
+        bits,
+        perchannel=False,
+        sym=True,
+        mse=False,
+        norm=2.4,
+        grid=100,
+        maxshrink=0.8,
+        trits=False,
+    ):
+        self.maxq = torch.tensor(2**bits - 1)
+        self.perchannel = perchannel
+        self.sym = sym
+        self.mse = mse
+        self.norm = norm
+        self.grid = grid
+        self.maxshrink = maxshrink
+        if trits:
+            self.maxq = torch.tensor(-1)
+
+    def find_params(self, x, weight=False):
+        dev = x.device
+        self.maxq = self.maxq.to(dev)
+
+        shape = x.shape
+        if self.perchannel:
+            if weight:
+                x = x.flatten(1)
+            else:
+                if len(shape) == 4:
+                    x = x.permute([1, 0, 2, 3])
+                    x = x.flatten(1)
+                if len(shape) == 3:
+                    x = x.reshape((-1, shape[-1])).t()
+                if len(shape) == 2:
+                    x = x.t()
+        else:
+            x = x.flatten().unsqueeze(0)
+
+        tmp = torch.zeros(x.shape[0], device=dev)
+        xmin = torch.minimum(x.min(1)[0], tmp)
+        xmax = torch.maximum(x.max(1)[0], tmp)
+
+        if self.sym:
+            xmax = torch.maximum(torch.abs(xmin), xmax)
+            tmp = xmin < 0
+            if torch.any(tmp):
+                xmin[tmp] = -xmax[tmp]
+        tmp = (xmin == 0) & (xmax == 0)
+        xmin[tmp] = -1
+        xmax[tmp] = +1
+
+        if self.maxq < 0:
+            self.scale = xmax
+            self.zero = xmin
+        else:
+            self.scale = (xmax - xmin) / self.maxq
+            if self.sym:
+                self.zero = torch.full_like(self.scale, (self.maxq + 1) / 2)
+            else:
+                self.zero = torch.round(-xmin / self.scale)
+
+        if self.mse:
+            best = torch.full([x.shape[0]], float("inf"), device=dev)
+            for i in range(int(self.maxshrink * self.grid)):
+                p = 1 - i / self.grid
+                xmin1 = p * xmin
+                xmax1 = p * xmax
+                scale1 = (xmax1 - xmin1) / self.maxq
+                zero1 = torch.round(-xmin1 / scale1) if not self.sym else self.zero
+                q = quantize(x, scale1.unsqueeze(1), zero1.unsqueeze(1), self.maxq)
+                q -= x
+                q.abs_()
+                q.pow_(self.norm)
+                err = torch.sum(q, 1)
+                tmp = err < best
+                if torch.any(tmp):
+                    best[tmp] = err[tmp]
+                    self.scale[tmp] = scale1[tmp]
+                    self.zero[tmp] = zero1[tmp]
+        if not self.perchannel:
+            if weight:
+                tmp = shape[0]
+            else:
+                tmp = shape[1] if len(shape) != 3 else shape[2]
+            self.scale = self.scale.repeat(tmp)
+            self.zero = self.zero.repeat(tmp)
+
+        if weight:
+            shape = [-1] + [1] * (len(shape) - 1)
+            self.scale = self.scale.reshape(shape)
+            self.zero = self.zero.reshape(shape)
+            return
+        if len(shape) == 4:
+            self.scale = self.scale.reshape((1, -1, 1, 1))
+            self.zero = self.zero.reshape((1, -1, 1, 1))
+        if len(shape) == 3:
+            self.scale = self.scale.reshape((1, 1, -1))
+            self.zero = self.zero.reshape((1, 1, -1))
+        if len(shape) == 2:
+            self.scale = self.scale.unsqueeze(0)
+            self.zero = self.zero.unsqueeze(0)
+
+    def quantize(self, x):
+        if self.ready():
+            return quantize(x, self.scale, self.zero, self.maxq)
+        return x
+
+    def enabled(self):
+        return self.maxq > 0
+
+    def ready(self):
+        return torch.all(self.scale != 0)
+
+
+__all__ = ["Quantizer"]
--- a/3rd_party/AutoGPTQ/auto_gptq/utils/__init__.py
+++ b/3rd_party/AutoGPTQ/auto_gptq/utils/__init__.py
+from .perplexity_utils import Perplexity
--- a/3rd_party/AutoGPTQ/auto_gptq/utils/accelerate_utils.py
+++ b/3rd_party/AutoGPTQ/auto_gptq/utils/accelerate_utils.py
+
+import gc
+import json
+import logging
+import os
+import shutil
+import tempfile
+from typing import Dict, List, Optional, Union
+
+import torch
+import torch.nn as nn
+from accelerate.utils.constants import SAFE_WEIGHTS_NAME, WEIGHTS_NAME
+from accelerate.utils.modeling import (
+    check_tied_parameters_in_config,
+    check_tied_parameters_on_same_device,
+    find_tied_parameters,
+    load_offloaded_weights,
+    load_state_dict,
+    retie_parameters,
+    set_module_tensor_to_device,
+)
+from accelerate.utils.offload import offload_weight, save_offload_index
+
+
+logger = logging.getLogger(__name__)
+
+# TODO: Remove and use instead accelerate.utils.modeling.load_checkpoint_in_model once https://github.com/huggingface/accelerate/pull/2588 is merged & accelerate 0.29 is released.
+def load_checkpoint_in_model(
+    model: nn.Module,
+    checkpoint: Union[str, os.PathLike],
+    device_map: Optional[Dict[str, Union[int, str, torch.device]]] = None,
+    offload_folder: Optional[Union[str, os.PathLike]] = None,
+    dtype: Optional[Union[str, torch.dtype]] = None,
+    offload_state_dict: bool = False,
+    offload_buffers: bool = False,
+    keep_in_fp32_modules: List[str] = None,
+    offload_8bit_bnb: bool = False,
+    strict: bool = False,
+):
+    """
+    Loads a (potentially sharded) checkpoint inside a model, potentially sending weights to a given device as they are
+    loaded.
+
+    <Tip warning={true}>
+
+    Once loaded across devices, you still need to call [`dispatch_model`] on your model to make it able to run. To
+    group the checkpoint loading and dispatch in one single call, use [`load_checkpoint_and_dispatch`].
+
+    </Tip>
+
+    Args:
+        model (`torch.nn.Module`):
+            The model in which we want to load a checkpoint.
+        checkpoint (`str` or `os.PathLike`):
+            The folder checkpoint to load. It can be:
+            - a path to a file containing a whole model state dict
+            - a path to a `.json` file containing the index to a sharded checkpoint
+            - a path to a folder containing a unique `.index.json` file and the shards of a checkpoint.
+            - a path to a folder containing a unique pytorch_model.bin or a model.safetensors file.
+        device_map (`Dict[str, Union[int, str, torch.device]]`, *optional*):
+            A map that specifies where each submodule should go. It doesn't need to be refined to each parameter/buffer
+            name, once a given module name is inside, every submodule of it will be sent to the same device.
+        offload_folder (`str` or `os.PathLike`, *optional*):
+            If the `device_map` contains any value `"disk"`, the folder where we will offload weights.
+        dtype (`str` or `torch.dtype`, *optional*):
+            If provided, the weights will be converted to that type when loaded.
+        offload_state_dict (`bool`, *optional*, defaults to `False`):
+            If `True`, will temporarily offload the CPU state dict on the hard drive to avoid getting out of CPU RAM if
+            the weight of the CPU state dict + the biggest shard does not fit.
+        offload_buffers (`bool`, *optional*, defaults to `False`):
+            Whether or not to include the buffers in the weights offloaded to disk.
+        keep_in_fp32_modules(`List[str]`, *optional*):
+            A list of the modules that we keep in `torch.float32` dtype.
+        offload_8bit_bnb (`bool`, *optional*):
+            Whether or not to enable offload of 8-bit modules on cpu/disk.
+        strict (`bool`, *optional*, defaults to `False`):
+            Whether to strictly enforce that the keys in the checkpoint state_dict match the keys of the model's state_dict.
+
+    """
+    if offload_8bit_bnb:
+        from accelerate.utils.bnb import quantize_and_offload_8bit
+
+    tied_params = find_tied_parameters(model)
+
+    if check_tied_parameters_in_config(model) and len(tied_params) == 0:
+        logger.warn(
+            "The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function."
+        )
+    if device_map is not None:
+        check_tied_parameters_on_same_device(tied_params, device_map)
+
+    if offload_folder is None and device_map is not None and "disk" in device_map.values():
+        raise ValueError(
+            "At least one of the model submodule will be offloaded to disk, please pass along an `offload_folder`."
+        )
+    elif offload_folder is not None and device_map is not None and "disk" in device_map.values():
+        os.makedirs(offload_folder, exist_ok=True)
+
+    if isinstance(dtype, str):
+        # We accept "torch.float16" or just "float16"
+        dtype = dtype.replace("torch.", "")
+        dtype = getattr(torch, dtype)
+
+    checkpoint_files = None
+    index_filename = None
+    if os.path.isfile(checkpoint):
+        if str(checkpoint).endswith(".json"):
+            index_filename = checkpoint
+        else:
+            checkpoint_files = [checkpoint]
+    elif os.path.isdir(checkpoint):
+        # check if the whole state dict is present
+        potential_state_bin = [f for f in os.listdir(checkpoint) if f == WEIGHTS_NAME]
+        potential_state_safetensor = [f for f in os.listdir(checkpoint) if f == SAFE_WEIGHTS_NAME]
+        if len(potential_state_bin) == 1:
+            checkpoint_files = [os.path.join(checkpoint, potential_state_bin[0])]
+        elif len(potential_state_safetensor) == 1:
+            checkpoint_files = [os.path.join(checkpoint, potential_state_safetensor[0])]
+        else:
+            # otherwise check for sharded checkpoints
+            potential_index = [f for f in os.listdir(checkpoint) if f.endswith(".index.json")]
+            if len(potential_index) == 0:
+                raise ValueError(
+                    f"{checkpoint} is not a folder containing a `.index.json` file or a {WEIGHTS_NAME} or a {SAFE_WEIGHTS_NAME} file"
+                )
+            elif len(potential_index) == 1:
+                index_filename = os.path.join(checkpoint, potential_index[0])
+            else:
+                raise ValueError(
+                    f"{checkpoint} containing more than one `.index.json` file, delete the irrelevant ones."
+                )
+    else:
+        raise ValueError(
+            "`checkpoint` should be the path to a file containing a whole state dict, or the index of a sharded "
+            f"checkpoint, or a folder containing a sharded checkpoint or the whole state dict, but got {checkpoint}."
+        )
+
+    if index_filename is not None:
+        checkpoint_folder = os.path.split(index_filename)[0]
+        with open(index_filename) as f:
+            index = json.loads(f.read())
+
+        if "weight_map" in index:
+            index = index["weight_map"]
+        checkpoint_files = sorted(list(set(index.values())))  # noqa: C414
+        checkpoint_files = [os.path.join(checkpoint_folder, f) for f in checkpoint_files]
+
+    # Logic for missing/unexepected keys goes here.
+
+    offload_index = {}
+    if offload_state_dict:
+        state_dict_folder = tempfile.mkdtemp()
+        state_dict_index = {}
+
+    unexpected_keys = set()
+    model_keys = set(model.state_dict().keys())
+    buffer_names = [name for name, _ in model.named_buffers()]
+    for checkpoint_file in checkpoint_files:
+        loaded_checkpoint = load_state_dict(checkpoint_file, device_map=device_map)
+        if device_map is None:
+            model.load_state_dict(loaded_checkpoint, strict=strict)
+            unexpected_keys.update(set(loaded_checkpoint.keys()) - model_keys)
+        else:
+            for param_name, param in loaded_checkpoint.items():
+                # skip SCB parameter (for 8-bit serialization)
+                if "SCB" in param_name:
+                    continue
+
+                if param_name not in model_keys:
+                    unexpected_keys.add(param_name)
+                    if not strict:
+                        continue  # Skip loading this parameter.
+
+                module_name = param_name
+
+                while len(module_name) > 0 and module_name not in device_map:
+                    module_name = ".".join(module_name.split(".")[:-1])
+                if module_name == "" and "" not in device_map:
+                    # TODO: group all errors and raise at the end.
+                    raise ValueError(f"{param_name} doesn't have any device set.")
+                param_device = device_map[module_name]
+                new_dtype = dtype
+                if dtype is not None and torch.is_floating_point(param):
+                    if keep_in_fp32_modules is not None and dtype == torch.float16:
+                        proceed = False
+                        for key in keep_in_fp32_modules:
+                            if ((key in param_name) and (key + "." in param_name)) or key == param_name:
+                                proceed = True
+                                break
+                        if proceed:
+                            new_dtype = torch.float32
+
+                if "weight" in param_name and param_name.replace("weight", "SCB") in loaded_checkpoint.keys():
+                    if param.dtype == torch.int8:
+                        fp16_statistics = loaded_checkpoint[param_name.replace("weight", "SCB")]
+                else:
+                    fp16_statistics = None
+
+                if param_device == "disk":
+                    if offload_buffers or param_name not in buffer_names:
+                        if new_dtype is None:
+                            new_dtype = param.dtype
+                        if offload_8bit_bnb:
+                            quantize_and_offload_8bit(
+                                model, param, param_name, new_dtype, offload_folder, offload_index, fp16_statistics
+                            )
+                            continue
+                        else:
+                            set_module_tensor_to_device(model, param_name, "meta", dtype=new_dtype)
+                        offload_weight(param, param_name, offload_folder, index=offload_index)
+                elif param_device == "cpu" and offload_state_dict:
+                    if new_dtype is None:
+                        new_dtype = param.dtype
+                    if offload_8bit_bnb:
+                        quantize_and_offload_8bit(
+                            model, param, param_name, new_dtype, state_dict_folder, state_dict_index, fp16_statistics
+                        )
+                    else:
+                        set_module_tensor_to_device(model, param_name, "meta", dtype=new_dtype)
+                        offload_weight(param, param_name, state_dict_folder, index=state_dict_index)
+                else:
+                    set_module_tensor_to_device(
+                        model,
+                        param_name,
+                        param_device,
+                        value=param,
+                        dtype=new_dtype,
+                        fp16_statistics=fp16_statistics,
+                    )
+
+        # Force Python to clean up.
+        del loaded_checkpoint
+        gc.collect()
+
+    if not strict and len(unexpected_keys) > 0:
+        logger.warning(
+            f"Some weights of the model checkpoint at {checkpoint} were not used when"
+            f" initializing {model.__class__.__name__}: {unexpected_keys}. This may or may not be an issue - make sure that the checkpoint does not have unnecessary parameters, or that the model definition correctly corresponds to the checkpoint."
+        )
+
+    save_offload_index(offload_index, offload_folder)
+
+    # Load back offloaded state dict on CPU
+    if offload_state_dict:
+        load_offloaded_weights(model, state_dict_index, state_dict_folder)
+        shutil.rmtree(state_dict_folder)
+
+    retie_parameters(model, tied_params)
--- a/3rd_party/AutoGPTQ/auto_gptq/utils/data_utils.py
+++ b/3rd_party/AutoGPTQ/auto_gptq/utils/data_utils.py
+import copy
+import random
+from functools import partial
+from typing import Callable, Dict, List, Optional
+
+import torch
+from datasets import DatasetDict, IterableDatasetDict, load_dataset
+from torch import LongTensor
+from torch.utils.data import DataLoader
+from transformers import PreTrainedTokenizer
+
+
+def make_data_block(
+    samples: Dict[str, List[str]],
+    prompt_col_name: str,
+    label_col_name: str,
+    tokenizer: PreTrainedTokenizer,
+    preprocess_fn: Optional[Callable] = None,
+    sample_max_len: int = 1024,
+    block_max_len: int = 2048,
+    add_eos_token: bool = False,
+    truncate_prompt: bool = True,
+    merge_prompt_label: bool = False,
+) -> Dict[str, List[LongTensor]]:
+    """A simple implementation of text generation oriented smart batching to maximize VRAM usage when evaluation
+
+    :param samples: Dict[str, List[str]], samples that used to make data blocks
+    :param prompt_col_name: str, name of the key in samples whose value stores prompt
+    :param label_col_name: str, name of the key in samples whose value stores label
+    :param tokenizer: transformers.PretrainedTokenizer, tokenizer that used to tokenize samples
+    :param preprocess_fn: Optional[Callable], optional function that used to preprocess samples such as
+        refactor the data structure of samples, note the output of this function must be a dict whose keys
+        at least contains `prompt_col_name` and `label_col_name`
+    :param sample_max_len: int, defaults to 1024, max tokens number of each sample (before padding)
+    :param block_max_len: int, defaults to 2048, max tokens number of each data block (after padding)
+    :param add_eos_token: bool, defaults to False, whether add eos_token or not to the label
+    :param truncate_prompt: bool, defaults to True, whether to truncate prompt if the sample's total tokens
+        number exceeds `sample_max_len`, if not, will truncate label and drop this sample when all tokens
+        in label are truncated
+    :param merge_prompt_label: bool, defaults to False, will merge label into prompt if set to True, usually
+        this only required when doing language modeling task
+    :return: Dict[str, List[torch.LongTensor]], a dict whose keys are `input_ids`, `attention_mask` and
+        `label` and values are a list of torch.LongTensor
+    """
+    if preprocess_fn:
+        samples = preprocess_fn(samples)
+
+    prompts = samples[prompt_col_name]
+    labels = samples[label_col_name]
+
+    # tokenize samples
+    tokenized_prompts = tokenizer(prompts, truncation=False)["input_ids"]
+    tokenized_labels = tokenizer(labels, truncation=False)["input_ids"]
+
+    # filter tokenized samples by length
+    dropped_indices = []
+    for idx, (tokenized_prompt, tokenized_label) in enumerate(zip(tokenized_prompts, tokenized_labels)):
+        if add_eos_token:
+            tokenized_label += [tokenizer.eos_token_id]
+        len_prompt = len(tokenized_prompt)
+        len_label = len(tokenized_label)
+        exceed_len = len_prompt + len_label - sample_max_len
+        if exceed_len > 0:
+            if truncate_prompt:
+                tokenized_prompt = tokenized_prompt[exceed_len:]
+            else:
+                tokenized_label = tokenized_label[:-exceed_len]
+        tokenized_prompts[idx] = tokenized_prompt
+        tokenized_labels[idx] = tokenized_label
+        if not tokenized_label:
+            dropped_indices.append(idx)
+
+    # make data blocks of samples
+    tokenized_samples = sorted(
+        [(p, l) for idx, (p, l) in enumerate(zip(tokenized_prompts, tokenized_labels)) if idx not in dropped_indices],
+        key=lambda x: (len(x[0]) + len(x[1])) if merge_prompt_label else len(x[0]),
+    )
+    sample_blocks = []
+    sample_block = []
+    blk_max_len = 0
+    blk_total_len = 0
+    for tokenized_sample in tokenized_samples:
+        prompt_ids, label_ids = tokenized_sample
+        ori_sample_len = len(prompt_ids)
+        if merge_prompt_label:
+            ori_sample_len += len(label_ids)
+        if ori_sample_len <= blk_max_len:
+            additional_len = blk_max_len
+            sample_len = blk_max_len
+        else:
+            additional_len = len(sample_block) * (ori_sample_len - blk_max_len) + ori_sample_len
+            sample_len = ori_sample_len
+
+        if blk_total_len + additional_len > block_max_len:
+            sample_blocks.append((copy.copy(sample_block), blk_max_len))
+            sample_block = []
+            blk_max_len = 0
+            blk_total_len = 0
+            sample_len = ori_sample_len
+            additional_len = ori_sample_len
+
+        sample_block.append(tokenized_sample)
+        blk_max_len = max(blk_max_len, sample_len)
+        blk_total_len += additional_len
+
+    if sample_block:
+        sample_blocks.append((copy.copy(sample_block), blk_max_len))
+    del sample_block
+    del blk_max_len
+    del blk_total_len
+
+    new_samples = {"input_ids": [], "attention_mask": [], "labels": []}
+    # padding each data block internally
+    for block, blk_max_len in sample_blocks:
+        input_ids = []
+        attention_mask = []
+        label_ids = []
+        label_max_len = max([len(sample[1]) for sample in block])
+
+        for sample in block:
+            tokenized_prompt, tokenized_label = sample
+            sample_len = len(tokenized_prompt)
+            if merge_prompt_label:
+                sample_len += len(tokenized_label)
+            pad_num = blk_max_len - sample_len
+            if merge_prompt_label:
+                input_ids.append([tokenizer.pad_token_id] * pad_num + tokenized_prompt + tokenized_label)
+                label_ids.append([-100] * (pad_num + len(tokenized_prompt)) + tokenized_label)
+            else:
+                input_ids.append([tokenizer.pad_token_id] * pad_num + tokenized_prompt)
+                label_ids.append([-100] * (label_max_len - len(tokenized_label)) + tokenized_label)
+            attention_mask.append([0] * pad_num + [1] * sample_len)
+
+        new_samples["input_ids"].append(input_ids)
+        new_samples["attention_mask"].append(attention_mask)
+        new_samples["labels"].append(label_ids)
+
+    return new_samples
+
+
+def collate_data(blocks: List[Dict[str, List[List[int]]]], pad_token_id: int) -> Dict[str, LongTensor]:
+    def pad_block(block, pads):
+        return torch.cat((pads.to(block.device), block), dim=-1)
+
+    input_ids_blocks = [LongTensor(block["input_ids"]) for block in blocks]
+    attention_mask_blocks = [LongTensor(block["attention_mask"]) for block in blocks]
+    label_blocks = [LongTensor(block["labels"]) for block in blocks]
+
+    bsz = len(blocks)
+    inp_max_len = max([block.size(-1) for block in input_ids_blocks])
+    label_max_len = max([block.size(-1) for block in label_blocks])
+
+    for i in range(bsz):
+        block_bsz, block_inp_len = input_ids_blocks[i].shape
+        block_label_len = label_blocks[i].shape[-1]
+        pad_num = inp_max_len - block_inp_len
+        if pad_num > 0:
+            input_ids_blocks[i] = pad_block(input_ids_blocks[i], torch.ones((block_bsz, pad_num)) * pad_token_id)
+            attention_mask_blocks[i] = pad_block(attention_mask_blocks[i], torch.zeros((block_bsz, pad_num)))
+        label_pad_num = label_max_len - block_label_len
+        if label_pad_num > 0:
+            label_blocks[i] = pad_block(label_blocks[i], torch.ones((block_bsz, label_pad_num)) * -100)
+
+    return {
+        "input_ids": torch.cat(input_ids_blocks, dim=0).long(),
+        "attention_mask": torch.cat(attention_mask_blocks, dim=0).long(),
+        "labels": torch.cat(label_blocks, dim=0).long(),
+    }
+
+
+def get_dataloader(
+    data_path_or_name: str,
+    prompt_col_name: str,
+    label_col_name: str,
+    tokenizer: PreTrainedTokenizer,
+    load_fn: Optional[Callable] = None,
+    preprocess_fn: Optional[Callable] = None,
+    num_samples: int = 128,
+    sample_max_len: int = 1024,
+    block_max_len: int = 2048,
+    add_eos_token: bool = False,
+    truncate_prompt: bool = True,
+    merge_prompt_label: bool = False,
+    load_fn_kwargs: Optional[dict] = None,
+    preprocess_fn_kwargs: Optional[dict] = None,
+    **kwargs,
+) -> DataLoader:
+    """load dataset and build dataloader
+
+    :param data_path_or_name: str, dataset name in hf-hub or local file path
+    :param prompt_col_name: str, see `make_data_block`
+    :param label_col_name: str, see `make_data_block`
+    :param tokenizer: str, see `make_data_block`
+    :param load_fn: Optional[Callable], defaults to None, function used to load dataset, if not specified,
+        use `datasets.load_dataset`
+    :param preprocess_fn: Optional[Callable], see `make_data_block`
+    :param num_samples: int, defaults to 128, total samples used to evaluation
+    :param sample_max_len: int, see `make_data_block`
+    :param block_max_len: int, see `make_data_block`
+    :param add_eos_token: bool, see `make_data_block`
+    :param truncate_prompt: bool, see `make_data_block`
+    :param merge_prompt_label: bool, see `make_data_block`
+    :param load_fn_kwargs: Optional[dict], defaults to None, keyword arguments used
+        for `load_fn` or `datasets.load_dataset`
+    :param preprocess_fn_kwargs: Optional[dict], defaults to None, keyword arguments used
+        for `preprocess_fn`
+    :param kwargs: additional keyword arguments will be passed to torch's `DataLoader` initialization,
+        note values of `batch_size`, `shuffle` and `collate_fn` will always be overridden to fixed value
+    :return: torch.utils.data.DataLoader
+    """
+
+    if not load_fn_kwargs:
+        load_fn_kwargs = {}
+    if not preprocess_fn_kwargs:
+        preprocess_fn_kwargs = {}
+
+    if load_fn:
+        ds = load_fn(data_path_or_name, **load_fn_kwargs)
+    else:
+        ds = load_dataset(data_path_or_name, **load_fn_kwargs)
+    if isinstance(ds, (DatasetDict, IterableDatasetDict)):
+        if "evaluation" in ds:
+            ds = ds["evaluation"]
+        elif "test" in ds:
+            ds = ds["test"]
+        else:
+            ds = ds["train"]
+
+    ds = ds.select(
+        indices=random.sample(range(len(ds)), min(len(ds), num_samples)),
+        keep_in_memory=True,
+    )
+    ds = ds.map(
+        make_data_block,
+        batched=True,
+        batch_size=len(ds),
+        num_proc=1,
+        remove_columns=ds.column_names,
+        keep_in_memory=True,
+        load_from_cache_file=False,
+        fn_kwargs={
+            "prompt_col_name": prompt_col_name,
+            "label_col_name": label_col_name,
+            "tokenizer": tokenizer,
+            "preprocess_fn": partial(preprocess_fn, **preprocess_fn_kwargs),
+            "sample_max_len": sample_max_len,
+            "block_max_len": block_max_len,
+            "add_eos_token": add_eos_token,
+            "truncate_prompt": truncate_prompt,
+            "merge_prompt_label": merge_prompt_label,
+        },
+    )
+
+    # override some arguments' values in kwargs despite user specified
+    kwargs["batch_size"] = 1
+    kwargs["shuffle"] = False
+    kwargs["collate_fn"] = partial(collate_data, pad_token_id=tokenizer.pad_token_id)
+    dl = DataLoader(ds, **kwargs)
+
+    return dl
+
+
+__all__ = ["make_data_block", "collate_data", "get_dataloader"]
--- a/3rd_party/AutoGPTQ/auto_gptq/utils/exllama_utils.py
+++ b/3rd_party/AutoGPTQ/auto_gptq/utils/exllama_utils.py
+import gc
+
+import torch
+
+from ..nn_modules.qlinear.qlinear_exllama import QuantLinear as ExllamaQuantLinear
+
+
+def exllama_set_max_input_length(model, max_input_length: int):
+    """
+    This method does not necessarily require `model` to inherit from BaseGPTQForCausalLM.
+
+    When using the exllama backend with act-order, it is necessary to initialize a buffer that depends on the maximum expected input length. In case the
+    default used (EXLLAMA_DEFAULT_MAX_INPUT_LENGTH) is too short, this method can be called to extend the buffer size without reloading the whole model.
+    """
+
+    # The import is set here to avoid a global import. Arguably this is quite ugly, it would be better to have lazy loading.
+    from exllama_kernels import cleanup_buffers_cuda, prepare_buffers
+
+    if not model.quantize_config.desc_act:
+        raise ValueError(
+            "The method exllama_set_max_input_length should be called only when using the exllama backend **with act-order**."
+        )
+
+    uses_exllama = False
+    for name, submodule in model.named_modules():
+        if isinstance(submodule, ExllamaQuantLinear):
+            uses_exllama = True
+
+    if not uses_exllama:
+        raise ValueError(
+            f"The function exllama_set_max_input_length was called, but the model (instance of {model.__class__.__name__}) does not use the exllama backend for GPTQ. An other implementation is used (exllamav2, cuda, cuda-old, triton) and that the call to exllama_set_max_input_length is unnecessary. Please remove the call to exllama_set_max_input_length or use the exllama v1 backend."
+        )
+
+    device_to_buffers_size = {}
+    for device, buffers in model.device_to_buffers.items():
+        device_to_buffers_size[device] = {
+            "max_dq_buffer_size": buffers["max_dq_buffer_size"],
+            "max_inner_outer_dim": buffers["max_inner_outer_dim"],
+        }
+
+    # For an unknown reason calling just `del model.device_to_buffers` raises an AttributeError.
+    for key in list(model.device_to_buffers.keys()):
+        del model.device_to_buffers[key]
+    model.device_to_buffers = None
+    del model.device_to_buffers
+
+    gc.collect()
+    torch.cuda.empty_cache()
+    cleanup_buffers_cuda()
+
+    device_to_buffers = {}
+    for device, buffers_size in device_to_buffers_size.items():
+        # The temp_state buffer is required to reorder X in the act-order case.
+        # The temp_dq buffer is required to dequantize weights when using cuBLAS, typically for the prefill.
+        device_to_buffers[device] = {
+            "temp_state": torch.zeros(
+                (max_input_length, buffers_size["max_inner_outer_dim"]),
+                dtype=torch.float16,
+                device=device,
+            ),
+            "temp_dq": torch.zeros(
+                (1, buffers_size["max_dq_buffer_size"]),
+                dtype=torch.float16,
+                device=device,
+            ),
+            "max_dq_buffer_size": buffers_size["max_dq_buffer_size"],
+            "max_inner_outer_dim": buffers_size["max_inner_outer_dim"],
+        }
+
+        prepare_buffers(
+            device,
+            device_to_buffers[device]["temp_state"],
+            device_to_buffers[device]["temp_dq"],
+        )
+
+    # Buffers need to be persistent to avoid any bug.
+    model.device_to_buffers = device_to_buffers
+
+    return model
--- a/3rd_party/AutoGPTQ/auto_gptq/utils/import_utils.py
+++ b/3rd_party/AutoGPTQ/auto_gptq/utils/import_utils.py
+from logging import getLogger
+from typing import Optional
+
+import torch
+from packaging.version import parse as parse_version
+
+
+try:
+    import triton  # noqa: F401
+
+    TRITON_AVAILABLE = True
+except ImportError:
+    TRITON_AVAILABLE = False
+
+try:
+    import autogptq_cuda_64  # noqa: F401
+
+    AUTOGPTQ_CUDA_AVAILABLE = True
+except Exception:
+    AUTOGPTQ_CUDA_AVAILABLE = False
+
+
+try:
+    import exllama_kernels  # noqa: F401
+
+    EXLLAMA_KERNELS_AVAILABLE = True
+except Exception:
+    EXLLAMA_KERNELS_AVAILABLE = False
+
+try:
+    import exllamav2_kernels  # noqa: F401
+
+    EXLLAMAV2_KERNELS_AVAILABLE = True
+except Exception:
+    EXLLAMAV2_KERNELS_AVAILABLE = False
+
+try:
+    import cQIGen  # noqa: F401
+
+    QIGEN_AVAILABLE = True
+    QIGEN_EXCEPTION = None
+except Exception as e:
+    QIGEN_AVAILABLE = False
+    QIGEN_EXCEPTION = e
+
+try:
+    import autogptq_marlin_cuda  # noqa: F401
+
+    MARLIN_AVAILABLE = True
+    MARLIN_EXCEPTION = None
+except Exception as e:
+    MARLIN_AVAILABLE = False
+    MARLIN_EXCEPTION = e
+
+
+logger = getLogger(__name__)
+
+
+def dynamically_import_QuantLinear(
+    use_triton: bool,
+    desc_act: bool,
+    group_size: int,
+    bits: int,
+    disable_exllama: Optional[bool] = None,
+    disable_exllamav2: bool = False,
+    use_qigen: bool = False,
+    use_marlin: bool = False,
+    use_tritonv2: bool = False,
+):
+    if use_qigen:
+        if not QIGEN_AVAILABLE:
+            raise ValueError(
+                f"QIGen appears to be not available with the error: {QIGEN_EXCEPTION}. Please check your installation or use `use_qigen=False`."
+            )
+        from ..nn_modules.qlinear.qlinear_qigen import QuantLinear
+    else:
+        if use_triton or use_tritonv2:
+            if torch.version.hip:
+                logger.warning(
+                    "Running GPTQ triton version on AMD GPUs is untested and may result in errors or wrong predictions. Please use use_triton=False."
+                )
+            if use_tritonv2:
+                logger.debug("Using tritonv2 for GPTQ")
+                from ..nn_modules.qlinear.qlinear_tritonv2 import QuantLinear
+            else:
+                from ..nn_modules.qlinear.qlinear_triton import QuantLinear
+        else:
+            # If disable_exllamav2 is True, we want to fall back on the exllama kernel and not the cuda/cuda_old ones.
+            if disable_exllama is None:
+                if disable_exllamav2:
+                    disable_exllama = False
+                else:
+                    disable_exllama = True
+            if bits == 4 and use_marlin:
+                from ..nn_modules.qlinear.qlinear_marlin import QuantLinear
+            elif bits == 4 and not disable_exllamav2 and EXLLAMAV2_KERNELS_AVAILABLE:
+                from ..nn_modules.qlinear.qlinear_exllamav2 import QuantLinear
+            elif bits == 4 and not disable_exllama and EXLLAMA_KERNELS_AVAILABLE:
+                from ..nn_modules.qlinear.qlinear_exllama import QuantLinear
+            elif not desc_act or group_size == -1:
+                from ..nn_modules.qlinear.qlinear_cuda_old import QuantLinear
+            else:
+                from ..nn_modules.qlinear.qlinear_cuda import QuantLinear
+
+    return QuantLinear
+
+
+def compare_transformers_version(version: str = "v4.28.0", op: str = "eq"):
+    assert op in ["eq", "lt", "le", "gt", "ge"]
+
+    from transformers import __version__
+
+    return getattr(parse_version(__version__), f"__{op}__")(parse_version(version))
+
+
+def compare_pytorch_version(version: str = "v2.0.0", op: str = "eq"):
+    assert op in ["eq", "lt", "le", "gt", "ge"]
+
+    from torch import __version__
+
+    return getattr(parse_version(__version__), f"__{op}__")(parse_version(version))
--- a/3rd_party/AutoGPTQ/auto_gptq/utils/marlin_utils.py
+++ b/3rd_party/AutoGPTQ/auto_gptq/utils/marlin_utils.py
+import gc
+from logging import getLogger
+from typing import Tuple
+
+import torch
+from accelerate.utils import find_tied_parameters
+from safetensors.torch import save_file as safe_save
+from tqdm import tqdm
+
+from ..nn_modules.qlinear.qlinear_marlin import QuantLinear as MarlinQuantLinear
+from ..nn_modules.qlinear.qlinear_marlin import _get_perms, unpack_qzeros
+from ..quantization import CHECKPOINT_FORMAT, QUANT_METHOD, BaseQuantizeConfig
+from .accelerate_utils import load_checkpoint_in_model
+from .import_utils import MARLIN_AVAILABLE, MARLIN_EXCEPTION
+from .modeling_utils import recurse_getattr, recurse_setattr
+
+
+if MARLIN_AVAILABLE:
+    import autogptq_marlin_cuda
+
+logger = getLogger(__name__)
+
+
+def prepare_model_for_marlin_load(
+    model,
+    quantize_config: BaseQuantizeConfig,
+    quant_linear_class,
+    torch_dtype,
+    current_model_save_name,
+    device_map,
+):
+    # The model (e.g. model.safetensors) is already serialized in the Marlin format, load it directly.
+    if quantize_config.checkpoint_format == CHECKPOINT_FORMAT.MARLIN:
+        model_save_name = current_model_save_name
+        logger.info(f"Loading a GPTQ model, detected Marlin serialized format at {model_save_name}.")
+        model = convert_to_marlin(model, quant_linear_class, quantize_config, repack=False)
+    else:
+        model_save_name, is_cached = quantize_config.get_cache_file_path(quant_method=QUANT_METHOD.GPTQ,
+                                                              checkpoint_format=CHECKPOINT_FORMAT.MARLIN)
+
+        # If GPTQ model has Marlin version cached locally, load from the cached version (no repacking needed).
+        if is_cached:
+            logger.info(
+                f"Loading a GPTQ model, detected a cached repacked weight for Marlin kernel at {model_save_name}."
+            )
+            model = convert_to_marlin(model, quant_linear_class, quantize_config, repack=False)
+
+        # Otherwise, convert the model to Marlin format first and cache locally.
+        else:
+            # Loading the GPTQ checkpoint to do the conversion.
+            # TODO: Avoid loading the model with wrong QuantLinear, and directly use
+            # Marlin ones. The repacking can be done directly on the safetensors, just
+            # as for AWQ checkpoints.
+            load_checkpoint_in_model(
+                model,
+                dtype=torch_dtype,  # This is very hacky but works due to https://github.com/huggingface/accelerate/blob/bd72a5f1a80d5146554458823f8aeda0a9db5297/src/accelerate/utils/modeling.py#L292
+                checkpoint=current_model_save_name,
+                device_map=device_map,
+                offload_state_dict=True,
+                offload_buffers=True,
+            )
+            # Convert model to marlin, repacking weights into Marlin format.
+            model = convert_to_marlin(model, quant_linear_class, quantize_config, repack=True)
+
+            # Safetensors is unable to save tied weights, so we untie them here. Reference: https://github.com/huggingface/safetensors/issues/202
+            tied_params = find_tied_parameters(model)
+
+            for weight_group in tied_params:
+                for param_name in weight_group:
+                    if isinstance(recurse_getattr(model, param_name), torch.nn.Parameter):
+                        recurse_setattr(
+                            model,
+                            param_name,
+                            torch.nn.Parameter(recurse_getattr(model, param_name).clone()),
+                        )
+                    else:
+                        recurse_setattr(
+                            model,
+                            param_name,
+                            recurse_getattr(model, param_name).clone(),
+                        )
+
+            # Cache the converted model.
+            safe_save(model.state_dict(), model_save_name)
+
+    return model, model_save_name
+
+
+# Validate marlin support
+def _validate_marlin_device_support() -> Tuple[bool, bool]:
+    """
+        Validates if the current device is compatible and optimized for Marlin.
+        ref: https://github.com/IST-DASLab/marlin?tab=readme-ov-file#requirements
+
+        Returns:
+            Tuple[bool, bool]: The first indicates if CUDA device is compatible for Marlin,
+                               the second indicates if CUDA device is optimized for Marlin.
+        """
+    supported = False
+    optimized = False
+
+    # >=hopper is compatible but not optimized
+    if torch.cuda.get_device_capability()[0] >= 9:
+        supported = True
+        optimized = False
+    # ampere and ada are supported and optimized
+    elif torch.cuda.get_device_capability()[0] >= 8:
+        supported = True
+        optimized = True
+
+    return supported, optimized
+
+
+# Adapted from https://github.com/rib-2/marlin/tree/conversion
+def _validate_marlin_compatibility(cfg: BaseQuantizeConfig):
+    if not MARLIN_AVAILABLE:
+        return f"AutoGPTQ is not compiled with the Marlin kernel, with the following error: {MARLIN_EXCEPTION}"
+    if cfg.bits != 4:
+        return f"The quantized model uses a bitwidth different than 4 (found {cfg.bits})"
+    if cfg.group_size != 128 and cfg.group_size != -1:
+        return "The quantized model uses a group size that is not 128 or -1 (found quantization_config.group_size)"
+    if not cfg.sym:
+        return "The quantized model uses asymmetric quantization"
+    if cfg.desc_act:
+        return "The quantized model uses act-order (also called desc-act) scheme"
+    if cfg.quant_method == QUANT_METHOD.AWQ:
+        return "awq_gemm format is currently not compatible with marlin"
+    return None
+
+
+@torch.no_grad()
+def convert_to_marlin(model, model_quantlinear, quantization_config: BaseQuantizeConfig, repack: bool, strict: bool = False):
+    """
+    Converts GPTQ-packed weights to the Marlin format. This assumes that the model already meets Marlin kernel constraints.
+
+    Arguments:
+        repack (`bool`):
+            Whether to repack the qweights from `model` into the Marlin's QuantLinear layers.
+    """
+    if repack:
+        message = "Repacking weights to be compatible with Marlin kernel..."
+    else:
+        # TODO: load directly Marlin QuantLinear.
+        message = "Overriding QuantLinear layers to use Marlin's QuantLinear..."
+
+    for name, module in tqdm(model.named_modules(), desc=message, total=len(list(model.named_modules()))):
+        if not isinstance(module, model_quantlinear):
+            continue
+
+        parent_name = ".".join(name.split(".")[:-1])
+        layer_name = name[len(parent_name) + 1 :]
+
+        # We could use `torch.count_nonzero(module.bias) > 0` here to discard zero bias, but this has issues when
+        # loading weights from checkpoints holding zero bias.
+        with torch.device("meta"):
+            new_module = MarlinQuantLinear(
+                bits=4,
+                group_size=module.group_size,
+                infeatures=module.infeatures,
+                outfeatures=module.outfeatures,
+                bias=module.bias is not None,
+                trainable=False,
+            )
+
+        # workspace is never in the state_dict, thus we need to allocate it manually.
+        new_module.workspace = torch.zeros(module.outfeatures // 128 * 16, dtype=torch.int, device=module.device)
+
+        # Dequantize the weight.
+        if repack:
+            marlin_repacked_weight = autogptq_marlin_cuda.gptq_repack(module.qweight)
+
+            if strict:
+                dequantized_qzeros = unpack_qzeros(module.qzeros)
+
+                if not torch.all(dequantized_qzeros == 8):
+                    raise ValueError(
+                        "Marlin kernel is compatible only with checkpoints using symmetric quantization."
+                        "Found non-symmetric quantization for the weight {name}."
+                    )
+
+
+            _, _scale_perm, _scale_perm_single = _get_perms()
+
+            s = module.scales.data.clone()
+            if module.group_size != module.infeatures:
+                s = s.reshape((1, -1))
+                s = s.reshape((-1, len(_scale_perm)))[:, _scale_perm]
+            else:
+                s = s.reshape((-1, len(_scale_perm_single)))[:, _scale_perm_single]
+            s = s.reshape((-1, module.outfeatures)).contiguous()
+
+            new_module.B = marlin_repacked_weight
+            new_module.s = s
+            new_module.bias = module.bias
+
+            new_module = new_module.to(module.device)
+
+        # Save to parent.
+        parent_module = model.get_submodule(parent_name)
+        setattr(parent_module, layer_name, new_module)
+
+        # Free cuda memory.
+        del module
+        if repack:
+            del marlin_repacked_weight
+        gc.collect()
+
+    # Set quantization config to be Marlin.
+    quantization_config.checkpoint_format = CHECKPOINT_FORMAT.MARLIN
+
+    return model
--- a/3rd_party/AutoGPTQ/auto_gptq/utils/modeling_utils.py
+++ b/3rd_party/AutoGPTQ/auto_gptq/utils/modeling_utils.py
+import functools
+
+
+def recurse_getattr(obj, attr: str):
+    """
+    Recursive `getattr`.
+
+    Args:
+        obj:
+            A class instance holding the attribute.
+        attr (`str`):
+            The attribute that is to be retrieved, e.g. 'attribute1.attribute2'.
+    """
+
+    def _getattr(obj, attr):
+        return getattr(obj, attr)
+
+    return functools.reduce(_getattr, [obj] + attr.split("."))
+
+
+def recurse_setattr(module, name, value):
+    """A function to recursively set attributes to a module."""
+    if "." not in name:
+        setattr(module, name, value)
+    else:
+        name, rest = name.split(".", 1)
+        recurse_setattr(getattr(module, name), rest, value)
--- a/3rd_party/AutoGPTQ/auto_gptq/utils/peft_utils.py
+++ b/3rd_party/AutoGPTQ/auto_gptq/utils/peft_utils.py
+import warnings
+from contextlib import contextmanager
+from typing import List, Optional, Tuple, Union
+
+import torch
+from peft import PeftConfig, PeftModel, PeftType, get_peft_model
+from peft.mapping import PEFT_TYPE_TO_CONFIG_MAPPING
+from peft.peft_model import PEFT_TYPE_TO_MODEL_MAPPING
+from peft.tuners.adalora import AdaLoraConfig, AdaLoraLayer, AdaLoraModel
+from peft.tuners.lora import LoraConfig, LoraLayer, LoraModel
+
+from ..modeling._base import BaseGPTQForCausalLM
+from ..nn_modules.qlinear import GeneralQuantLinear
+from ..nn_modules.qlinear.qlinear_cuda import QuantLinear as QuantLinearCuda
+from ..nn_modules.qlinear.qlinear_cuda_old import QuantLinear as QuantLinearCudaOld
+from ..nn_modules.qlinear.qlinear_exllama import QuantLinear as QuantLinearExllama
+from ..nn_modules.qlinear.qlinear_exllama import QuantLinear as QuantLinearExllamaV2
+from ..nn_modules.qlinear.qlinear_qigen import QuantLinear as QuantLinearQigen
+from ..nn_modules.qlinear.qlinear_triton import QuantLinear as QuantLinearTriton
+
+
+LinearLayer = Union[
+    torch.nn.Linear,
+    GeneralQuantLinear,
+    QuantLinearCuda,
+    QuantLinearCudaOld,
+    QuantLinearExllama,
+    QuantLinearExllamaV2,
+    QuantLinearQigen,
+    QuantLinearTriton,
+]
+
+
+class GPTQLoraConfig(LoraConfig):
+    injected_fused_attention: bool = False
+    injected_fused_mlp: bool = False
+
+
+def _get_linear_feature_count(linear_layer: LinearLayer) -> Tuple[int, int]:
+    in_features = getattr(linear_layer, "in_features", getattr(linear_layer, "infeatures"))
+    out_features = getattr(linear_layer, "out_features", getattr(linear_layer, "outfeatures"))
+    return in_features, out_features
+
+
+def _get_weight(linear_layer: LinearLayer) -> torch.Tensor:
+    return getattr(linear_layer, "weight", getattr(linear_layer, "qweight"))
+
+
+class GPTQLoraLinear(torch.nn.Linear, LoraLayer):
+    def __init__(
+        self,
+        adapter_name: str,
+        linear_module: LinearLayer,
+        r: int = 0,
+        lora_alpha: int = 1,
+        lora_dropout: float = 0.0,
+        fan_in_fan_out: bool = False,  # Set this to True if the layer to replace stores weight like (fan_in, fan_out)
+        **kwargs,
+    ):
+        init_lora_weights = kwargs.pop("init_lora_weights", True)
+
+        in_features, out_features = _get_linear_feature_count(linear_module)
+        torch.nn.Linear.__init__(self, in_features, out_features)
+        LoraLayer.__init__(self, in_features, out_features)
+
+        self.linear_module = linear_module
+
+        delattr(self, "weight")
+        self.weight = _get_weight(linear_module)
+        delattr(self, "bias")
+
+        self.fan_in_fan_out = fan_in_fan_out
+        if fan_in_fan_out:
+            assert hasattr(linear_module, "weight")
+            linear_module.weight.data = linear_module.weight.data.T
+
+        self.update_layer(adapter_name, r, lora_alpha, lora_dropout, init_lora_weights)
+        self.active_adapter = adapter_name
+
+    def reset_lora_parameters(self, adapter_name):
+        if adapter_name in self.lora_A.keys():
+            torch.nn.init.xavier_uniform_(self.lora_A[adapter_name].weight)
+            torch.nn.init.zeros_(self.lora_B[adapter_name].weight)
+
+    def merge(self):
+        raise NotImplementedError("gptq model not support merge lora adapter")
+
+    def unmerge(self):
+        raise NotImplementedError("gptq model not support unmerge lora adapter")
+
+    def forward(self, x: torch.Tensor):
+        previous_dtype = x.dtype
+        if self.active_adapter not in self.lora_A.keys():
+            return self.linear_module(x)
+        if self.disable_adapters:
+            if self.r[self.active_adapter] > 0 and self.merged:
+                self.unmerge()
+            result = self.linear_module(x)
+        elif self.r[self.active_adapter] > 0 and not self.merged:
+            result = self.linear_module(x)
+
+            lora_B = self.lora_B[self.active_adapter]
+            lora_A = self.lora_A[self.active_adapter]
+            lora_dropout = self.lora_dropout[self.active_adapter]
+            scale = self.scaling[self.active_adapter]
+
+            x = x.type_as(lora_A.weight.data)
+            adapter_result = (lora_B(lora_A(lora_dropout(x))) * scale).type_as(result)
+            result += adapter_result
+        else:
+            result = self.linear_module(x)
+
+        result = result.to(previous_dtype)
+
+        return result
+
+
+class GPTQLoraModel(LoraModel):
+    def _replace_module(self, parent_module, child_name, new_module, old_module):
+        setattr(parent_module, child_name, new_module)
+        if not isinstance(new_module, GPTQLoraLinear):
+            new_module.weight = old_module.weight
+            if hasattr(old_module, "bias"):
+                if old_module.bias is not None:
+                    new_module.bias = old_module.bias
+
+            if getattr(old_module, "state", None) is not None:
+                new_module.state = old_module.state
+                new_module.to(old_module.weight.device)
+
+        # dispatch to correct device
+        for name, module in new_module.named_modules():
+            if "lora_" in name:
+                device = (list(old_module.parameters()) + list(old_module.buffers()))[0].device
+                module.to(device)
+
+    @staticmethod
+    def _create_new_module(
+        lora_config: GPTQLoraConfig,
+        adapter_name: str,
+        target: torch.nn.Linear,
+        **kwargs,
+    ):
+        gptq_quantlinears = {
+            GeneralQuantLinear,
+            QuantLinearCuda,
+            QuantLinearCudaOld,
+            QuantLinearExllama,
+            QuantLinearExllamaV2,
+            QuantLinearQigen,
+            QuantLinearTriton,
+        }
+
+        is_gptq_layer = any(isinstance(target, cls) for cls in gptq_quantlinears)
+        if is_gptq_layer:
+            return GPTQLoraLinear(
+                adapter_name,
+                target,
+                r=lora_config.r,
+                lora_alpha=lora_config.lora_alpha,
+                lora_dropout=lora_config.lora_dropout,
+                fan_in_fan_out=lora_config.fan_in_fan_out,
+            )
+        else:
+            return LoraModel._create_new_module(lora_config, adapter_name, target, **kwargs)
+
+    def merge_adapter(self):
+        raise NotImplementedError("gptq model not support merge ada lora adapter")
+
+    def unmerge_adapter(self):
+        raise NotImplementedError("gptq model not support unmerge ada lora adapter")
+
+    def merge_and_unload(self):
+        raise NotImplementedError("gptq model not support merge and unload")
+
+
+class GPTQAdaLoraConfig(AdaLoraConfig):
+    injected_fused_attention: bool = False
+    injected_fused_mlp: bool = False
+
+
+class GPTQSVDLinear(torch.nn.Linear, AdaLoraLayer):
+    def __init__(
+        self,
+        adapter_name: str,
+        linear_module: LinearLayer,
+        r: int = 0,
+        lora_alpha: int = 1,
+        lora_dropout: float = 0.0,
+        fan_in_fan_out: bool = False,  # Set this to True if the layer to replace stores weight like (fan_in, fan_out)
+        **kwargs,
+    ):
+        init_lora_weights = kwargs.pop("init_lora_weights", True)
+
+        in_features, out_features = _get_linear_feature_count(linear_module)
+        torch.nn.Linear.__init__(self, in_features, out_features)
+        AdaLoraLayer.__init__(self, in_features, out_features)
+
+        self.linear_module = linear_module
+
+        delattr(self, "weight")
+        self.weight = _get_weight(linear_module)
+        delattr(self, "bias")
+        self.fan_in_fan_out = fan_in_fan_out
+        if fan_in_fan_out:
+            assert hasattr(linear_module, "weight")
+            linear_module.weight.data = linear_module.weight.data.T
+
+        self.update_layer(adapter_name, r, lora_alpha, lora_dropout, init_lora_weights)
+        self.active_adapter = adapter_name
+
+    def merge(self):
+        raise NotImplementedError("gptq model not support merge lora adapter")
+
+    def unmerge(self):
+        raise NotImplementedError("gptq model not support unmerge lora adapter")
+
+    def forward(self, x: torch.Tensor):
+        if self.active_adapter not in self.lora_A.keys():
+            return self.linear_module(x)
+        if self.disable_adapters:
+            if self.r[self.active_adapter] > 0 and self.merged:
+                self.unmerge()
+            result = self.linear_module(x)
+        elif self.r[self.active_adapter] > 0 and not self.merged:
+            result = self.linear_module(x)
+            result += (
+                (
+                    self.lora_dropout[self.active_adapter](x)
+                    @ (self.lora_A[self.active_adapter] * self.lora_E[self.active_adapter]).T
+                    @ self.lora_B[self.active_adapter].T
+                )
+                * self.scaling[self.active_adapter]
+                / (self.ranknum[self.active_adapter] + 1e-5)
+            )
+        else:
+            result = self.linear_module(x)
+        return result
+
+    def reset_lora_parameters(self, adapter_name):
+        if adapter_name in self.lora_A.keys():
+            # Peft standard values seems too high
+            # Still not ideal, just not causing NaNs with fp16 anymore
+            torch.nn.init.normal_(self.lora_E[adapter_name], mean=0.0, std=0.005)
+            torch.clamp_(self.lora_E[adapter_name].data, -0.1, 0.1)
+            torch.nn.init.normal_(self.lora_A[adapter_name], mean=0.0, std=0.005)
+            torch.clamp_(self.lora_A[adapter_name].data, -0.1, 0.1)
+            torch.nn.init.normal_(self.lora_B[adapter_name], mean=0.0, std=0.005)
+            torch.clamp_(self.lora_B[adapter_name].data, -0.1, 0.1)
+
+
+class GPTQAdaLoraModel(AdaLoraModel):
+    def _replace_module(self, parent_module, child_name, new_module, old_module):
+        setattr(parent_module, child_name, new_module)
+
+        # dispatch to correct device
+        for name, module in new_module.named_modules():
+            if "lora_" in name:
+                device = (list(old_module.parameters()) + list(old_module.buffers()))[0].device
+                module.to(device)
+
+    @staticmethod
+    def _create_new_module(
+        lora_config: GPTQLoraConfig,
+        adapter_name: str,
+        target: torch.nn.Linear,
+        **kwargs,
+    ):
+        gptq_quantlinears = {
+            GeneralQuantLinear,
+            QuantLinearCuda,
+            QuantLinearCudaOld,
+            QuantLinearExllama,
+            QuantLinearExllamaV2,
+            QuantLinearQigen,
+            QuantLinearTriton,
+        }
+
+        is_gptq_layer = any(isinstance(target, cls) for cls in gptq_quantlinears)
+        if is_gptq_layer:
+            return GPTQSVDLinear(
+                adapter_name,
+                target,
+                r=lora_config.r,
+                lora_alpha=lora_config.lora_alpha,
+                lora_dropout=lora_config.lora_dropout,
+                fan_in_fan_out=lora_config.fan_in_fan_out,
+            )
+        else:
+            return LoraModel._create_new_module(lora_config, adapter_name, target, **kwargs)
+
+    def merge_adapter(self):
+        raise NotImplementedError("gptq model not support merge ada lora adapter")
+
+    def unmerge_adapter(self):
+        raise NotImplementedError("gptq model not support unmerge ada lora adapter")
+
+    def merge_and_unload(self):
+        raise NotImplementedError("gptq model not support merge and unload")
+
+
+def find_all_linear_names(
+    model: BaseGPTQForCausalLM,
+    ignore: Optional[List[str]] = None,
+    ignore_lm_head: bool = True,
+):
+    if not ignore:
+        ignore = []
+    lm_head_name = model.lm_head_name
+    if ignore_lm_head and lm_head_name not in ignore:
+        ignore.append(lm_head_name)
+    results = set()
+    for n, m in model.named_modules():
+        if isinstance(m, torch.nn.Linear):
+            res = n.split(".")[-1]
+            if res not in ignore:
+                results.add(res)
+    return list(results)
+
+
+@contextmanager
+def hijack_peft_mappings():
+    PEFT_TYPE_TO_CONFIG_MAPPING[PeftType.LORA] = GPTQLoraConfig
+    PEFT_TYPE_TO_MODEL_MAPPING[PeftType.LORA] = GPTQLoraModel
+    PEFT_TYPE_TO_CONFIG_MAPPING[PeftType.ADALORA] = GPTQAdaLoraConfig
+    PEFT_TYPE_TO_MODEL_MAPPING[PeftType.ADALORA] = GPTQAdaLoraModel
+
+    try:
+        yield
+    except:
+        PEFT_TYPE_TO_CONFIG_MAPPING[PeftType.LORA] = GPTQLoraConfig
+        PEFT_TYPE_TO_MODEL_MAPPING[PeftType.LORA] = GPTQLoraModel
+        PEFT_TYPE_TO_CONFIG_MAPPING[PeftType.ADALORA] = GPTQAdaLoraConfig
+        PEFT_TYPE_TO_MODEL_MAPPING[PeftType.ADALORA] = GPTQAdaLoraModel
+        raise
+    finally:
+        PEFT_TYPE_TO_CONFIG_MAPPING[PeftType.LORA] = GPTQLoraConfig
+        PEFT_TYPE_TO_MODEL_MAPPING[PeftType.LORA] = GPTQLoraModel
+        PEFT_TYPE_TO_CONFIG_MAPPING[PeftType.ADALORA] = GPTQAdaLoraConfig
+        PEFT_TYPE_TO_MODEL_MAPPING[PeftType.ADALORA] = GPTQAdaLoraModel
+
+
+def get_gptq_peft_model(
+    model: BaseGPTQForCausalLM,
+    peft_config: PeftConfig = None,
+    model_id: str = None,
+    adapter_name: str = "default",
+    auto_find_all_linears: bool = True,
+    train_mode: bool = False,
+):
+    if train_mode and not model.trainable:
+        model.enable_trainable_mode()
+    if train_mode and not peft_config:
+        raise ValueError("peft_config not specified when in train mode.")
+    if not train_mode and not model_id:
+        raise ValueError("model_id(where to load adapters) not specified when in inference mode.")
+
+    if model.fused_attn_module_type is not None and not model.injected_fused_attention:
+        peft_types = [PeftType.LORA.value, PeftType.ADALORA.value]
+        warnings.warn(
+            f"You can just ignore this warning if the peft type you use isn't in {peft_types}.\n"
+            f"{model.__class__.__name__} supports injecting fused attention but not enables this time. "
+            "If you are training adapters, you must also disable fused attention injection when loading quantized "
+            "base model at inference time, otherwise adapters may not be added to base model properly. "
+            "If you are loading adapters to do inference, you can reference to adapter's config file to check "
+            "whether the adapters are trained using base model that not enable fused attention injection."
+        )
+    if model.injected_fused_mlp:
+        raise NotImplementedError(
+            "GPTQ model that enables fused mlp injection is not supported to integrate with peft."
+        )
+
+    if train_mode:
+        peft_type = peft_config.peft_type
+        if not isinstance(peft_type, str):
+            peft_type = peft_type.value
+        if peft_type in [PeftType.LORA.value, PeftType.ADALORA.value]:
+            if auto_find_all_linears:
+                peft_config.target_modules = find_all_linear_names(model, ignore_lm_head=True)
+            if peft_type == PeftType.LORA.value and not isinstance(peft_config, GPTQLoraConfig):
+                peft_config = GPTQLoraConfig(**peft_config.to_dict())
+            if peft_type == PeftType.ADALORA.value and not isinstance(peft_config, GPTQAdaLoraConfig):
+                peft_config = GPTQAdaLoraConfig(**peft_config.to_dict())
+            peft_config.injected_fused_attention = model.injected_fused_attention
+            peft_config.injected_fused_mlp = model.injected_fused_mlp
+        if peft_type == PeftType.ADAPTION_PROMPT.value:
+            if peft_config.adapter_layers > model.config.num_hidden_layers:
+                warnings.warn(
+                    f"model has only {model.config.num_hidden_layers} layers "
+                    f"but adapter_layers is set to {peft_config.adapter_layers}, "
+                    f"will reset value to {model.config.num_hidden_layers}."
+                )
+                peft_config.adapter_layers = model.config.num_hidden_layers
+            if model.injected_fused_attention:
+                raise NotImplementedError(
+                    "model with fused attention injected isn't supported to use ADAPTION_PROMPT peft type yet."
+                )
+
+    with hijack_peft_mappings():
+        try:
+            if train_mode:
+                peft_model = get_peft_model(model.model, peft_config, adapter_name=adapter_name)
+            else:
+                peft_model = PeftModel.from_pretrained(model.model, model_id, adapter_name)
+        except:
+            raise
+            raise NotImplementedError(
+                f"{model.__class__.__name__} not support {peft_config.peft_type.value} peft type yet."
+            )
+
+    return peft_model
+
+
+__all__ = [
+    "GPTQLoraConfig",
+    "GPTQLoraModel",
+    "GPTQAdaLoraConfig",
+    "GPTQAdaLoraModel",
+    "find_all_linear_names",
+    "get_gptq_peft_model",
+]
--- a/3rd_party/AutoGPTQ/auto_gptq/utils/perplexity_utils.py
+++ b/3rd_party/AutoGPTQ/auto_gptq/utils/perplexity_utils.py
+import sys
+
+import numpy as np
+import torch
+from datasets import load_dataset
+from tqdm import tqdm
+
+
+class Perplexity:
+    """
+    A class for calculating the perplexity of a language model.
+    """
+
+    def __init__(
+        self,
+        model,
+        tokenizer,
+        dataset_path="wikitext",
+        dataset_name=None,
+        split="test",
+        text_column="text",
+    ):
+        """
+        Calculate perplexity using the same method as seen in llama.cpp.
+
+        Parameters
+        ----------
+        model : AutoModelForCausalLM
+            The language model for which the perplexity is calculated.
+        tokenizer : AutoTokenizer
+            The tokenizer corresponding to the model.
+        device : str, optional
+            The device to run the calculations on. If auto, the device that your model uses
+            will be the device used for these calculations. Default is 'auto'.
+        dataset_path : str, optional
+            The path to the dataset on the Hugging Face dataset hub. Default is 'wikitext'.
+        dataset_name : str, optional
+            The name of the dataset. Default is None.
+        split : str, optional
+            The split of the dataset to use. Default is 'test'.
+        text_column : str, optional
+            The name of the column in the dataset that contains the text data. Default is 'text'.
+        """
+        self._model = model
+        self._tokenizer = tokenizer
+        self._dataset_path = dataset_path
+        self._dataset_name = dataset_name
+        self._split = split
+        self._text_column = text_column
+        self._text = self._prepare_data()
+
+    def _get_device(self):
+        if torch.backends.mps.is_available():
+            return "mps"
+        elif torch.cuda.is_available():
+            return "cuda:0"
+        else:
+            return "cpu"
+
+    def _prepare_data(self):
+        """
+        Prepares the dataset by loading and formatting.
+
+        Returns
+        -------
+        str
+            The formatted dataset as a single string.
+        """
+        if self._dataset_path == "wikitext":
+            self._dataset_name = "wikitext-2-raw-v1"
+
+        # Load the dataset
+        data = load_dataset(self._dataset_path, self._dataset_name, split=self._split)
+        # Format the text column of the dataset
+        text_list = [" \n" if s == "" else s for s in data[self._text_column]]
+        return "".join(text_list)
+
+    @staticmethod
+    def softmax(logits):
+        """
+        Static method for applying the softmax function.
+
+        Parameters
+        ----------
+        logits : np.ndarray
+            The input to the softmax function.
+
+        Returns
+        -------
+        np.ndarray
+            The output of the softmax function.
+        """
+        e_x = np.exp(logits - np.max(logits))
+        return e_x / e_x.sum(axis=0)
+
+    def calculate_perplexity(self, n_ctx=512, n_batch=512):
+        """
+        Calculates the perplexity of the language model.
+
+        Parameters
+        ----------
+        n_ctx : int
+            The context size.
+        n_batch : int
+            The batch size.
+
+        Returns
+        -------
+        list
+            The list of perplexity scores calculated.
+        """
+        # Tokenize the text
+        self._tokenizer.model_max_length = sys.maxsize
+        tokens = self._tokenizer(self._text, truncation=False, return_tensors="pt").input_ids.to(self._model.device)
+
+        nll = 0.0  # Negative log likelihood
+        count = 0  # Counter for processed tokens
+        curr_ppl = 0
+        all_perplexity = []
+
+        with tqdm(range(len(tokens[0]) // n_ctx), desc="Perplexity: - ") as progress:
+            for i in progress:
+                # Process each batch of tokens
+                nll, count = self._process_batch(i, n_ctx, n_batch, tokens, nll, count)
+
+                # Calculate and display the current perplexity
+                curr_ppl = np.exp(nll / count)
+                all_perplexity.append(curr_ppl)
+                progress.set_description(f"Perplexity: {curr_ppl:.4f}")
+
+        return all_perplexity
+
+    def _process_batch(self, i, n_ctx, n_batch, tokens, nll, count):
+        """
+        Processes each batch of tokens.
+
+        Parameters
+        ----------
+        i : int
+            The batch index.
+        n_ctx : int
+            The context size.
+        n_batch : int
+            The batch size.
+        tokens : torch.Tensor
+            The tokenized text.
+        nll : float
+            The current negative log likelihood.
+        count : int
+            The current count of processed tokens.
+
+        Returns
+        -------
+        float
+            The updated negative log likelihood.
+        int
+            The updated count of processed tokens.
+        """
+        start = i * n_ctx
+        end = start + n_ctx
+
+        num_batches = (n_ctx + n_batch - 1) // n_batch
+
+        logits = []
+
+        for j in range(num_batches):
+            batch_start = start + j * n_batch
+            batch_size = min(end - batch_start, n_batch)
+
+            token_org = tokens[0][batch_start].item()
+
+            if j == 0:
+                # Replace the first token with the BOS token
+                tokens[0][batch_start] = self._tokenizer.bos_token_id
+
+            # Compute the logits for the current batch of tokens
+            batch_logits = self._compute_batch_logits(tokens, batch_start, batch_size)
+
+            tokens[0][batch_start] = token_org
+
+            logits.append(batch_logits)
+
+        # We rely on the fact that attention in the forward pass only looks at previous
+        # tokens here, so the logits returned for each token are an accurate representation
+        # of what the model would have predicted at that point.
+        #
+        # Example, we have a context window of 512, we will compute perplexity for each of the
+        # last 256 tokens.  Then, we split the input up into context window size chunks to
+        # process the entire prompt.
+
+        for j in range(min(512, n_ctx // 2), n_ctx - 1):
+            tok_logits = logits[0][0][j].cpu().numpy()
+            # Compute the probability of the next token
+            prob = self.softmax(tok_logits)[tokens[0][start + j + 1]]
+
+            # Update the negative log likelihood and the count of processed tokens
+            nll += -np.log(prob, where=prob > 0)
+            count += 1
+
+        return nll, count
+
+    def _compute_batch_logits(self, tokens, batch_start, batch_size):
+        """
+        Computes the logits for a batch of tokens.
+
+        Parameters
+        ----------
+        tokens : torch.Tensor
+            The tokenized text.
+        batch_start : int
+            The start index of the batch.
+        batch_size : int
+            The size of the batch.
+
+        Returns
+        -------
+        torch.Tensor
+            The logits for the batch of tokens.
+        """
+        # Compute the logits without keeping track of gradients
+        with torch.no_grad():
+            outputs = self._model(tokens[:, batch_start : batch_start + batch_size])
+        return outputs.logits.detach()
--- a/3rd_party/AutoGPTQ/autogptq_extension/cuda_256/autogptq_cuda_256.cpp
+++ b/3rd_party/AutoGPTQ/autogptq_extension/cuda_256/autogptq_cuda_256.cpp
+#include <torch/all.h>
+#include <torch/python.h>
+#include <c10/cuda/CUDAGuard.h>
+
+void vecquant2matmul_cuda(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  torch::Tensor g_idx
+);
+
+void vecquant2matmul(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  torch::Tensor g_idx
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
+  vecquant2matmul_cuda(vec, mat, mul, scales, zeros, g_idx);
+}
+
+void vecquant3matmul_cuda(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  torch::Tensor g_idx
+);
+
+void vecquant3matmul(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  torch::Tensor g_idx
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
+  vecquant3matmul_cuda(vec, mat, mul, scales, zeros, g_idx);
+}
+
+void vecquant4matmul_cuda(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  torch::Tensor g_idx
+);
+
+void vecquant4matmul(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  torch::Tensor g_idx
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
+  vecquant4matmul_cuda(vec, mat, mul, scales, zeros, g_idx);
+}
+
+void vecquant8matmul_cuda(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  torch::Tensor g_idx
+);
+
+void vecquant8matmul(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  torch::Tensor g_idx
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
+  vecquant8matmul_cuda(vec, mat, mul, scales, zeros, g_idx);
+}
+
+
+// old
+
+void vecquant2matmul_cuda_old(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  int groupsize
+); 
+
+void vecquant2matmul_old(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  int groupsize
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
+  vecquant2matmul_cuda_old(vec, mat, mul, scales, zeros,groupsize);
+}
+
+void vecquant3matmul_cuda_old(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  int groupsize
+); 
+
+void vecquant3matmul_old(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  int groupsize
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
+  vecquant3matmul_cuda_old(vec, mat, mul, scales, zeros, groupsize);
+}
+
+void vecquant4matmul_cuda_old(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  int groupsize
+); 
+
+void vecquant4matmul_old(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  int groupsize
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
+  vecquant4matmul_cuda_old(vec, mat, mul, scales, zeros, groupsize);
+}
+
+void vecquant8matmul_cuda_old(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  int groupsize
+); 
+
+void vecquant8matmul_old(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  int groupsize
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
+  vecquant8matmul_cuda_old(vec, mat, mul, scales, zeros, groupsize);
+}
+
+void vecquant2matmul_faster_cuda_old(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  int groupsize, int vec_height
+); 
+
+void vecquant2matmul_faster_old(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  int groupsize, int vec_height
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
+  vecquant2matmul_faster_cuda_old(vec, mat, mul, scales, zeros, groupsize, vec_height);
+}
+
+void vecquant3matmul_faster_cuda_old(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  int groupsize, int vec_height
+); 
+
+void vecquant3matmul_faster_old(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  int groupsize, int vec_height
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
+  vecquant3matmul_faster_cuda_old(vec, mat, mul, scales, zeros, groupsize, vec_height);
+}
+
+void vecquant4matmul_faster_cuda_old(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  int groupsize, int vec_height
+); 
+
+void vecquant4matmul_faster_old(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  int groupsize, int vec_height
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
+  vecquant4matmul_faster_cuda_old(vec, mat, mul, scales, zeros, groupsize, vec_height);
+}
+
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("vecquant2matmul", &vecquant2matmul, "Vector 2-bit Quantized Matrix Multiplication (CUDA) (desc_act)");
+  m.def("vecquant3matmul", &vecquant3matmul, "Vector 3-bit Quantized Matrix Multiplication (CUDA) (desc_act)");
+  m.def("vecquant4matmul", &vecquant4matmul, "Vector 4-bit Quantized Matrix Multiplication (CUDA) (desc_act)");
+  m.def("vecquant8matmul", &vecquant8matmul, "Vector 8-bit Quantized Matrix Multiplication (CUDA) (desc_act)");
+  
+  m.def("vecquant2matmul_old", &vecquant2matmul_old, "Vector 2-bit Quantized Matrix Multiplication (CUDA)");
+  m.def("vecquant3matmul_old", &vecquant3matmul_old, "Vector 3-bit Quantized Matrix Multiplication (CUDA)");
+  m.def("vecquant4matmul_old", &vecquant4matmul_old, "Vector 4-bit Quantized Matrix Multiplication (CUDA)");
+  m.def("vecquant8matmul_old", &vecquant8matmul_old, "Vector 8-bit Quantized Matrix Multiplication (CUDA)");
+  m.def("vecquant2matmul_faster_old", &vecquant2matmul_faster_old, "Vector 2-bit Quantized Matrix Multiplication (CUDA), faster version");
+  m.def("vecquant3matmul_faster_old", &vecquant3matmul_faster_old, "Vector 3-bit Quantized Matrix Multiplication (CUDA), faster version");
+  m.def("vecquant4matmul_faster_old", &vecquant4matmul_faster_old, "Vector 4-bit Quantized Matrix Multiplication (CUDA), faster version");
+}
--- a/3rd_party/AutoGPTQ/autogptq_extension/cuda_256/autogptq_cuda_kernel_256.cu
+++ b/3rd_party/AutoGPTQ/autogptq_extension/cuda_256/autogptq_cuda_kernel_256.cu
+#include <torch/all.h>
+#include <torch/python.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+// atomicAdd for double-precision floating-point numbers on hardware with
+// compute capability < 6.0 from:
+// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions
+// #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
+// __device__ double atomicAdd(
+//     double* address,
+//     double val
+// ) {
+//   unsigned long long int* address_as_ull = (unsigned long long int*)address;
+//   unsigned long long int old = *address_as_ull, assumed;
+//
+//   do {
+//     assumed = old;
+//     old = atomicCAS(
+//       address_as_ull,
+//       assumed,
+//       __double_as_longlong(val + __longlong_as_double(assumed))
+//     );
+//
+//   // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
+//   } while (assumed != old);
+//
+//   return __longlong_as_double(old);
+// }
+// #endif
+
+#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700) || defined(USE_ROCM)
+// adapted from https://github.com/torch/cutorch/blob/master/lib/THC/THCAtomics.cuh
+
+__device__ __forceinline__ void atomicAdd(c10::Half* address, c10::Half val) {
+    unsigned int *address_as_ui = reinterpret_cast<unsigned int *>(reinterpret_cast<char *>(address) - (reinterpret_cast<size_t>(address) & 2));
+    unsigned int old = *address_as_ui;
+    unsigned int assumed;
+
+    do {
+        assumed = old;
+        unsigned short hsum = reinterpret_cast<size_t>(address) & 2 ? (old >> 16) : (old & 0xffff);
+        hsum += val;
+        old = reinterpret_cast<size_t>(address) & 2
+                 ? (old & 0xffff) | (hsum << 16)
+                 : (old & 0xffff0000) | hsum;
+        old = atomicCAS(address_as_ui, assumed, old);
+
+    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
+    } while (assumed != old);
+}
+__device__ __forceinline__ void atomicAdd(__half* address, c10::Half val) {
+    unsigned int * address_as_ui = (unsigned int *) ((char *)address - ((size_t)address & 2));
+    unsigned int old = *address_as_ui;
+    unsigned int assumed;
+
+    do {
+        assumed = old;
+        __half_raw hsum;
+        hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
+        half tmpres = __hadd(hsum, val);
+        hsum = __half_raw(tmpres);
+        old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x;
+        old = atomicCAS(address_as_ui, assumed, old);
+    } while (assumed != old);
+}
+#endif
+
+
+template <typename scalar_t>
+__global__ void VecQuant2MatMulKernel(
+    const  scalar_t* __restrict__ vec,
+    const       int* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const  		int* __restrict__ zeros,
+	const  	    int* __restrict__ g_idx,
+    int batch,
+    int vec_height,
+    int height,
+    int width,
+	int zero_width
+);
+
+template <typename scalar_t>
+__global__ void VecQuant3MatMulKernel(
+    const  scalar_t* __restrict__ vec,
+    const       int* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const  		int* __restrict__ zeros,
+	const  	    int* __restrict__ g_idx,
+    int batch,
+    int vec_height,
+    int height,
+    int width,
+	int zero_width
+);
+
+template <typename scalar_t>
+__global__ void VecQuant4MatMulKernel(
+    const  scalar_t* __restrict__ vec,
+    const       int* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const  		int* __restrict__ zeros,
+	const  	    int* __restrict__ g_idx,
+    int batch,
+    int vec_height,
+    int height,
+    int width,
+	int zero_width
+);
+
+template <typename scalar_t>
+__global__ void VecQuant8MatMulKernel(
+    const  scalar_t* __restrict__ vec,
+    const       int* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const  		int* __restrict__ zeros,
+	const  	    int* __restrict__ g_idx,
+    int batch,
+    int vec_height,
+    int height,
+    int width,
+	int zero_width
+);
+
+template <typename scalar_t>
+__global__ void VecQuant2MatMulKernel_old(
+    const  scalar_t* __restrict__ vec,
+    const       int* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const  		int* __restrict__ zeros,
+    int batch,
+    int vec_height,
+    int height,
+    int width,
+    int zero_width,
+    int groupsize
+);
+
+template <typename scalar_t>
+__global__ void VecQuant3MatMulKernel_old(
+    const  scalar_t* __restrict__ vec,
+    const       int* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const  	int* __restrict__ zeros,
+    int batch,
+    int vec_height,
+    int height,
+    int width,
+    int zero_width,
+    int groupsize
+);
+
+template <typename scalar_t>
+__global__ void VecQuant4MatMulKernel_old(
+    const  scalar_t* __restrict__ vec,
+    const       int* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const  	int* __restrict__ zeros,
+    int batch,
+    int vec_height,
+    int height,
+    int width,
+    int zero_width,
+    int groupsize
+);
+
+template <typename scalar_t>
+__global__ void VecQuant8MatMulKernel_old(
+    const  scalar_t* __restrict__ vec,
+    const       int* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const  	int* __restrict__ zeros,
+    int batch,
+    int vec_height,
+    int height,
+    int width,
+    int zero_width,
+    int groupsize
+);
+
+__global__ void VecQuant2MatMulKernelFaster_old(
+    const  half2* __restrict__ vec,
+    const    int* __restrict__ mat,
+           float* __restrict__ mul,
+    const  float* __restrict__ scales,
+    const    int* __restrict__ zeros,
+    int batch,
+    int vec_height,
+    int height,
+    int width,
+    int zero_width,
+    int groupsize
+);
+
+__global__ void VecQuant3MatMulKernelFaster_old(
+    const  half2* __restrict__ vec,
+    const    int* __restrict__ mat,
+           float* __restrict__ mul,
+    const  float* __restrict__ scales,
+    const    int* __restrict__ zeros,
+    int batch,
+    int vec_height,
+    int height,
+    int width,
+    int zero_width,
+    int groupsize
+);
+
+__global__ void VecQuant4MatMulKernelFaster_old(
+    const  half2* __restrict__ vec,
+    const    int* __restrict__ mat,
+           float* __restrict__ mul,
+    const  float* __restrict__ scales,
+    const    int* __restrict__ zeros,
+    int batch,
+    int vec_height,
+    int height,
+    int width,
+    int zero_width,
+    int groupsize
+);
+
+
+const int BLOCKWIDTH  = 256;
+const int BLOCKHEIGHT2 =  16;
+const int BLOCKHEIGHT3 =  24;
+const int BLOCKHEIGHT4 =  32;
+const int BLOCKHEIGHT8 =  64;
+
+__device__ inline unsigned int as_unsigned(int i) {
+  return *reinterpret_cast<unsigned int*>(&i);
+}
+
+__device__ inline int as_int(int i) {
+  return *reinterpret_cast<int*>(&i);
+}
+
+
+void vecquant2matmul_cuda(
+  torch::Tensor vec,
+  torch::Tensor mat,
+  torch::Tensor mul,
+  torch::Tensor scales,
+  torch::Tensor zeros,
+  torch::Tensor g_idx
+) {
+  int batch = vec.size(0);
+  int vec_height = vec.size(1);
+  int height = mat.size(0);
+  int width = mat.size(1);
+  int zero_width = zeros.size(1);
+
+  dim3 blocks(
+    (height + BLOCKHEIGHT2 - 1) / BLOCKHEIGHT2,
+    (width + BLOCKWIDTH - 1) / BLOCKWIDTH
+  );
+  dim3 threads(BLOCKWIDTH);
+
+  AT_DISPATCH_FLOATING_TYPES(
+    vec.type(), "vecquant2matmul_cuda", ([&] {
+      VecQuant2MatMulKernel<<<blocks, threads>>>(
+        vec.data<scalar_t>(), mat.data<int>(), mul.data<scalar_t>(),
+        scales.data<scalar_t>(), zeros.data<int>(), g_idx.data<int>(),
+        batch, vec_height, height, width, zero_width
+      );
+    })
+  );
+}
+
+template <typename scalar_t>
+__global__ void VecQuant2MatMulKernel(
+    const  scalar_t* __restrict__ vec,
+    const       int* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const  		int* __restrict__ zeros,
+    const   	int* __restrict__ g_idx,
+    int batch,
+    int vec_height,
+    int height,
+    int width,
+	int zero_width
+) {
+  int h = BLOCKHEIGHT2 * blockIdx.x;
+  int w = BLOCKWIDTH * blockIdx.y + threadIdx.x;
+
+  __shared__ scalar_t blockvec[BLOCKWIDTH];
+  int i = width * h + w;
+  int g_h = h * 16;
+  int k;
+  unsigned int g;
+  scalar_t w_tmp;
+
+  int z_w = w / 16;
+  int z_mod = (w % 16) * 2;
+
+  float weight[BLOCKWIDTH];
+
+  for (k = 0; k <  BLOCKWIDTH; ++k){
+	int k_w = (k / 16);
+	int k_bit = (k % 16) * 2;
+
+    g = as_int(g_idx[g_h + k]);
+    scalar_t scale = scales[g * width + w];
+
+    // Avoid overflows with & 0x0f.
+    scalar_t zero = scalar_t(((as_unsigned(zeros[g * zero_width + z_w]) >> z_mod & 0x3) + 1) & 0x0f);
+
+    w_tmp = ((as_unsigned(mat[i + (k_w * width)]) >> k_bit) & 0x3);
+
+	weight[k] = scale * (w_tmp - zero);
+  }
+
+  scalar_t res;
+  for (int b = 0; b < batch; ++b){
+	res = 0;
+
+    blockvec[threadIdx.x] = vec[b * vec_height + blockIdx.x * BLOCKWIDTH + threadIdx.x];
+    __syncthreads();
+	for (k = 0; k <  BLOCKWIDTH; ++k){
+	  res += weight[k] * blockvec[k];
+    }
+    atomicAdd(&mul[b * width + w], res);
+    __syncthreads();
+  }
+}
+
+void vecquant3matmul_cuda(
+  torch::Tensor vec,
+  torch::Tensor mat,
+  torch::Tensor mul,
+  torch::Tensor scales,
+  torch::Tensor zeros,
+  torch::Tensor g_idx
+) {
+  int batch = vec.size(0);
+  int vec_height = vec.size(1);
+  int height = mat.size(0);
+  int width = mat.size(1);
+  int zero_width = zeros.size(1);
+
+  dim3 blocks(
+    (height + BLOCKHEIGHT3 - 1) / BLOCKHEIGHT3,
+    (width + BLOCKWIDTH - 1) / BLOCKWIDTH
+  );
+  dim3 threads(BLOCKWIDTH);
+
+  AT_DISPATCH_FLOATING_TYPES(
+    vec.type(), "vecquant3matmul_cuda", ([&] {
+      VecQuant3MatMulKernel<<<blocks, threads>>>(
+        vec.data<scalar_t>(), mat.data<int>(), mul.data<scalar_t>(),
+        scales.data<scalar_t>(), zeros.data<int>(), g_idx.data<int>(),
+        batch, vec_height, height, width, zero_width
+      );
+    })
+  );
+}
+
+template <typename scalar_t>
+__global__ void VecQuant3MatMulKernel(
+    const  scalar_t* __restrict__ vec,
+    const       int* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const       int* __restrict__ zeros,
+    const   	int* __restrict__ g_idx,
+    int batch,
+    int vec_height,
+    int height,
+    int width,
+	int zero_width
+) {
+  int h = BLOCKHEIGHT3 * blockIdx.x;
+  int w = BLOCKWIDTH * blockIdx.y + threadIdx.x;
+
+  __shared__ scalar_t blockvec[BLOCKWIDTH];
+  int i = width * h + w;
+  int g_h = (h / 3) * 32;
+  int k;
+  unsigned int g;
+  scalar_t w_tmp;
+
+  int z_w = (w / 32) * 3;
+  int z_mod = w % 32;
+  int z_bit;
+  unsigned int z_tmp;
+  if (z_mod != 10){
+    if (z_mod != 21){
+      z_bit = z_mod;
+      if (z_bit > 21){
+        z_bit -= 22;
+        z_bit *= 3;
+        z_bit += 2;
+        z_w += 2;
+      } else if (z_bit > 10){
+        z_bit -= 11;
+        z_bit *= 3;
+        z_bit += 1;
+        z_w += 1;
+      } else {
+        z_bit *= 3;
+      }
+    } else {
+      z_w += 1;
+    }
+  }
+
+  float weight[BLOCKWIDTH];
+
+  for (k = 0; k <  BLOCKWIDTH; ++k){
+	int k_w = (k / 32) * 3;
+	int k_mod = k % 32;
+	int k_bit;
+
+	if (k_mod != 10){
+	  if (k_mod != 21){
+        k_bit = k_mod;
+        if (k_bit > 21){
+		  k_bit -= 22;
+		  k_bit *= 3;
+		  k_bit += 2;
+		  k_w += 2;
+        } else if (k_bit > 10){
+		  k_bit -= 11;
+		  k_bit *= 3;
+		  k_bit += 1;
+		  k_w += 1;
+        } else {
+		  k_bit *= 3;
+        }
+	  } else {
+        k_w += 1;
+	  }
+	}
+
+    g = as_int(g_idx[g_h + k]);
+    scalar_t scale = scales[g * width + w];
+    scalar_t zero;
+    if (z_mod == 10) {
+      z_tmp = (as_unsigned(zeros[g * zero_width + z_w]) >> 30) | ((as_unsigned(zeros[g * zero_width + (z_w + 1)]) << 2) & 0x4);
+      zero = scalar_t(((z_tmp) + 1) & 0x0f);  // Avoid overflows
+    } else if (z_mod == 21){
+      z_tmp = (as_unsigned(zeros[g * zero_width + z_w]) >> 31) | ((as_unsigned(zeros[g * zero_width + (z_w + 1)]) << 1) & 0x6);
+      zero = scalar_t(((z_tmp) + 1) & 0x0f);
+    } else {
+      zero = scalar_t((((as_unsigned(zeros[g * zero_width + z_w]) >> z_bit) & 0x7) + 1) & 0x0f);
+    }
+
+    if (k_mod == 10) {
+      w_tmp = (as_unsigned(mat[i + (k_w * width)]) >> 30) | ((as_unsigned(mat[i + ((k_w + 1)* width)]) << 2) & 0x4);
+    } else if (k_mod == 21){
+      w_tmp = (as_unsigned(mat[i + (k_w * width)]) >> 31) | ((as_unsigned(mat[i + ((k_w + 1)* width)]) << 1) & 0x6);
+    } else {
+      w_tmp = ((as_unsigned(mat[i + (k_w * width)]) >> k_bit) & 0x7);
+    }
+	weight[k] = scale * (w_tmp - zero);
+  }
+
+  scalar_t res;
+  for (int b = 0; b < batch; ++b){
+	res = 0;
+
+    blockvec[threadIdx.x] = vec[b * vec_height + blockIdx.x * BLOCKWIDTH + threadIdx.x];
+    __syncthreads();
+	for (k = 0; k <  BLOCKWIDTH; ++k){
+	  res += weight[k] * blockvec[k];
+    }
+    atomicAdd(&mul[b * width + w], res);
+    __syncthreads();
+  }
+}
+
+void vecquant4matmul_cuda(
+  torch::Tensor vec,
+  torch::Tensor mat,
+  torch::Tensor mul,
+  torch::Tensor scales,
+  torch::Tensor zeros,
+  torch::Tensor g_idx
+) {
+  int batch = vec.size(0);
+  int vec_height = vec.size(1);
+  int height = mat.size(0);
+  int width = mat.size(1);
+  int zero_width = zeros.size(1);
+
+  dim3 blocks(
+    (height + BLOCKHEIGHT4 - 1) / BLOCKHEIGHT4,
+    (width + BLOCKWIDTH - 1) / BLOCKWIDTH
+  );
+  dim3 threads(BLOCKWIDTH);
+
+  AT_DISPATCH_FLOATING_TYPES(
+    vec.type(), "vecquant4matmul_cuda", ([&] {
+      VecQuant4MatMulKernel<<<blocks, threads>>>(
+        vec.data<scalar_t>(), mat.data<int>(), mul.data<scalar_t>(),
+        scales.data<scalar_t>(), zeros.data<int>(), g_idx.data<int>(),
+        batch, vec_height, height, width, zero_width
+      );
+    })
+  );
+}
+
+template <typename scalar_t>
+__global__ void VecQuant4MatMulKernel(
+    const  scalar_t* __restrict__ vec,
+    const       int* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const       int* __restrict__ zeros,
+    const   	int* __restrict__ g_idx,
+    int batch,
+    int vec_height,
+    int height,
+    int width,
+	int zero_width
+) {
+  int h = BLOCKHEIGHT4 * blockIdx.x;
+  int w = BLOCKWIDTH * blockIdx.y + threadIdx.x;
+
+  __shared__ scalar_t blockvec[BLOCKWIDTH];
+  int i = width * h + w;
+  int g_h = h * 8;
+  int k;
+  unsigned int g;
+  scalar_t w_tmp;
+
+
+  int z_w = w / 8;
+  int z_mod = (w % 8) * 4;
+
+  float weight[BLOCKWIDTH];
+
+  for (k = 0; k <  BLOCKWIDTH; ++k){
+	int k_w = (k / 8);
+	int k_bit = (k % 8) * 4;
+
+    g = as_int(g_idx[g_h + k]);
+    scalar_t scale = scales[g * width + w];
+    scalar_t zero = scalar_t((((as_unsigned(zeros[g * zero_width + z_w]) >> z_mod) & 0xF) + 1) & 0x0f);
+
+    w_tmp = ((as_unsigned(mat[i + (k_w * width)]) >> k_bit) & 0xF);
+
+	weight[k] = scale * (w_tmp - zero);
+  }
+
+  scalar_t res;
+  for (int b = 0; b < batch; ++b){
+	res = 0;
+
+    blockvec[threadIdx.x] = vec[b * vec_height + blockIdx.x * BLOCKWIDTH + threadIdx.x];
+    __syncthreads();
+	for (k = 0; k <  BLOCKWIDTH; ++k){
+	  res += weight[k] * blockvec[k];
+    }
+    atomicAdd(&mul[b * width + w], res);
+    __syncthreads();
+  }
+}
+
+void vecquant8matmul_cuda(
+  torch::Tensor vec,
+  torch::Tensor mat,
+  torch::Tensor mul,
+  torch::Tensor scales,
+  torch::Tensor zeros,
+  torch::Tensor g_idx
+) {
+  int batch = vec.size(0);
+  int vec_height = vec.size(1);
+  int height = mat.size(0);
+  int width = mat.size(1);
+  int zero_width = zeros.size(1);
+
+  dim3 blocks(
+    (height + BLOCKHEIGHT8 - 1) / BLOCKHEIGHT8,
+    (width + BLOCKWIDTH - 1) / BLOCKWIDTH
+  );
+  dim3 threads(BLOCKWIDTH);
+
+  AT_DISPATCH_FLOATING_TYPES(
+    vec.type(), "vecquant8matmul_cuda", ([&] {
+      VecQuant8MatMulKernel<<<blocks, threads>>>(
+        vec.data<scalar_t>(), mat.data<int>(), mul.data<scalar_t>(),
+        scales.data<scalar_t>(), zeros.data<int>(), g_idx.data<int>(),
+        batch, vec_height, height, width, zero_width
+      );
+    })
+  );
+}
+
+template <typename scalar_t>
+__global__ void VecQuant8MatMulKernel(
+    const  scalar_t* __restrict__ vec,
+    const       int* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const       int* __restrict__ zeros,
+    const   	int* __restrict__ g_idx,
+    int batch,
+    int vec_height,
+    int height,
+    int width,
+	int zero_width
+) {
+  int h = BLOCKHEIGHT8 * blockIdx.x;
+  int w = BLOCKWIDTH * blockIdx.y + threadIdx.x;
+
+  __shared__ scalar_t blockvec[BLOCKWIDTH];
+  int i = width * h + w;
+  int g_h = h * 4;
+  int k;
+  unsigned int g;
+  scalar_t w_tmp;
+
+  int z_w = w / 4;
+  int z_mod = (w % 4) * 8;
+
+  float weight[BLOCKWIDTH];
+
+  for (k = 0; k <  BLOCKWIDTH; ++k){
+	int k_w = (k / 4);
+	int k_bit = (k % 4) * 8;
+
+    g = as_int(g_idx[g_h + k]);
+    scalar_t scale = scales[g * width + w];
+    scalar_t zero = scalar_t((((as_unsigned(zeros[g * zero_width + z_w]) >> z_mod) & 0xFF) + 1) & 0x0f);
+
+    w_tmp = ((as_unsigned(mat[i + (k_w * width)]) >> k_bit) & 0xFF);
+
+	weight[k] = scale * (w_tmp - zero);
+  }
+
+  scalar_t res;
+  for (int b = 0; b < batch; ++b){
+	res = 0;
+
+    blockvec[threadIdx.x] = vec[b * vec_height + blockIdx.x * BLOCKWIDTH + threadIdx.x];
+    __syncthreads();
+	for (k = 0; k <  BLOCKWIDTH; ++k){
+	  res += weight[k] * blockvec[k];
+    }
+    atomicAdd(&mul[b * width + w], res);
+    __syncthreads();
+  }
+}
+
+
+void vecquant2matmul_cuda_old(
+  torch::Tensor vec,
+  torch::Tensor mat,
+  torch::Tensor mul,
+  torch::Tensor scales,
+  torch::Tensor zeros,
+  int groupsize
+) {
+  int batch = vec.size(0);
+  int vec_height = vec.size(1);
+  int height = mat.size(0);
+  int width = mat.size(1);
+  int zero_width = zeros.size(1);
+
+  dim3 blocks(
+    (height + BLOCKHEIGHT2 - 1) / BLOCKHEIGHT2,
+    (width + BLOCKWIDTH - 1) / BLOCKWIDTH,
+    batch
+  );
+  dim3 threads(BLOCKWIDTH);
+
+  AT_DISPATCH_FLOATING_TYPES(
+    vec.type(), "vecquant2matmul_cuda_old", ([&] {
+      VecQuant2MatMulKernel_old<<<blocks, threads>>>(
+        vec.data<scalar_t>(), mat.data<int>(), mul.data<scalar_t>(),
+        scales.data<scalar_t>(), zeros.data<int>(),
+        batch, vec_height, height, width, zero_width, groupsize
+      );
+    })
+  );
+}
+
+template <typename scalar_t>
+__global__ void VecQuant2MatMulKernel_old(
+    const  scalar_t* __restrict__ vec,
+    const       int* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const  	int* __restrict__ zeros,
+    int batch,
+    int vec_height,
+    int height,
+    int width,
+    int zero_width,
+    int groupsize
+) {
+  int b = blockIdx.z;
+  int h = BLOCKHEIGHT2 * blockIdx.x;
+  int w = BLOCKWIDTH * blockIdx.y + threadIdx.x;
+
+  __shared__ scalar_t blockvec[BLOCKWIDTH];
+  blockvec[threadIdx.x] = vec[b * vec_height + blockIdx.x * BLOCKWIDTH + threadIdx.x];
+  __syncthreads();
+
+  scalar_t res = 0;
+  int i = width * h + w;
+  int g_h = h * 16;
+  int k = 0;
+
+  int z_w = w / 16;
+  int z_mod = (w % 16) * 2;
+
+  unsigned int tmp;
+
+  while (k < BLOCKWIDTH) {
+    tmp = as_unsigned(mat[i]);
+
+    int g = (g_h + k) / groupsize;
+    scalar_t scale = scales[g * width + w];
+    scalar_t zero = scale * scalar_t(((as_unsigned(zeros[g * zero_width + z_w]) >> z_mod & 0x3) + 1) & 0x0f);
+
+    res += (scale * scalar_t((tmp >> 0) & 0x3) - zero) * blockvec[k + 0];
+    res += (scale * scalar_t((tmp >> 2) & 0x3) - zero) * blockvec[k + 1];
+    res += (scale * scalar_t((tmp >> 4) & 0x3) - zero) * blockvec[k + 2];
+    res += (scale * scalar_t((tmp >> 6) & 0x3) - zero) * blockvec[k + 3];
+    res += (scale * scalar_t((tmp >> 8) & 0x3) - zero) * blockvec[k + 4];
+    res += (scale * scalar_t((tmp >> 10) & 0x3) - zero) * blockvec[k + 5];
+    res += (scale * scalar_t((tmp >> 12) & 0x3) - zero) * blockvec[k + 6];
+    res += (scale * scalar_t((tmp >> 14) & 0x3) - zero) * blockvec[k + 7];
+    res += (scale * scalar_t((tmp >> 16) & 0x3) - zero) * blockvec[k + 8];
+    res += (scale * scalar_t((tmp >> 18) & 0x3) - zero) * blockvec[k + 9];
+    res += (scale * scalar_t((tmp >> 20) & 0x3) - zero) * blockvec[k + 10];
+    res += (scale * scalar_t((tmp >> 22) & 0x3) - zero) * blockvec[k + 11];
+    res += (scale * scalar_t((tmp >> 24) & 0x3) - zero) * blockvec[k + 12];
+    res += (scale * scalar_t((tmp >> 26) & 0x3) - zero) * blockvec[k + 13];
+    res += (scale * scalar_t((tmp >> 28) & 0x3) - zero) * blockvec[k + 14];
+    res += (scale * scalar_t((tmp >> 30) & 0x3) - zero) * blockvec[k + 15];
+
+    i += width;
+    k += 16;
+  }
+
+  atomicAdd(&mul[b * width + w], res);
+}
+
+void vecquant3matmul_cuda_old(
+  torch::Tensor vec,
+  torch::Tensor mat,
+  torch::Tensor mul,
+  torch::Tensor scales,
+  torch::Tensor zeros,
+  int groupsize
+) {
+  int batch = vec.size(0);
+  int vec_height = vec.size(1);
+  int height = mat.size(0);
+  int width = mat.size(1);
+  int zero_width = zeros.size(1);
+
+  dim3 blocks(
+    (height + BLOCKHEIGHT3 - 1) / BLOCKHEIGHT3,
+    (width + BLOCKWIDTH - 1) / BLOCKWIDTH,
+    batch
+  );
+  dim3 threads(BLOCKWIDTH);
+
+  AT_DISPATCH_FLOATING_TYPES(
+    vec.type(), "vecquant3matmul_cuda_old", ([&] {
+      VecQuant3MatMulKernel_old<<<blocks, threads>>>(
+        vec.data<scalar_t>(), mat.data<int>(), mul.data<scalar_t>(),
+        scales.data<scalar_t>(), zeros.data<int>(),
+        batch, vec_height, height, width, zero_width, groupsize
+      );
+    })
+  );
+}
+
+template <typename scalar_t>
+__global__ void VecQuant3MatMulKernel_old(
+    const  scalar_t* __restrict__ vec,
+    const       int* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const  	int* __restrict__ zeros,
+    int batch,
+    int vec_height,
+    int height,
+    int width,
+    int zero_width,
+    int groupsize
+) {
+  int b = blockIdx.z;
+  int h = BLOCKHEIGHT3 * blockIdx.x;
+  int w = BLOCKWIDTH * blockIdx.y + threadIdx.x;
+
+  __shared__ scalar_t blockvec[BLOCKWIDTH];
+  blockvec[threadIdx.x] = vec[b * vec_height + blockIdx.x * BLOCKWIDTH + threadIdx.x];
+  __syncthreads();
+
+  scalar_t res = 0;
+  int i = width * h + w;
+  int g_h = (h / 3) * 32;
+  int k = 0;
+
+  int z_w = (w / 32) * 3;
+  int z_mod = w % 32;
+  int z_bit;
+
+  if (z_mod != 10){
+    if (z_mod != 21){
+      z_bit = z_mod;
+      if (z_bit > 21){
+        z_bit -= 22;
+        z_bit *= 3;
+        z_bit += 2;
+        z_w += 2;
+      } else if (z_bit > 10){
+        z_bit -= 11;
+        z_bit *= 3;
+        z_bit += 1;
+        z_w += 1;
+      } else {
+        z_bit *= 3;
+      }
+    } else {
+      z_w += 1;
+    }
+  }
+
+  unsigned int tmp1;
+  unsigned int tmp2;
+  unsigned int tmp;
+  unsigned int z_tmp;
+
+  while (k < BLOCKWIDTH) {
+    tmp1 = as_unsigned(mat[i]);
+
+    int g = (g_h + k) / groupsize;
+    scalar_t scale = scales[g * width + w];
+    scalar_t zero;
+    if (z_mod == 10) {
+      z_tmp = (as_unsigned(zeros[g * zero_width + z_w]) >> 30) | ((as_unsigned(zeros[g * zero_width + (z_w + 1)]) << 2) & 0x4);
+      zero = scale * scalar_t(((z_tmp) + 1) & 0x0f);
+    } else if (z_mod == 21){
+      z_tmp = (as_unsigned(zeros[g * zero_width + z_w]) >> 31) | ((as_unsigned(zeros[g * zero_width + (z_w + 1)]) << 1) & 0x6);
+      zero = scale * scalar_t(((z_tmp) + 1) & 0x0f);
+    } else {
+      zero = scale * scalar_t((((as_unsigned(zeros[g * zero_width + z_w]) >> z_bit) & 0x7) + 1) & 0x0f);
+    }
+
+    res += (scale * scalar_t((tmp1 >>  0) & 0x7) - zero) * blockvec[k + 0];
+    res += (scale * scalar_t((tmp1 >>  3) & 0x7) - zero) * blockvec[k + 1];
+    res += (scale * scalar_t((tmp1 >>  6) & 0x7) - zero) * blockvec[k + 2];
+    res += (scale * scalar_t((tmp1 >>  9) & 0x7) - zero) * blockvec[k + 3];
+    res += (scale * scalar_t((tmp1 >> 12) & 0x7) - zero) * blockvec[k + 4];
+    res += (scale * scalar_t((tmp1 >> 15) & 0x7) - zero) * blockvec[k + 5];
+    res += (scale * scalar_t((tmp1 >> 18) & 0x7) - zero) * blockvec[k + 6];
+    res += (scale * scalar_t((tmp1 >> 21) & 0x7) - zero) * blockvec[k + 7];
+    res += (scale * scalar_t((tmp1 >> 24) & 0x7) - zero) * blockvec[k + 8];
+    res += (scale * scalar_t((tmp1 >> 27) & 0x7) - zero) * blockvec[k + 9];
+
+    i += width;
+    tmp2 = as_unsigned(mat[i]);
+    tmp = (tmp1 >> 30) | ((tmp2 << 2) & 0x4);
+    tmp2 >>= 1;
+    res += (scale * scalar_t(tmp) - zero) * blockvec[k + 10];
+    k += 11;
+
+    res += (scale * scalar_t((tmp2 >>  0) & 0x7) - zero) * blockvec[k + 0];
+    res += (scale * scalar_t((tmp2 >>  3) & 0x7) - zero) * blockvec[k + 1];
+    res += (scale * scalar_t((tmp2 >>  6) & 0x7) - zero) * blockvec[k + 2];
+    res += (scale * scalar_t((tmp2 >>  9) & 0x7) - zero) * blockvec[k + 3];
+    res += (scale * scalar_t((tmp2 >> 12) & 0x7) - zero) * blockvec[k + 4];
+    res += (scale * scalar_t((tmp2 >> 15) & 0x7) - zero) * blockvec[k + 5];
+    res += (scale * scalar_t((tmp2 >> 18) & 0x7) - zero) * blockvec[k + 6];
+    res += (scale * scalar_t((tmp2 >> 21) & 0x7) - zero) * blockvec[k + 7];
+    res += (scale * scalar_t((tmp2 >> 24) & 0x7) - zero) * blockvec[k + 8];
+    res += (scale * scalar_t((tmp2 >> 27) & 0x7) - zero) * blockvec[k + 9];
+
+    i += width;
+    tmp1 = as_unsigned(mat[i]);
+    tmp = (tmp2 >> 30) | ((tmp1 << 1) & 0x6);
+    tmp1 >>= 2;
+    res += (scale * scalar_t(tmp) - zero) * blockvec[k + 10];
+    k += 11;
+
+    res += (scale * scalar_t((tmp1 >>  0) & 0x7) - zero) * blockvec[k + 0];
+    res += (scale * scalar_t((tmp1 >>  3) & 0x7) - zero) * blockvec[k + 1];
+    res += (scale * scalar_t((tmp1 >>  6) & 0x7) - zero) * blockvec[k + 2];
+    res += (scale * scalar_t((tmp1 >>  9) & 0x7) - zero) * blockvec[k + 3];
+    res += (scale * scalar_t((tmp1 >> 12) & 0x7) - zero) * blockvec[k + 4];
+    res += (scale * scalar_t((tmp1 >> 15) & 0x7) - zero) * blockvec[k + 5];
+    res += (scale * scalar_t((tmp1 >> 18) & 0x7) - zero) * blockvec[k + 6];
+    res += (scale * scalar_t((tmp1 >> 21) & 0x7) - zero) * blockvec[k + 7];
+    res += (scale * scalar_t((tmp1 >> 24) & 0x7) - zero) * blockvec[k + 8];
+    res += (scale * scalar_t((tmp1 >> 27) & 0x7) - zero) * blockvec[k + 9];
+
+    i += width;
+    k += 10;
+  }
+
+  atomicAdd(&mul[b * width + w], res);
+}
+
+void vecquant4matmul_cuda_old(
+  torch::Tensor vec,
+  torch::Tensor mat,
+  torch::Tensor mul,
+  torch::Tensor scales,
+  torch::Tensor zeros,
+  int groupsize
+) {
+  int batch = vec.size(0);
+  int vec_height = vec.size(1);
+  int height = mat.size(0);
+  int width = mat.size(1);
+  int zero_width = zeros.size(1);
+
+  dim3 blocks(
+    (height + BLOCKHEIGHT4 - 1) / BLOCKHEIGHT4,
+    (width + BLOCKWIDTH - 1) / BLOCKWIDTH,
+    batch
+  );
+  dim3 threads(BLOCKWIDTH);
+
+  AT_DISPATCH_FLOATING_TYPES(
+    vec.type(), "vecquant4matmul_cuda_old", ([&] {
+      VecQuant4MatMulKernel_old<<<blocks, threads>>>(
+        vec.data<scalar_t>(), mat.data<int>(), mul.data<scalar_t>(),
+        scales.data<scalar_t>(), zeros.data<int>(),
+        batch, vec_height, height, width, zero_width, groupsize
+      );
+    })
+  );
+}
+
+template <typename scalar_t>
+__global__ void VecQuant4MatMulKernel_old(
+    const  scalar_t* __restrict__ vec,
+    const       int* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const       int* __restrict__ zeros,
+    int batch,
+    int vec_height,
+    int height,
+    int width,
+    int zero_width,
+    int groupsize
+) {
+  int b = blockIdx.z;
+  int h = BLOCKHEIGHT4 * blockIdx.x;
+  int w = BLOCKWIDTH * blockIdx.y + threadIdx.x;
+
+  __shared__ scalar_t blockvec[BLOCKWIDTH];
+  blockvec[threadIdx.x] = vec[b * vec_height + blockIdx.x * BLOCKWIDTH + threadIdx.x];
+  __syncthreads();
+
+  scalar_t res = 0;
+  int i = width * h + w;
+  int g_h = h * 8;
+  int k = 0;
+
+  int z_w = w / 8;
+  int z_mod = (w % 8) * 4;
+
+  unsigned int tmp;
+
+  while (k < BLOCKWIDTH) {
+    tmp = as_unsigned(mat[i]);
+
+    int g = (g_h + k) / groupsize;
+    scalar_t scale = scales[g * width + w];
+    scalar_t zero = scale * scalar_t((((as_unsigned(zeros[g * zero_width + z_w]) >> z_mod) & 0xF) + 1) & 0x0f);
+
+    res += (scale * scalar_t((tmp >> 0) & 0xF) - zero) * blockvec[k + 0];
+    res += (scale * scalar_t((tmp >> 4) & 0xF) - zero) * blockvec[k + 1];
+    res += (scale * scalar_t((tmp >> 8) & 0xF) - zero) * blockvec[k + 2];
+    res += (scale * scalar_t((tmp >> 12) & 0xF) - zero) * blockvec[k + 3];
+    res += (scale * scalar_t((tmp >> 16) & 0xF) - zero) * blockvec[k + 4];
+    res += (scale * scalar_t((tmp >> 20) & 0xF) - zero) * blockvec[k + 5];
+    res += (scale * scalar_t((tmp >> 24) & 0xF) - zero) * blockvec[k + 6];
+    res += (scale * scalar_t((tmp >> 28) & 0xF) - zero) * blockvec[k + 7];
+
+    i += width;
+    k += 8;
+  }
+
+  atomicAdd(&mul[b * width + w], res);
+}
+
+void vecquant8matmul_cuda_old(
+  torch::Tensor vec,
+  torch::Tensor mat,
+  torch::Tensor mul,
+  torch::Tensor scales,
+  torch::Tensor zeros,
+  int groupsize
+) {
+  int batch = vec.size(0);
+  int vec_height = vec.size(1);
+  int height = mat.size(0);
+  int width = mat.size(1);
+  int zero_width = zeros.size(1);
+
+  dim3 blocks(
+    (height + BLOCKHEIGHT8 - 1) / BLOCKHEIGHT8,
+    (width + BLOCKWIDTH - 1) / BLOCKWIDTH,
+    batch
+  );
+  dim3 threads(BLOCKWIDTH);
+
+  AT_DISPATCH_FLOATING_TYPES(
+    vec.type(), "vecquant8matmul_cuda_old", ([&] {
+      VecQuant8MatMulKernel_old<<<blocks, threads>>>(
+        vec.data<scalar_t>(), mat.data<int>(), mul.data<scalar_t>(),
+        scales.data<scalar_t>(), zeros.data<int>(),
+        batch, vec_height, height, width, zero_width, groupsize
+      );
+    })
+  );
+}
+
+template <typename scalar_t>
+__global__ void VecQuant8MatMulKernel_old(
+    const  scalar_t* __restrict__ vec,
+    const       int* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const  	int* __restrict__ zeros,
+    int batch,
+    int vec_height,
+    int height,
+    int width,
+    int zero_width,
+    int groupsize
+) {
+  int b = blockIdx.z;
+  int h = BLOCKHEIGHT8 * blockIdx.x;
+  int w = BLOCKWIDTH * blockIdx.y + threadIdx.x;
+
+  __shared__ scalar_t blockvec[BLOCKWIDTH];
+  blockvec[threadIdx.x] = vec[b * vec_height + blockIdx.x * BLOCKWIDTH + threadIdx.x];
+  __syncthreads();
+
+  scalar_t res = 0;
+  int i = width * h + w;
+  int g_h = h * 4;
+  int k = 0;
+
+  int z_w = w / 4;
+  int z_mod = (w % 4) * 8;
+
+  unsigned int tmp;
+
+  while (k < BLOCKWIDTH) {
+    tmp = as_unsigned(mat[i]);
+
+    int g = (g_h + k) / groupsize;
+    scalar_t scale = scales[g * width + w];
+    scalar_t zero = scale * scalar_t((((as_unsigned(zeros[g * zero_width + z_w]) >> z_mod) & 0xFF) + 1) & 0x0f);
+
+    res += (scale * scalar_t((tmp >> 0) & 0xFF) - zero) * blockvec[k + 0];
+    res += (scale * scalar_t((tmp >> 8) & 0xFF) - zero) * blockvec[k + 1];
+    res += (scale * scalar_t((tmp >> 16) & 0xFF) - zero) * blockvec[k + 2];
+    res += (scale * scalar_t((tmp >> 24) & 0xFF) - zero) * blockvec[k + 3];
+
+    i += width;
+    k += 4;
+  }
+
+  atomicAdd(&mul[b * width + w], res);
+}
+
+
+void vecquant2matmul_faster_cuda_old(
+  torch::Tensor vec,
+  torch::Tensor mat,
+  torch::Tensor mul,
+  torch::Tensor scales,
+  torch::Tensor zeros,
+  int groupsize,
+  int vec_height
+) {
+  int batch = vec.size(0);
+  int height = mat.size(0);
+  int width = mat.size(1);
+  int zero_width = zeros.size(1);
+
+  dim3 blocks(
+    (height + BLOCKHEIGHT2 - 1) / BLOCKHEIGHT2,
+    (width + BLOCKWIDTH - 1) / BLOCKWIDTH,
+    batch
+  );
+  dim3 threads(BLOCKWIDTH);
+
+  VecQuant2MatMulKernelFaster_old<<<blocks, threads>>>(
+    (half2*) vec.data_ptr(),
+    mat.data_ptr<int>(),
+    mul.data_ptr<float>(),
+    scales.data_ptr<float>(),
+    zeros.data_ptr<int>(),
+    batch, vec_height, height, width, zero_width, groupsize
+  );
+}
+
+__global__ void VecQuant2MatMulKernelFaster_old(
+    const  half2* __restrict__ vec,
+    const    int* __restrict__ mat,
+           float* __restrict__ mul,
+    const  float* __restrict__ scales,
+    const  	 int* __restrict__ zeros,
+	int batch,
+	int vec_height,
+    int height,
+    int width,
+    int zero_width,
+    int groupsize
+) {
+  const int blockwidth2 = BLOCKWIDTH / 2;
+  int b = blockIdx.z;
+  int h = BLOCKHEIGHT2 * blockIdx.x;
+  int w = BLOCKWIDTH * blockIdx.y + threadIdx.x;
+
+  __shared__ half2 blockvec[blockwidth2];
+  if (threadIdx.x < blockwidth2)
+    blockvec[threadIdx.x] = vec[b * vec_height + blockIdx.x * blockwidth2 + threadIdx.x];
+
+  __shared__ half2 deq2[16][16];
+  int val = threadIdx.x / 16;
+  int off = threadIdx.x % 16;
+  for (; val < 16; val += BLOCKWIDTH / 16) {
+    deq2[val][off] = __halves2half2(
+       __int2half_rn(val & 0x3), __int2half_rn(val >> 2)
+    );
+  }
+
+  int i = width * h + w;
+  int g_h = h * 16;
+  int k = 0;
+
+  int z_w = w / 16;
+  int z_mod = (w % 16) * 2;
+
+  float res = 0;
+  half2 res2;
+
+  unsigned int tmp;
+
+  __syncthreads();
+
+  while (k < blockwidth2) {
+    int g = (g_h + (k * 2)) / groupsize;
+	float scale_f = scales[g * width + w];
+    half2 scale = __float2half2_rn(scale_f);
+    half2 zero = __float2half2_rn(-(scale_f * ((((as_unsigned(zeros[g * zero_width + z_w]) >> z_mod) & 0x3) + 1) & 0x0f)));
+
+    std::memset(&res2, 0, sizeof(half2));
+    tmp = as_unsigned(mat[i]);
+    res2 = __hfma2(__hfma2(deq2[(tmp >>  0) & 0xf][off], scale, zero), blockvec[k + 0], res2);
+    res2 = __hfma2(__hfma2(deq2[(tmp >>  4) & 0xf][off], scale, zero), blockvec[k + 1], res2);
+    res2 = __hfma2(__hfma2(deq2[(tmp >>  8) & 0xf][off], scale, zero), blockvec[k + 2], res2);
+    res2 = __hfma2(__hfma2(deq2[(tmp >> 12) & 0xf][off], scale, zero), blockvec[k + 3], res2);
+    res2 = __hfma2(__hfma2(deq2[(tmp >> 16) & 0xf][off], scale, zero), blockvec[k + 4], res2);
+    res2 = __hfma2(__hfma2(deq2[(tmp >> 20) & 0xf][off], scale, zero), blockvec[k + 5], res2);
+    res2 = __hfma2(__hfma2(deq2[(tmp >> 24) & 0xf][off], scale, zero), blockvec[k + 6], res2);
+    res2 = __hfma2(__hfma2(deq2[(tmp >> 28) & 0xf][off], scale, zero), blockvec[k + 7], res2);
+	i += width;
+    k += 8;
+    res += __low2float(res2) + __high2float(res2);
+  }
+
+  atomicAdd(&mul[b * width + w], res);
+}
+
+void vecquant3matmul_faster_cuda_old(
+  torch::Tensor vec,
+  torch::Tensor mat,
+  torch::Tensor mul,
+  torch::Tensor scales,
+  torch::Tensor zeros,
+  int groupsize,
+  int vec_height
+) {
+  int batch = vec.size(0);
+  int height = mat.size(0);
+  int width = mat.size(1);
+  int zero_width = zeros.size(1);
+
+  dim3 blocks(
+    (height + BLOCKHEIGHT3 - 1) / BLOCKHEIGHT3,
+    (width + BLOCKWIDTH - 1) / BLOCKWIDTH,
+    batch
+  );
+  dim3 threads(BLOCKWIDTH);
+
+  VecQuant3MatMulKernelFaster_old<<<blocks, threads>>>(
+    (half2*) vec.data_ptr(),
+    mat.data_ptr<int>(),
+    mul.data_ptr<float>(),
+    scales.data_ptr<float>(),
+    zeros.data_ptr<int>(),
+    batch, vec_height, height, width, zero_width, groupsize
+  );
+}
+
+__global__ void VecQuant3MatMulKernelFaster_old(
+    const  half2* __restrict__ vec,
+    const    int* __restrict__ mat,
+           float* __restrict__ mul,
+    const  float* __restrict__ scales,
+    const  	 int* __restrict__ zeros,
+	int batch,
+	int vec_height,
+    int height,
+    int width,
+    int zero_width,
+    int groupsize
+) {
+  const int blockwidth2 = BLOCKWIDTH / 2;
+  int b = blockIdx.z;
+  int h = BLOCKHEIGHT3 * blockIdx.x;
+  int w = BLOCKWIDTH * blockIdx.y + threadIdx.x;
+
+  __shared__ half2 blockvec[blockwidth2];
+  if (threadIdx.x < blockwidth2)
+    blockvec[threadIdx.x] = vec[b * vec_height + blockIdx.x * blockwidth2 + threadIdx.x];
+
+  __shared__ half2 deq2[64][32];
+  int val = threadIdx.x / 32;
+  int off = threadIdx.x % 32;
+  for (; val < 64; val += BLOCKWIDTH / 32) {
+    deq2[val][off] = __halves2half2(
+       __int2half_rn(val & 0x7), __int2half_rn(val >> 3)
+    );
+  }
+
+  int i = width * h + w;
+  int g_h = (h / 3) * 32;
+  int k = 0;
+
+  int z_w = (w / 32) * 3;
+  int z_mod = w % 32;
+  int z_bit;
+
+  if (z_mod != 10){
+    if (z_mod != 21){
+      z_bit = z_mod;
+      if (z_bit > 21){
+        z_bit -= 22;
+        z_bit *= 3;
+        z_bit += 2;
+        z_w += 2;
+      } else if (z_bit > 10){
+        z_bit -= 11;
+        z_bit *= 3;
+        z_bit += 1;
+        z_w += 1;
+      } else {
+        z_bit *= 3;
+      }
+    } else {
+      z_w += 1;
+    }
+  }
+
+  float res = 0;
+  half2 res2;
+
+  unsigned int tmp1;
+  unsigned int tmp2;
+  unsigned int tmp;
+  unsigned int z_tmp;
+
+  __syncthreads();
+
+  while (k < blockwidth2) {
+    int g = (g_h + (k * 2)) / groupsize;
+	float scale_f = scales[g * width + w];
+    half2 scale = __float2half2_rn(scale_f);
+    half2 zero;
+    if (z_mod == 10) {
+      z_tmp = (as_unsigned(zeros[g * zero_width + z_w]) >> 30) | ((as_unsigned(zeros[g * zero_width + (z_w + 1)]) << 2) & 0x4);
+      zero = __float2half2_rn(-(scale_f * (((z_tmp) + 1) & 0x0f)));
+    } else if (z_mod == 21){
+      z_tmp = (as_unsigned(zeros[g * zero_width + z_w]) >> 31) | ((as_unsigned(zeros[g * zero_width + (z_w + 1)]) << 1) & 0x6);
+      zero = __float2half2_rn(-(scale_f * (((z_tmp) + 1) & 0x0f)));
+    } else {
+      zero = __float2half2_rn(-(scale_f * ((((as_unsigned(zeros[g * zero_width + z_w]) >> z_bit) & 0x7) + 1) & 0x0f)));
+    }
+
+    std::memset(&res2, 0, sizeof(half2));
+    tmp1 = as_unsigned(mat[i]);
+    res2 = __hfma2(__hfma2(deq2[(tmp1 >>  0) & 0x3f][off], scale, zero), blockvec[k + 0], res2);
+    res2 = __hfma2(__hfma2(deq2[(tmp1 >>  6) & 0x3f][off], scale, zero), blockvec[k + 1], res2);
+    res2 = __hfma2(__hfma2(deq2[(tmp1 >> 12) & 0x3f][off], scale, zero), blockvec[k + 2], res2);
+    res2 = __hfma2(__hfma2(deq2[(tmp1 >> 18) & 0x3f][off], scale, zero), blockvec[k + 3], res2);
+    res2 = __hfma2(__hfma2(deq2[(tmp1 >> 24) & 0x3f][off], scale, zero), blockvec[k + 4], res2);
+    i += width;
+    tmp2 = as_unsigned(mat[i]);
+    tmp = (tmp1 >> 30) | ((tmp2 << 2) & 0x3c);
+    res2 = __hfma2(__hfma2(deq2[tmp][off], scale, zero), blockvec[k + 5], res2);
+    tmp2 >>= 4;
+    k += 6;
+    res2 = __hfma2(__hfma2(deq2[(tmp2 >>  0) & 0x3f][off], scale, zero), blockvec[k + 0], res2);
+    res2 = __hfma2(__hfma2(deq2[(tmp2 >>  6) & 0x3f][off], scale, zero), blockvec[k + 1], res2);
+    res2 = __hfma2(__hfma2(deq2[(tmp2 >> 12) & 0x3f][off], scale, zero), blockvec[k + 2], res2);
+    res2 = __hfma2(__hfma2(deq2[(tmp2 >> 18) & 0x3f][off], scale, zero), blockvec[k + 3], res2);
+    i += width;
+    tmp1 = as_unsigned(mat[i]);
+    tmp = (tmp2 >> 24) | ((tmp1 << 4) & 0x30);
+    res2 = __hfma2(__hfma2(deq2[tmp][off], scale, zero), blockvec[k + 4], res2);
+    tmp1 >>= 2;
+    k += 5;
+    res2 = __hfma2(__hfma2(deq2[(tmp1 >>  0) & 0x3f][off], scale, zero), blockvec[k + 0], res2);
+    res2 = __hfma2(__hfma2(deq2[(tmp1 >>  6) & 0x3f][off], scale, zero), blockvec[k + 1], res2);
+    res2 = __hfma2(__hfma2(deq2[(tmp1 >> 12) & 0x3f][off], scale, zero), blockvec[k + 2], res2);
+    res2 = __hfma2(__hfma2(deq2[(tmp1 >> 18) & 0x3f][off], scale, zero), blockvec[k + 3], res2);
+    res2 = __hfma2(__hfma2(deq2[(tmp1 >> 24) & 0x3f][off], scale, zero), blockvec[k + 4], res2);
+    i += width;
+    k += 5;
+    res += __low2float(res2) + __high2float(res2);
+  }
+
+  atomicAdd(&mul[b * width + w], res);
+}
+
+void vecquant4matmul_faster_cuda_old(
+  torch::Tensor vec,
+  torch::Tensor mat,
+  torch::Tensor mul,
+  torch::Tensor scales,
+  torch::Tensor zeros,
+  int groupsize,
+  int vec_height
+) {
+  int batch = vec.size(0);
+  int height = mat.size(0);
+  int width = mat.size(1);
+  int zero_width = zeros.size(1);
+
+  dim3 blocks(
+    (height + BLOCKHEIGHT4 - 1) / BLOCKHEIGHT4,
+    (width + BLOCKWIDTH - 1) / BLOCKWIDTH,
+    batch
+  );
+  dim3 threads(BLOCKWIDTH);
+
+  VecQuant4MatMulKernelFaster_old<<<blocks, threads>>>(
+    (half2*) vec.data_ptr(),
+    mat.data_ptr<int>(),
+    mul.data_ptr<float>(),
+    scales.data_ptr<float>(),
+    zeros.data_ptr<int>(),
+    batch, vec_height, height, width, zero_width, groupsize
+  );
+}
+
+__global__ void VecQuant4MatMulKernelFaster_old(
+    const  half2* __restrict__ vec,
+    const    int* __restrict__ mat,
+           float* __restrict__ mul,
+    const  float* __restrict__ scales,
+    const  	 int* __restrict__ zeros,
+	int batch,
+	int vec_height,
+    int height,
+    int width,
+    int zero_width,
+    int groupsize
+) {
+  const int blockwidth2 = BLOCKWIDTH / 2;
+  int b = blockIdx.z;
+  int h = BLOCKHEIGHT4 * blockIdx.x;
+  int w = BLOCKWIDTH * blockIdx.y + threadIdx.x;
+
+  __shared__ half2 blockvec[blockwidth2];
+  if (threadIdx.x < blockwidth2)
+    blockvec[threadIdx.x] = vec[b * vec_height + blockIdx.x * blockwidth2 + threadIdx.x];
+
+  __shared__ half2 deq2[256][8];
+  int val = threadIdx.x / 8;
+  int off = threadIdx.x % 8;
+  for (; val < 256; val += BLOCKWIDTH / 8) {
+    deq2[val][off] = __halves2half2(
+       __int2half_rn(val & 0xF), __int2half_rn(val >> 4)
+    );
+  }
+
+  int i = width * h + w;
+  int g_h = h * 8;
+  int k = 0;
+
+  int z_w = w / 8;
+  int z_mod = (w % 8) * 4;
+
+  float res = 0;
+  half2 res2;
+
+  unsigned int tmp;
+
+  __syncthreads();
+
+  while (k < blockwidth2) {
+    int g = (g_h + (k * 2)) / groupsize;
+	float scale_f = scales[g * width + w];
+
+    half2 scale = __float2half2_rn(scale_f);
+    half2 zero = __float2half2_rn(-(scale_f * ((((as_unsigned(zeros[g * zero_width + z_w]) >> z_mod) & 0xF) + 1) & 0x0f)));
+
+    //std::memset(&res2, 0, sizeof(half2));
+
+    //res2 = __float2half2_rn((float)0.);
+
+    std::memset(&res2, 0, sizeof(half2));
+    tmp = as_unsigned(mat[i]);
+    res2 = __hfma2(__hfma2(deq2[(tmp >>  0) & 0xff][off], scale, zero), blockvec[k + 0], res2);
+    res2 = __hfma2(__hfma2(deq2[(tmp >>  8) & 0xff][off], scale, zero), blockvec[k + 1], res2);
+    res2 = __hfma2(__hfma2(deq2[(tmp >> 16) & 0xff][off], scale, zero), blockvec[k + 2], res2);
+    res2 = __hfma2(__hfma2(deq2[(tmp >> 24) & 0xff][off], scale, zero), blockvec[k + 3], res2);
+	i += width;
+    k += 4;
+
+    res += __low2float(res2) + __high2float(res2);
+
+  }
+
+  atomicAdd(&mul[b * width + w], res);
+}
--- a/3rd_party/AutoGPTQ/autogptq_extension/cuda_64/autogptq_cuda_64.cpp
+++ b/3rd_party/AutoGPTQ/autogptq_extension/cuda_64/autogptq_cuda_64.cpp
+#include <torch/all.h>
+#include <torch/python.h>
+#include <c10/cuda/CUDAGuard.h>
+
+void vecquant2matmul_cuda(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  torch::Tensor g_idx
+);
+
+void vecquant2matmul(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  torch::Tensor g_idx
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
+  vecquant2matmul_cuda(vec, mat, mul, scales, zeros, g_idx);
+}
+
+void vecquant3matmul_cuda(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  torch::Tensor g_idx
+);
+
+void vecquant3matmul(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  torch::Tensor g_idx
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
+  vecquant3matmul_cuda(vec, mat, mul, scales, zeros, g_idx);
+}
+
+void vecquant4matmul_cuda(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  torch::Tensor g_idx
+);
+
+void vecquant4matmul(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  torch::Tensor g_idx
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
+  vecquant4matmul_cuda(vec, mat, mul, scales, zeros, g_idx);
+}
+
+void vecquant8matmul_cuda(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  torch::Tensor g_idx
+);
+
+void vecquant8matmul(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  torch::Tensor g_idx
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
+  vecquant8matmul_cuda(vec, mat, mul, scales, zeros, g_idx);
+}
+
+
+// old
+
+void vecquant2matmul_cuda_old(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  int groupsize
+); 
+
+void vecquant2matmul_old(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  int groupsize
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
+  vecquant2matmul_cuda_old(vec, mat, mul, scales, zeros,groupsize);
+}
+
+void vecquant3matmul_cuda_old(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  int groupsize
+); 
+
+void vecquant3matmul_old(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  int groupsize
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
+  vecquant3matmul_cuda_old(vec, mat, mul, scales, zeros, groupsize);
+}
+
+void vecquant4matmul_cuda_old(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  int groupsize
+); 
+
+void vecquant4matmul_old(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  int groupsize
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
+  vecquant4matmul_cuda_old(vec, mat, mul, scales, zeros, groupsize);
+}
+
+void vecquant8matmul_cuda_old(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  int groupsize
+); 
+
+void vecquant8matmul_old(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  int groupsize
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
+  vecquant8matmul_cuda_old(vec, mat, mul, scales, zeros, groupsize);
+}
+
+void vecquant2matmul_faster_cuda_old(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  int groupsize, int vec_height
+); 
+
+void vecquant2matmul_faster_old(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  int groupsize, int vec_height
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
+  vecquant2matmul_faster_cuda_old(vec, mat, mul, scales, zeros, groupsize, vec_height);
+}
+
+void vecquant3matmul_faster_cuda_old(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  int groupsize, int vec_height
+); 
+
+void vecquant3matmul_faster_old(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  int groupsize, int vec_height
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
+  vecquant3matmul_faster_cuda_old(vec, mat, mul, scales, zeros, groupsize, vec_height);
+}
+
+void vecquant4matmul_faster_cuda_old(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  int groupsize, int vec_height
+); 
+
+void vecquant4matmul_faster_old(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  int groupsize, int vec_height
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
+  vecquant4matmul_faster_cuda_old(vec, mat, mul, scales, zeros, groupsize, vec_height);
+}
+
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("vecquant2matmul", &vecquant2matmul, "Vector 2-bit Quantized Matrix Multiplication (CUDA) (desc_act)");
+  m.def("vecquant3matmul", &vecquant3matmul, "Vector 3-bit Quantized Matrix Multiplication (CUDA) (desc_act)");
+  m.def("vecquant4matmul", &vecquant4matmul, "Vector 4-bit Quantized Matrix Multiplication (CUDA) (desc_act)");
+  m.def("vecquant8matmul", &vecquant8matmul, "Vector 8-bit Quantized Matrix Multiplication (CUDA) (desc_act)");
+  
+  m.def("vecquant2matmul_old", &vecquant2matmul_old, "Vector 2-bit Quantized Matrix Multiplication (CUDA)");
+  m.def("vecquant3matmul_old", &vecquant3matmul_old, "Vector 3-bit Quantized Matrix Multiplication (CUDA)");
+  m.def("vecquant4matmul_old", &vecquant4matmul_old, "Vector 4-bit Quantized Matrix Multiplication (CUDA)");
+  m.def("vecquant8matmul_old", &vecquant8matmul_old, "Vector 8-bit Quantized Matrix Multiplication (CUDA)");
+  m.def("vecquant2matmul_faster_old", &vecquant2matmul_faster_old, "Vector 2-bit Quantized Matrix Multiplication (CUDA), faster version");
+  m.def("vecquant3matmul_faster_old", &vecquant3matmul_faster_old, "Vector 3-bit Quantized Matrix Multiplication (CUDA), faster version");
+  m.def("vecquant4matmul_faster_old", &vecquant4matmul_faster_old, "Vector 4-bit Quantized Matrix Multiplication (CUDA), faster version");
+}
--- a/3rd_party/AutoGPTQ/autogptq_extension/cuda_64/autogptq_cuda_kernel_64.cu
+++ b/3rd_party/AutoGPTQ/autogptq_extension/cuda_64/autogptq_cuda_kernel_64.cu
+#include <torch/all.h>
+#include <torch/python.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+// atomicAdd for double-precision floating-point numbers on hardware with
+// compute capability < 6.0 from:
+// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions
+// #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
+// __device__ double atomicAdd(
+//     double* address,
+//     double val
+// ) {
+//   unsigned long long int* address_as_ull = (unsigned long long int*)address;
+//   unsigned long long int old = *address_as_ull, assumed;
+//
+//   do {
+//     assumed = old;
+//     old = atomicCAS(
+//       address_as_ull,
+//       assumed,
+//       __double_as_longlong(val + __longlong_as_double(assumed))
+//     );
+//
+//   // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
+//   } while (assumed != old);
+//
+//   return __longlong_as_double(old);
+// }
+// #endif
+
+
+#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700) || defined(USE_ROCM)
+// adapted from https://github.com/torch/cutorch/blob/master/lib/THC/THCAtomics.cuh
+__device__ __forceinline__ void atomicAdd(c10::Half* address, c10::Half val) {
+    unsigned int *address_as_ui = reinterpret_cast<unsigned int *>(reinterpret_cast<char *>(address) - (reinterpret_cast<size_t>(address) & 2));
+    unsigned int old = *address_as_ui;
+    unsigned int assumed;
+
+    do {
+        assumed = old;
+        unsigned short hsum = reinterpret_cast<size_t>(address) & 2 ? (old >> 16) : (old & 0xffff);
+        hsum += val;
+        old = reinterpret_cast<size_t>(address) & 2
+                 ? (old & 0xffff) | (hsum << 16)
+                 : (old & 0xffff0000) | hsum;
+        old = atomicCAS(address_as_ui, assumed, old);
+
+    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
+    } while (assumed != old);
+}
+__device__ __forceinline__ void atomicAdd(__half* address, c10::Half val) {
+    unsigned int * address_as_ui = (unsigned int *) ((char *)address - ((size_t)address & 2));
+    unsigned int old = *address_as_ui;
+    unsigned int assumed;
+
+    do {
+        assumed = old;
+        __half_raw hsum;
+        hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
+        half tmpres = __hadd(hsum, val);
+        hsum = __half_raw(tmpres);
+        old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x;
+        old = atomicCAS(address_as_ui, assumed, old);
+    } while (assumed != old);
+}
+#endif
+
+
+template <typename scalar_t>
+__global__ void VecQuant2MatMulKernel(
+    const  scalar_t* __restrict__ vec,
+    const       int* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const  		int* __restrict__ zeros,
+	const  	    int* __restrict__ g_idx,
+    int batch,
+    int vec_height, 	
+    int height,
+    int width,
+	int zero_width
+);
+
+template <typename scalar_t>
+__global__ void VecQuant3MatMulKernel(
+    const  scalar_t* __restrict__ vec,
+    const       int* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const  		int* __restrict__ zeros,
+	const  	    int* __restrict__ g_idx,
+    int batch,
+    int vec_height, 	
+    int height,
+    int width,
+	int zero_width
+);
+
+template <typename scalar_t>
+__global__ void VecQuant4MatMulKernel(
+    const  scalar_t* __restrict__ vec,
+    const       int* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const  		int* __restrict__ zeros,
+	const  	    int* __restrict__ g_idx,
+    int batch,
+    int vec_height,
+    int height,
+    int width,
+	int zero_width
+);
+
+
+template <typename scalar_t>
+__global__ void VecQuant8MatMulKernel(
+    const  scalar_t* __restrict__ vec,
+    const       int* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const  		int* __restrict__ zeros,
+	const  	    int* __restrict__ g_idx,
+    int batch,
+    int vec_height, 	
+    int height,
+    int width,
+	int zero_width
+);
+
+template <typename scalar_t>
+__global__ void VecQuant2MatMulKernel_old(
+    const  scalar_t* __restrict__ vec,
+    const       int* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const  		int* __restrict__ zeros,
+    int batch,
+    int vec_height, 	
+    int height,
+    int width,
+    int zero_width,
+    int groupsize
+);
+
+template <typename scalar_t>
+__global__ void VecQuant3MatMulKernel_old(
+    const  scalar_t* __restrict__ vec,
+    const       int* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const  	int* __restrict__ zeros,
+    int batch,
+    int vec_height, 	
+    int height,
+    int width,
+    int zero_width,
+    int groupsize
+);
+
+template <typename scalar_t>
+__global__ void VecQuant4MatMulKernel_old(
+    const  scalar_t* __restrict__ vec,
+    const       int* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const  	int* __restrict__ zeros,
+    int batch,
+    int vec_height, 	
+    int height,
+    int width,
+    int zero_width,
+    int groupsize
+);
+
+template <typename scalar_t>
+__global__ void VecQuant8MatMulKernel_old(
+    const  scalar_t* __restrict__ vec,
+    const       int* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const  	int* __restrict__ zeros,
+    int batch,
+    int vec_height, 	
+    int height,
+    int width,
+    int zero_width,
+    int groupsize
+);
+
+__global__ void VecQuant2MatMulKernelFaster_old(
+    const  half2* __restrict__ vec,
+    const    int* __restrict__ mat,
+           float* __restrict__ mul,
+    const  float* __restrict__ scales,
+    const    int* __restrict__ zeros,
+    int batch,
+    int vec_height,
+    int height,
+    int width,
+    int zero_width,
+    int groupsize
+);
+
+__global__ void VecQuant3MatMulKernelFaster_old(
+    const  half2* __restrict__ vec,
+    const    int* __restrict__ mat,
+           float* __restrict__ mul,
+    const  float* __restrict__ scales,
+    const    int* __restrict__ zeros,
+    int batch,
+    int vec_height, 	
+    int height,
+    int width,
+    int zero_width,
+    int groupsize
+);
+
+__global__ void VecQuant4MatMulKernelFaster_old(
+    const  half2* __restrict__ vec,
+    const    int* __restrict__ mat,
+           float* __restrict__ mul,
+    const  float* __restrict__ scales,
+    const    int* __restrict__ zeros,
+    int batch,
+    int vec_height, 	
+    int height,
+    int width,
+    int zero_width,
+    int groupsize
+);
+
+
+const int BLOCKWIDTH  = 64;
+const int BLOCKHEIGHT2 =  4;
+const int BLOCKHEIGHT3 =  6;
+const int BLOCKHEIGHT4 =  8;
+const int BLOCKHEIGHT8 =  16;
+
+__device__ inline unsigned int as_unsigned(int i) {
+  return *reinterpret_cast<unsigned int*>(&i);
+}
+
+__device__ inline int as_int(int i) {
+  return *reinterpret_cast<int*>(&i);
+}
+
+
+void vecquant2matmul_cuda(
+  torch::Tensor vec,
+  torch::Tensor mat,
+  torch::Tensor mul,
+  torch::Tensor scales,
+  torch::Tensor zeros,
+  torch::Tensor g_idx
+) {
+  int batch = vec.size(0);
+  int vec_height = vec.size(1);
+  int height = mat.size(0);
+  int width = mat.size(1);
+  int zero_width = zeros.size(1);
+
+  dim3 blocks(
+    (height + BLOCKHEIGHT2 - 1) / BLOCKHEIGHT2,
+    (width + BLOCKWIDTH - 1) / BLOCKWIDTH
+  );
+  dim3 threads(BLOCKWIDTH);
+
+  AT_DISPATCH_FLOATING_TYPES(
+    vec.type(), "vecquant2matmul_cuda", ([&] {
+      VecQuant2MatMulKernel<<<blocks, threads>>>(
+        vec.data<scalar_t>(), mat.data<int>(), mul.data<scalar_t>(),
+        scales.data<scalar_t>(), zeros.data<int>(), g_idx.data<int>(), 
+        batch, vec_height, height, width, zero_width
+      );
+    })
+  );
+}
+
+template <typename scalar_t>
+__global__ void VecQuant2MatMulKernel(
+    const  scalar_t* __restrict__ vec,
+    const       int* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const  		int* __restrict__ zeros,
+    const   	int* __restrict__ g_idx,
+    int batch,
+    int vec_height,
+    int height,
+    int width,
+	int zero_width
+) {
+  int h = BLOCKHEIGHT2 * blockIdx.x;
+  int w = BLOCKWIDTH * blockIdx.y + threadIdx.x;
+  
+  __shared__ scalar_t blockvec[BLOCKWIDTH];
+  int i = width * h + w;
+  int g_h = h * 16;
+  int k;
+  unsigned int g;
+  scalar_t w_tmp;
+  
+  int z_w = w / 16; 
+  int z_mod = (w % 16) * 2;
+  
+  float weight[BLOCKWIDTH];
+  
+  for (k = 0; k <  BLOCKWIDTH; ++k){	
+	int k_w = (k / 16); 
+	int k_bit = (k % 16) * 2;
+	
+    g = as_int(g_idx[g_h + k]);
+    scalar_t scale = scales[g * width + w];
+
+    // Avoid overflows with & 0x0f.
+    scalar_t zero = scalar_t(((as_unsigned(zeros[g * zero_width + z_w]) >> z_mod & 0x3) + 1) & 0x0f);
+
+    w_tmp = ((as_unsigned(mat[i + (k_w * width)]) >> k_bit) & 0x3);
+    
+	weight[k] = scale * (w_tmp - zero);
+  }
+
+  scalar_t res;
+  for (int b = 0; b < batch; ++b){	
+	res = 0;
+	
+    blockvec[threadIdx.x] = vec[b * vec_height + blockIdx.x * BLOCKWIDTH + threadIdx.x];
+    __syncthreads();
+	for (k = 0; k <  BLOCKWIDTH; ++k){	
+	  res += weight[k] * blockvec[k];
+    }
+    atomicAdd(&mul[b * width + w], res);
+    __syncthreads();
+  }
+}
+
+void vecquant3matmul_cuda(
+  torch::Tensor vec,
+  torch::Tensor mat,
+  torch::Tensor mul,
+  torch::Tensor scales,
+  torch::Tensor zeros,
+  torch::Tensor g_idx
+) {
+  int batch = vec.size(0);
+  int vec_height = vec.size(1);
+  int height = mat.size(0);
+  int width = mat.size(1);
+  int zero_width = zeros.size(1);
+
+  dim3 blocks(
+    (height + BLOCKHEIGHT3 - 1) / BLOCKHEIGHT3,
+    (width + BLOCKWIDTH - 1) / BLOCKWIDTH
+  );
+  dim3 threads(BLOCKWIDTH);
+
+  AT_DISPATCH_FLOATING_TYPES(
+    vec.type(), "vecquant3matmul_cuda", ([&] {
+      VecQuant3MatMulKernel<<<blocks, threads>>>(
+        vec.data<scalar_t>(), mat.data<int>(), mul.data<scalar_t>(),
+        scales.data<scalar_t>(), zeros.data<int>(), g_idx.data<int>(), 
+        batch, vec_height, height, width, zero_width
+      );
+    })
+  );
+}
+
+template <typename scalar_t>
+__global__ void VecQuant3MatMulKernel(
+    const  scalar_t* __restrict__ vec,
+    const       int* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const       int* __restrict__ zeros,
+    const   	int* __restrict__ g_idx,
+    int batch,
+    int vec_height,
+    int height,
+    int width,
+	int zero_width
+) {
+  int h = BLOCKHEIGHT3 * blockIdx.x;
+  int w = BLOCKWIDTH * blockIdx.y + threadIdx.x;
+  
+  __shared__ scalar_t blockvec[BLOCKWIDTH];
+  int i = width * h + w;
+  int g_h = (h / 3) * 32;
+  int k;
+  unsigned int g;
+  scalar_t w_tmp;
+  
+  int z_w = (w / 32) * 3; 
+  int z_mod = w % 32;
+  int z_bit;
+  unsigned int z_tmp;
+  if (z_mod != 10){
+    if (z_mod != 21){
+      z_bit = z_mod;
+      if (z_bit > 21){
+        z_bit -= 22;
+        z_bit *= 3;
+        z_bit += 2;
+        z_w += 2;
+      } else if (z_bit > 10){
+        z_bit -= 11;
+        z_bit *= 3;
+        z_bit += 1;
+        z_w += 1;
+      } else {
+        z_bit *= 3;
+      }
+    } else {
+      z_w += 1;
+    }
+  }
+  
+  float weight[BLOCKWIDTH];
+  
+  for (k = 0; k <  BLOCKWIDTH; ++k){	
+	int k_w = (k / 32) * 3; 
+	int k_mod = k % 32;
+	int k_bit;
+	  
+	if (k_mod != 10){
+	  if (k_mod != 21){
+        k_bit = k_mod;
+        if (k_bit > 21){
+		  k_bit -= 22;
+		  k_bit *= 3;
+		  k_bit += 2;
+		  k_w += 2;
+        } else if (k_bit > 10){
+		  k_bit -= 11;
+		  k_bit *= 3;
+		  k_bit += 1;
+		  k_w += 1;
+        } else {
+		  k_bit *= 3;
+        }
+	  } else {
+        k_w += 1;
+	  }
+	}
+	
+    g = as_int(g_idx[g_h + k]);
+    scalar_t scale = scales[g * width + w];
+    scalar_t zero;
+    if (z_mod == 10) {
+      z_tmp = (as_unsigned(zeros[g * zero_width + z_w]) >> 30) | ((as_unsigned(zeros[g * zero_width + (z_w + 1)]) << 2) & 0x4);
+      zero = scalar_t(((z_tmp) + 1) & 0x0f);  // Avoid overflows
+    } else if (z_mod == 21){
+      z_tmp = (as_unsigned(zeros[g * zero_width + z_w]) >> 31) | ((as_unsigned(zeros[g * zero_width + (z_w + 1)]) << 1) & 0x6);
+      zero = scalar_t(((z_tmp) + 1) & 0x0f);  // Avoid overflows
+    } else {
+      zero = scalar_t((((as_unsigned(zeros[g * zero_width + z_w]) >> z_bit) & 0x7) + 1) & 0x0f);
+    }
+	
+    if (k_mod == 10) {
+      w_tmp = (as_unsigned(mat[i + (k_w * width)]) >> 30) | ((as_unsigned(mat[i + ((k_w + 1)* width)]) << 2) & 0x4);
+    } else if (k_mod == 21){
+      w_tmp = (as_unsigned(mat[i + (k_w * width)]) >> 31) | ((as_unsigned(mat[i + ((k_w + 1)* width)]) << 1) & 0x6);
+    } else {
+      w_tmp = ((as_unsigned(mat[i + (k_w * width)]) >> k_bit) & 0x7);
+    }
+	weight[k] = scale * (w_tmp - zero);
+  }
+
+  scalar_t res;
+  for (int b = 0; b < batch; ++b){	
+	res = 0;
+	
+    blockvec[threadIdx.x] = vec[b * vec_height + blockIdx.x * BLOCKWIDTH + threadIdx.x];
+    __syncthreads();
+	for (k = 0; k <  BLOCKWIDTH; ++k){	
+	  res += weight[k] * blockvec[k];
+    }
+    atomicAdd(&mul[b * width + w], res);
+    __syncthreads();
+  }
+}
+
+void vecquant4matmul_cuda(
+  torch::Tensor vec,
+  torch::Tensor mat,
+  torch::Tensor mul,
+  torch::Tensor scales,
+  torch::Tensor zeros,
+  torch::Tensor g_idx
+) {
+  int batch = vec.size(0);
+  int vec_height = vec.size(1);
+  int height = mat.size(0);
+  int width = mat.size(1);
+  int zero_width = zeros.size(1);
+
+  dim3 blocks(
+    (height + BLOCKHEIGHT4 - 1) / BLOCKHEIGHT4,
+    (width + BLOCKWIDTH - 1) / BLOCKWIDTH
+  );
+  dim3 threads(BLOCKWIDTH);
+
+  AT_DISPATCH_FLOATING_TYPES(
+    vec.type(), "vecquant4matmul_cuda", ([&] {
+      VecQuant4MatMulKernel<<<blocks, threads>>>(
+        vec.data<scalar_t>(), mat.data<int>(), mul.data<scalar_t>(),
+        scales.data<scalar_t>(), zeros.data<int>(), g_idx.data<int>(), 
+        batch, vec_height, height, width, zero_width
+      );
+    })
+  );
+}
+
+template <typename scalar_t>
+__global__ void VecQuant4MatMulKernel(
+    const  scalar_t* __restrict__ vec,
+    const       int* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const       int* __restrict__ zeros,
+    const   	int* __restrict__ g_idx,
+    int batch,
+    int vec_height,
+    int height,
+    int width,
+	int zero_width
+) {
+  int h = BLOCKHEIGHT4 * blockIdx.x;
+  int w = BLOCKWIDTH * blockIdx.y + threadIdx.x;
+  
+  __shared__ scalar_t blockvec[BLOCKWIDTH];
+  int i = width * h + w;
+  int g_h = h * 8;
+  int k;
+  unsigned int g;
+  scalar_t w_tmp;
+  
+
+  int z_w = w / 8; 
+  int z_mod = (w % 8) * 4;
+  
+  float weight[BLOCKWIDTH];
+  
+  for (k = 0; k <  BLOCKWIDTH; ++k){	
+	int k_w = (k / 8); 
+	int k_bit = (k % 8) * 4;
+	
+    g = as_int(g_idx[g_h + k]);
+    scalar_t scale = scales[g * width + w];
+    scalar_t zero = scalar_t((((as_unsigned(zeros[g * zero_width + z_w]) >> z_mod) & 0xF) + 1) & 0x0f);
+	
+    w_tmp = ((as_unsigned(mat[i + (k_w * width)]) >> k_bit) & 0xF);
+    
+	weight[k] = scale * (w_tmp - zero);
+  }
+
+  scalar_t res;
+  for (int b = 0; b < batch; ++b){	
+	res = 0;
+	
+    blockvec[threadIdx.x] = vec[b * vec_height + blockIdx.x * BLOCKWIDTH + threadIdx.x];
+    __syncthreads();
+	for (k = 0; k <  BLOCKWIDTH; ++k){	
+	  res += weight[k] * blockvec[k];
+    }
+    atomicAdd(&mul[b * width + w], res);
+    __syncthreads();
+  }
+}
+
+void vecquant8matmul_cuda(
+  torch::Tensor vec,
+  torch::Tensor mat,
+  torch::Tensor mul,
+  torch::Tensor scales,
+  torch::Tensor zeros,
+  torch::Tensor g_idx
+) {
+  int batch = vec.size(0);
+  int vec_height = vec.size(1);
+  int height = mat.size(0);
+  int width = mat.size(1);
+  int zero_width = zeros.size(1);
+
+  dim3 blocks(
+    (height + BLOCKHEIGHT8 - 1) / BLOCKHEIGHT8,
+    (width + BLOCKWIDTH - 1) / BLOCKWIDTH
+  );
+  dim3 threads(BLOCKWIDTH);
+
+  AT_DISPATCH_FLOATING_TYPES(
+    vec.type(), "vecquant8matmul_cuda", ([&] {
+      VecQuant8MatMulKernel<<<blocks, threads>>>(
+        vec.data<scalar_t>(), mat.data<int>(), mul.data<scalar_t>(),
+        scales.data<scalar_t>(), zeros.data<int>(), g_idx.data<int>(), 
+        batch, vec_height, height, width, zero_width
+      );
+    })
+  );
+}
+
+template <typename scalar_t>
+__global__ void VecQuant8MatMulKernel(
+    const  scalar_t* __restrict__ vec,
+    const       int* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const       int* __restrict__ zeros,
+    const   	int* __restrict__ g_idx,
+    int batch,
+    int vec_height,
+    int height,
+    int width,
+	int zero_width
+) {
+  int h = BLOCKHEIGHT8 * blockIdx.x;
+  int w = BLOCKWIDTH * blockIdx.y + threadIdx.x;
+  
+  __shared__ scalar_t blockvec[BLOCKWIDTH];
+  int i = width * h + w;
+  int g_h = h * 4;
+  int k;
+  unsigned int g;
+  scalar_t w_tmp;
+  
+  int z_w = w / 4; 
+  int z_mod = (w % 4) * 8;
+  
+  float weight[BLOCKWIDTH];
+  
+  for (k = 0; k <  BLOCKWIDTH; ++k){	
+	int k_w = (k / 4); 
+	int k_bit = (k % 4) * 8;
+	
+    g = as_int(g_idx[g_h + k]);
+    scalar_t scale = scales[g * width + w];
+    scalar_t zero = scalar_t((((as_unsigned(zeros[g * zero_width + z_w]) >> z_mod) & 0xFF) + 1) & 0x0f);
+	
+    w_tmp = ((as_unsigned(mat[i + (k_w * width)]) >> k_bit) & 0xFF);
+    
+	weight[k] = scale * (w_tmp - zero);
+  }
+
+  scalar_t res;
+  for (int b = 0; b < batch; ++b){	
+	res = 0;
+	
+    blockvec[threadIdx.x] = vec[b * vec_height + blockIdx.x * BLOCKWIDTH + threadIdx.x];
+    __syncthreads();
+	for (k = 0; k <  BLOCKWIDTH; ++k){	
+	  res += weight[k] * blockvec[k];
+    }
+    atomicAdd(&mul[b * width + w], res);
+    __syncthreads();
+  }
+}
+
+
+void vecquant2matmul_cuda_old(
+  torch::Tensor vec,
+  torch::Tensor mat,
+  torch::Tensor mul,
+  torch::Tensor scales,
+  torch::Tensor zeros,
+  int groupsize
+) {
+  int batch = vec.size(0);
+  int vec_height = vec.size(1);
+  int height = mat.size(0);
+  int width = mat.size(1);
+  int zero_width = zeros.size(1);
+
+  dim3 blocks(
+    (height + BLOCKHEIGHT2 - 1) / BLOCKHEIGHT2,
+    (width + BLOCKWIDTH - 1) / BLOCKWIDTH,
+    batch
+  );
+  dim3 threads(BLOCKWIDTH);
+
+  AT_DISPATCH_FLOATING_TYPES(
+    vec.type(), "vecquant2matmul_cuda_old", ([&] {
+      VecQuant2MatMulKernel_old<<<blocks, threads>>>(
+        vec.data<scalar_t>(), mat.data<int>(), mul.data<scalar_t>(),
+        scales.data<scalar_t>(), zeros.data<int>(),
+        batch, vec_height, height, width, zero_width, groupsize
+      );
+    })
+  );
+}
+
+template <typename scalar_t>
+__global__ void VecQuant2MatMulKernel_old(
+    const  scalar_t* __restrict__ vec,
+    const       int* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const  	int* __restrict__ zeros,
+    int batch,
+    int vec_height,
+    int height,
+    int width,
+    int zero_width,
+    int groupsize
+) {
+  int b = blockIdx.z;
+  int h = BLOCKHEIGHT2 * blockIdx.x;
+  int w = BLOCKWIDTH * blockIdx.y + threadIdx.x;
+
+  __shared__ scalar_t blockvec[BLOCKWIDTH];
+  blockvec[threadIdx.x] = vec[b * vec_height + blockIdx.x * BLOCKWIDTH + threadIdx.x];
+  __syncthreads();
+
+  scalar_t res = 0;
+  int i = width * h + w;
+  int g_h = h * 16;
+  int k = 0;
+  
+  int z_w = w / 16; 
+  int z_mod = (w % 16) * 2;
+
+  unsigned int tmp;
+
+  while (k < BLOCKWIDTH) {
+    tmp = as_unsigned(mat[i]);
+	
+    int g = (g_h + k) / groupsize;
+    scalar_t scale = scales[g * width + w];
+    scalar_t zero = scale * scalar_t(((as_unsigned(zeros[g * zero_width + z_w]) >> z_mod & 0x3) + 1) & 0x0f);
+	
+    res += (scale * scalar_t((tmp >> 0) & 0x3) - zero) * blockvec[k + 0];
+    res += (scale * scalar_t((tmp >> 2) & 0x3) - zero) * blockvec[k + 1];
+    res += (scale * scalar_t((tmp >> 4) & 0x3) - zero) * blockvec[k + 2];
+    res += (scale * scalar_t((tmp >> 6) & 0x3) - zero) * blockvec[k + 3];
+    res += (scale * scalar_t((tmp >> 8) & 0x3) - zero) * blockvec[k + 4];
+    res += (scale * scalar_t((tmp >> 10) & 0x3) - zero) * blockvec[k + 5];
+    res += (scale * scalar_t((tmp >> 12) & 0x3) - zero) * blockvec[k + 6];
+    res += (scale * scalar_t((tmp >> 14) & 0x3) - zero) * blockvec[k + 7];
+    res += (scale * scalar_t((tmp >> 16) & 0x3) - zero) * blockvec[k + 8];
+    res += (scale * scalar_t((tmp >> 18) & 0x3) - zero) * blockvec[k + 9];
+    res += (scale * scalar_t((tmp >> 20) & 0x3) - zero) * blockvec[k + 10];
+    res += (scale * scalar_t((tmp >> 22) & 0x3) - zero) * blockvec[k + 11];
+    res += (scale * scalar_t((tmp >> 24) & 0x3) - zero) * blockvec[k + 12];
+    res += (scale * scalar_t((tmp >> 26) & 0x3) - zero) * blockvec[k + 13];
+    res += (scale * scalar_t((tmp >> 28) & 0x3) - zero) * blockvec[k + 14];
+    res += (scale * scalar_t((tmp >> 30) & 0x3) - zero) * blockvec[k + 15];
+	
+    i += width;
+    k += 16;
+  }
+
+  atomicAdd(&mul[b * width + w], res);
+}
+
+void vecquant3matmul_cuda_old(
+  torch::Tensor vec,
+  torch::Tensor mat,
+  torch::Tensor mul,
+  torch::Tensor scales,
+  torch::Tensor zeros,
+  int groupsize
+) {
+  int batch = vec.size(0);
+  int vec_height = vec.size(1);
+  int height = mat.size(0);
+  int width = mat.size(1);
+  int zero_width = zeros.size(1);
+
+  dim3 blocks(
+    (height + BLOCKHEIGHT3 - 1) / BLOCKHEIGHT3,
+    (width + BLOCKWIDTH - 1) / BLOCKWIDTH,
+    batch
+  );
+  dim3 threads(BLOCKWIDTH);
+
+  AT_DISPATCH_FLOATING_TYPES(
+    vec.type(), "vecquant3matmul_cuda_old", ([&] {
+      VecQuant3MatMulKernel_old<<<blocks, threads>>>(
+        vec.data<scalar_t>(), mat.data<int>(), mul.data<scalar_t>(),
+        scales.data<scalar_t>(), zeros.data<int>(),
+        batch, vec_height, height, width, zero_width, groupsize
+      );
+    })
+  );
+}
+
+template <typename scalar_t>
+__global__ void VecQuant3MatMulKernel_old(
+    const  scalar_t* __restrict__ vec,
+    const       int* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const  	int* __restrict__ zeros,
+    int batch,
+    int vec_height,
+    int height,
+    int width,
+    int zero_width,
+    int groupsize
+) {
+  int b = blockIdx.z;
+  int h = BLOCKHEIGHT3 * blockIdx.x;
+  int w = BLOCKWIDTH * blockIdx.y + threadIdx.x;
+
+  __shared__ scalar_t blockvec[BLOCKWIDTH];
+  blockvec[threadIdx.x] = vec[b * vec_height + blockIdx.x * BLOCKWIDTH + threadIdx.x];
+  __syncthreads();
+
+  scalar_t res = 0;
+  int i = width * h + w;
+  int g_h = (h / 3) * 32;
+  int k = 0;
+  
+  int z_w = (w / 32) * 3; 
+  int z_mod = w % 32;
+  int z_bit;
+  
+  if (z_mod != 10){
+    if (z_mod != 21){
+      z_bit = z_mod;
+      if (z_bit > 21){
+        z_bit -= 22;
+        z_bit *= 3;
+        z_bit += 2;
+        z_w += 2;
+      } else if (z_bit > 10){
+        z_bit -= 11;
+        z_bit *= 3;
+        z_bit += 1;
+        z_w += 1;
+      } else {
+        z_bit *= 3;
+      }
+    } else {
+      z_w += 1;
+    }
+  }
+ 
+  unsigned int tmp1;
+  unsigned int tmp2;
+  unsigned int tmp;
+  unsigned int z_tmp;
+
+  while (k < BLOCKWIDTH) {
+    tmp1 = as_unsigned(mat[i]);
+	
+    int g = (g_h + k) / groupsize;
+    scalar_t scale = scales[g * width + w];
+    scalar_t zero;
+    if (z_mod == 10) {
+      z_tmp = (as_unsigned(zeros[g * zero_width + z_w]) >> 30) | ((as_unsigned(zeros[g * zero_width + (z_w + 1)]) << 2) & 0x4);
+      zero = scale * scalar_t(((z_tmp) + 1) & 0x0f);
+    } else if (z_mod == 21){
+      z_tmp = (as_unsigned(zeros[g * zero_width + z_w]) >> 31) | ((as_unsigned(zeros[g * zero_width + (z_w + 1)]) << 1) & 0x6);
+      zero = scale * scalar_t(((z_tmp) + 1) & 0x0f);
+    } else {
+      zero = scale * scalar_t((((as_unsigned(zeros[g * zero_width + z_w]) >> z_bit) & 0x7) + 1) & 0x0f);
+    }
+	
+    res += (scale * scalar_t((tmp1 >>  0) & 0x7) - zero) * blockvec[k + 0];
+    res += (scale * scalar_t((tmp1 >>  3) & 0x7) - zero) * blockvec[k + 1];
+    res += (scale * scalar_t((tmp1 >>  6) & 0x7) - zero) * blockvec[k + 2];
+    res += (scale * scalar_t((tmp1 >>  9) & 0x7) - zero) * blockvec[k + 3];
+    res += (scale * scalar_t((tmp1 >> 12) & 0x7) - zero) * blockvec[k + 4];
+    res += (scale * scalar_t((tmp1 >> 15) & 0x7) - zero) * blockvec[k + 5];
+    res += (scale * scalar_t((tmp1 >> 18) & 0x7) - zero) * blockvec[k + 6];
+    res += (scale * scalar_t((tmp1 >> 21) & 0x7) - zero) * blockvec[k + 7];
+    res += (scale * scalar_t((tmp1 >> 24) & 0x7) - zero) * blockvec[k + 8];
+    res += (scale * scalar_t((tmp1 >> 27) & 0x7) - zero) * blockvec[k + 9];
+	
+    i += width;
+    tmp2 = as_unsigned(mat[i]);
+    tmp = (tmp1 >> 30) | ((tmp2 << 2) & 0x4);
+    tmp2 >>= 1;
+    res += (scale * scalar_t(tmp) - zero) * blockvec[k + 10];
+    k += 11;
+	
+    res += (scale * scalar_t((tmp2 >>  0) & 0x7) - zero) * blockvec[k + 0];
+    res += (scale * scalar_t((tmp2 >>  3) & 0x7) - zero) * blockvec[k + 1];
+    res += (scale * scalar_t((tmp2 >>  6) & 0x7) - zero) * blockvec[k + 2];
+    res += (scale * scalar_t((tmp2 >>  9) & 0x7) - zero) * blockvec[k + 3];
+    res += (scale * scalar_t((tmp2 >> 12) & 0x7) - zero) * blockvec[k + 4];
+    res += (scale * scalar_t((tmp2 >> 15) & 0x7) - zero) * blockvec[k + 5];
+    res += (scale * scalar_t((tmp2 >> 18) & 0x7) - zero) * blockvec[k + 6];
+    res += (scale * scalar_t((tmp2 >> 21) & 0x7) - zero) * blockvec[k + 7];
+    res += (scale * scalar_t((tmp2 >> 24) & 0x7) - zero) * blockvec[k + 8];
+    res += (scale * scalar_t((tmp2 >> 27) & 0x7) - zero) * blockvec[k + 9];
+	
+    i += width;
+    tmp1 = as_unsigned(mat[i]);
+    tmp = (tmp2 >> 30) | ((tmp1 << 1) & 0x6);
+    tmp1 >>= 2;
+    res += (scale * scalar_t(tmp) - zero) * blockvec[k + 10];
+    k += 11;
+	
+    res += (scale * scalar_t((tmp1 >>  0) & 0x7) - zero) * blockvec[k + 0];
+    res += (scale * scalar_t((tmp1 >>  3) & 0x7) - zero) * blockvec[k + 1];
+    res += (scale * scalar_t((tmp1 >>  6) & 0x7) - zero) * blockvec[k + 2];
+    res += (scale * scalar_t((tmp1 >>  9) & 0x7) - zero) * blockvec[k + 3];
+    res += (scale * scalar_t((tmp1 >> 12) & 0x7) - zero) * blockvec[k + 4];
+    res += (scale * scalar_t((tmp1 >> 15) & 0x7) - zero) * blockvec[k + 5];
+    res += (scale * scalar_t((tmp1 >> 18) & 0x7) - zero) * blockvec[k + 6];
+    res += (scale * scalar_t((tmp1 >> 21) & 0x7) - zero) * blockvec[k + 7];
+    res += (scale * scalar_t((tmp1 >> 24) & 0x7) - zero) * blockvec[k + 8];
+    res += (scale * scalar_t((tmp1 >> 27) & 0x7) - zero) * blockvec[k + 9];
+	
+    i += width;
+    k += 10;
+  }
+
+  atomicAdd(&mul[b * width + w], res);
+}
+
+void vecquant4matmul_cuda_old(
+  torch::Tensor vec,
+  torch::Tensor mat,
+  torch::Tensor mul,
+  torch::Tensor scales,
+  torch::Tensor zeros,
+  int groupsize
+) {
+  int batch = vec.size(0);
+  int vec_height = vec.size(1);
+  int height = mat.size(0);
+  int width = mat.size(1);
+  int zero_width = zeros.size(1);
+
+  dim3 blocks(
+    (height + BLOCKHEIGHT4 - 1) / BLOCKHEIGHT4,
+    (width + BLOCKWIDTH - 1) / BLOCKWIDTH,
+    batch
+  );
+  dim3 threads(BLOCKWIDTH);
+
+  AT_DISPATCH_FLOATING_TYPES(
+    vec.type(), "vecquant4matmul_cuda_old", ([&] {
+      VecQuant4MatMulKernel_old<<<blocks, threads>>>(
+        vec.data<scalar_t>(), mat.data<int>(), mul.data<scalar_t>(),
+        scales.data<scalar_t>(), zeros.data<int>(),
+        batch, vec_height, height, width, zero_width, groupsize
+      );
+    })
+  );
+}
+
+template <typename scalar_t>
+__global__ void VecQuant4MatMulKernel_old(
+    const  scalar_t* __restrict__ vec,
+    const       int* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const       int* __restrict__ zeros,
+    int batch,
+    int vec_height,
+    int height,
+    int width,
+    int zero_width,
+    int groupsize
+) {
+  int b = blockIdx.z;
+  int h = BLOCKHEIGHT4 * blockIdx.x;
+  int w = BLOCKWIDTH * blockIdx.y + threadIdx.x;
+
+  __shared__ scalar_t blockvec[BLOCKWIDTH];
+  blockvec[threadIdx.x] = vec[b * vec_height + blockIdx.x * BLOCKWIDTH + threadIdx.x];
+  __syncthreads();
+
+  scalar_t res = 0;
+  int i = width * h + w;
+  int g_h = h * 8;
+  int k = 0;
+
+  int z_w = w / 8; 
+  int z_mod = (w % 8) * 4;
+
+  unsigned int tmp;
+
+  while (k < BLOCKWIDTH) {
+    tmp = as_unsigned(mat[i]);
+	
+    int g = (g_h + k) / groupsize;
+    scalar_t scale = scales[g * width + w];
+    scalar_t zero = scale * scalar_t((((as_unsigned(zeros[g * zero_width + z_w]) >> z_mod) & 0xF) + 1) & 0x0f);
+	
+    res += (scale * scalar_t((tmp >> 0) & 0xF) - zero) * blockvec[k + 0];
+    res += (scale * scalar_t((tmp >> 4) & 0xF) - zero) * blockvec[k + 1];
+    res += (scale * scalar_t((tmp >> 8) & 0xF) - zero) * blockvec[k + 2];
+    res += (scale * scalar_t((tmp >> 12) & 0xF) - zero) * blockvec[k + 3];
+    res += (scale * scalar_t((tmp >> 16) & 0xF) - zero) * blockvec[k + 4];
+    res += (scale * scalar_t((tmp >> 20) & 0xF) - zero) * blockvec[k + 5];
+    res += (scale * scalar_t((tmp >> 24) & 0xF) - zero) * blockvec[k + 6];
+    res += (scale * scalar_t((tmp >> 28) & 0xF) - zero) * blockvec[k + 7];
+	
+    i += width;
+    k += 8;
+  }
+
+  atomicAdd(&mul[b * width + w], res);
+}
+
+void vecquant8matmul_cuda_old(
+  torch::Tensor vec,
+  torch::Tensor mat,
+  torch::Tensor mul,
+  torch::Tensor scales,
+  torch::Tensor zeros,
+  int groupsize
+) {
+  int batch = vec.size(0);
+  int vec_height = vec.size(1);
+  int height = mat.size(0);
+  int width = mat.size(1);
+  int zero_width = zeros.size(1);
+
+  dim3 blocks(
+    (height + BLOCKHEIGHT8 - 1) / BLOCKHEIGHT8,
+    (width + BLOCKWIDTH - 1) / BLOCKWIDTH,
+    batch
+  );
+  dim3 threads(BLOCKWIDTH);
+
+  AT_DISPATCH_FLOATING_TYPES(
+    vec.type(), "vecquant8matmul_cuda_old", ([&] {
+      VecQuant8MatMulKernel_old<<<blocks, threads>>>(
+        vec.data<scalar_t>(), mat.data<int>(), mul.data<scalar_t>(),
+        scales.data<scalar_t>(), zeros.data<int>(),
+        batch, vec_height, height, width, zero_width, groupsize
+      );
+    })
+  );
+}
+
+template <typename scalar_t>
+__global__ void VecQuant8MatMulKernel_old(
+    const  scalar_t* __restrict__ vec,
+    const       int* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const  	int* __restrict__ zeros,
+    int batch,
+    int vec_height,
+    int height,
+    int width,
+    int zero_width,
+    int groupsize
+) {
+  int b = blockIdx.z;
+  int h = BLOCKHEIGHT8 * blockIdx.x;
+  int w = BLOCKWIDTH * blockIdx.y + threadIdx.x;
+
+  __shared__ scalar_t blockvec[BLOCKWIDTH];
+  blockvec[threadIdx.x] = vec[b * vec_height + blockIdx.x * BLOCKWIDTH + threadIdx.x];
+  __syncthreads();
+
+  scalar_t res = 0;
+  int i = width * h + w;
+  int g_h = h * 4;
+  int k = 0;
+  
+  int z_w = w / 4; 
+  int z_mod = (w % 4) * 8;
+
+  unsigned int tmp;
+
+  while (k < BLOCKWIDTH) { 
+    tmp = as_unsigned(mat[i]);
+	
+    int g = (g_h + k) / groupsize;
+    scalar_t scale = scales[g * width + w];
+    scalar_t zero = scale * scalar_t((((as_unsigned(zeros[g * zero_width + z_w]) >> z_mod) & 0xFF) + 1) & 0x0f);
+	
+    res += (scale * scalar_t((tmp >> 0) & 0xFF) - zero) * blockvec[k + 0];
+    res += (scale * scalar_t((tmp >> 8) & 0xFF) - zero) * blockvec[k + 1];
+    res += (scale * scalar_t((tmp >> 16) & 0xFF) - zero) * blockvec[k + 2];
+    res += (scale * scalar_t((tmp >> 24) & 0xFF) - zero) * blockvec[k + 3];
+	
+    i += width;
+    k += 4;
+  }
+
+  atomicAdd(&mul[b * width + w], res);
+}
+
+
+void vecquant2matmul_faster_cuda_old(
+  torch::Tensor vec,
+  torch::Tensor mat,
+  torch::Tensor mul,
+  torch::Tensor scales,
+  torch::Tensor zeros,
+  int groupsize,
+  int vec_height
+) {
+  int batch = vec.size(0);
+  int height = mat.size(0);
+  int width = mat.size(1);
+  int zero_width = zeros.size(1);
+  
+  dim3 blocks(
+    (height + BLOCKHEIGHT2 - 1) / BLOCKHEIGHT2,
+    (width + BLOCKWIDTH - 1) / BLOCKWIDTH,
+    batch
+  );
+  dim3 threads(BLOCKWIDTH);
+
+  VecQuant2MatMulKernelFaster_old<<<blocks, threads>>>(
+    (half2*) vec.data_ptr(),
+    mat.data_ptr<int>(),
+    mul.data_ptr<float>(),
+    scales.data_ptr<float>(),
+    zeros.data_ptr<int>(),
+    batch, vec_height, height, width, zero_width, groupsize
+  );
+}
+
+__global__ void VecQuant2MatMulKernelFaster_old(
+    const  half2* __restrict__ vec,
+    const    int* __restrict__ mat,
+           float* __restrict__ mul,
+    const  float* __restrict__ scales,
+    const  	 int* __restrict__ zeros,
+	int batch,
+	int vec_height,
+    int height,
+    int width,
+    int zero_width,
+    int groupsize
+) {
+  const int blockwidth2 = BLOCKWIDTH / 2;
+  int b = blockIdx.z;
+  int h = BLOCKHEIGHT2 * blockIdx.x;
+  int w = BLOCKWIDTH * blockIdx.y + threadIdx.x;
+
+  __shared__ half2 blockvec[blockwidth2];
+  if (threadIdx.x < blockwidth2)
+    blockvec[threadIdx.x] = vec[b * vec_height + blockIdx.x * blockwidth2 + threadIdx.x];
+
+  __shared__ half2 deq2[16][16];
+  int val = threadIdx.x / 16;
+  int off = threadIdx.x % 16;
+  for (; val < 16; val += BLOCKWIDTH / 16) {
+    deq2[val][off] = __halves2half2(
+       __int2half_rn(val & 0x3), __int2half_rn(val >> 2)
+    );
+  }
+
+  int i = width * h + w;
+  int g_h = h * 16;
+  int k = 0;
+  
+  int z_w = w / 16; 
+  int z_mod = (w % 16) * 2;
+
+  float res = 0;
+  half2 res2;
+
+  unsigned int tmp;
+
+  __syncthreads();
+
+  while (k < blockwidth2) {
+    int g = (g_h + (k * 2)) / groupsize;
+	float scale_f = scales[g * width + w];
+    half2 scale = __float2half2_rn(scale_f);
+    half2 zero = __float2half2_rn(-(scale_f * ((((as_unsigned(zeros[g * zero_width + z_w]) >> z_mod) & 0x3) + 1) & 0x0f)));
+	
+    std::memset(&res2, 0, sizeof(half2));
+    tmp = as_unsigned(mat[i]);
+    res2 = __hfma2(__hfma2(deq2[(tmp >>  0) & 0xf][off], scale, zero), blockvec[k + 0], res2);
+    res2 = __hfma2(__hfma2(deq2[(tmp >>  4) & 0xf][off], scale, zero), blockvec[k + 1], res2);
+    res2 = __hfma2(__hfma2(deq2[(tmp >>  8) & 0xf][off], scale, zero), blockvec[k + 2], res2);
+    res2 = __hfma2(__hfma2(deq2[(tmp >> 12) & 0xf][off], scale, zero), blockvec[k + 3], res2);
+    res2 = __hfma2(__hfma2(deq2[(tmp >> 16) & 0xf][off], scale, zero), blockvec[k + 4], res2);
+    res2 = __hfma2(__hfma2(deq2[(tmp >> 20) & 0xf][off], scale, zero), blockvec[k + 5], res2);
+    res2 = __hfma2(__hfma2(deq2[(tmp >> 24) & 0xf][off], scale, zero), blockvec[k + 6], res2);
+    res2 = __hfma2(__hfma2(deq2[(tmp >> 28) & 0xf][off], scale, zero), blockvec[k + 7], res2);
+	i += width;
+    k += 8;
+    res += __low2float(res2) + __high2float(res2);
+  }
+
+  atomicAdd(&mul[b * width + w], res);
+}
+
+void vecquant3matmul_faster_cuda_old(
+  torch::Tensor vec,
+  torch::Tensor mat,
+  torch::Tensor mul,
+  torch::Tensor scales,
+  torch::Tensor zeros,
+  int groupsize,
+  int vec_height
+) {
+  int batch = vec.size(0);
+  int height = mat.size(0);
+  int width = mat.size(1);
+  int zero_width = zeros.size(1);
+  
+  dim3 blocks(
+    (height + BLOCKHEIGHT3 - 1) / BLOCKHEIGHT3,
+    (width + BLOCKWIDTH - 1) / BLOCKWIDTH,
+    batch
+  );
+  dim3 threads(BLOCKWIDTH);
+
+  VecQuant3MatMulKernelFaster_old<<<blocks, threads>>>(
+    (half2*) vec.data_ptr(),
+    mat.data_ptr<int>(),
+    mul.data_ptr<float>(),
+    scales.data_ptr<float>(),
+    zeros.data_ptr<int>(),
+    batch, vec_height, height, width, zero_width, groupsize
+  );
+}
+
+__global__ void VecQuant3MatMulKernelFaster_old(
+    const  half2* __restrict__ vec,
+    const    int* __restrict__ mat,
+           float* __restrict__ mul,
+    const  float* __restrict__ scales,
+    const  	 int* __restrict__ zeros,
+	int batch,
+	int vec_height,
+    int height,
+    int width,
+    int zero_width,
+    int groupsize
+) {
+  const int blockwidth2 = BLOCKWIDTH / 2;
+  int b = blockIdx.z;
+  int h = BLOCKHEIGHT3 * blockIdx.x;
+  int w = BLOCKWIDTH * blockIdx.y + threadIdx.x;
+
+  __shared__ half2 blockvec[blockwidth2];
+  if (threadIdx.x < blockwidth2)
+    blockvec[threadIdx.x] = vec[b * vec_height + blockIdx.x * blockwidth2 + threadIdx.x];
+
+  __shared__ half2 deq2[64][32];
+  int val = threadIdx.x / 32;
+  int off = threadIdx.x % 32;
+  for (; val < 64; val += BLOCKWIDTH / 32) {
+    deq2[val][off] = __halves2half2(
+       __int2half_rn(val & 0x7), __int2half_rn(val >> 3)
+    );
+  }
+
+  int i = width * h + w;
+  int g_h = (h / 3) * 32;
+  int k = 0;
+  
+  int z_w = (w / 32) * 3;
+  int z_mod = w % 32;
+  int z_bit;
+  
+  if (z_mod != 10){
+    if (z_mod != 21){
+      z_bit = z_mod;
+      if (z_bit > 21){
+        z_bit -= 22;
+        z_bit *= 3;
+        z_bit += 2;
+        z_w += 2;
+      } else if (z_bit > 10){
+        z_bit -= 11;
+        z_bit *= 3;
+        z_bit += 1;
+        z_w += 1;
+      } else {
+        z_bit *= 3;
+      }
+    } else {
+      z_w += 1;
+    }
+  }
+
+  float res = 0;
+  half2 res2;
+
+  unsigned int tmp1;
+  unsigned int tmp2;
+  unsigned int tmp;
+  unsigned int z_tmp;
+
+  __syncthreads();
+
+  while (k < blockwidth2) {
+    int g = (g_h + (k * 2)) / groupsize;
+	float scale_f = scales[g * width + w];
+    half2 scale = __float2half2_rn(scale_f);
+    half2 zero;
+    if (z_mod == 10) {
+      z_tmp = (as_unsigned(zeros[g * zero_width + z_w]) >> 30) | ((as_unsigned(zeros[g * zero_width + (z_w + 1)]) << 2) & 0x4);
+      zero = __float2half2_rn(-(scale_f * (((z_tmp) + 1) & 0x0f)));
+    } else if (z_mod == 21){
+      z_tmp = (as_unsigned(zeros[g * zero_width + z_w]) >> 31) | ((as_unsigned(zeros[g * zero_width + (z_w + 1)]) << 1) & 0x6);
+      zero = __float2half2_rn(-(scale_f * (((z_tmp) + 1) & 0x0f)));
+    } else {
+      zero = __float2half2_rn(-(scale_f * ((((as_unsigned(zeros[g * zero_width + z_w]) >> z_bit) & 0x7) + 1) & 0x0f)));
+    }
+	
+    std::memset(&res2, 0, sizeof(half2));
+    tmp1 = as_unsigned(mat[i]);
+    res2 = __hfma2(__hfma2(deq2[(tmp1 >>  0) & 0x3f][off], scale, zero), blockvec[k + 0], res2);
+    res2 = __hfma2(__hfma2(deq2[(tmp1 >>  6) & 0x3f][off], scale, zero), blockvec[k + 1], res2);
+    res2 = __hfma2(__hfma2(deq2[(tmp1 >> 12) & 0x3f][off], scale, zero), blockvec[k + 2], res2);
+    res2 = __hfma2(__hfma2(deq2[(tmp1 >> 18) & 0x3f][off], scale, zero), blockvec[k + 3], res2);
+    res2 = __hfma2(__hfma2(deq2[(tmp1 >> 24) & 0x3f][off], scale, zero), blockvec[k + 4], res2);
+    i += width;
+    tmp2 = as_unsigned(mat[i]);
+    tmp = (tmp1 >> 30) | ((tmp2 << 2) & 0x3c);
+    res2 = __hfma2(__hfma2(deq2[tmp][off], scale, zero), blockvec[k + 5], res2);
+    tmp2 >>= 4;
+    k += 6;
+    res2 = __hfma2(__hfma2(deq2[(tmp2 >>  0) & 0x3f][off], scale, zero), blockvec[k + 0], res2);
+    res2 = __hfma2(__hfma2(deq2[(tmp2 >>  6) & 0x3f][off], scale, zero), blockvec[k + 1], res2);
+    res2 = __hfma2(__hfma2(deq2[(tmp2 >> 12) & 0x3f][off], scale, zero), blockvec[k + 2], res2);
+    res2 = __hfma2(__hfma2(deq2[(tmp2 >> 18) & 0x3f][off], scale, zero), blockvec[k + 3], res2);
+    i += width;
+    tmp1 = as_unsigned(mat[i]);
+    tmp = (tmp2 >> 24) | ((tmp1 << 4) & 0x30);
+    res2 = __hfma2(__hfma2(deq2[tmp][off], scale, zero), blockvec[k + 4], res2);
+    tmp1 >>= 2;
+    k += 5;
+    res2 = __hfma2(__hfma2(deq2[(tmp1 >>  0) & 0x3f][off], scale, zero), blockvec[k + 0], res2);
+    res2 = __hfma2(__hfma2(deq2[(tmp1 >>  6) & 0x3f][off], scale, zero), blockvec[k + 1], res2);
+    res2 = __hfma2(__hfma2(deq2[(tmp1 >> 12) & 0x3f][off], scale, zero), blockvec[k + 2], res2);
+    res2 = __hfma2(__hfma2(deq2[(tmp1 >> 18) & 0x3f][off], scale, zero), blockvec[k + 3], res2);
+    res2 = __hfma2(__hfma2(deq2[(tmp1 >> 24) & 0x3f][off], scale, zero), blockvec[k + 4], res2);
+    i += width;
+    k += 5;
+    res += __low2float(res2) + __high2float(res2);
+  }
+
+  atomicAdd(&mul[b * width + w], res);
+}
+
+void vecquant4matmul_faster_cuda_old(
+  torch::Tensor vec,
+  torch::Tensor mat,
+  torch::Tensor mul,
+  torch::Tensor scales,
+  torch::Tensor zeros,
+  int groupsize,
+  int vec_height
+) {
+  int batch = vec.size(0);
+  int height = mat.size(0);
+  int width = mat.size(1);
+  int zero_width = zeros.size(1);
+  
+  dim3 blocks(
+    (height + BLOCKHEIGHT4 - 1) / BLOCKHEIGHT4,
+    (width + BLOCKWIDTH - 1) / BLOCKWIDTH,
+    batch
+  );
+  dim3 threads(BLOCKWIDTH);
+
+  VecQuant4MatMulKernelFaster_old<<<blocks, threads>>>(
+    (half2*) vec.data_ptr(),
+    mat.data_ptr<int>(),
+    mul.data_ptr<float>(),
+    scales.data_ptr<float>(),
+    zeros.data_ptr<int>(),
+    batch, vec_height, height, width, zero_width, groupsize
+  );
+}
+
+__global__ void VecQuant4MatMulKernelFaster_old(
+    const  half2* __restrict__ vec,
+    const    int* __restrict__ mat,
+           float* __restrict__ mul,
+    const  float* __restrict__ scales,
+    const  	 int* __restrict__ zeros,
+	int batch,
+	int vec_height,
+    int height,
+    int width,
+    int zero_width,
+    int groupsize
+) {
+  const int blockwidth2 = BLOCKWIDTH / 2;
+  int b = blockIdx.z;
+  int h = BLOCKHEIGHT4 * blockIdx.x;
+  int w = BLOCKWIDTH * blockIdx.y + threadIdx.x;
+
+  __shared__ half2 blockvec[blockwidth2];
+  if (threadIdx.x < blockwidth2)
+    blockvec[threadIdx.x] = vec[b * vec_height + blockIdx.x * blockwidth2 + threadIdx.x];
+
+  __shared__ half2 deq2[256][8];
+  int val = threadIdx.x / 8;
+  int off = threadIdx.x % 8;
+  for (; val < 256; val += BLOCKWIDTH / 8) {
+    deq2[val][off] = __halves2half2(
+       __int2half_rn(val & 0xF), __int2half_rn(val >> 4)
+    );
+  }
+
+  int i = width * h + w;
+  int g_h = h * 8;
+  int k = 0;
+
+  int z_w = w / 8; 
+  int z_mod = (w % 8) * 4;
+
+  float res = 0;
+  half2 res2;
+
+  unsigned int tmp;
+
+  __syncthreads();
+
+  while (k < blockwidth2) {
+    int g = (g_h + (k * 2)) / groupsize;
+	float scale_f = scales[g * width + w];
+    half2 scale = __float2half2_rn(scale_f);
+    half2 zero = __float2half2_rn(-(scale_f * ((((as_unsigned(zeros[g * zero_width + z_w]) >> z_mod) & 0xF) + 1) & 0x0f)));
+	
+    std::memset(&res2, 0, sizeof(half2));
+    tmp = as_unsigned(mat[i]);
+    res2 = __hfma2(__hfma2(deq2[(tmp >>  0) & 0xff][off], scale, zero), blockvec[k + 0], res2);
+    res2 = __hfma2(__hfma2(deq2[(tmp >>  8) & 0xff][off], scale, zero), blockvec[k + 1], res2);
+    res2 = __hfma2(__hfma2(deq2[(tmp >> 16) & 0xff][off], scale, zero), blockvec[k + 2], res2);
+    res2 = __hfma2(__hfma2(deq2[(tmp >> 24) & 0xff][off], scale, zero), blockvec[k + 3], res2);
+	i += width;
+    k += 4;
+    res += __low2float(res2) + __high2float(res2);
+  }
+
+  atomicAdd(&mul[b * width + w], res);
+}