Initial commit

da900c3b · yangql · da900c3b · da900c3b · da900c3b · da900c3b
Commit da900c3b authored Sep 19, 2024 by yangql
20 changed files
--- a/auto_gptq/nn_modules/__pycache__/fused_llama_attn.cpython-310.pyc
+++ b/auto_gptq/nn_modules/__pycache__/fused_llama_attn.cpython-310.pyc
--- a/auto_gptq/nn_modules/__pycache__/fused_llama_mlp.cpython-310.pyc
+++ b/auto_gptq/nn_modules/__pycache__/fused_llama_mlp.cpython-310.pyc
--- a/auto_gptq/nn_modules/_fused_base.py
+++ b/auto_gptq/nn_modules/_fused_base.py
+from abc import abstractmethod
+from logging import getLogger
+import torch.nn as nn
+from .triton_utils.mixin import TritonModuleMixin
+logger = getLogger(__name__)
+class FusedBaseModule(nn.Module, TritonModuleMixin):
+    @classmethod
+    @abstractmethod
+    def inject_to_model(cls, *args, **kwargs):
+        raise NotImplementedError()
+class FusedBaseAttentionModule(FusedBaseModule):
+    @classmethod
+    @abstractmethod
+    def inject_to_model(
+        cls, model, use_triton=False, group_size=-1, use_cuda_fp16=True, desc_act=False, trainable=False, **kwargs
+    ):
+        raise NotImplementedError()
+    @classmethod
+    def warmup(cls, model, transpose=False, seqlen=2048):
+        pass
+class FusedBaseMLPModule(FusedBaseModule):
+    @classmethod
+    @abstractmethod
+    def inject_to_model(cls, model, use_triton=False, **kwargs):
+        raise NotImplementedError()
--- a/auto_gptq/nn_modules/fused_gptj_attn.py
+++ b/auto_gptq/nn_modules/fused_gptj_attn.py
+from typing import Optional, Tuple, Union
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from transformers.models.gptj.modeling_gptj import GPTJAttention
+from ..utils.import_utils import compare_pytorch_version, dynamically_import_QuantLinear
+from ._fused_base import FusedBaseAttentionModule
+def fixed_pos_embedding(x, seq_dim=1, seq_len=None):
+    dim = x.shape[-1]
+    if seq_len is None:
+        seq_len = x.shape[seq_dim]
+    inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2) / dim))
+    sinusoid_inp = (
+        torch.einsum("i , j -> i j", torch.arange(seq_len, dtype=torch.float), inv_freq).to(x.device).float()
+    )
+    return torch.sin(sinusoid_inp), torch.cos(sinusoid_inp)
+def rotate_every_two(x):
+    x1 = x[:, :, :, ::2]
+    x2 = x[:, :, :, 1::2]
+    x = torch.stack((-x2, x1), dim=-1)
+    return x.flatten(-2)  # in einsum notation: rearrange(x, '... d j -> ... (d j)')
+def duplicate_interleave(m):
+    """
+    A simple version of `torch.repeat_interleave` for duplicating a matrix while interleaving the copy.
+    """
+    dim0 = m.shape[0]
+    m = m.view(-1, 1)  # flatten the matrix
+    m = m.repeat(1, 2)  # repeat all elements into the 2nd dimension
+    m = m.view(dim0, -1)  # reshape into a matrix, interleaving the copy
+    return m
+def apply_rotary_pos_emb(x, sincos, offset=0):
+    sin, cos = (duplicate_interleave(t)[None, offset : x.shape[1] + offset, None, :] for t in sincos)
+    # einsum notation for lambda t: repeat(t[offset:x.shape[1]+offset,:], "n d -> () n () (d j)", j=2)
+    return (x * cos) + (rotate_every_two(x) * sin)
+class FusedGPTJAttentionForQuantizedModel(FusedBaseAttentionModule):
+    def __init__(self, config):
+        super().__init__()
+        max_positions = config.max_position_embeddings
+        self.register_buffer(
+            "bias",
+            torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view(
+                1, 1, max_positions, max_positions
+            ),
+        )
+        self.register_buffer("masked_bias", torch.tensor(-1e9))
+        self.attn_dropout = nn.Dropout(config.attn_pdrop)
+        self.attn_dropout_p = config.attn_pdrop
+        self.resid_dropout = nn.Dropout(config.resid_pdrop)
+        self.embed_dim = config.hidden_size
+        self.num_attention_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_attention_heads
+        if self.head_dim * self.num_attention_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_attention_heads (got `embed_dim`: {self.embed_dim} and"
+                f" `num_attention_heads`: {self.num_attention_heads})."
+            )
+        self.scale_attn = torch.sqrt(torch.tensor(self.head_dim, dtype=torch.float32)).to(torch.get_default_dtype())
+        self.qkv_proj = nn.Linear(self.embed_dim, self.embed_dim * 3, bias=False)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
+        self.rotary_dim = config.rotary_dim
+    def _split_heads(self, qkv):
+        """
+        Splits hidden dim into attn_head_size and num_attention_heads
+        """
+        new_shape = qkv.size()[:-1] + (3, self.num_attention_heads, self.head_dim)
+        qkv = qkv.view(new_shape)  # (batch, seq_length, 3, head, head_features)
+        query = qkv[:, :, 0]
+        key = qkv[:, :, 1]
+        value = qkv[:, :, 2]
+        return query, key, value
+    def _merge_heads(self, tensor, num_attention_heads, attn_head_size):
+        """
+        Merges attn_head_size dim and num_attn_heads dim into hidden dim
+        """
+        if len(tensor.shape) == 5:
+            tensor = tensor.permute(0, 1, 3, 2, 4).contiguous()
+        elif len(tensor.shape) == 4:
+            tensor = tensor.permute(0, 2, 1, 3).contiguous()
+        else:
+            raise ValueError(f"Input tensor rank should be one of [4, 5], but is: {len(tensor.shape)}")
+        new_shape = tensor.size()[:-2] + (num_attention_heads * attn_head_size,)
+        return tensor.view(new_shape)
+    def _attn(
+        self,
+        query,
+        key,
+        value,
+        attention_mask=None,
+        head_mask=None,
+    ):
+        # compute causal mask from causal mask buffer
+        query_length, key_length = query.size(-2), key.size(-2)
+        causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length]
+        # Keep the attention weights computation in fp32 to avoid overflow issues
+        query = query.to(torch.float32)
+        key = key.to(torch.float32)
+        attn_weights = torch.matmul(query, key.transpose(-1, -2))
+        mask_value = torch.finfo(attn_weights.dtype).min
+        # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
+        # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
+        mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype).to(attn_weights.device)
+        attn_weights = torch.where(causal_mask, attn_weights, mask_value)
+        attn_weights = attn_weights / self.scale_attn
+        if attention_mask is not None:
+            # Apply the attention mask
+            attn_weights = attn_weights + attention_mask
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+        attn_weights = attn_weights.to(value.dtype)
+        attn_weights = self.attn_dropout(attn_weights)
+        # Mask heads if we want to
+        if head_mask is not None:
+            attn_weights = attn_weights * head_mask
+        attn_output = torch.matmul(attn_weights, value)
+        return attn_output, attn_weights
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        layer_past: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+    ) -> Union[
+        Tuple[torch.Tensor, Tuple[torch.Tensor]],
+        Optional[Tuple[torch.Tensor, Tuple[torch.Tensor], Tuple[torch.Tensor, ...]]],
+    ]:
+        query, key, value = self._split_heads(self.qkv_proj(hidden_states))
+        seq_len = key.shape[1]
+        offset = 0
+        if layer_past is not None:
+            offset = layer_past[0].shape[-2]
+            seq_len += offset
+        if self.rotary_dim is not None:
+            k_rot = key[:, :, :, : self.rotary_dim]
+            k_pass = key[:, :, :, self.rotary_dim :]
+            q_rot = query[:, :, :, : self.rotary_dim]
+            q_pass = query[:, :, :, self.rotary_dim :]
+            sincos = fixed_pos_embedding(k_rot, 1, seq_len=seq_len)
+            k_rot = apply_rotary_pos_emb(k_rot, sincos, offset=offset)
+            q_rot = apply_rotary_pos_emb(q_rot, sincos, offset=offset)
+            key = torch.cat([k_rot, k_pass], dim=-1)
+            query = torch.cat([q_rot, q_pass], dim=-1)
+        else:
+            sincos = fixed_pos_embedding(key, 1, seq_len=seq_len)
+            key = apply_rotary_pos_emb(key, sincos, offset=offset)
+            query = apply_rotary_pos_emb(query, sincos, offset=offset)
+        key = key.permute(0, 2, 1, 3)
+        query = query.permute(0, 2, 1, 3)
+        value = value.permute(0, 2, 1, 3)
+        is_causal = layer_past is None
+        if layer_past is not None:
+            past_key = layer_past[0]
+            past_value = layer_past[1]
+            key = torch.cat((past_key, key), dim=-2)
+            value = torch.cat((past_value, value), dim=-2)
+        if use_cache is True:
+            query = query.contiguous()
+            key = key.contiguous()
+            value = value.contiguous()
+            present = (key, value)
+        else:
+            present = None
+        # compute self-attention: V x Softmax(QK^T)
+        if compare_pytorch_version("v2.0.0", op="ge"):
+            attn_output = F.scaled_dot_product_attention(
+                query,
+                key,
+                value,
+                attn_mask=None if is_causal else attention_mask,
+                dropout_p=self.attn_dropout_p,
+                is_causal=is_causal,
+            )
+            attn_weights = None
+        else:
+            attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
+        attn_output = self._merge_heads(attn_output, self.num_attention_heads, self.head_dim)
+        attn_output = self.out_proj(attn_output)
+        attn_output = self.resid_dropout(attn_output)
+        outputs = (attn_output, present)
+        if output_attentions:
+            outputs += (attn_weights,)
+        return outputs  # a, present, (attentions)
+    @classmethod
+    def inject_to_model(
+        cls,
+        model,
+        use_triton=False,
+        group_size=-1,
+        use_cuda_fp16=True,
+        desc_act=False,
+        trainable=False,
+        bits: int = 4,
+        disable_exllama=True,
+        disable_exllamav2=False,
+        **kwargs,
+    ):
+        config = model.config
+        QuantLinear = dynamically_import_QuantLinear(
+            use_triton=use_triton,
+            desc_act=desc_act,
+            group_size=group_size,
+            bits=bits,
+            disable_exllama=disable_exllama,
+            disable_exllamav2=disable_exllamav2,
+        )
+        for name, m in model.named_modules():
+            if not isinstance(m, GPTJAttention):
+                continue
+            attn = cls(config).to(device=next(m.buffers()).device)
+            q_proj = m.q_proj
+            k_proj = m.k_proj
+            v_proj = m.v_proj
+            qweights = torch.cat([q_proj.qweight, k_proj.qweight, v_proj.qweight], dim=1)
+            qzeros = torch.cat([q_proj.qzeros, k_proj.qzeros, v_proj.qzeros], dim=1)
+            scales = torch.cat([q_proj.scales, k_proj.scales, v_proj.scales], dim=1)
+            if QuantLinear.QUANT_TYPE == "exllama":
+                if desc_act:
+                    # See fused_llama_attn.py comment
+                    raise ValueError(
+                        "Exllama kernel does not support query/key/value fusion with act-order. Please either use inject_fused_attention=False or disable_exllama=True."
+                    )
+                else:
+                    g_idx = None
+            else:
+                g_idx = torch.cat([q_proj.g_idx, k_proj.g_idx, v_proj.g_idx], dim=0)
+            bias = torch.cat([q_proj.bias, k_proj.bias, v_proj.bias], dim=0) if q_proj.bias is not None else None
+            qlinear_args = (
+                q_proj.bits,
+                q_proj.group_size,
+                q_proj.infeatures,
+                q_proj.outfeatures + k_proj.outfeatures + v_proj.outfeatures,
+                True if q_proj.bias is not None else False,
+            )
+            qlinear_kwargs = {"trainable": trainable}
+            if (not desc_act or group_size == -1) and not use_triton:
+                qlinear_kwargs["use_cuda_fp16"] = use_cuda_fp16
+            qlinear_kwargs["weight_dtype"] = q_proj.scales.dtype
+            qkv_proj = QuantLinear(*qlinear_args, **qlinear_kwargs)
+            qkv_proj.qweight = qweights
+            qkv_proj.qzeros = qzeros
+            qkv_proj.scales = scales
+            qkv_proj.g_idx = g_idx
+            qkv_proj.bias = bias
+            if "." in name:
+                parent_name = name.rsplit(".", 1)[0]
+                child_name = name[len(parent_name) + 1 :]
+                parent = model.get_submodule(parent_name)
+            else:
+                parent_name = ""
+                parent = model
+                child_name = name
+            attn.qkv_proj = qkv_proj
+            attn.out_proj = m.out_proj
+            setattr(parent, child_name, attn)
+            del m
+__all__ = ["FusedGPTJAttentionForQuantizedModel"]
--- a/auto_gptq/nn_modules/fused_llama_attn.py
+++ b/auto_gptq/nn_modules/fused_llama_attn.py
+import math
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from transformers.models.llama.modeling_llama import (
+    LlamaAttention,
+    apply_rotary_pos_emb,
+)
+from ..utils.import_utils import compare_pytorch_version, dynamically_import_QuantLinear
+from ._fused_base import FusedBaseAttentionModule
+class FusedLlamaAttentionForQuantizedModel(FusedBaseAttentionModule):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(
+        self,
+        hidden_size,
+        num_heads,
+        qkv_proj,
+        o_proj,
+        rotary_emb,
+        layer_idx,
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        self.head_dim = hidden_size // num_heads
+        self.layer_idx = layer_idx
+        if self.head_dim * num_heads != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.qkv_proj = qkv_proj
+        self.o_proj = o_proj
+        self.rotary_emb = rotary_emb
+    def _shape(self, tensor, seq_len, bsz):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+    def forward(
+        self,
+        hidden_states,
+        past_key_value=None,
+        attention_mask=None,
+        position_ids=None,
+        output_attentions=False,
+        use_cache=False,
+        **kwargs,
+    ):
+        """Input shape: Batch x Time x Channel"""
+        bsz, q_len, _ = hidden_states.size()
+        qkv_states = self.qkv_proj(hidden_states)
+        query_states, key_states, value_states = torch.split(qkv_states, self.hidden_size, dim=2)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index. Please open an issue in AutoGPTQ if you hit this."
+                )
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        # [bsz, nh, t, hd]
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        if use_cache:
+            # Since qkv_proj is fused, query_states etc will hold a reference to the original qkv_states tensor
+            # which can cause excessive memory usage by the cache. `contiguous` is a convenient way to workaround this.
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+        if compare_pytorch_version("v2.0.0", op="ge"):
+            attn_output = F.scaled_dot_product_attention(
+                query_states,
+                key_states,
+                value_states,
+                attn_mask=attention_mask,
+                is_causal=attention_mask is None and q_len > 1,
+            )
+            attn_weights = None
+        else:
+            attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+            if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention weights should be of size {(bsz * self.num_heads, q_len, kv_seq_len)}, but is"
+                    f" {attn_weights.size()}"
+                )
+            if attention_mask is not None:
+                if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                    raise ValueError(
+                        f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                    )
+                attn_weights = attn_weights + attention_mask
+                attn_weights = torch.max(attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min))
+            # upcast attention to fp32
+            attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+            attn_output = torch.matmul(attn_weights, value_states)
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+    @classmethod
+    def inject_to_model(
+        cls,
+        model,
+        use_triton=False,
+        group_size=-1,
+        use_cuda_fp16=True,
+        desc_act=False,
+        trainable=False,
+        bits: int = 4,
+        disable_exllama=True,
+        disable_exllamav2=False,
+        **kwargs,
+    ):
+        """
+        Replace all LlamaAttention modules with QuantLlamaAttention modules, fusing the q, k, v projections.
+        """
+        QuantLinear = dynamically_import_QuantLinear(
+            use_triton=use_triton,
+            desc_act=desc_act,
+            group_size=group_size,
+            bits=bits,
+            disable_exllama=disable_exllama,
+            disable_exllamav2=disable_exllamav2,
+        )
+        for name, m in model.named_modules():
+            if not isinstance(m, LlamaAttention):
+                continue
+            q_proj = m.q_proj
+            k_proj = m.k_proj
+            v_proj = m.v_proj
+            qweights = torch.cat([q_proj.qweight, k_proj.qweight, v_proj.qweight], dim=1)
+            qzeros = torch.cat([q_proj.qzeros, k_proj.qzeros, v_proj.qzeros], dim=1)
+            scales = torch.cat([q_proj.scales, k_proj.scales, v_proj.scales], dim=1)
+            if QuantLinear.QUANT_TYPE == "exllama":
+                if desc_act:
+                    # TODO: support it. The issue lies maybe in the line:
+                    # int groups = qzeros.size(0);
+                    # in exllama_ext.cpp
+                    raise ValueError(
+                        "Exllama kernel does not support query/key/value fusion with act-order. Please either use inject_fused_attention=False or disable_exllama=True."
+                    )
+                else:
+                    g_idx = None
+            else:
+                g_idx = torch.cat([q_proj.g_idx, k_proj.g_idx, v_proj.g_idx], dim=0)
+            bias = torch.cat([q_proj.bias, k_proj.bias, v_proj.bias], dim=0) if q_proj.bias is not None else None
+            qlinear_args = (
+                q_proj.bits,
+                q_proj.group_size,
+                q_proj.infeatures,
+                q_proj.outfeatures + k_proj.outfeatures + v_proj.outfeatures,
+                True if q_proj.bias is not None else False,
+            )
+            qlinear_kwargs = {"trainable": trainable}
+            if (not desc_act or group_size == -1) and not use_triton:
+                qlinear_kwargs["use_cuda_fp16"] = use_cuda_fp16
+            qlinear_kwargs["weight_dtype"] = q_proj.scales.dtype
+            qkv_layer = QuantLinear(*qlinear_args, **qlinear_kwargs)
+            qkv_layer.qweight = qweights
+            qkv_layer.qzeros = qzeros
+            qkv_layer.scales = scales
+            qkv_layer.g_idx = g_idx
+            qkv_layer.bias = bias
+            # Introduced in Transformers 4.36
+            layer_idx = None
+            if hasattr(m, "layer_idx"):
+                layer_idx = m.layer_idx
+            attn = cls(
+                m.hidden_size,
+                m.num_heads,
+                qkv_layer,
+                m.o_proj,
+                m.rotary_emb,
+                layer_idx=layer_idx,
+            )
+            if "." in name:
+                parent_name = name.rsplit(".", 1)[0]
+                child_name = name[len(parent_name) + 1 :]
+                parent = model.get_submodule(parent_name)
+            else:
+                parent_name = ""
+                parent = model
+                child_name = name
+            setattr(parent, child_name, attn)
+__all__ = ["FusedLlamaAttentionForQuantizedModel"]
--- a/auto_gptq/nn_modules/fused_llama_mlp.py
+++ b/auto_gptq/nn_modules/fused_llama_mlp.py
+import math
+from logging import getLogger
+import torch
+from transformers.models.llama.modeling_llama import LlamaMLP
+from ..utils.import_utils import TRITON_AVAILABLE
+from ._fused_base import FusedBaseMLPModule
+logger = getLogger(__name__)
+if TRITON_AVAILABLE:
+    import triton
+    import triton.language as tl
+    from .triton_utils import custom_autotune
+    from .triton_utils.kernels import silu
+    @custom_autotune.autotune(
+        configs=[
+            triton.Config(
+                {
+                    "BLOCK_SIZE_M": 256,
+                    "BLOCK_SIZE_N": 64,
+                    "BLOCK_SIZE_K": 32,
+                    "GROUP_SIZE_M": 8,
+                },
+                num_stages=4,
+                num_warps=4,
+            ),
+            triton.Config(
+                {
+                    "BLOCK_SIZE_M": 64,
+                    "BLOCK_SIZE_N": 256,
+                    "BLOCK_SIZE_K": 32,
+                    "GROUP_SIZE_M": 8,
+                },
+                num_stages=4,
+                num_warps=4,
+            ),
+            triton.Config(
+                {
+                    "BLOCK_SIZE_M": 128,
+                    "BLOCK_SIZE_N": 128,
+                    "BLOCK_SIZE_K": 32,
+                    "GROUP_SIZE_M": 8,
+                },
+                num_stages=4,
+                num_warps=4,
+            ),
+            triton.Config(
+                {
+                    "BLOCK_SIZE_M": 128,
+                    "BLOCK_SIZE_N": 64,
+                    "BLOCK_SIZE_K": 32,
+                    "GROUP_SIZE_M": 8,
+                },
+                num_stages=4,
+                num_warps=4,
+            ),
+            triton.Config(
+                {
+                    "BLOCK_SIZE_M": 64,
+                    "BLOCK_SIZE_N": 128,
+                    "BLOCK_SIZE_K": 32,
+                    "GROUP_SIZE_M": 8,
+                },
+                num_stages=4,
+                num_warps=4,
+            ),
+            triton.Config(
+                {
+                    "BLOCK_SIZE_M": 128,
+                    "BLOCK_SIZE_N": 32,
+                    "BLOCK_SIZE_K": 32,
+                    "GROUP_SIZE_M": 8,
+                },
+                num_stages=4,
+                num_warps=4,
+            ),  # 3090
+            triton.Config(
+                {
+                    "BLOCK_SIZE_M": 128,
+                    "BLOCK_SIZE_N": 16,
+                    "BLOCK_SIZE_K": 32,
+                    "GROUP_SIZE_M": 8,
+                },
+                num_stages=4,
+                num_warps=4,
+            ),  # 3090
+            triton.Config(
+                {
+                    "BLOCK_SIZE_M": 32,
+                    "BLOCK_SIZE_N": 32,
+                    "BLOCK_SIZE_K": 128,
+                    "GROUP_SIZE_M": 8,
+                },
+                num_stages=2,
+                num_warps=4,
+            ),  # 3090
+            triton.Config(
+                {
+                    "BLOCK_SIZE_M": 64,
+                    "BLOCK_SIZE_N": 16,
+                    "BLOCK_SIZE_K": 64,
+                    "GROUP_SIZE_M": 8,
+                },
+                num_stages=4,
+                num_warps=4,
+            ),  # 3090
+            triton.Config(
+                {
+                    "BLOCK_SIZE_M": 64,
+                    "BLOCK_SIZE_N": 32,
+                    "BLOCK_SIZE_K": 64,
+                    "GROUP_SIZE_M": 8,
+                },
+                num_stages=4,
+                num_warps=4,
+            ),  # 3090
+        ],
+        key=["M", "N", "K"],
+        nearest_power_of_two=True,
+        prune_configs_by={
+            "early_config_prune": custom_autotune.matmul248_kernel_config_pruner,
+            "perf_model": None,
+            "top_k": None,
+        },
+    )
+    @triton.jit
+    def quant_fused_matmul_248_kernel(
+        a_ptr,
+        c_ptr,
+        b1_ptr,
+        scales1_ptr,
+        zeros1_ptr,
+        g1_ptr,
+        b2_ptr,
+        scales2_ptr,
+        zeros2_ptr,
+        g2_ptr,
+        M,
+        N,
+        K,
+        bits,
+        maxq,
+        stride_am,
+        stride_ak,
+        stride_bk,
+        stride_bn,
+        stride_cm,
+        stride_cn,
+        stride_scales,
+        stride_zeros,
+        BLOCK_SIZE_M: tl.constexpr,
+        BLOCK_SIZE_N: tl.constexpr,
+        BLOCK_SIZE_K: tl.constexpr,
+        GROUP_SIZE_M: tl.constexpr,
+    ):
+        """
+        Computes: C = silu(A * B1) * (A * B2)
+        A is of shape (M, K) float16
+        B is of shape (K//8, N) int32
+        C is of shape (M, N) float16
+        scales is of shape (1, N) float16
+        zeros is of shape (1, N//8) int32
+        """
+        infearure_per_bits = 32 // bits
+        pid = tl.program_id(axis=0)
+        num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+        num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+        num_pid_k = tl.cdiv(K, BLOCK_SIZE_K)
+        num_pid_in_group = GROUP_SIZE_M * num_pid_n
+        group_id = pid // num_pid_in_group
+        first_pid_m = group_id * GROUP_SIZE_M
+        group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+        pid_m = first_pid_m + (pid % group_size_m)
+        pid_n = (pid % num_pid_in_group) // group_size_m
+        offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+        offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+        offs_k = tl.arange(0, BLOCK_SIZE_K)
+        a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)  # (BLOCK_SIZE_M, BLOCK_SIZE_K)
+        a_mask = offs_am[:, None] < M
+        # b_ptrs is set up such that it repeats elements along the K axis 8 times
+        b1_ptrs = b1_ptr + ((offs_k[:, None] // infearure_per_bits) * stride_bk + offs_bn[None, :] * stride_bn)
+        b2_ptrs = b2_ptr + ((offs_k[:, None] // infearure_per_bits) * stride_bk + offs_bn[None, :] * stride_bn)
+        g1_ptrs = g1_ptr + offs_k
+        g2_ptrs = g2_ptr + offs_k
+        # shifter is used to extract the N bits of each element in the 32-bit word from B
+        scales1_ptrs = scales1_ptr + offs_bn[None, :]
+        scales2_ptrs = scales2_ptr + offs_bn[None, :]
+        zeros1_ptrs = zeros1_ptr + (offs_bn[None, :] // infearure_per_bits)
+        zeros2_ptrs = zeros2_ptr + (offs_bn[None, :] // infearure_per_bits)
+        shifter = (offs_k % infearure_per_bits) * bits
+        zeros_shifter = (offs_bn % infearure_per_bits) * bits
+        accumulator1 = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+        accumulator2 = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+        for k in range(0, num_pid_k):
+            g1_idx = tl.load(g1_ptrs)
+            g2_idx = tl.load(g2_ptrs)
+            # Fetch scales and zeros; these are per-outfeature and thus reused in the inner loop
+            scales1 = tl.load(scales1_ptrs + g1_idx[:, None] * stride_scales)  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
+            scales2 = tl.load(scales2_ptrs + g2_idx[:, None] * stride_scales)
+            zeros1 = tl.load(zeros1_ptrs + g1_idx[:, None] * stride_zeros)  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
+            zeros1 = (zeros1 >> zeros_shifter[None, :]) & maxq
+            zeros1 = zeros1 + 1
+            zeros2 = tl.load(zeros2_ptrs + g2_idx[:, None] * stride_zeros)  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
+            zeros2 = (zeros2 >> zeros_shifter[None, :]) & maxq
+            zeros2 = zeros2 + 1
+            a = tl.load(a_ptrs, mask=a_mask, other=0.0)  # (BLOCK_SIZE_M, BLOCK_SIZE_K)
+            b1 = tl.load(b1_ptrs)  # (BLOCK_SIZE_K, BLOCK_SIZE_N), but repeated
+            b2 = tl.load(b2_ptrs)
+            # Now we need to unpack b (which is N-bit values) into 32-bit values
+            b1 = (b1 >> shifter[:, None]) & maxq  # Extract the N-bit values
+            b1 = (b1 - zeros1) * scales1  # Scale and shift
+            accumulator1 += tl.dot(a, b1)
+            b2 = (b2 >> shifter[:, None]) & maxq
+            b2 = (b2 - zeros2) * scales2
+            accumulator2 += tl.dot(a, b2)
+            a_ptrs += BLOCK_SIZE_K
+            b1_ptrs += (BLOCK_SIZE_K // infearure_per_bits) * stride_bk
+            b2_ptrs += (BLOCK_SIZE_K // infearure_per_bits) * stride_bk
+            g1_ptrs += BLOCK_SIZE_K
+            g2_ptrs += BLOCK_SIZE_K
+        accumulator1 = silu(accumulator1)
+        c = accumulator1 * accumulator2
+        c = c.to(tl.float16)
+        c_ptrs = c_ptr + stride_cm * offs_am[:, None] + stride_cn * offs_bn[None, :]
+        c_mask = (offs_am[:, None] < M) & (offs_bn[None, :] < N)
+        tl.store(c_ptrs, c, mask=c_mask)
+else:
+    quant_fused_matmul_248_kernel = None
+class FusedLlamaMLPForQuantizedModel(FusedBaseMLPModule):
+    def __init__(
+        self,
+        gate_proj,
+        down_proj,
+        up_proj,
+    ):
+        super().__init__()
+        self.infeatures = gate_proj.infeatures
+        self.intermediate_size = gate_proj.outfeatures
+        self.outfeatures = down_proj.outfeatures
+        self.bits = gate_proj.bits
+        self.maxq = gate_proj.maxq
+        self.gate_proj = gate_proj
+        self.up_proj = up_proj
+        self.down_proj = down_proj
+    def forward(self, x):
+        return self.down_proj(self.triton_llama_mlp(x))
+    def triton_llama_mlp(self, x):
+        with torch.cuda.device(x.device):
+            out_shape = x.shape[:-1] + (self.intermediate_size,)
+            x = x.reshape(-1, x.shape[-1])
+            M, K = x.shape
+            N = self.intermediate_size
+            c = torch.empty((M, N), device=x.device, dtype=torch.float16)
+            grid = lambda META: (  # noqa: E731
+                triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]),
+            )
+            quant_fused_matmul_248_kernel[grid](
+                x,
+                c,
+                self.gate_proj.qweight,
+                self.gate_proj.scales,
+                self.gate_proj.qzeros,
+                self.gate_proj.g_idx,
+                self.up_proj.qweight,
+                self.up_proj.scales,
+                self.up_proj.qzeros,
+                self.up_proj.g_idx,
+                M,
+                N,
+                K,
+                self.bits,
+                self.maxq,
+                x.stride(0),
+                x.stride(1),
+                self.gate_proj.qweight.stride(0),
+                self.gate_proj.qweight.stride(1),
+                c.stride(0),
+                c.stride(1),
+                self.gate_proj.scales.stride(0),
+                self.gate_proj.qzeros.stride(0),
+            )
+            c = c.reshape(out_shape)
+            return c
+    @classmethod
+    def inject_to_model(cls, model, use_triton=False, **kwargs):
+        if not use_triton:
+            logger.warning(
+                f"Skipping module injection for {cls.__name__} as currently not supported with use_triton=False."
+            )
+            return
+        elif not TRITON_AVAILABLE:
+            logger.warning(
+                f"Skipping module injection for {cls.__name__} as Triton is not available. Please check your installation."
+            )
+            return
+        for name, m in model.named_modules():
+            if not isinstance(m, LlamaMLP):
+                continue
+            mlp = cls(m.gate_proj, m.down_proj, m.up_proj)
+            if "." in name:
+                parent_name = name.rsplit(".", 1)[0]
+                child_name = name[len(parent_name) + 1 :]
+                parent = model.get_submodule(parent_name)
+            else:
+                parent_name = ""
+                parent = model
+                child_name = name
+            setattr(parent, child_name, mlp)
+    @classmethod
+    def warmup(cls, model, transpose=False, seqlen=2048):
+        from tqdm import tqdm
+        kn_values = {}
+        for _, m in model.named_modules():
+            if not isinstance(m, cls):
+                continue
+            k = m.infeatures
+            n = m.intermediate_size
+            if (k, n) not in kn_values:
+                kn_values[(k, n)] = m
+        logger.info(f"Found {len(kn_values)} unique fused mlp KN values.")
+        logger.info("Warming up autotune cache ...")
+        with torch.no_grad():
+            for m in tqdm(range(0, math.ceil(math.log2(seqlen)) + 1)):
+                m = 2**m
+                for (k, n), (modules) in kn_values.items():
+                    a = torch.randn(m, k, dtype=torch.float16, device=model.device)
+                    modules.triton_llama_mlp(a)
+        del kn_values
+__all__ = ["FusedLlamaMLPForQuantizedModel"]
--- a/auto_gptq/nn_modules/qlinear/__init__.py
+++ b/auto_gptq/nn_modules/qlinear/__init__.py
+import torch.nn as nn
+class GeneralQuantLinear(nn.Linear):
+    def __init__(self, quant_linear_module):
+        super().__init__(
+            in_features=quant_linear_module.infeatures,
+            out_features=quant_linear_module.outfeatures,
+            bias=True,
+        )
+        self.infeatures = quant_linear_module.infeatures
+        self.outfeatures = quant_linear_module.outfeatures
+        self.bits = quant_linear_module.bits
+        self.group_size = quant_linear_module.group_size
+        self.maxq = quant_linear_module.maxq
+        self.weight.requires_grad = False
+        self.weight.data = quant_linear_module.qweight
+        self.register_buffer("qweight", quant_linear_module.qweight)
+        self.bias.data = quant_linear_module.bias
+        self.qweight.requires_grad = False
+        self.bias.requires_grad = False
+        self.register_buffer("qzeros", quant_linear_module.qzeros)
+        self.register_buffer("scales", quant_linear_module.scales)
+        self.register_buffer("g_idx", quant_linear_module.g_idx)
+        if hasattr(quant_linear_module, "wf"):
+            self.wf = quant_linear_module.wf
+        if hasattr(quant_linear_module, "kernel_switch_threshold"):
+            self.kernel_switch_threshold = quant_linear_module.kernel_switch_threshold
+        if hasattr(quant_linear_module, "autogptq_cuda_available"):
+            self.autogptq_cuda_available = quant_linear_module.autogptq_cuda_available
+        self.trainable = quant_linear_module.trainable
+        self.forward = quant_linear_module.forward
+    @classmethod
+    def inject_to_model(cls, model, target_module_type):
+        for name, m in model.named_modules():
+            if not isinstance(m, target_module_type):
+                continue
+            new_m = cls(m)
+            if "." in name:
+                parent_name = name.rsplit(".", 1)[0]
+                child_name = name[len(parent_name) + 1 :]
+                parent = model.get_submodule(parent_name)
+            else:
+                parent_name = ""
+                parent = model
+                child_name = name
+            setattr(parent, child_name, new_m)
--- a/auto_gptq/nn_modules/qlinear/__pycache__/__init__.cpython-310.pyc
+++ b/auto_gptq/nn_modules/qlinear/__pycache__/__init__.cpython-310.pyc
--- a/auto_gptq/nn_modules/qlinear/__pycache__/qlinear_cuda.cpython-310.pyc
+++ b/auto_gptq/nn_modules/qlinear/__pycache__/qlinear_cuda.cpython-310.pyc
--- a/auto_gptq/nn_modules/qlinear/__pycache__/qlinear_cuda_old.cpython-310.pyc
+++ b/auto_gptq/nn_modules/qlinear/__pycache__/qlinear_cuda_old.cpython-310.pyc
--- a/auto_gptq/nn_modules/qlinear/__pycache__/qlinear_exllama.cpython-310.pyc
+++ b/auto_gptq/nn_modules/qlinear/__pycache__/qlinear_exllama.cpython-310.pyc
--- a/auto_gptq/nn_modules/qlinear/__pycache__/qlinear_exllamav2.cpython-310.pyc
+++ b/auto_gptq/nn_modules/qlinear/__pycache__/qlinear_exllamav2.cpython-310.pyc
--- a/auto_gptq/nn_modules/qlinear/__pycache__/qlinear_hpu.cpython-310.pyc
+++ b/auto_gptq/nn_modules/qlinear/__pycache__/qlinear_hpu.cpython-310.pyc
--- a/auto_gptq/nn_modules/qlinear/__pycache__/qlinear_marlin.cpython-310.pyc
+++ b/auto_gptq/nn_modules/qlinear/__pycache__/qlinear_marlin.cpython-310.pyc
--- a/auto_gptq/nn_modules/qlinear/__pycache__/qlinear_qigen.cpython-310.pyc
+++ b/auto_gptq/nn_modules/qlinear/__pycache__/qlinear_qigen.cpython-310.pyc
--- a/auto_gptq/nn_modules/qlinear/__pycache__/qlinear_triton.cpython-310.pyc
+++ b/auto_gptq/nn_modules/qlinear/__pycache__/qlinear_triton.cpython-310.pyc
--- a/auto_gptq/nn_modules/qlinear/qlinear_cuda.py
+++ b/auto_gptq/nn_modules/qlinear/qlinear_cuda.py
+import math
+from logging import getLogger
+import numpy as np
+import torch
+import torch.nn as nn
+import transformers
+logger = getLogger(__name__)
+try:
+    import autogptq_cuda_64
+    import autogptq_cuda_256
+    _autogptq_cuda_available = True
+except ImportError:
+    logger.warning("CUDA extension not installed.")
+    autogptq_cuda_256 = None
+    autogptq_cuda_64 = None
+    _autogptq_cuda_available = False
+class QuantLinear(nn.Module):
+    QUANT_TYPE = "cuda"
+    def __init__(
+        self,
+        bits,
+        group_size,
+        infeatures,
+        outfeatures,
+        bias,
+        kernel_switch_threshold=128,
+        trainable=False,
+        weight_dtype=torch.float16,
+    ):
+        super().__init__()
+        global _autogptq_cuda_available
+        if bits not in [2, 3, 4, 8]:
+            raise NotImplementedError("Only 2,3,4,8 bits are supported.")
+        if trainable:
+            _autogptq_cuda_available = False
+        self.infeatures = infeatures
+        self.outfeatures = outfeatures
+        self.bits = bits
+        self.group_size = group_size if group_size != -1 else infeatures
+        self.maxq = 2**self.bits - 1
+        self.register_buffer(
+            "qweight",
+            torch.zeros((infeatures // 32 * self.bits, outfeatures), dtype=torch.int32),
+        )
+        self.register_buffer(
+            "qzeros",
+            torch.zeros(
+                (
+                    math.ceil(infeatures / self.group_size),
+                    outfeatures // 32 * self.bits,
+                ),
+                dtype=torch.int32,
+            ),
+        )
+        self.register_buffer(
+            "scales",
+            torch.zeros(
+                (math.ceil(infeatures / self.group_size), outfeatures),
+                dtype=weight_dtype,
+            ),
+        )
+        self.register_buffer(
+            "g_idx",
+            torch.tensor([i // self.group_size for i in range(infeatures)], dtype=torch.int32),
+        )
+        if bias:
+            self.register_buffer("bias", torch.zeros((outfeatures), dtype=weight_dtype))
+        else:
+            self.bias = None
+        # is performed by unpacking the weights and using torch.matmul
+        if self.bits in [2, 4, 8]:
+            self.wf = torch.tensor(list(range(0, 32, self.bits)), dtype=torch.int32).unsqueeze(0)
+        elif self.bits == 3:
+            self.wf = torch.tensor(
+                [
+                    [0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 0],
+                    [0, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31],
+                    [0, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0],
+                ],
+                dtype=torch.int32,
+            ).reshape(1, 3, 12)
+        self.kernel_switch_threshold = kernel_switch_threshold
+        self.autogptq_cuda_available = _autogptq_cuda_available
+        self.autogptq_cuda = autogptq_cuda_256
+        if infeatures % 256 != 0 or outfeatures % 256 != 0:
+            self.autogptq_cuda = autogptq_cuda_64
+        if infeatures % 64 != 0 or outfeatures % 64 != 0:
+            self.autogptq_cuda_available = False
+        self.trainable = trainable
+    def post_init(self):
+        pass
+    def pack(self, linear, scales, zeros, g_idx=None):
+        W = linear.weight.data.clone()
+        if isinstance(linear, nn.Conv2d):
+            W = W.flatten(1)
+        if isinstance(linear, transformers.pytorch_utils.Conv1D):
+            W = W.t()
+        self.g_idx = g_idx.clone() if g_idx is not None else self.g_idx
+        scales = scales.t().contiguous()
+        zeros = zeros.t().contiguous()
+        scale_zeros = zeros * scales
+        self.scales = scales.clone().to(dtype=linear.weight.dtype)
+        if linear.bias is not None:
+            self.bias = linear.bias.clone().to(dtype=linear.weight.dtype)
+        intweight = []
+        for idx in range(self.infeatures):
+            intweight.append(
+                torch.round((W[:, idx] + scale_zeros[self.g_idx[idx]]) / self.scales[self.g_idx[idx]]).to(torch.int)[
+                    :, None
+                ]
+            )
+        intweight = torch.cat(intweight, dim=1)
+        intweight = intweight.t().contiguous()
+        intweight = intweight.numpy().astype(np.uint32)
+        i = 0
+        row = 0
+        qweight = np.zeros((intweight.shape[0] // 32 * self.bits, intweight.shape[1]), dtype=np.uint32)
+        while row < qweight.shape[0]:
+            if self.bits in [2, 4, 8]:
+                for j in range(i, i + (32 // self.bits)):
+                    qweight[row] |= intweight[j] << (self.bits * (j - i))
+                i += 32 // self.bits
+                row += 1
+            elif self.bits == 3:
+                for j in range(i, i + 10):
+                    qweight[row] |= intweight[j] << (3 * (j - i))
+                i += 10
+                qweight[row] |= intweight[i] << 30
+                row += 1
+                qweight[row] |= (intweight[i] >> 2) & 1
+                i += 1
+                for j in range(i, i + 10):
+                    qweight[row] |= intweight[j] << (3 * (j - i) + 1)
+                i += 10
+                qweight[row] |= intweight[i] << 31
+                row += 1
+                qweight[row] |= (intweight[i] >> 1) & 0x3
+                i += 1
+                for j in range(i, i + 10):
+                    qweight[row] |= intweight[j] << (3 * (j - i) + 2)
+                i += 10
+                row += 1
+            else:
+                raise NotImplementedError("Only 2,3,4,8 bits are supported.")
+        qweight = qweight.astype(np.int32)
+        self.qweight = torch.from_numpy(qweight)
+        zeros -= 1
+        zeros = zeros.numpy().astype(np.uint32)
+        qzeros = np.zeros((zeros.shape[0], zeros.shape[1] // 32 * self.bits), dtype=np.uint32)
+        i = 0
+        col = 0
+        while col < qzeros.shape[1]:
+            if self.bits in [2, 4, 8]:
+                for j in range(i, i + (32 // self.bits)):
+                    qzeros[:, col] |= zeros[:, j] << (self.bits * (j - i))
+                i += 32 // self.bits
+                col += 1
+            elif self.bits == 3:
+                for j in range(i, i + 10):
+                    qzeros[:, col] |= zeros[:, j] << (3 * (j - i))
+                i += 10
+                qzeros[:, col] |= zeros[:, i] << 30
+                col += 1
+                qzeros[:, col] |= (zeros[:, i] >> 2) & 1
+                i += 1
+                for j in range(i, i + 10):
+                    qzeros[:, col] |= zeros[:, j] << (3 * (j - i) + 1)
+                i += 10
+                qzeros[:, col] |= zeros[:, i] << 31
+                col += 1
+                qzeros[:, col] |= (zeros[:, i] >> 1) & 0x3
+                i += 1
+                for j in range(i, i + 10):
+                    qzeros[:, col] |= zeros[:, j] << (3 * (j - i) + 2)
+                i += 10
+                col += 1
+            else:
+                raise NotImplementedError("Only 2,3,4,8 bits are supported.")
+        qzeros = qzeros.astype(np.int32)
+        self.qzeros = torch.from_numpy(qzeros)
+    def forward(self, x: torch.Tensor):
+        out_shape = x.shape[:-1] + (self.outfeatures,)
+        x = x.reshape(-1, x.shape[-1])
+        x_dtype = x.dtype
+        if (
+            x.device.type == "cuda"
+            and self.autogptq_cuda_available
+            and (self.kernel_switch_threshold == 0 or x.shape[0] < self.kernel_switch_threshold)
+        ):
+            out = torch.zeros((x.shape[0], self.outfeatures), device=x.device, dtype=torch.float32)
+            if self.bits == 2:
+                self.autogptq_cuda.vecquant2matmul(
+                    x.float(),
+                    self.qweight,
+                    out,
+                    self.scales.float(),
+                    self.qzeros,
+                    self.g_idx,
+                )
+            elif self.bits == 3:
+                self.autogptq_cuda.vecquant3matmul(
+                    x.float(),
+                    self.qweight,
+                    out,
+                    self.scales.float(),
+                    self.qzeros,
+                    self.g_idx,
+                )
+            elif self.bits == 4:
+                self.autogptq_cuda.vecquant4matmul(
+                    x.float(),
+                    self.qweight,
+                    out,
+                    self.scales.float(),
+                    self.qzeros,
+                    self.g_idx,
+                )
+            elif self.bits == 8:
+                self.autogptq_cuda.vecquant8matmul(
+                    x.float(),
+                    self.qweight,
+                    out,
+                    self.scales.float(),
+                    self.qzeros,
+                    self.g_idx,
+                )
+            else:
+                raise NotImplementedError("Only 2,3,4,8 bits are supported.")
+        else:
+            if self.wf.device != self.qzeros.device:
+                self.wf = self.wf.to(self.qzeros.device)
+            if self.bits in [2, 4, 8]:
+                zeros = torch.bitwise_right_shift(
+                    torch.unsqueeze(self.qzeros, 2).expand(-1, -1, 32 // self.bits),
+                    self.wf.unsqueeze(0),
+                ).to(torch.int16 if self.bits == 8 else torch.int8)
+                zeros = torch.bitwise_and(zeros, (2**self.bits) - 1)
+                zeros = zeros + 1
+                zeros = zeros.reshape(self.scales.shape)
+                weight = torch.bitwise_right_shift(
+                    torch.unsqueeze(self.qweight, 1).expand(-1, 32 // self.bits, -1),
+                    self.wf.unsqueeze(-1),
+                ).to(torch.int16 if self.bits == 8 else torch.int8)
+                weight = torch.bitwise_and(weight, (2**self.bits) - 1)
+            elif self.bits == 3:
+                zeros = self.qzeros.reshape(self.qzeros.shape[0], self.qzeros.shape[1] // 3, 3, 1).expand(
+                    -1, -1, -1, 12
+                )
+                zeros = zeros >> self.wf.unsqueeze(0)
+                zeros[:, :, 0, 10] = (zeros[:, :, 0, 10] & 0x3) | ((zeros[:, :, 1, 0] << 2) & 0x4)
+                zeros[:, :, 1, 11] = (zeros[:, :, 1, 11] & 0x1) | ((zeros[:, :, 2, 0] << 1) & 0x6)
+                zeros = zeros & 0x7
+                zeros = torch.cat(
+                    [zeros[:, :, 0, :11], zeros[:, :, 1, 1:12], zeros[:, :, 2, 1:11]],
+                    dim=2,
+                )
+                zeros = zeros + 1
+                zeros = zeros.reshape(self.scales.shape)
+                weight = self.qweight.reshape(self.qweight.shape[0] // 3, 3, 1, self.qweight.shape[1]).expand(
+                    -1, -1, 12, -1
+                )
+                weight = (weight >> self.wf.unsqueeze(-1)) & 0x7
+                weight[:, 0, 10] = (weight[:, 0, 10] & 0x3) | ((weight[:, 1, 0] << 2) & 0x4)
+                weight[:, 1, 11] = (weight[:, 1, 11] & 0x1) | ((weight[:, 2, 0] << 1) & 0x6)
+                weight = weight & 0x7
+                weight = torch.cat([weight[:, 0, :11], weight[:, 1, 1:12], weight[:, 2, 1:11]], dim=1)
+            else:
+                raise NotImplementedError("Only 2,3,4,8 bits are supported.")
+            weight = weight.reshape(weight.shape[0] * weight.shape[1], weight.shape[2])
+            num_itr = self.g_idx.shape[0] // x.shape[-1]
+            if num_itr == 1:
+                weights = self.scales[self.g_idx.long()] * (weight - zeros[self.g_idx.long()])
+            else:
+                num_dim = self.g_idx.shape[0] // num_itr
+                weights = []
+                for i in range(num_itr):
+                    scale_i = self.scales[:, i * num_dim : (i + 1) * num_dim]
+                    weight_i = weight[:, i * num_dim : (i + 1) * num_dim]
+                    zeros_i = zeros[:, i * num_dim : (i + 1) * num_dim]
+                    g_idx_i = self.g_idx[i * num_dim : (i + 1) * num_dim]
+                    weights.append(scale_i[g_idx_i.long()] * (weight_i - zeros_i[g_idx_i.long()]))
+                weights = torch.cat(weights, dim=1)
+            out = torch.matmul(x, weights)
+        out = out.to(x_dtype)
+        out = out.reshape(out_shape)
+        out = out + self.bias if self.bias is not None else out
+        return out
+__all__ = ["QuantLinear"]
--- a/auto_gptq/nn_modules/qlinear/qlinear_cuda_old.py
+++ b/auto_gptq/nn_modules/qlinear/qlinear_cuda_old.py
+import math
+from logging import getLogger
+import numpy as np
+import torch
+import torch.nn as nn
+import transformers
+logger = getLogger(__name__)
+try:
+    import autogptq_cuda_64
+    import autogptq_cuda_256
+    _autogptq_cuda_available = True
+except ImportError:
+    logger.warning("CUDA extension not installed.")
+    autogptq_cuda_256 = None
+    autogptq_cuda_64 = None
+    _autogptq_cuda_available = False
+class QuantLinear(nn.Module):
+    QUANT_TYPE = "cuda-old"
+    def __init__(
+        self,
+        bits,
+        group_size,
+        infeatures,
+        outfeatures,
+        bias,
+        use_cuda_fp16=True,
+        kernel_switch_threshold=128,
+        trainable=False,
+        weight_dtype=torch.float16,
+    ):
+        super().__init__()
+        global _autogptq_cuda_available
+        if bits not in [2, 3, 4, 8]:
+            raise NotImplementedError("Only 2,3,4,8 bits are supported.")
+        if trainable:
+            _autogptq_cuda_available = False
+        self.infeatures = infeatures
+        self.outfeatures = outfeatures
+        self.bits = bits
+        self.group_size = group_size if group_size != -1 else infeatures
+        self.maxq = 2**self.bits - 1
+        self.register_buffer(
+            "qweight",
+            torch.zeros((infeatures // 32 * self.bits, outfeatures), dtype=torch.int32),
+        )
+        self.register_buffer(
+            "qzeros",
+            torch.zeros(
+                (
+                    math.ceil(infeatures / self.group_size),
+                    outfeatures // 32 * self.bits,
+                ),
+                dtype=torch.int32,
+            ),
+        )
+        self.register_buffer(
+            "scales",
+            torch.zeros(
+                (math.ceil(infeatures / self.group_size), outfeatures),
+                dtype=weight_dtype,
+            ),
+        )
+        self.register_buffer(
+            "g_idx",
+            torch.tensor([i // self.group_size for i in range(infeatures)], dtype=torch.int32),
+        )
+        if bias:
+            self.register_buffer("bias", torch.zeros((outfeatures), dtype=weight_dtype))
+        else:
+            self.bias = None
+        self.half_indim = self.infeatures // 2
+        self.use_cuda_fp16 = use_cuda_fp16 if bits != 8 else False
+        # is performed by unpacking the weights and using torch.matmul
+        if self.bits in [2, 4, 8]:
+            self.wf = torch.tensor(list(range(0, 32, self.bits)), dtype=torch.int32).unsqueeze(0)
+        elif self.bits == 3:
+            self.wf = torch.tensor(
+                [
+                    [0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 0],
+                    [0, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31],
+                    [0, 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0],
+                ],
+                dtype=torch.int32,
+            ).reshape(1, 3, 12)
+        self.kernel_switch_threshold = kernel_switch_threshold
+        self.autogptq_cuda_available = _autogptq_cuda_available
+        self.autogptq_cuda = autogptq_cuda_256
+        if infeatures % 256 != 0 or outfeatures % 256 != 0:
+            self.autogptq_cuda = autogptq_cuda_64
+        if infeatures % 64 != 0 or outfeatures % 64 != 0:
+            self.autogptq_cuda_available = False
+        self.trainable = trainable
+    def post_init(self):
+        pass
+    def pack(self, linear, scales, zeros, g_idx):
+        W = linear.weight.data.clone()
+        if isinstance(linear, nn.Conv2d):
+            W = W.flatten(1)
+        if isinstance(linear, transformers.pytorch_utils.Conv1D):
+            W = W.t()
+        scales = scales.t().contiguous()
+        zeros = zeros.t().contiguous()
+        scale_zeros = zeros * scales
+        self.scales = scales.clone().to(dtype=linear.weight.dtype)
+        if linear.bias is not None:
+            self.bias = linear.bias.clone().to(dtype=linear.weight.dtype)
+        intweight = []
+        for idx in range(self.infeatures):
+            g_idx = idx // self.group_size
+            intweight.append(torch.round((W[:, idx] + scale_zeros[g_idx]) / self.scales[g_idx]).to(torch.int)[:, None])
+        intweight = torch.cat(intweight, dim=1)
+        intweight = intweight.t().contiguous()
+        intweight = intweight.numpy().astype(np.uint32)
+        i = 0
+        row = 0
+        qweight = np.zeros((intweight.shape[0] // 32 * self.bits, intweight.shape[1]), dtype=np.uint32)
+        while row < qweight.shape[0]:
+            if self.bits in [2, 4, 8]:
+                for j in range(i, i + (32 // self.bits)):
+                    qweight[row] |= intweight[j] << (self.bits * (j - i))
+                i += 32 // self.bits
+                row += 1
+            elif self.bits == 3:
+                for j in range(i, i + 10):
+                    qweight[row] |= intweight[j] << (3 * (j - i))
+                i += 10
+                qweight[row] |= intweight[i] << 30
+                row += 1
+                qweight[row] |= (intweight[i] >> 2) & 1
+                i += 1
+                for j in range(i, i + 10):
+                    qweight[row] |= intweight[j] << (3 * (j - i) + 1)
+                i += 10
+                qweight[row] |= intweight[i] << 31
+                row += 1
+                qweight[row] |= (intweight[i] >> 1) & 0x3
+                i += 1
+                for j in range(i, i + 10):
+                    qweight[row] |= intweight[j] << (3 * (j - i) + 2)
+                i += 10
+                row += 1
+            else:
+                raise NotImplementedError("Only 2,3,4,8 bits are supported.")
+        qweight = qweight.astype(np.int32)
+        self.qweight = torch.from_numpy(qweight)
+        zeros -= 1
+        zeros = zeros.numpy().astype(np.uint32)
+        qzeros = np.zeros((zeros.shape[0], zeros.shape[1] // 32 * self.bits), dtype=np.uint32)
+        i = 0
+        col = 0
+        while col < qzeros.shape[1]:
+            if self.bits in [2, 4, 8]:
+                for j in range(i, i + (32 // self.bits)):
+                    qzeros[:, col] |= zeros[:, j] << (self.bits * (j - i))
+                i += 32 // self.bits
+                col += 1
+            elif self.bits == 3:
+                for j in range(i, i + 10):
+                    qzeros[:, col] |= zeros[:, j] << (3 * (j - i))
+                i += 10
+                qzeros[:, col] |= zeros[:, i] << 30
+                col += 1
+                qzeros[:, col] |= (zeros[:, i] >> 2) & 1
+                i += 1
+                for j in range(i, i + 10):
+                    qzeros[:, col] |= zeros[:, j] << (3 * (j - i) + 1)
+                i += 10
+                qzeros[:, col] |= zeros[:, i] << 31
+                col += 1
+                qzeros[:, col] |= (zeros[:, i] >> 1) & 0x3
+                i += 1
+                for j in range(i, i + 10):
+                    qzeros[:, col] |= zeros[:, j] << (3 * (j - i) + 2)
+                i += 10
+                col += 1
+            else:
+                raise NotImplementedError("Only 2,3,4,8 bits are supported.")
+        qzeros = qzeros.astype(np.int32)
+        self.qzeros = torch.from_numpy(qzeros)
+    def forward(self, x):
+        x_dtype = x.dtype
+        out_shape = x.shape[:-1] + (self.outfeatures,)
+        x = x.reshape(-1, x.shape[-1])
+        if (
+            x.device.type == "cuda"
+            and self.autogptq_cuda_available is True
+            and (self.kernel_switch_threshold is False or x.shape[0] < self.kernel_switch_threshold)
+        ):
+            out = torch.zeros(x.shape[0], out_shape[-1], dtype=torch.float, device=x.device)
+            if self.use_cuda_fp16:
+                if x_dtype != torch.float16:
+                    logger.warning_once(
+                        f"The cuda-old kernel for GPTQ with use_cuda_fp16=True requires a float16 input activation, while {x_dtype} was passed. Casting to float16.\nMake sure you loaded your model with torch_dtype=torch.float16, that the model definition does not inadvertently cast to float32, or disable AMP Autocast that may produce float32 intermediate activations in the model."
+                    )
+                if self.bits == 2:
+                    self.autogptq_cuda.vecquant2matmul_faster_old(
+                        x,
+                        self.qweight,
+                        out,
+                        self.scales.float(),
+                        self.qzeros,
+                        self.group_size,
+                        self.half_indim,
+                    )
+                elif self.bits == 3:
+                    self.autogptq_cuda.vecquant3matmul_faster_old(
+                        x,
+                        self.qweight,
+                        out,
+                        self.scales.float(),
+                        self.qzeros,
+                        self.group_size,
+                        self.half_indim,
+                    )
+                elif self.bits == 4:
+                    self.autogptq_cuda.vecquant4matmul_faster_old(
+                        x,
+                        self.qweight,
+                        out,
+                        self.scales.float(),
+                        self.qzeros,
+                        self.group_size,
+                        self.half_indim,
+                    )
+                else:
+                    raise NotImplementedError("Only 2,3,4 bits are supported.")
+            else:
+                x = x.to(torch.float32)  # This is required for autocast compatibility.
+                if self.bits == 2:
+                    self.autogptq_cuda.vecquant2matmul_old(
+                        x,
+                        self.qweight,
+                        out,
+                        self.scales.float(),
+                        self.qzeros,
+                        self.group_size,
+                    )
+                elif self.bits == 3:
+                    self.autogptq_cuda.vecquant3matmul_old(
+                        x,
+                        self.qweight,
+                        out,
+                        self.scales.float(),
+                        self.qzeros,
+                        self.group_size,
+                    )
+                elif self.bits == 4:
+                    self.autogptq_cuda.vecquant4matmul_old(
+                        x,
+                        self.qweight,
+                        out,
+                        self.scales.float(),
+                        self.qzeros,
+                        self.group_size,
+                    )
+                elif self.bits == 8:
+                    self.autogptq_cuda.vecquant8matmul_old(
+                        x,
+                        self.qweight,
+                        out,
+                        self.scales.float(),
+                        self.qzeros,
+                        self.group_size,
+                    )
+                else:
+                    raise NotImplementedError("Only 2,3,4,8 bits are supported.")
+        else:
+            if self.wf.device != self.qzeros.device:
+                self.wf = self.wf.to(self.qzeros.device)
+            if self.bits in [2, 4, 8]:
+                zeros = torch.bitwise_right_shift(
+                    torch.unsqueeze(self.qzeros, 2).expand(-1, -1, 32 // self.bits),
+                    self.wf.unsqueeze(0),
+                ).to(torch.int16 if self.bits == 8 else torch.int8)
+                zeros = zeros + 1
+                zeros = torch.bitwise_and(
+                    zeros, (2**self.bits) - 1
+                )  # NOTE: It appears that casting here after the `zeros = zeros + 1` is important.
+                zeros = zeros.reshape(-1, 1, zeros.shape[1] * zeros.shape[2])
+                scales = self.scales
+                scales = scales.reshape(-1, 1, scales.shape[-1])
+                weight = torch.bitwise_right_shift(
+                    torch.unsqueeze(self.qweight, 1).expand(-1, 32 // self.bits, -1),
+                    self.wf.unsqueeze(-1),
+                ).to(torch.int16 if self.bits == 8 else torch.int8)
+                weight = torch.bitwise_and(weight, (2**self.bits) - 1)
+                weight = weight.reshape(-1, self.group_size, weight.shape[2])
+            elif self.bits == 3:
+                zeros = self.qzeros.reshape(self.qzeros.shape[0], self.qzeros.shape[1] // 3, 3, 1).expand(
+                    -1, -1, -1, 12
+                )
+                zeros = zeros >> self.wf.unsqueeze(0)
+                zeros[:, :, 0, 10] = (zeros[:, :, 0, 10] & 0x3) | ((zeros[:, :, 1, 0] << 2) & 0x4)
+                zeros[:, :, 1, 11] = (zeros[:, :, 1, 11] & 0x1) | ((zeros[:, :, 2, 0] << 1) & 0x6)
+                zeros = zeros & 0x7
+                zeros = torch.cat(
+                    [zeros[:, :, 0, :11], zeros[:, :, 1, 1:12], zeros[:, :, 2, 1:11]],
+                    dim=2,
+                )
+                zeros = zeros + 1
+                zeros = zeros.reshape(-1, 1, zeros.shape[1] * zeros.shape[2])
+                scales = self.scales
+                scales = scales.reshape(-1, 1, scales.shape[-1])
+                weight = self.qweight.reshape(self.qweight.shape[0] // 3, 3, 1, self.qweight.shape[1]).expand(
+                    -1, -1, 12, -1
+                )
+                weight = (weight >> self.wf.unsqueeze(-1)) & 0x7
+                weight[:, 0, 10] = (weight[:, 0, 10] & 0x3) | ((weight[:, 1, 0] << 2) & 0x4)
+                weight[:, 1, 11] = (weight[:, 1, 11] & 0x1) | ((weight[:, 2, 0] << 1) & 0x6)
+                weight = weight & 0x7
+                weight = torch.cat([weight[:, 0, :11], weight[:, 1, 1:12], weight[:, 2, 1:11]], dim=1)
+                weight = weight.reshape(-1, self.group_size, weight.shape[2])
+            else:
+                raise NotImplementedError("Only 2,3,4,8 bits are supported.")
+            weight = scales * (weight - zeros)
+            weight = weight.reshape(weight.shape[0] * weight.shape[1], weight.shape[2])
+            out = torch.matmul(x, weight)
+        out = out.to(dtype=x_dtype).reshape(
+            out_shape
+        )  # A cast is needed here as for some reason the vecquant2matmul_faster_old still allocate a float32 output.
+        out = out + self.bias if self.bias is not None else out
+        return out
+__all__ = ["QuantLinear"]
--- a/auto_gptq/nn_modules/qlinear/qlinear_exllama.py
+++ b/auto_gptq/nn_modules/qlinear/qlinear_exllama.py
+# Adapted from turboderp exllama: https://github.com/turboderp/exllama
+import math
+from logging import getLogger
+import numpy as np
+import torch
+import torch.nn as nn
+import transformers
+logger = getLogger(__name__)
+try:
+    from exllama_kernels import make_q4, q4_matmul
+except ImportError as e:
+    exllama_import_exception = e
+    def error_raiser_exllama(*args, **kwargs):
+        raise ValueError(
+            f"Trying to use the exllama backend, but could not import the C++/CUDA dependencies with the following error: {exllama_import_exception}"
+        )
+    make_q4 = error_raiser_exllama
+    q4_matmul = error_raiser_exllama
+# Dummy tensor to pass instead of g_idx since there is no way to pass "None" to a C++ extension
+none_tensor = torch.empty((1, 1), device="meta")
+def ext_make_q4(qweight, qzeros, scales, g_idx, device):
+    """Construct Q4Matrix, return handle"""
+    return make_q4(qweight, qzeros, scales, g_idx if g_idx is not None else none_tensor, device)
+def ext_q4_matmul(x, q4, q4_width):
+    """Matrix multiplication, returns x @ q4"""
+    outshape = x.shape[:-1] + (q4_width,)
+    x = x.view(-1, x.shape[-1])
+    output = torch.empty((x.shape[0], q4_width), dtype=torch.float16, device=x.device)
+    q4_matmul(x, q4, output)
+    return output.view(outshape)
+class QuantLinear(nn.Module):
+    QUANT_TYPE = "exllama"
+    """Linear layer implementation with per-group 4-bit quantization of the weights"""
+    def __init__(self, bits, group_size, infeatures, outfeatures, bias, trainable=False, **kwargs):
+        super().__init__()
+        if bits != 4:
+            raise ValueError(
+                f"Exllama kernel supports only bits=4, requested bits={bits}. Something is wrong in the model initialization."
+            )
+        if trainable:
+            raise NotImplementedError("Exllama kernel does not support training.")
+        self.padding = -outfeatures % 32
+        self.outfeatures = outfeatures + self.padding
+        outfeatures = self.outfeatures
+        self.infeatures = infeatures
+        self.bits = bits
+        self.group_size = group_size if group_size != -1 else infeatures
+        self.trainable = trainable
+        self.maxq = 2**self.bits - 1
+        assert infeatures % 32 == 0
+        assert infeatures % self.group_size == 0
+        assert outfeatures % 32 == 0
+        self.register_buffer(
+            "qweight",
+            torch.zeros((infeatures // 32 * self.bits, outfeatures), dtype=torch.int32),
+        )
+        self.register_buffer(
+            "qzeros",
+            torch.zeros(
+                (
+                    math.ceil(infeatures / self.group_size),
+                    outfeatures // 32 * self.bits,
+                ),
+                dtype=torch.int32,
+            ),
+        )
+        self.register_buffer(
+            "scales",
+            torch.zeros(
+                (math.ceil(infeatures / self.group_size), outfeatures),
+                dtype=torch.float16,
+            ),
+        )
+        self.register_buffer(
+            "g_idx",
+            torch.tensor([i // self.group_size for i in range(infeatures)], dtype=torch.int32),
+        )
+        if bias:
+            self.register_buffer("bias", torch.zeros((outfeatures), dtype=torch.float16))
+        else:
+            self.bias = None
+    def post_init(self):
+        assert self.qweight.device.type == "cuda"
+        assert self.qweight.device.index is not None
+        self.width = self.qweight.shape[1]
+        # make_q4 segfaults if g_idx is not on cpu in the act-order case. In the non act-order case, None needs to be passed for g_idx.
+        self.q4 = ext_make_q4(
+            self.qweight,
+            self.qzeros,
+            self.scales,
+            self.g_idx.to("cpu") if self._use_act_order else None,
+            self.qweight.device.index,
+        )
+    def pack(self, linear, scales, zeros, g_idx=None):
+        W = linear.weight.data.clone()
+        if isinstance(linear, nn.Conv2d):
+            W = W.flatten(1)
+        if isinstance(linear, transformers.pytorch_utils.Conv1D):
+            W = W.t()
+        self.g_idx = g_idx.clone() if g_idx is not None else self.g_idx
+        scales = scales.t().contiguous()
+        zeros = zeros.t().contiguous()
+        scale_zeros = zeros * scales
+        self.scales = scales.clone().half()
+        if linear.bias is not None:
+            self.bias = linear.bias.clone().half()
+        intweight = []
+        for idx in range(self.infeatures):
+            intweight.append(
+                torch.round((W[:, idx] + scale_zeros[self.g_idx[idx]]) / self.scales[self.g_idx[idx]]).to(torch.int)[
+                    :, None
+                ]
+            )
+        intweight = torch.cat(intweight, dim=1)
+        intweight = intweight.t().contiguous()
+        intweight = intweight.numpy().astype(np.uint32)
+        i = 0
+        row = 0
+        qweight = np.zeros((intweight.shape[0] // 32 * self.bits, intweight.shape[1]), dtype=np.uint32)
+        while row < qweight.shape[0]:
+            if self.bits in [4]:
+                for j in range(i, i + (32 // self.bits)):
+                    qweight[row] |= intweight[j] << (self.bits * (j - i))
+                i += 32 // self.bits
+                row += 1
+            else:
+                raise NotImplementedError("Only 4 bits are supported.")
+        qweight = qweight.astype(np.int32)
+        self.qweight = torch.from_numpy(qweight)
+        zeros -= 1
+        zeros = zeros.numpy().astype(np.uint32)
+        qzeros = np.zeros((zeros.shape[0], zeros.shape[1] // 32 * self.bits), dtype=np.uint32)
+        i = 0
+        col = 0
+        while col < qzeros.shape[1]:
+            if self.bits in [4]:
+                for j in range(i, i + (32 // self.bits)):
+                    qzeros[:, col] |= zeros[:, j] << (self.bits * (j - i))
+                i += 32 // self.bits
+                col += 1
+            else:
+                raise NotImplementedError("Only 4 bits are supported.")
+        qzeros = qzeros.astype(np.int32)
+        self.qzeros = torch.from_numpy(qzeros)
+    def forward(self, x):
+        if x.dtype != torch.float16:
+            logger.warning_once(
+                f"The exllama kernel for GPTQ requires a float16 input activation, while {x.dtype} was passed. Casting to float16.\nMake sure you loaded your model with torch_dtype=torch.float16, that the model definition does not inadvertently cast to float32, or disable AMP Autocast that may produce float32 intermediate activations in the model."
+            )
+            x = x.half()
+        out = ext_q4_matmul(x, self.q4, self.width)
+        if self.bias is not None:
+            out.add_(self.bias)
+        return out
--- a/auto_gptq/nn_modules/qlinear/qlinear_exllamav2.py
+++ b/auto_gptq/nn_modules/qlinear/qlinear_exllamav2.py
+# Adapted from turboderp exllama: https://github.com/turboderp/exllamav2
+import math
+from logging import getLogger
+import numpy as np
+import torch
+import torch.nn as nn
+import transformers
+logger = getLogger(__name__)
+try:
+    from exllamav2_kernels import gemm_half_q_half, make_q_matrix
+except ImportError as e:
+    exllama_v2_import_exception = e
+    def error_raiser_exllama(*args, **kwargs):
+        raise ValueError(
+            f"Trying to use the exllama v2 backend, but could not import the C++/CUDA dependencies with the following error: {exllama_v2_import_exception}"
+        )
+    make_q_matrix = error_raiser_exllama
+    gemm_half_q_half = error_raiser_exllama
+# Dummy tensor to pass instead of g_idx since there is no way to pass "None" to a C++ extension
+none_tensor = torch.empty((1, 1), device="meta")
+def _torch_device(idx):
+    if idx == -1:
+        return "cpu"
+    return f"cuda:{idx}"
+def ext_gemm_half_q_half(x, q_handle, q4_width, force_cuda):
+    """Matrix multiplication, returns x @ q4"""
+    output_shape = x.shape[:-1] + (q4_width,)
+    x = x.view(-1, x.shape[-1])
+    output = torch.empty((x.shape[0], q4_width), dtype=torch.half, device=x.device)
+    gemm_half_q_half(x, q_handle, output, force_cuda)
+    return output.view(output_shape)
+def ext_make_q_matrix(w: dict, temp_dq, key: str = None):
+    """
+    Create Q matrix
+    """
+    # EXL2
+    # won't work as the moment because the tensors are not the same.
+    if "q_weight" in w:
+        w["q_scale_max"] /= 256
+        w["q_perm"] = w["q_perm"].short()
+        w["q_invperm"] = w["q_invperm"].short()
+        return make_q_matrix(
+            w["q_weight"],
+            w["q_perm"],
+            w["q_invperm"],
+            w["q_scale"],
+            w["q_scale_max"],
+            w["q_groups"],
+            none_tensor,
+            none_tensor,
+            none_tensor,
+            temp_dq,
+        )
+    # GPTQ
+    elif "qweight" in w:
+        if w["scales"].dtype == torch.float:
+            w["scales"] = w["scales"].half()
+        # GPTQ with g_idx (act_order)
+        if "g_idx" in w and not (w["g_idx"] == 0).all().item():
+            w["q_perm"] = torch.empty(
+                (w["qweight"].shape[0] * 8,),
+                dtype=torch.short,
+                device=w["qweight"].device,
+            )
+            w["q_invperm"] = torch.empty_like(w["q_perm"])
+            # make_q4 segfaults if g_idx is not on cpu in the act-order case. In the non act-order case, None needs to be passed for g_idx.
+            return make_q_matrix(
+                w["qweight"],
+                w["q_perm"],
+                w["q_invperm"],
+                none_tensor,
+                none_tensor,
+                none_tensor,
+                w["qzeros"],
+                w["scales"],
+                w["g_idx"].cpu(),
+                temp_dq,
+            )
+        # GPTQ without g_idx
+        else:
+            return make_q_matrix(
+                w["qweight"],
+                none_tensor,
+                none_tensor,
+                none_tensor,
+                none_tensor,
+                none_tensor,
+                w["qzeros"],
+                w["scales"],
+                none_tensor,
+                temp_dq,
+            )
+class QuantLinear(nn.Module):
+    QUANT_TYPE = "exllamav2"
+    """Linear layer implementation with per-group 4-bit quantization of the weights"""
+    def __init__(self, bits, group_size, infeatures, outfeatures, bias, trainable=False, **kwargs):
+        super().__init__()
+        if bits != 4:
+            raise ValueError(
+                f"Exllamav2 kernel supports only bits=4, requested bits={bits}. Something is wrong in the model initialization."
+            )
+        if trainable:
+            raise NotImplementedError("Exllamav2 kernel does not support training.")
+        self.q_handle = None
+        self.q_tensors = None
+        self.padding = -outfeatures % 32
+        self.outfeatures = outfeatures + self.padding
+        outfeatures = self.outfeatures
+        self.infeatures = infeatures
+        self.bits = bits
+        self.group_size = group_size if group_size != -1 else infeatures
+        self.trainable = trainable
+        self.maxq = 2**self.bits - 1
+        assert infeatures % 32 == 0
+        assert infeatures % self.group_size == 0
+        assert outfeatures % 32 == 0
+        # I need to register the tensors, otherwise, we won't be able to load them easily using transformers ...
+        self.register_buffer(
+            "qweight",
+            torch.zeros((infeatures // 32 * self.bits, outfeatures), dtype=torch.int32),
+        )
+        self.register_buffer(
+            "qzeros",
+            torch.zeros(
+                (
+                    math.ceil(infeatures / self.group_size),
+                    outfeatures // 32 * self.bits,
+                ),
+                dtype=torch.int32,
+            ),
+        )
+        self.register_buffer(
+            "scales",
+            torch.zeros(
+                (math.ceil(infeatures / self.group_size), outfeatures),
+                dtype=torch.float16,
+            ),
+        )
+        self.register_buffer(
+            "g_idx",
+            torch.tensor([i // self.group_size for i in range(infeatures)], dtype=torch.int32),
+        )
+        if bias:
+            self.register_buffer("bias", torch.zeros((outfeatures), dtype=torch.float16))
+        else:
+            self.bias = None
+    def post_init(self, temp_dq):
+        assert self.qweight.device.type == "cuda"
+        assert self.qweight.device.index is not None
+        self.q_tensors = {
+            "qweight": self.qweight,
+            "qzeros": self.qzeros,
+            "scales": self.scales,
+            "g_idx": self.g_idx,
+        }
+        temp_dq = temp_dq.get_scratch_slice(self.temp_dq_size())
+        self.q_handle = ext_make_q_matrix(self.q_tensors, temp_dq)
+    def pack(self, linear, scales, zeros, g_idx=None):
+        W = linear.weight.data.clone()
+        if isinstance(linear, nn.Conv2d):
+            W = W.flatten(1)
+        if isinstance(linear, transformers.pytorch_utils.Conv1D):
+            W = W.t()
+        self.g_idx = g_idx.clone() if g_idx is not None else self.g_idx
+        scales = scales.t().contiguous()
+        zeros = zeros.t().contiguous()
+        scale_zeros = zeros * scales
+        self.scales = scales.clone().half()
+        if linear.bias is not None:
+            self.bias = linear.bias.clone().half()
+        intweight = []
+        for idx in range(self.infeatures):
+            intweight.append(
+                torch.round((W[:, idx] + scale_zeros[self.g_idx[idx]]) / self.scales[self.g_idx[idx]]).to(torch.int)[
+                    :, None
+                ]
+            )
+        intweight = torch.cat(intweight, dim=1)
+        intweight = intweight.t().contiguous()
+        intweight = intweight.numpy().astype(np.uint32)
+        i = 0
+        row = 0
+        qweight = np.zeros((intweight.shape[0] // 32 * self.bits, intweight.shape[1]), dtype=np.uint32)
+        while row < qweight.shape[0]:
+            if self.bits in [4]:
+                for j in range(i, i + (32 // self.bits)):
+                    qweight[row] |= intweight[j] << (self.bits * (j - i))
+                i += 32 // self.bits
+                row += 1
+            else:
+                raise NotImplementedError("Only 4 bits are supported.")
+        qweight = qweight.astype(np.int32)
+        self.qweight = torch.from_numpy(qweight)
+        zeros -= 1
+        zeros = zeros.numpy().astype(np.uint32)
+        qzeros = np.zeros((zeros.shape[0], zeros.shape[1] // 32 * self.bits), dtype=np.uint32)
+        i = 0
+        col = 0
+        while col < qzeros.shape[1]:
+            if self.bits in [4]:
+                for j in range(i, i + (32 // self.bits)):
+                    qzeros[:, col] |= zeros[:, j] << (self.bits * (j - i))
+                i += 32 // self.bits
+                col += 1
+            else:
+                raise NotImplementedError("Only 4 bits are supported.")
+        qzeros = qzeros.astype(np.int32)
+        self.qzeros = torch.from_numpy(qzeros)
+    def forward(self, x, force_cuda=False):
+        if x.dtype != torch.float16:
+            logger.warning_once(
+                f"The exllama v2 kernel for GPTQ requires a float16 input activation, while {x.dtype} was passed. Casting to float16.\nMake sure you loaded your model with torch_dtype=torch.float16, that the model definition does not inadvertently cast to float32, or disable AMP Autocast that may produce float32 intermediate activations in the model."
+            )
+            x = x.half()
+        output = ext_gemm_half_q_half(x, self.q_handle, self.outfeatures, force_cuda)
+        if self.bias is not None:
+            output.add_(self.bias)
+        return output
+    def temp_dq_size(self):
+        return self.infeatures * self.outfeatures * 2 + 128
+    def temp_fwd_size(self, max_input_len, max_batch_size):
+        return self.outfeatures * max_input_len * max_batch_size * 4 + 128
+    def scratch_space_fixed(self, max_input_len=2048, max_batch_size=8):
+        return self.temp_dq_size() + self.temp_fwd_size(max_input_len, max_batch_size)
+class ExLlamaV2DeviceTensors:
+    device_idx: int
+    scratch_bytes: int
+    scratch_idx: int
+    scratch: torch.tensor = None
+    def __init__(self, device_idx, scratch_bytes):
+        self.device_idx = device_idx
+        self.scratch_bytes = scratch_bytes
+    def prepare(self):
+        self.scratch = torch.empty(
+            (self.scratch_bytes // 2,),
+            dtype=torch.half,
+            device=_torch_device(self.device_idx),
+        )
+    def get_scratch_slice(self, size_bytes):
+        if self.scratch is None:
+            self.prepare()
+        size_bytes = ((size_bytes + 127) // 128) * 128
+        size_half = size_bytes // 2
+        scratch_slice = self.scratch.narrow(0, 0, size_half)
+        return scratch_slice