Initial commit

da900c3b · yangql · da900c3b · da900c3b · da900c3b · da900c3b
Commit da900c3b authored Sep 19, 2024 by yangql
20 changed files
--- a/auto_gptq/modeling/__pycache__/rw.cpython-310.pyc
+++ b/auto_gptq/modeling/__pycache__/rw.cpython-310.pyc
--- a/auto_gptq/modeling/__pycache__/stablelmepoch.cpython-310.pyc
+++ b/auto_gptq/modeling/__pycache__/stablelmepoch.cpython-310.pyc
--- a/auto_gptq/modeling/__pycache__/starcoder2.cpython-310.pyc
+++ b/auto_gptq/modeling/__pycache__/starcoder2.cpython-310.pyc
--- a/auto_gptq/modeling/__pycache__/xverse.cpython-310.pyc
+++ b/auto_gptq/modeling/__pycache__/xverse.cpython-310.pyc
--- a/auto_gptq/modeling/__pycache__/yi.cpython-310.pyc
+++ b/auto_gptq/modeling/__pycache__/yi.cpython-310.pyc
--- a/auto_gptq/modeling/_base.py
+++ b/auto_gptq/modeling/_base.py
+import copy
+import logging
+import os
+from os.path import isdir, join
+from typing import Dict, List, Optional, Union
+
+import accelerate
+import torch
+import torch.nn as nn
+import transformers
+from accelerate.hooks import remove_hook_from_module
+from safetensors import safe_open
+from safetensors.torch import load_file as safe_load
+from safetensors.torch import save_file as safe_save
+from tqdm import tqdm
+from transformers import AutoConfig, AutoModelForCausalLM, PreTrainedModel
+from transformers.modeling_utils import no_init_weights
+from transformers.utils.generic import ContextManagers
+from transformers.utils.hub import (
+    CommitOperationAdd,
+    PushToHubMixin,
+    create_commit,
+    create_repo,
+)
+
+from ..nn_modules._fused_base import FusedBaseAttentionModule, FusedBaseMLPModule
+from ..nn_modules.qlinear import GeneralQuantLinear
+from ..quantization import GPTQ, BaseQuantizeConfig
+from ..quantization.config import (
+    CHECKPOINT_FORMAT,
+    CHECKPOINT_FORMAT_FIELD,
+    QUANT_METHOD_FIELD,
+    QUANTIZE_BLACK_LIST,
+)
+from ..utils.accelerate_utils import load_checkpoint_in_model
+from ..utils.data_utils import collate_data
+from ..utils.import_utils import (
+    AUTOGPTQ_CUDA_AVAILABLE,
+    EXLLAMA_KERNELS_AVAILABLE,
+    EXLLAMAV2_KERNELS_AVAILABLE,
+    MARLIN_AVAILABLE,
+    QIGEN_AVAILABLE,
+    TRITON_AVAILABLE,
+    dynamically_import_QuantLinear,
+)
+from ..utils.marlin_utils import (
+    _validate_marlin_compatibility,
+    _validate_marlin_device_support,
+    prepare_model_for_marlin_load,
+)
+from ._const import CPU, CUDA_0, SUPPORTED_MODELS
+from ._utils import (
+    autogptq_post_init,
+    find_layers,
+    get_checkpoints,
+    get_device,
+    get_module_by_name_prefix,
+    get_module_by_name_suffix,
+    make_quant,
+    make_sure_no_tensor_in_meta_device,
+    move_to_device,
+    pack_from_tensors,
+    pack_model,
+    preprocess_checkpoint_qigen,
+    simple_dispatch_model,
+    unpack_awq,
+)
+
+
+logger = logging.getLogger(__name__)
+handler = logging.StreamHandler()
+formatter = logging.Formatter("%(levelname)s - %(message)s")
+handler.setFormatter(formatter)
+logger.propagate = False
+logger.addHandler(handler)
+logger.setLevel(logging.INFO)
+
+
+def nested_move_to_device(v, device):
+    if isinstance(v, torch.Tensor):
+        return move_to_device(v, device)
+    elif isinstance(v, (list, tuple)):
+        return type(v)([nested_move_to_device(e, device) for e in v])
+    else:
+        return v
+
+
+class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
+    layer_type: str = None
+    layers_block_name: str = None
+    outside_layer_modules: List[str] = None
+    inside_layer_modules: List[List[str]] = None
+    lm_head_name: str = "lm_head"
+
+    fused_attn_module_type: Optional[FusedBaseAttentionModule] = None
+    fused_mlp_module_type: Optional[FusedBaseMLPModule] = None
+
+    def __init__(
+        self,
+        model: PreTrainedModel,
+        quantized: bool,
+        quantize_config: BaseQuantizeConfig,
+        is_triton_backend: bool = False,
+        injected_fused_attention: bool = False,
+        injected_fused_mlp: bool = False,
+        trainable: bool = False,
+    ):
+        super().__init__()
+
+        self.model = model
+        self.model_type = self.model.config.model_type
+        self._quantized = quantized
+        self.quantize_config = quantize_config
+        self.config = self.model.config
+
+        self.is_triton_backend = is_triton_backend
+        self.injected_fused_attention = injected_fused_attention
+        self.injected_fused_mlp = injected_fused_mlp
+        self.trainable = trainable
+
+    @property
+    def quantized(self):
+        return self._quantized
+
+    @property
+    def hf_device_map(self):
+        return getattr(self.model, "hf_device_map", None)
+
+    def _prepare_examples_for_quantization(
+        self,
+        examples: List[Dict[str, Union[List[int], torch.LongTensor]]],
+        batch_size: int = 1,
+    ):
+        def _convert_tensor_to_list(tensor):
+            if isinstance(tensor, torch.Tensor):
+                if len(tensor.shape) == 1:
+                    tensor = tensor.unsqueeze(0)
+                tensor = tensor.long()
+                return tensor.cpu().numpy().tolist()
+            return [tensor]
+
+        new_examples = []
+        for example in examples:
+            input_ids = _convert_tensor_to_list(example["input_ids"])
+            attention_mask = _convert_tensor_to_list(example["attention_mask"])
+            if "labels" in example:
+                labels = _convert_tensor_to_list(example["labels"])
+            elif "label" in example:
+                labels = _convert_tensor_to_list(example["label"])
+            elif "label_ids" in example:
+                labels = _convert_tensor_to_list(example["label_ids"])
+            else:
+                labels = copy.deepcopy(input_ids)
+            new_examples.append(
+                {
+                    "input_ids": input_ids,
+                    "attention_mask": attention_mask,
+                    "labels": labels,
+                }
+            )
+        pad_token_id = self.config.pad_token_id
+        if not pad_token_id:
+            pad_token_id = self.config.eos_token_id
+
+        new_examples = [
+            collate_data(new_examples[start : start + batch_size], pad_token_id)
+            for start in range(0, len(new_examples), batch_size)
+        ]
+        for new_example in new_examples:
+            del new_example["labels"]
+
+        return new_examples
+
+    @torch.inference_mode()
+    def quantize(
+        self,
+        examples: List[Dict[str, Union[List[int], torch.LongTensor]]],
+        batch_size: int = 1,
+        use_triton: bool = False,
+        use_cuda_fp16: bool = True,
+        autotune_warmup_after_quantized: bool = False,
+        cache_examples_on_gpu: bool = True,
+    ):
+        if self.quantized:
+            raise EnvironmentError("can't execute quantize because the model is quantized.")
+
+        if self.quantize_config.quant_method in QUANTIZE_BLACK_LIST:
+            raise ValueError(f"Unsupported quantization operation for quant method: {self.quantize_config.quant_method}")
+
+        if use_triton and not TRITON_AVAILABLE:
+            logger.warning("triton is not installed, reset use_triton to False")
+            use_triton = False
+
+        device_map = self.hf_device_map
+        if device_map:
+            for name, device in device_map.items():
+                if device == "cpu":
+                    logger.info(f"truly offloading {name} to cpu with hook.")
+                    module = get_module_by_name_suffix(self.model, name)
+                    remove_hook_from_module(module, recurse=True)
+                    accelerate.cpu_offload_with_hook(module, CUDA_0)
+
+        layer_inputs = []
+        attention_masks = []
+        position_ids = []
+        layer_input_kwargs = []
+        layer_outputs = []
+
+        examples = self._prepare_examples_for_quantization(examples, batch_size)
+
+        forward_pass_use_cache = self.model.config.use_cache
+        self.model.config.use_cache = False
+
+        num_batches = len(examples)
+        layers = get_module_by_name_prefix(self.model, self.layers_block_name)
+
+        cur_layer_device = get_device(layers[0])
+        data_device = cur_layer_device if cache_examples_on_gpu else CPU
+        def store_input_hook(_, args, kwargs):
+            # Positional arguments.
+            layer_input = []
+            for inp in args:
+                layer_input.append(move_to_device(inp, data_device))
+            layer_inputs.append(layer_input)
+
+            # Keyword arguments.
+            if kwargs["attention_mask"] is not None:
+                attention_masks.append(kwargs["attention_mask"].to(data_device))
+            else:
+                attention_masks.append(None)
+
+            pos_ids = kwargs.get("position_ids", None)
+            if pos_ids is not None:
+                position_ids.append(move_to_device(pos_ids, data_device))
+            one_kwargs = {}
+            for (
+                k,
+                v,
+            ) in kwargs.items():  # make sure other arguments also be captured
+                if k not in ["hidden_states", "attention_mask", "position_ids"]:
+                    one_kwargs[k] = nested_move_to_device(v, data_device)
+            layer_input_kwargs.append(one_kwargs)
+            raise ValueError
+
+        force_layer_back_to_cpu = False
+        if get_device(layers[0]) == CPU:
+            layers[0] = layers[0].to(CUDA_0)
+            force_layer_back_to_cpu = True
+
+        ori_outside_layer_module_devices = {}
+        for module_name in self.outside_layer_modules:
+            module = get_module_by_name_prefix(self.model, module_name)
+
+            if module is None:
+                continue
+
+            ori_outside_layer_module_devices[module_name] = get_device(module)
+            if module is not None:
+                move_to_device(module, cur_layer_device)
+
+        # TODO: make this optional, backporting https://github.com/huggingface/optimum/blob/main/optimum/gptq/quantizer.py
+        handle = layers[0].register_forward_pre_hook(store_input_hook, with_kwargs=True)
+        for example in examples:
+            for k, v in example.items():
+                if len(v.shape) == 1:
+                    v = v.unsqueeze(0)
+                example[k] = move_to_device(v, cur_layer_device)
+            try:
+                self.model(**example)
+            except ValueError:
+                pass
+        handle.remove()
+
+        move_to_device(layers[0], CPU if force_layer_back_to_cpu else cur_layer_device)
+        for module_name in self.outside_layer_modules:
+            module = get_module_by_name_prefix(self.model, module_name)
+            if module is not None:
+                move_to_device(module, ori_outside_layer_module_devices[module_name])
+
+        torch.cuda.empty_cache()
+
+        inside_layer_modules = self.inside_layer_modules
+        if not self.quantize_config.true_sequential:
+            inside_layer_modules = [sum(inside_layer_modules, [])]
+        quantizers = {}
+        for i in range(len(layers)):
+            logger.info(f"Start quantizing layer {i + 1}/{len(layers)}")
+            layer = layers[i]
+            force_layer_back_to_cpu = False
+            if get_device(layer) == CPU:
+                move_to_device(layer, CUDA_0)
+                force_layer_back_to_cpu = True
+            cur_layer_device = get_device(layer)
+
+            full = find_layers(layer)
+            for names in inside_layer_modules:
+                subset = {n: full[n] for n in names if n in full}
+                gptq = {}
+                for name in subset:
+                    gptq[name] = GPTQ(subset[name])
+                    gptq[name].quantizer.configure(
+                        self.quantize_config.bits,
+                        perchannel=True,
+                        sym=self.quantize_config.sym,
+                        mse=False,
+                    )
+
+                def add_batch(name):
+                    def tmp(_, inp, out):
+                        # gptq is mutable.
+                        gptq[name].add_batch(inp[0].data, out.data)  # noqa: F821
+
+                    return tmp
+
+                handles = []
+                for name in subset:
+                    handles.append(subset[name].register_forward_hook(add_batch(name)))
+                for j in range(num_batches):
+                    layer_input = []
+                    for k, layer_inp in enumerate(layer_inputs[j]):
+                        layer_input.append(move_to_device(layer_inp, cur_layer_device))
+
+                    layer_attention_mask = move_to_device(attention_masks[j], cur_layer_device)
+                    additional_layer_inputs = {"attention_mask": layer_attention_mask}
+                    layer_position_ids = (
+                        None if not position_ids else move_to_device(position_ids[j], cur_layer_device)
+                    )
+                    if layer_position_ids is not None:
+                        additional_layer_inputs["position_ids"] = layer_position_ids
+                    for k, v in layer_input_kwargs[j].items():
+                        additional_layer_inputs[k] = nested_move_to_device(v, cur_layer_device)
+                    layer(*layer_input, **additional_layer_inputs)
+                for h in handles:
+                    h.remove()
+
+                for name in subset:
+                    logger.info(f"Quantizing {name} in layer {i + 1}/{len(layers)}...")
+                    scale, zero, g_idx = gptq[name].fasterquant(
+                        percdamp=self.quantize_config.damp_percent,
+                        group_size=self.quantize_config.group_size,
+                        actorder=self.quantize_config.desc_act,
+                        static_groups=self.quantize_config.static_groups,
+                    )
+                    quantizers[f"{self.layers_block_name}.{i}.{name}"] = (
+                        gptq[name].quantizer.to(CPU if force_layer_back_to_cpu else cur_layer_device),
+                        move_to_device(scale, CPU if force_layer_back_to_cpu else cur_layer_device),
+                        move_to_device(zero, CPU if force_layer_back_to_cpu else cur_layer_device),
+                        move_to_device(g_idx, CPU if force_layer_back_to_cpu else cur_layer_device),
+                    )
+                    gptq[name].free()
+
+            for j in range(num_batches):
+                layer_input = []
+                for k, layer_inp in enumerate(layer_inputs[j]):
+                    layer_input.append(move_to_device(layer_inp, cur_layer_device))
+
+                layer_attention_mask = move_to_device(attention_masks[j], cur_layer_device)
+                additional_layer_inputs = {"attention_mask": layer_attention_mask}
+                layer_position_ids = None if not position_ids else move_to_device(position_ids[j], cur_layer_device)
+                if layer_position_ids is not None:
+                    additional_layer_inputs["position_ids"] = layer_position_ids
+                for k, v in layer_input_kwargs[j].items():
+                    additional_layer_inputs[k] = nested_move_to_device(v, cur_layer_device)
+                layer_output = move_to_device(
+                    layer(*layer_input, **additional_layer_inputs)[0],
+                    cur_layer_device if cache_examples_on_gpu else CPU,
+                )
+                layer_outputs.append([layer_output])
+
+            layers[i] = move_to_device(layer, CPU if force_layer_back_to_cpu else cur_layer_device)
+            del layer
+            del gptq
+            del layer_inputs
+            layer_inputs, layer_outputs = layer_outputs, []  # TODO: is it really OK to cache only the first positional argument?
+            torch.cuda.empty_cache()
+
+        pack_model(
+            model=self.model,
+            quantizers=quantizers,
+            bits=self.quantize_config.bits,
+            group_size=self.quantize_config.group_size,
+            use_triton=use_triton,
+            use_cuda_fp16=use_cuda_fp16,
+            desc_act=self.quantize_config.desc_act,
+            warmup_triton=autotune_warmup_after_quantized,
+            force_layer_back_to_cpu=force_layer_back_to_cpu,
+            use_marlin=self.quantize_config.checkpoint_format == CHECKPOINT_FORMAT.MARLIN,
+        )
+        if device_map:
+            self.model = remove_hook_from_module(self.model, recurse=True)
+            self.model = simple_dispatch_model(self.model, device_map)
+        self.model.config.use_cache = forward_pass_use_cache
+
+        self._quantized = True
+
+        torch.cuda.empty_cache()
+
+    @property
+    def device(self):
+        if not self.hf_device_map:
+            return self.model.device
+        else:
+            device = [d for d in self.hf_device_map.values() if d not in {"disk"}][0]
+            return torch.device(device)
+
+    def to(self, device: Union[str, torch.device]):
+        self.model.to(device)
+        return self
+
+    def forward(self, *args, **kwargs):
+        return self.model(*args, **kwargs)
+
+    def generate(self, **kwargs):
+        """shortcut for model.generate"""
+        with torch.inference_mode(), torch.amp.autocast(device_type=self.device.type):
+            return self.model.generate(**kwargs)
+
+    def prepare_inputs_for_generation(self, *args, **kwargs):
+        """shortcut for model.prepare_inputs_for_generation"""
+        return self.model.prepare_inputs_for_generation(*args, **kwargs)
+
+    def push_to_hub(
+        self,
+        repo_id: str,
+        save_dir: Optional[str] = None,
+        use_safetensors: Optional[bool] = True,
+        safetensors_metadata: Optional[Dict[str, str]] = None,
+        commit_message: Optional[str] = "Upload of AutoGPTQ quantized model",
+        use_auth_token: Optional[Union[bool, str]] = None,
+        private: Optional[bool] = None,
+        token: Optional[Union[bool, str]] = None,
+        create_pr: Optional[bool] = False,
+    ) -> str:
+        """
+        Upload the model to the Hugging Face Hub.
+
+        Parameters:
+            repo_id (`str`):
+                The name of the repository you want to push your tool to. It should contain your organization name when
+                pushing to a given organization.
+            save_dir (`str`, *optional*):
+                The name of the local folder to save the model to.
+                If the model has already been saved, this parameter can be omitted.
+            use_safetensors (`bool`, *optional*):
+                Save the model using `safetensors`.
+                If the model has already been saved, this parameter can be omitted.
+            safetensors_metadata: (`dict`, *optional*, defaults to `None`):
+                Pass optional metadata dictionary to be saved in the `safetensors` model file(s).
+                Metadata is optional and is purely for informational purposes. It does not affect inference.
+                If `None`, no metadata will be saved.
+            commit_message (`str`, *optional*, defaults to `"Upload tool"`):
+                Message to commit while pushing.
+            use_auth_token (`bool` or `str`, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+                when running `huggingface-cli login` (stored in `~/.huggingface`). Will default to `True` if `repo_url`
+                is not specified.
+            private (`bool`, *optional*):
+                Whether or not the repository created should be private.
+            token (`bool` or `str`, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If unset, will use the token generated
+                when running `huggingface-cli login` (stored in `~/.huggingface`).
+            create_pr (`bool`, *optional*, defaults to `False`):
+                Whether or not to create a PR with the uploaded files or directly commit.
+        """
+        if (
+            self.quantize_config.model_name_or_path is None or not isdir(self.quantize_config.model_name_or_path)
+        ) and save_dir is None:
+            raise ValueError(
+                "Quantized model should be saved first, or you can provide save_dir to make sure model is saved to local disk before uploading."
+            )
+
+        if save_dir is not None:
+            logger.info(f"Saving model to {save_dir}")
+            self.save_quantized(save_dir, use_safetensors, safetensors_metadata)
+
+        repo_url = create_repo(
+            repo_id=repo_id,
+            token=token,
+            private=private,
+            exist_ok=True,
+            repo_type="model",
+        )
+        repo_id = repo_url.repo_id
+
+        if self.quantize_config.model_name_or_path is not None:
+            work_dir = self.quantize_config.model_name_or_path
+            operations = [
+                CommitOperationAdd(path_or_fileobj=join(work_dir, f), path_in_repo=f) for f in os.listdir(work_dir)
+            ]
+            logger.info(f"Uploading the following files to {repo_id}: {','.join(os.listdir(work_dir))}")
+            return create_commit(
+                repo_id=repo_id,
+                operations=operations,
+                commit_message=commit_message,
+                token=use_auth_token,
+                create_pr=create_pr,
+                repo_type="model",
+            )
+
+    def save_quantized(
+        self,
+        save_dir: str,
+        use_safetensors: bool = True,
+        safetensors_metadata: Optional[Dict[str, str]] = None,
+    ):
+        """save quantized model and configs to local disk"""
+        os.makedirs(save_dir, exist_ok=True)
+
+        if not self.quantized:
+            raise EnvironmentError("can only save quantized model, please execute .quantize first.")
+
+        self.model.to(CPU)
+
+        model_base_name = (
+            self.quantize_config.model_file_base_name
+            or f"gptq_model-{self.quantize_config.bits}bit-{self.quantize_config.group_size}g"
+        )
+        if use_safetensors:
+            model_save_name = model_base_name + ".safetensors"
+            state_dict = self.model.state_dict()
+            state_dict = {k: v.clone().contiguous() for k, v in state_dict.items()}
+            if safetensors_metadata is None:
+                safetensors_metadata = {}
+            elif not isinstance(safetensors_metadata, dict):
+                raise TypeError("safetensors_metadata must be a dictionary.")
+            else:
+                logger.debug(f"Received safetensors_metadata: {safetensors_metadata}")
+                new_safetensors_metadata = {}
+                converted_keys = False
+                for key, value in safetensors_metadata.items():
+                    if not isinstance(key, str) or not isinstance(value, str):
+                        converted_keys = True
+                        try:
+                            new_key = str(key)
+                            new_value = str(value)
+                        except Exception as e:
+                            raise TypeError(
+                                f"safetensors_metadata: both keys and values must be strings and an error occured when trying to convert them: {e}"
+                            )
+                        if new_key in new_safetensors_metadata:
+                            logger.warning(
+                                f"After converting safetensors_metadata keys to strings, the key '{new_key}' is duplicated. Ensure that all your metadata keys are strings to avoid overwriting."
+                            )
+                        new_safetensors_metadata[new_key] = new_value
+                safetensors_metadata = new_safetensors_metadata
+                if converted_keys:
+                    logger.debug(
+                        f"One or more safetensors_metadata keys or values had to be converted to str(). Final safetensors_metadata: {safetensors_metadata}"
+                    )
+
+            # Format is required to enable Accelerate to load the metadata
+            # otherwise it raises an OSError
+            safetensors_metadata["format"] = "pt"
+
+            # Store the quantization configuration as safetensors metadata
+            from auto_gptq import __version__
+
+            safetensors_metadata["auto_gptq_version"] = str(__version__)
+            safetensors_metadata["gptq_bits"] = str(self.quantize_config.bits)
+            safetensors_metadata["gptq_group_size"] = str(self.quantize_config.group_size)
+            safetensors_metadata["gptq_desc_act"] = str(self.quantize_config.desc_act)
+            safetensors_metadata["gptq_damp_percent"] = str(self.quantize_config.damp_percent)
+            safetensors_metadata["gptq_" + CHECKPOINT_FORMAT_FIELD] = self.quantize_config.checkpoint_format
+            safetensors_metadata["gptq_" + QUANT_METHOD_FIELD] = self.quantize_config.quant_method
+
+            safe_save(state_dict, join(save_dir, model_save_name), safetensors_metadata)
+        else:
+            model_save_name = model_base_name + ".bin"
+            torch.save(self.model.state_dict(), join(save_dir, model_save_name))
+
+        self.model.config.quantization_config = self.quantize_config.to_dict()
+        self.model.config.save_pretrained(save_dir)
+        self.quantize_config.save_pretrained(save_dir)
+        self.quantize_config.model_name_or_path = save_dir
+        self.quantize_config.model_file_base_name = model_base_name
+
+    def save_pretrained(
+        self,
+        save_dir: str,
+        use_safetensors: bool = True,
+        safetensors_metadata: Optional[Dict[str, str]] = None,
+        **kwargs,
+    ):
+        """alias of save_quantized"""
+        logger.warning("you are using save_pretrained, which will re-direct to save_quantized.")
+        self.save_quantized(save_dir, use_safetensors, safetensors_metadata)
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: str,
+        quantize_config: BaseQuantizeConfig,
+        max_memory: Optional[dict] = None,
+        trust_remote_code: bool = False,
+        torch_dtype: torch.dtype = torch.float16,
+        **model_init_kwargs,
+    ):
+        """load un-quantized pretrained model to cpu"""
+
+        if not torch.cuda.is_available():
+            raise EnvironmentError("Load pretrained model to do quantization requires CUDA available.")
+
+        def skip(*args, **kwargs):
+            pass
+
+        torch.nn.init.kaiming_uniform_ = skip
+        torch.nn.init.uniform_ = skip
+        torch.nn.init.normal_ = skip
+
+        # Parameters related to loading from Hugging Face Hub
+        cache_dir = model_init_kwargs.pop("cache_dir", None)
+        force_download = model_init_kwargs.pop("force_download", False)
+        resume_download = model_init_kwargs.pop("resume_download", False)
+        proxies = model_init_kwargs.pop("proxies", None)
+        local_files_only = model_init_kwargs.pop("local_files_only", False)
+        use_auth_token = model_init_kwargs.pop("use_auth_token", None)
+        revision = model_init_kwargs.pop("revision", None)
+        subfolder = model_init_kwargs.pop("subfolder", "")
+        commit_hash = model_init_kwargs.pop("_commit_hash", None)
+
+        cached_file_kwargs = {
+            "cache_dir": cache_dir,
+            "force_download": force_download,
+            "proxies": proxies,
+            "resume_download": resume_download,
+            "local_files_only": local_files_only,
+            "use_auth_token": use_auth_token,
+            "revision": revision,
+            "subfolder": subfolder,
+            "_commit_hash": commit_hash,
+        }
+
+        config = AutoConfig.from_pretrained(
+            pretrained_model_name_or_path, trust_remote_code=True, **cached_file_kwargs
+        )
+        if config.model_type not in SUPPORTED_MODELS:
+            raise TypeError(f"{config.model_type} isn't supported yet.")
+
+        # enforce some values despite user specified
+        model_init_kwargs["torch_dtype"] = torch_dtype
+        model_init_kwargs["trust_remote_code"] = trust_remote_code
+        if max_memory:
+            if "disk" in max_memory:
+                raise NotImplementedError("disk offload not support yet.")
+            with accelerate.init_empty_weights():
+                model = AutoModelForCausalLM.from_config(config, trust_remote_code=True)
+            model.tie_weights()
+
+            max_memory = accelerate.utils.get_balanced_memory(
+                model,
+                max_memory=max_memory,
+                no_split_module_classes=[cls.layer_type],
+                dtype=model_init_kwargs["torch_dtype"],
+                low_zero=False,
+            )
+            model_init_kwargs["device_map"] = accelerate.infer_auto_device_map(
+                model,
+                max_memory=max_memory,
+                no_split_module_classes=[cls.layer_type],
+                dtype=model_init_kwargs["torch_dtype"],
+            )
+            model_init_kwargs["low_cpu_mem_usage"] = True
+
+            del model
+        else:
+            model_init_kwargs["device_map"] = None
+            model_init_kwargs["low_cpu_mem_usage"] = False
+
+        torch.cuda.empty_cache()
+
+        merged_kwargs = {**model_init_kwargs, **cached_file_kwargs}
+        model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, **merged_kwargs)
+
+        model_config = model.config.to_dict()
+        seq_len_keys = ["max_position_embeddings", "seq_length", "n_positions"]
+        if any(k in model_config for k in seq_len_keys):
+            for key in seq_len_keys:
+                if key in model_config:
+                    model.seqlen = model_config[key]
+                    break
+        else:
+            logger.warning("can't get model's sequence length from model config, will set to 4096.")
+            model.seqlen = 4096
+        model.eval()
+
+        return cls(model, False, quantize_config)
+
+    @classmethod
+    def from_quantized(
+        cls,
+        model_name_or_path: Optional[str],
+        device_map: Optional[Union[str, Dict[str, Union[int, str]]]] = None,
+        max_memory: Optional[dict] = None,
+        device: Optional[Union[str, int]] = None,
+        low_cpu_mem_usage: bool = False,
+        use_triton: bool = False,
+        use_qigen: bool = False,
+        use_marlin: bool = False,
+        torch_dtype: Optional[torch.dtype] = None,
+        inject_fused_attention: bool = False,
+        inject_fused_mlp: bool = False,
+        use_cuda_fp16: bool = True,
+        quantize_config: Optional[BaseQuantizeConfig] = None,
+        model_basename: Optional[str] = None,
+        use_safetensors: bool = True,
+        trust_remote_code: bool = False,
+        warmup_triton: bool = False,
+        trainable: bool = False,
+        disable_exllama: Optional[bool] = None,
+        disable_exllamav2: bool = False,
+        use_tritonv2: bool = False,
+        checkpoint_format: Optional[str] = None,
+        **kwargs,
+    ):
+        """load quantized model from local disk"""
+        # If disable_exllamav2 is True, we want to fall back on the exllama kernel and not the cuda/cuda_old ones.
+        if disable_exllama is None:
+            if disable_exllamav2:
+                disable_exllama = False
+            else:
+                disable_exllama = True
+
+        # Parameters related to loading from Hugging Face Hub
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        resume_download = kwargs.pop("resume_download", False)
+        proxies = kwargs.pop("proxies", None)
+        local_files_only = kwargs.pop("local_files_only", False)
+        use_auth_token = kwargs.pop("use_auth_token", None)
+        revision = kwargs.pop("revision", None)
+        subfolder = kwargs.pop("subfolder", "")
+        commit_hash = kwargs.pop("_commit_hash", None)
+
+        cached_file_kwargs = {
+            "cache_dir": cache_dir,
+            "force_download": force_download,
+            "proxies": proxies,
+            "resume_download": resume_download,
+            "local_files_only": local_files_only,
+            "use_auth_token": use_auth_token,
+            "revision": revision,
+            "subfolder": subfolder,
+            "_raise_exceptions_for_missing_entries": False,
+            "_commit_hash": commit_hash,
+        }
+        if use_qigen and not QIGEN_AVAILABLE:
+            logger.warning("Qigen is not installed, reset use_qigen to False.")
+            use_qigen = False
+        if use_triton and use_tritonv2:
+            logging.warn(
+                "Both use_triton and use_tritonv2 are set to True. Defaulting to use_triton"
+            )
+            use_tritonv2 = False
+        if (use_triton or use_tritonv2) and not TRITON_AVAILABLE:
+            logger.warning("Triton is not installed, reset use_triton to False.")
+            use_triton = False
+            use_tritonv2 = False
+        if not disable_exllama and not EXLLAMA_KERNELS_AVAILABLE:
+            logger.warning(
+                "Exllama kernel is not installed, reset disable_exllama to True. "
+                "This may because you installed auto_gptq using a pre-build wheel "
+                "on Windows, in which exllama_kernels are not compiled. To use "
+                "exllama_kernels to further speedup inference, you can re-install "
+                "auto_gptq from source."
+            )
+            disable_exllama = True
+        if not disable_exllamav2 and not EXLLAMAV2_KERNELS_AVAILABLE:
+            logger.warning(
+                "Exllamav2 kernel is not installed, reset disable_exllamav2 to True. "
+                "This may because you installed auto_gptq using a pre-build wheel "
+                "on Windows, in which exllama_kernels are not compiled. To use "
+                "exllama_kernels to further speedup inference, you can re-install "
+                "auto_gptq from source."
+            )
+            disable_exllamav2 = True
+        if not AUTOGPTQ_CUDA_AVAILABLE:
+            logger.warning(
+                "CUDA kernels for auto_gptq are not installed, this will result in "
+                "very slow inference speed. This may because:\n"
+                "1. You disabled CUDA extensions compilation by setting BUILD_CUDA_EXT=0 when install auto_gptq from source.\n"
+                "2. You are using pytorch without CUDA support.\n"
+                "3. CUDA and nvcc are not installed in your device."
+            )
+
+        if use_qigen and QIGEN_AVAILABLE:
+            logger.warning("QIgen is active. Ignores all settings related to cuda.")
+            inject_fused_attention = False
+            inject_fused_mlp = False
+            use_triton = False
+            disable_exllama = False
+            disable_exllamav2 = True
+
+        if not disable_exllamav2 and not disable_exllama:
+            logger.warning(
+                "You have activated both exllama and exllamav2 kernel. Setting disable_exllama to True and keeping disable_exllamav2 to False"
+            )
+            disable_exllama = True
+
+        # == step1: prepare configs and file names == #
+        config = AutoConfig.from_pretrained(
+            model_name_or_path,
+            trust_remote_code=trust_remote_code,
+            **cached_file_kwargs,
+        )
+
+        if config.model_type not in SUPPORTED_MODELS:
+            raise TypeError(f"{config.model_type} isn't supported yet.")
+
+        if quantize_config is None:
+            quantize_config = BaseQuantizeConfig.from_pretrained(model_name_or_path, checkpoint_format=checkpoint_format, **cached_file_kwargs, **kwargs)
+        else:
+            if not isinstance(quantize_config, BaseQuantizeConfig):
+                quantize_config = BaseQuantizeConfig.from_quant_config(quantize_config, checkpoint_format)
+
+        if quantize_config.checkpoint_format == CHECKPOINT_FORMAT.MARLIN:
+            # format marlin requires marlin kernel
+            use_marlin = True
+
+        marlin_compatible = _validate_marlin_device_support()
+        if use_marlin and not MARLIN_AVAILABLE:
+            raise TypeError("use_marlin is true but Marlin is not available due to cuda/device support.")
+
+        if not use_marlin and MARLIN_AVAILABLE:
+            unsupported_reason = _validate_marlin_compatibility(quantize_config)
+            if unsupported_reason is None and marlin_compatible:
+                logger.info(
+                    "You passed a model that is compatible with the Marlin int4*fp16 GPTQ kernel but use_marlin is False. We recommend using `use_marlin=True` to use the optimized Marlin kernels for inference. Example: `model = AutoGPTQForCausalLM.from_quantized(..., use_marlin=True)`."
+                )
+
+        if model_basename is None:
+            if quantize_config.model_file_base_name:
+                possible_model_basenames = [quantize_config.model_file_base_name]
+            else:
+                possible_model_basenames = [
+                    f"gptq_model-{quantize_config.bits}bit-{quantize_config.group_size}g",
+                    "model",
+                ]
+        else:
+            possible_model_basenames = [model_basename]
+
+        quantize_config.model_name_or_path = model_name_or_path
+
+        extensions = []
+        if use_safetensors:
+            extensions.append(".safetensors")
+        else:
+            extensions += [".bin", ".pt"]
+
+        model_name_or_path = str(model_name_or_path)
+
+        # Retrieve (and if necessary download) the quantized checkpoint(s).
+        is_sharded, resolved_archive_file, true_model_basename = get_checkpoints(model_name_or_path=model_name_or_path, extensions=extensions, possible_model_basenames=possible_model_basenames, **cached_file_kwargs)
+
+        quantize_config.model_file_base_name = true_model_basename
+
+        model_save_name = resolved_archive_file  # In case a model is sharded, this would be `model.safetensors.index.json` which may later break.
+
+        if (not disable_exllama or not disable_exllamav2) and trainable:
+            logger.warning(
+                "QuantLinear with the exllama backend not does support the trainable mode yet, switching to cuda/cuda_old/triton backend."
+            )
+            disable_exllama = True
+            disable_exllamav2 = True
+
+        elif not (use_triton or use_tritonv2) and trainable:
+            logger.warning(
+                "QuantLinear with cuda backend not support trainable mode yet, Switch to the pytorch backend."
+            )
+
+        # == step2: convert model to gptq-model (replace Linear with QuantLinear) == #
+        def skip(*args, **kwargs):
+            pass
+
+        if torch_dtype is None:
+            if not use_qigen:
+                torch_dtype = torch.float16
+            else:
+                torch_dtype = torch.float32
+
+        if torch_dtype != torch.float16:
+            logger.warning("Overriding use_cuda_fp16 to False since torch_dtype is not torch.float16.")
+            use_cuda_fp16 = False
+
+        if not use_qigen:
+            torch.nn.init.kaiming_uniform_ = skip
+            torch.nn.init.uniform_ = skip
+            torch.nn.init.normal_ = skip
+
+            transformers.modeling_utils._init_weights = False
+
+            init_contexts = [no_init_weights()]
+            if low_cpu_mem_usage:
+                init_contexts.append(accelerate.init_empty_weights(include_buffers=False))
+
+            with ContextManagers(init_contexts):
+                model = AutoModelForCausalLM.from_config(
+                    config, trust_remote_code=trust_remote_code, torch_dtype=torch_dtype
+                )
+
+                layers = find_layers(model)
+                ignore_layers = [cls.lm_head_name] + cls.outside_layer_modules
+                for name in list(layers.keys()):
+                    if any(name.startswith(ignore_layer) for ignore_layer in ignore_layers) or all(
+                        not name.endswith(ignore_layer)
+                        for sublist in cls.inside_layer_modules
+                        for ignore_layer in sublist
+                    ):
+                        logger.info(f"The layer {name} is not quantized.")
+                        del layers[name]
+
+                make_quant(
+                    model,
+                    layers,
+                    quantize_config.bits,
+                    quantize_config.group_size,
+                    use_triton=use_triton,
+                    disable_exllama=disable_exllama,
+                    disable_exllamav2=disable_exllamav2,
+                    use_cuda_fp16=use_cuda_fp16,
+                    desc_act=quantize_config.desc_act,
+                    trainable=trainable,
+                    use_tritonv2=use_tritonv2,
+                )
+                model.tie_weights()
+
+            # == step3: load checkpoint and dispatch == #
+            if isinstance(device_map, str) and device_map not in [
+                "auto",
+                "balanced",
+                "balanced_low_0",
+                "sequential",
+            ]:
+                raise ValueError(
+                    "If passing a string for `device_map`, please choose 'auto', 'balanced', 'balanced_low_0' or "
+                    "'sequential'."
+                )
+            if isinstance(device_map, dict):
+                max_memory = None
+            else:
+                if device is None and not device_map and not max_memory:
+                    device_map = "auto"
+                if device is not None:
+                    device = torch.device(device)
+                    if not max_memory and not device_map:
+                        device_map = {"": device.index if device.type == "cuda" else device.type}
+                if not isinstance(device_map, dict) and device_map != "sequential":
+                    max_memory = accelerate.utils.get_balanced_memory(
+                        model=model,
+                        max_memory=max_memory,
+                        no_split_module_classes=[cls.layer_type],
+                        low_zero=(device_map == "balanced_low_0"),
+                    )
+            if not isinstance(device_map, dict):
+                device_map = accelerate.infer_auto_device_map(
+                    model,
+                    max_memory=max_memory,
+                    no_split_module_classes=[cls.layer_type],
+                )
+
+            if low_cpu_mem_usage:
+                make_sure_no_tensor_in_meta_device(
+                    model,
+                    use_triton,
+                    quantize_config.desc_act,
+                    quantize_config.group_size,
+                    bits=quantize_config.bits,
+                    disable_exllama=disable_exllama,
+                    disable_exllamav2=disable_exllamav2,
+                    use_tritonv2=use_tritonv2,
+                )
+
+            # TODO: move this logic in an awq_utils.py file.
+            if quantize_config.checkpoint_format == CHECKPOINT_FORMAT.AWQ_GEMM:
+                if is_sharded:
+                    raise ValueError("The loading of sharded checkpoints with AWQ checkpoints is currently not supported. Please raise an issue in AutoGPTQ repository.")
+
+                if use_marlin:
+                    raise ValueError(
+                        "Tried to load an AWQ model with use_marlin=True. This is currently not supported. Please open an issue in AutoGPTQ repository."
+                    )
+
+                model_cache_name, is_cached = quantize_config.get_cache_file_path()
+
+                if is_cached:
+                    model_save_name = model_cache_name
+                    logger.info(f"Loading an AWQ model, detected a cached repacked weight at {model_save_name}.")
+                else:
+                    logger.info(
+                        "Loading an AWQ model. This requires repacking the weights, and no cached repacked checkpoint was found. Grab a coffee!"
+                    )
+
+                    if "safetensors" not in model_save_name:
+                        raise NotImplementedError(
+                            f"Conversion from AWQ checkpoints is implemented only for safetensors checkpoints, found {model_save_name}"
+                        )
+                    if quantize_config.bits != 4:
+                        raise NotImplementedError(
+                            f"Conversion from AWQ checkpoints is supported only for 4 bits models. Found {quantize_config.bits} bits."
+                        )
+                    gptq_layers = set()
+                    non_gptq_params = set()
+                    with safe_open(model_save_name, framework="pt") as f:
+                        for state_dict_key in f.keys():
+                            if (
+                                "qweight" not in state_dict_key
+                                and "qzeros" not in state_dict_key
+                                and "scales" not in state_dict_key
+                            ):
+                                non_gptq_params.add(state_dict_key)
+                                continue
+
+                            # e.g. prefix "model.layers.3.self_attn.k_proj"
+                            prefix, _ = state_dict_key.rsplit(".", 1)
+                            gptq_layers.add(prefix)
+
+                        new_state_dict = {}
+
+                        for state_dict_key in non_gptq_params:
+                            new_state_dict[state_dict_key] = f.get_tensor(state_dict_key)
+
+                        gptq_layers = sorted(gptq_layers)
+                        max_layer_name_length = len(max(gptq_layers, key=len))
+                        pbar = tqdm(gptq_layers)
+                        i = 0
+                        for gptq_layer_name in pbar:
+                            i += 1
+                            desc = f"Unpacking {gptq_layer_name} + '...'"
+                            desc = desc + " " * (max_layer_name_length - len(desc))
+
+                            awq_qweight = f.get_tensor(gptq_layer_name + ".qweight")
+                            awq_qzeros = f.get_tensor(gptq_layer_name + ".qzeros")
+                            awq_scales = f.get_tensor(gptq_layer_name + ".scales")
+
+                            # TODO: add FAST unpacking.
+                            unpacked_qweight, unpacked_qzeros = unpack_awq(
+                                awq_qweight,
+                                awq_qzeros,
+                                awq_scales,
+                                bits=quantize_config.bits,
+                                group_size=quantize_config.group_size,
+                            )
+
+                            # TODO: add FAST repacking, this is too slow.
+                            desc = f"Repacking {gptq_layer_name}..."
+                            desc = desc + " " * (max_layer_name_length + 12 - len(desc))
+                            pbar.set_description(desc)
+                            gptq_qweight, gptq_qzeros = pack_from_tensors(
+                                unpacked_qweight,
+                                unpacked_qzeros,
+                                awq_scales,
+                                bits=quantize_config.bits,
+                                group_size=quantize_config.group_size,
+                            )
+
+                            new_state_dict[gptq_layer_name + ".qweight"] = gptq_qweight
+                            new_state_dict[gptq_layer_name + ".qzeros"] = gptq_qzeros
+                            new_state_dict[gptq_layer_name + ".scales"] = awq_scales
+
+                        safe_save(new_state_dict, model_cache_name)
+                        model_save_name = model_cache_name
+
+            if use_marlin:
+                if is_sharded:
+                    raise ValueError("The loading of sharded checkpoints with Marlin is currently not supported. Please raise an issue in AutoGPTQ repository.")
+                if torch.version.hip:
+                    raise ValueError("Can not use Marlin int4*fp16 kernel with AMD ROCm version of PyTorch as the kernel is not compatible. Please do not use `use_marlin=True` when using ROCm devices.")
+                if not _validate_marlin_device_support():
+                    raise ValueError(f'Can not use Marlin int4*fp16 kernel with a device of compute capability {torch.cuda.get_device_capability()}, the minimum compute capability is 8.0 for Marlin kernel. Please do not use `use_marlin=True`, or please upgrade your GPU ("The more you buy, the more you save." - Taiwanese proverb).')
+
+                # Validate the model can run in Marlin.
+                if torch_dtype != torch.float16:
+                    raise ValueError("Marlin kernel requires torch_dtype=torch.float16.")
+                unsupported_reason = _validate_marlin_compatibility(quantize_config)
+                if unsupported_reason is not None:
+                    raise ValueError(
+                        f"The model {model_name_or_path} can not be converted to use the Marlin kernel for the following reason: {unsupported_reason}, which is not supported by Marlin kernel."
+                    )
+
+                # Load the quant linear type we need.
+                # TODO: load directy marlin with the right quantlinear class.
+                quant_linear_class = dynamically_import_QuantLinear(
+                    use_triton=use_triton,
+                    desc_act=quantize_config.desc_act,
+                    group_size=quantize_config.group_size,
+                    bits=quantize_config.bits,
+                    disable_exllama=disable_exllama,
+                    disable_exllamav2=disable_exllamav2,
+                    use_marlin=False,
+                    use_tritonv2=use_tritonv2,  # Get the "original" QuantLienar class
+                )
+
+                # Prepare model for marlin load.
+                #   If stub is marlin serialzed         --> load from directly
+                #   If stub has cached marlin version   --> load from the cached versin
+                #   Otherwise                           --> convert to marlin, cache, load from cache
+                model, model_save_name = prepare_model_for_marlin_load(
+                    model=model,
+                    quantize_config=quantize_config,
+                    quant_linear_class=quant_linear_class,
+                    torch_dtype=torch_dtype,
+                    current_model_save_name=model_save_name,
+                    device_map=device_map,
+                )
+
+                # Disable incompatible optimizations.
+                if inject_fused_attention or inject_fused_mlp:
+                    # TODO: Validate whether that can be used.
+                    logger.info("Disabling fused attention and mlp injection because Marlin kernel is used.")
+                    inject_fused_attention = False
+                    inject_fused_mlp = False
+
+            load_checkpoint_in_model(
+                model,
+                dtype=torch_dtype,  # This is very hacky but works due to https://github.com/huggingface/accelerate/blob/bd72a5f1a80d5146554458823f8aeda0a9db5297/src/accelerate/utils/modeling.py#L292
+                checkpoint=model_save_name,
+                device_map=device_map,
+                offload_state_dict=True,
+                offload_buffers=True,
+            )
+
+            # TODO: Why are we using this custom function and not dispatch_model?
+            model = simple_dispatch_model(model, device_map)
+        else:
+            # Using QiGen.
+
+            if is_sharded:
+                raise ValueError("The loading of sharded checkpoints with QiGen is currently not supported. Please raise an issue in AutoGPTQ repository.")
+
+            if quantize_config.desc_act:
+                NotImplementedError("desc_act=True is not yet supported with QiGen.")
+            model = AutoModelForCausalLM.from_config(
+                config, trust_remote_code=trust_remote_code, torch_dtype=torch_dtype
+            )
+
+            layers = find_layers(model)
+            ignore_layers = [cls.lm_head_name] + cls.outside_layer_modules
+            for name in list(layers.keys()):
+                if any(name.startswith(ignore_layer) for ignore_layer in ignore_layers):
+                    logger.info(f"{name} not been quantized, will be ignored when make_quant.")
+                    del layers[name]
+
+            if model_save_name.endswith(".safetensors"):
+                checkpoint = safe_load(model_save_name)
+            else:
+                checkpoint = torch.load(model_save_name)
+            make_quant(
+                model,
+                layers,
+                quantize_config.bits,
+                quantize_config.group_size,
+                use_triton=use_triton,
+                disable_exllama=disable_exllama,
+                disable_exllamav2=disable_exllamav2,
+                use_cuda_fp16=use_cuda_fp16,
+                desc_act=quantize_config.desc_act,
+                trainable=trainable,
+                use_qigen=True,
+                use_tritonv2=use_tritonv2,
+                use_marlin=quantize_config.checkpoint_format == CHECKPOINT_FORMAT.MARLIN,
+            )
+            preprocess_checkpoint_qigen(
+                model,
+                layers,
+                quantize_config.bits,
+                quantize_config.group_size,
+                checkpoint,
+            )
+            model.load_state_dict(checkpoint)
+
+        # == step4: set seqlen == #
+        model_config = model.config.to_dict()
+        seq_len_keys = ["max_position_embeddings", "seq_length", "n_positions"]
+        if any(k in model_config for k in seq_len_keys):
+            for key in seq_len_keys:
+                if key in model_config:
+                    model.seqlen = model_config[key]
+                    break
+        else:
+            logger.warning("can't get model's sequence length from model config, will set to 4096.")
+            model.seqlen = 4096
+
+        # == step5: (optional) inject optimized module == #
+        if inject_fused_attention:
+            if cls.fused_attn_module_type is None:
+                inject_fused_attention = False
+                logger.warning(f"{cls.__name__} hasn't fused attention module yet, will skip inject fused attention.")
+            else:
+                cls.fused_attn_module_type.inject_to_model(
+                    model,
+                    use_triton=use_triton,
+                    group_size=quantize_config.group_size,
+                    use_cuda_fp16=use_cuda_fp16,
+                    desc_act=quantize_config.desc_act,
+                    trainable=trainable,
+                    bits=quantize_config.bits,
+                    disable_exllama=disable_exllama,
+                    disable_exllamav2=disable_exllamav2,
+                    use_tritonv2=use_tritonv2,
+                )
+        if inject_fused_mlp:
+            if cls.fused_mlp_module_type is None:
+                inject_fused_mlp = False
+                logger.warning(f"{cls.__name__} hasn't fused mlp module yet, will skip inject fused mlp.")
+            else:
+                cls.fused_mlp_module_type.inject_to_model(model, use_triton=use_triton)
+
+        # Any post-initialization that require device information, for example buffers initialization on device.
+        model = autogptq_post_init(model, use_act_order=quantize_config.desc_act)
+
+        model.eval()
+
+        # == step6: (optional) warmup triton == #
+        if (use_triton or use_tritonv2) and warmup_triton:
+            if use_tritonv2:
+                from ..nn_modules.qlinear.qlinear_tritonv2 import QuantLinear
+            else:
+                from ..nn_modules.qlinear.qlinear_triton import QuantLinear
+
+            QuantLinear.warmup(model, seqlen=model.seqlen)
+
+            if inject_fused_mlp and cls.fused_mlp_module_type is not None:
+                cls.fused_mlp_module_type.warmup(model, seqlen=model.seqlen)
+
+        # == step7: make model compatible with peft
+        # cls.make_sure_compatible_with_peft(
+        #     model,
+        #     use_triton,
+        #     quantize_config.desc_act,
+        #     quantize_config.group_size,
+        #     bits=quantize_config.bits,
+        #     disable_exllama=disable_exllama,
+        #     disable_exllamav2=disable_exllamav2,
+        #     use_marlin=use_marlin,
+        #     use_qigen=use_qigen,
+        # )
+
+        return cls(
+            model,
+            True,
+            quantize_config,
+            is_triton_backend=use_triton or use_tritonv2,
+            injected_fused_attention=inject_fused_attention,
+            injected_fused_mlp=inject_fused_mlp and (use_triton or use_tritonv2),
+            trainable=trainable,
+        )
+
+    def warmup_triton(self, enabled: bool = True):
+        if not enabled:
+            return
+        if not TRITON_AVAILABLE:
+            logger.warning("triton is not available, skip warmup stage directly.")
+            return
+
+        from ..nn_modules.qlinear.qlinear_triton import QuantLinear
+
+        QuantLinear.warmup(self.model, seqlen=self.model.seqlen)
+
+        if self.fused_mlp_module_type is not None:
+            self.fused_mlp_module_type.warmup(self.model, seqlen=self.model.seqlen)
+
+    def enable_trainable_mode(self, enabled: bool = True):
+        if not self.is_triton_backend and enabled:
+            raise NotImplementedError("For now, trainable mode only supports triton backend.")
+        for n, m in self.model.named_modules():
+            if hasattr(m, "trainable"):
+                setattr(m, "trainable", enabled)
+
+    def disable_trainable_mode(self):
+        self.enable_trainable_mode(enabled=False)
+
+    @staticmethod
+    def make_sure_compatible_with_peft(
+        model: PreTrainedModel,
+        use_triton: bool,
+        desc_act: bool,
+        group_size: int,
+        bits: int,
+        disable_exllama: bool = True,
+        disable_exllamav2: bool = False,
+        use_marlin: bool = False,
+        use_qigen: bool = False,
+        use_tritonv2: bool = False,
+    ):
+        GeneralQuantLinear.inject_to_model(
+            model,
+            dynamically_import_QuantLinear(use_triton, desc_act, group_size, bits=bits, disable_exllama=disable_exllama,
+                                           disable_exllamav2=disable_exllamav2,
+                                           use_marlin=use_marlin, use_qigen=use_qigen),
+        )
+
+    def __getattr__(self, item):
+        try:
+            return super().__getattr__(item)
+        except Exception:
+            return getattr(self.model, item)
+
+
+__all__ = ["BaseGPTQForCausalLM", "BaseQuantizeConfig"]
--- a/auto_gptq/modeling/_const.py
+++ b/auto_gptq/modeling/_const.py
+from torch import device
+
+from ..utils.import_utils import compare_transformers_version
+
+
+CPU = device("cpu")
+CUDA_0 = device("cuda:0")
+
+SUPPORTED_MODELS = [
+    "bloom",
+    "gptj",
+    "gpt2",
+    "gpt_neox",
+    "opt",
+    "moss",
+    "gpt_bigcode",
+    "codegen",
+    "RefinedWebModel",
+    "RefinedWeb",
+    "baichuan",
+    "internlm",
+    "qwen",
+    "xverse",
+    "deci",
+    "stablelm_epoch",
+    "mpt",
+    "cohere",
+]
+if compare_transformers_version("v4.28.0", op="ge"):
+    SUPPORTED_MODELS.append("llama")
+if compare_transformers_version("v4.30.0", op="ge"):
+    SUPPORTED_MODELS.append("longllama")
+if compare_transformers_version("v4.33.0", op="ge"):
+    SUPPORTED_MODELS.append("falcon")
+if compare_transformers_version("v4.34.0", op="ge"):
+    SUPPORTED_MODELS.append("mistral")
+    SUPPORTED_MODELS.append("Yi")
+if compare_transformers_version("v4.36.0", op="ge"):
+    SUPPORTED_MODELS.append("mixtral")
+if compare_transformers_version("v4.37.0", op="ge"):
+    SUPPORTED_MODELS.append("qwen2")
+    SUPPORTED_MODELS.append("phi")
+if compare_transformers_version("v4.38.0", op="ge"):
+    SUPPORTED_MODELS.append("gemma")
+if compare_transformers_version("v4.39.0.dev0", op="ge"):
+    SUPPORTED_MODELS.append("starcoder2")
+if compare_transformers_version("v4.43.0.dev0", op="ge"):
+    SUPPORTED_MODELS.append("gemma2")    
+
+EXLLAMA_DEFAULT_MAX_INPUT_LENGTH = 2048
+
+__all__ = ["CPU", "CUDA_0", "SUPPORTED_MODELS", "EXLLAMA_DEFAULT_MAX_INPUT_LENGTH"]
--- a/auto_gptq/modeling/_utils.py
+++ b/auto_gptq/modeling/_utils.py
+import json
+import logging
+import os
+from logging import getLogger
+from typing import List, Optional, Union
+
+import accelerate
+import numpy as np
+import torch
+import torch.nn as nn
+import transformers
+import threadpoolctl as tctl
+from tqdm import tqdm
+from transformers import AutoConfig
+from transformers.utils.hub import cached_file
+
+from ..utils.import_utils import dynamically_import_QuantLinear
+from ..utils.modeling_utils import recurse_setattr
+from ._const import CPU, CUDA_0, EXLLAMA_DEFAULT_MAX_INPUT_LENGTH, SUPPORTED_MODELS
+
+
+logger = getLogger(__name__)
+handler = logging.StreamHandler()
+formatter = logging.Formatter("%(levelname)s - %(message)s")
+handler.setFormatter(formatter)
+logger.addHandler(handler)
+logger.setLevel(logging.INFO)
+
+
+def get_device(obj: Union[torch.Tensor, nn.Module]):
+    if isinstance(obj, torch.Tensor):
+        return obj.device
+    return next(obj.parameters()).device
+
+
+def move_to_device(obj: Optional[Union[torch.Tensor, nn.Module]], device: torch.device):
+    if obj is None:
+        return obj
+    else:
+        if get_device(obj) != device:
+            obj = obj.to(device)
+        return obj
+
+
+def find_layers(module, layers=None, name=""):
+    if not layers:
+        layers = [transformers.pytorch_utils.Conv1D, nn.Conv2d, nn.Linear]
+    for layer in layers:
+        if isinstance(module, layer):
+            return {name: module}
+    res = {}
+    for name1, child in module.named_children():
+        res.update(find_layers(child, layers=layers, name=name + "." + name1 if name != "" else name1))
+    return res
+
+
+def get_module_by_name_prefix(model, module_name: str):
+    for name, module in model.named_modules():
+        if name.startswith(module_name):
+            return module
+
+
+def get_module_by_name_suffix(model, module_name: str):
+    for name, module in model.named_modules():
+        if name.endswith(module_name):
+            return module
+
+
+def make_quant(
+    module,
+    names,
+    bits,
+    group_size,
+    name="",
+    use_triton: bool = False,
+    use_marlin: bool = False,
+    disable_exllama: Optional[bool] = None,
+    disable_exllamav2: bool = False,
+    use_qigen: bool = False,
+    use_cuda_fp16: bool = True,
+    desc_act: bool = False,
+    trainable: bool = False,
+    use_tritonv2: bool = False,
+):
+    # If disable_exllamav2 is True, we want to fall back on the exllama kernel and not the cuda/cuda_old ones.
+    if disable_exllama is None:
+        if disable_exllamav2:
+            disable_exllama = False
+        else:
+            disable_exllama = True
+
+    QuantLinear = dynamically_import_QuantLinear(
+        use_triton=use_triton,
+        desc_act=desc_act,
+        group_size=group_size,
+        bits=bits,
+        use_marlin=use_marlin,
+        disable_exllama=disable_exllama,
+        disable_exllamav2=disable_exllamav2,
+        use_qigen=use_qigen,
+        use_tritonv2=use_tritonv2,
+    )
+
+    if isinstance(module, QuantLinear):
+        return
+
+    for name, submodule in module.named_modules():
+        if name in names:
+            ori_layer_device = next(submodule.parameters()).device
+
+            if isinstance(submodule, nn.Linear):
+                in_features = submodule.in_features
+                out_features = submodule.out_features
+            elif isinstance(submodule, nn.Conv2d):
+                in_features = submodule.in_channels
+                out_features = submodule.out_channels
+            elif isinstance(submodule, transformers.pytorch_utils.Conv1D):
+                in_features = submodule.weight.shape[0]
+                out_features = submodule.weight.shape[1]
+            bias = submodule.bias is not None
+            if (
+                (not (desc_act) or group_size == -1)
+                and not use_triton
+                and not use_qigen
+                and not use_tritonv2
+            ):
+                new_layer = QuantLinear(
+                    bits,
+                    group_size,
+                    in_features,
+                    out_features,
+                    bias,
+                    use_cuda_fp16=use_cuda_fp16,
+                    trainable=trainable,
+                    weight_dtype=submodule.weight.dtype,
+                )
+            else:
+                new_layer = QuantLinear(
+                    bits,
+                    group_size,
+                    in_features,
+                    out_features,
+                    bias,
+                    trainable=trainable,
+                    weight_dtype=submodule.weight.dtype,
+                )
+            new_layer.device = ori_layer_device
+            recurse_setattr(module, name, new_layer.to(ori_layer_device))
+
+
+def preprocess_checkpoint_qigen(
+    module,
+    names,
+    bits,
+    group_size,
+    checkpoint,
+    name="",
+):
+    try:
+        import cQIGen as qinfer
+    except ImportError:
+        logger.error("cQIGen not installed.")
+        raise
+
+    QuantLinear = dynamically_import_QuantLinear(
+        use_triton=False,
+        desc_act=False,
+        group_size=group_size,
+        bits=bits,
+        disable_exllama=False,
+        use_qigen=True,
+    )
+    if isinstance(module, QuantLinear):
+        in_features = module.infeatures
+        out_features = module.outfeatures
+
+        zeros = checkpoint[name + ".qzeros"]
+        scales = checkpoint[name + ".scales"].float()
+
+        if zeros.dtype != torch.float32:
+            new_zeros = torch.zeros_like(scales).float().contiguous()
+            if bits == 4:
+                qinfer.unpack_zeros4(zeros, new_zeros, new_zeros.shape[0], new_zeros.shape[1])
+            elif bits == 2:
+                qinfer.unpack_zeros2(zeros, new_zeros, new_zeros.shape[0], new_zeros.shape[1])
+            elif bits == 3:
+                logger.info("Unpacking zeros for 3 bits")
+            new_scales = scales.contiguous()
+        else:
+            if scales.shape[1] != out_features:
+                new_scales = scales.transpose(0, 1).contiguous()
+            else:
+                new_scales = scales.contiguous()
+            if zeros.shape[1] != out_features:
+                new_zeros = zeros.transpose(0, 1).contiguous()
+            else:
+                new_zeros = zeros.contiguous()
+
+        checkpoint[name + ".zeros"], checkpoint[name + ".scales"] = (
+            new_zeros,
+            new_scales,
+        )
+        del checkpoint[name + ".qzeros"]
+        del checkpoint[name + ".g_idx"]
+        if name + ".bias" in checkpoint:
+            checkpoint[name + ".bias"] = checkpoint[name + ".bias"].float()
+        else:
+            checkpoint[name + ".bias"] = torch.zeros(out_features)
+        checkpoint_qweight = checkpoint[name + ".qweight"].int().contiguous()
+        if bits == 4:
+            qweight = torch.zeros(int(in_features // 8 * out_features)).int().contiguous()
+            qinfer.pack4(
+                checkpoint_qweight,
+                qweight,
+                in_features // 8,
+                out_features,
+                module.mb,
+                module.tb,
+                module.cutoff,
+            )  # * (module.tt//tb))
+        elif bits == 3:
+            qweight = torch.zeros(int(in_features // 32 * 3 * out_features)).int().contiguous()
+            qinfer.pack3(
+                checkpoint_qweight,
+                qweight,
+                in_features // 32 * 3,
+                out_features,
+                module.mb // 32 * 3,
+                module.tb,
+                module.cutoff,
+            )
+        elif bits == 2:
+            qweight = torch.zeros(int(in_features // 16 * out_features)).int().contiguous()
+            qinfer.pack2(
+                checkpoint_qweight,
+                qweight,
+                in_features // 16,
+                out_features,
+                module.mb,
+                module.tb,
+                module.cutoff,
+            )  # * (module.tt//tb))
+        checkpoint[name + ".qweight"] = qweight
+        return
+
+    for name1, child in module.named_children():
+        preprocess_checkpoint_qigen(
+            child,
+            names,
+            bits,
+            group_size,
+            checkpoint,
+            name + "." + name1 if name != "" else name1,
+        )
+
+
+def pack_model(
+    model,
+    quantizers,
+    bits,
+    group_size,
+    use_triton=False,
+    use_cuda_fp16=True,
+    desc_act=False,
+    warmup_triton: bool = False,
+    force_layer_back_to_cpu: bool = False,
+    use_marlin: bool = False,
+    use_tritonv2: bool = False,
+):
+    
+    # set excllama excllamav2
+    disable_ex=True,
+    disable_exv2=False
+
+    QuantLinear = dynamically_import_QuantLinear(
+        use_triton=use_triton,
+        desc_act=desc_act,
+        group_size=group_size,
+        bits=bits,
+        disable_exllama=disable_ex,
+        disable_exllamav2=disable_exv2,
+        use_marlin=use_marlin,
+        use_tritonv2=use_tritonv2,
+    )
+
+    if force_layer_back_to_cpu:
+        model.to(CPU)
+
+    logger.info("Packing model...")
+    layers = find_layers(model)
+    layers = {n: layers[n] for n in quantizers}
+    make_quant(
+        model,
+        quantizers,
+        bits,
+        group_size,
+        use_triton=use_triton,
+        use_cuda_fp16=use_cuda_fp16,
+        desc_act=desc_act,
+        disable_exllama=disable_ex,
+        disable_exllamav2=disable_exv2,
+        use_marlin=use_marlin,
+    )
+    qlayers = find_layers(model, [QuantLinear])
+
+    # TODO remove once pack() thread regression is fixed
+    # Limit pack() thread usage to avoid slow-down: applies limit to all supported libs
+    with tctl.threadpool_limits(limits=1):
+        pbar = tqdm(qlayers.keys(), leave=True)
+        for name in pbar:
+            pbar.set_description(f"Packing {name}...", refresh=True)
+
+            quantizers[name], scale, zero, g_idx = quantizers[name]
+            # so far can only pack layer on CPU
+            layer_device = qlayers[name].device
+            qlayers[name].to(CPU)
+            layers[name], scale, zero, g_idx = (
+                layers[name].to(CPU),
+                scale.to(CPU),
+                zero.to(CPU),
+                g_idx.to(CPU),
+            )
+            if QuantLinear.QUANT_TYPE == "marlin":
+                qlayers[name].pack(layers[name], scale)
+            else:
+                qlayers[name].pack(layers[name], scale, zero, g_idx)
+            qlayers[name].to(layer_device)
+
+    logger.info("Model packed.")
+
+    if use_triton and warmup_triton:
+        logger.warning(
+            "using autotune_warmup will move model to GPU, make sure you have enough VRAM to load the whole model."
+        )
+        QuantLinear.warmup(model.to(CUDA_0), seqlen=model.seqlen)
+
+
+def check_and_get_model_type(model_dir, trust_remote_code=False):
+    config = AutoConfig.from_pretrained(model_dir, trust_remote_code=trust_remote_code)
+    if config.model_type not in SUPPORTED_MODELS:
+        raise TypeError(f"{config.model_type} isn't supported yet.")
+    model_type = config.model_type
+    return model_type
+
+
+def simple_dispatch_model(model, device_map):
+    from accelerate.hooks import AlignDevicesHook, add_hook_to_module
+
+    if "" in device_map:
+        d = device_map[""]
+        model = model.to(torch.device(d))
+        model.hf_device_map = device_map
+        return model
+
+    tied_params = accelerate.utils.modeling.find_tied_parameters(model)
+    if set(device_map.values()) == {"cpu"} or set(device_map.values()) == {
+        "cpu",
+        "disk",
+    }:
+        main_device = "cpu"
+    else:
+        main_device = [d for d in device_map.values() if d not in ["cpu", "disk"]][0]
+
+    cpu_offload_group = [(n, d) for n, d in device_map.items() if d == "cpu"]
+    prev_hook = None
+    for idx, (n, d) in enumerate(cpu_offload_group):
+        m = get_module_by_name_suffix(model, n)
+        _, prev_hook = accelerate.cpu_offload_with_hook(m, execution_device=main_device, prev_module_hook=prev_hook)
+    # set first cpu offload module's prev_module_hook to the last cpu offload module's hook
+    if len(cpu_offload_group) > 1:
+        get_module_by_name_suffix(model, cpu_offload_group[0][0])._hf_hook.prev_module_hook = prev_hook
+
+    for n, d in device_map.items():
+        m = get_module_by_name_suffix(model, n)
+        if d != "cpu":
+            d = torch.device(d)
+            hook = AlignDevicesHook(d, io_same_device=True, place_submodules=True)
+            add_hook_to_module(m, hook)
+    accelerate.utils.modeling.retie_parameters(model, tied_params)
+    model.hf_device_map = device_map
+
+    return model
+
+
+def autogptq_post_init(model, use_act_order: bool, max_input_length: Optional[int] = None):
+    """
+    The max_input_length argument is specific to the exllama backend, that requires to initialize a buffer temp_state.
+    """
+    device_to_buffers_size = {}
+
+    model_uses_exllama = False
+    for name, submodule in model.named_modules():
+        if hasattr(submodule, "QUANT_TYPE") and submodule.QUANT_TYPE == "exllama":
+            model_uses_exllama = True
+            device = submodule.qweight.device
+            if device not in device_to_buffers_size:
+                device_to_buffers_size[device] = {
+                    "max_dq_buffer_size": 1,
+                    "max_inner_outer_dim": 1,
+                }
+
+            if not use_act_order:
+                submodule._use_act_order = False
+            else:
+                submodule._use_act_order = True
+
+            # Disable this heuristic for detecting act_order, but it could be used instead of the config.
+            """
+            if submodule.g_idx is None:
+                submodule.act_order = False
+            elif submodule.g_idx is not None and ((submodule.g_idx == 0).all() or torch.equal(submodule.g_idx.cpu(), torch.tensor([i // submodule.group_size for i in range(submodule.g_idx.shape[0])], dtype=torch.int32))):
+                submodule.g_idx = None
+                submodule.act_order = False
+            else:
+                submodule.act_order = True
+            """
+
+            device_to_buffers_size[device]["max_dq_buffer_size"] = max(
+                device_to_buffers_size[device]["max_dq_buffer_size"],
+                submodule.qweight.numel() * 8,
+            )
+
+            if use_act_order:
+                device_to_buffers_size[device]["max_inner_outer_dim"] = max(
+                    device_to_buffers_size[device]["max_inner_outer_dim"],
+                    submodule.infeatures,
+                    submodule.outfeatures,
+                )
+
+    if model_uses_exllama:
+        # To be honest this is quite ugly, not proud of this.
+        try:
+            from exllama_kernels import prepare_buffers, set_tuning_params
+        except ImportError as e:
+            raise ImportError(
+                f"Could not import exllama backend dependencies prepare_buffers, set_tuning_params with the following error: {e}"
+            )
+
+        device_to_buffers = {}
+
+        if use_act_order:
+            if max_input_length is None:
+                max_input_len = EXLLAMA_DEFAULT_MAX_INPUT_LENGTH
+            else:
+                max_input_len = max_input_length
+        else:
+            if max_input_length is not None:
+                logger.info(
+                    "Using exllama backend without act-order, the parameter max_input_length was set although not needed, it will be ignored."
+                )
+            max_input_len = 1
+
+        for device, buffers_size in device_to_buffers_size.items():
+            # The temp_state buffer is required to reorder X in the act-order case.
+            # The temp_dq buffer is required to dequantize weights when using cuBLAS, typically for the prefill.
+            device_to_buffers[device] = {
+                "temp_state": torch.zeros(
+                    (max_input_len, buffers_size["max_inner_outer_dim"]),
+                    dtype=torch.float16,
+                    device=device,
+                ),
+                "temp_dq": torch.zeros(
+                    (1, buffers_size["max_dq_buffer_size"]),
+                    dtype=torch.float16,
+                    device=device,
+                ),
+                "max_dq_buffer_size": buffers_size["max_dq_buffer_size"],
+                "max_inner_outer_dim": buffers_size["max_inner_outer_dim"],
+            }
+
+        # Buffers need to be persistent to avoid any bug.
+        model.device_to_buffers = device_to_buffers
+
+        for device, buffers in model.device_to_buffers.items():
+            prepare_buffers(device, buffers["temp_state"], buffers["temp_dq"])
+
+        # Using the default from exllama repo here.
+        matmul_recons_thd = 8
+        matmul_fused_remap = False
+        matmul_no_half2 = False
+        set_tuning_params(matmul_recons_thd, matmul_fused_remap, matmul_no_half2)
+
+        # The buffers need to have been initialized first before calling make_q4.
+        for name, submodule in model.named_modules():
+            if hasattr(submodule, "QUANT_TYPE") and submodule.QUANT_TYPE == "exllama":
+                submodule.post_init()
+
+    ## exllamav2
+    fixed_bytes = {}
+    model_uses_exllamav2 = False
+
+    for _, submodule in model.named_modules():
+        if hasattr(submodule, "QUANT_TYPE"):
+            if submodule.QUANT_TYPE == "exllamav2":
+                model_uses_exllamav2 = True
+                device = submodule.qweight.device
+                scratch_fixed = submodule.scratch_space_fixed()
+                fixed_bytes[device] = max(scratch_fixed, fixed_bytes.get(device, 0))
+            elif submodule.QUANT_TYPE == "hpu":
+                submodule.post_init()
+
+    if model_uses_exllamav2:
+        from ..nn_modules.qlinear.qlinear_exllamav2 import ExLlamaV2DeviceTensors
+
+        device_tensors = {}
+        for device, scratch_bytes in fixed_bytes.items():
+            device_tensors[device] = ExLlamaV2DeviceTensors(device.index, scratch_bytes)
+
+        # have persistent buffers, otherwise we will get OOM
+        model.device_tensors = device_tensors
+
+        for _, submodule in model.named_modules():
+            if hasattr(submodule, "QUANT_TYPE") and submodule.QUANT_TYPE == "exllamav2":
+                device = submodule.qweight.device
+                submodule.post_init(temp_dq=model.device_tensors[device])
+    torch.cuda.empty_cache()
+
+    return model
+
+
+def make_sure_no_tensor_in_meta_device(
+    model, use_triton: bool, desc_act: bool, group_size: int, bits: int, disable_exllama: bool, disable_exllamav2: bool, use_marlin: bool = False, use_tritonv2: bool = False,
+):
+    QuantLinear = dynamically_import_QuantLinear(use_triton, desc_act, group_size, bits=bits, disable_exllama=disable_exllama, disable_exllamav2=disable_exllamav2, use_marlin=use_marlin, use_tritonv2=use_tritonv2)
+    for n, m in model.named_modules():
+        if isinstance(m, QuantLinear) and m.bias is not None and m.bias.device == torch.device("meta"):
+            m.register_buffer("bias", torch.zeros((m.outfeatures), dtype=torch.float16, device="cpu"))
+
+
+def awq_reverse_reorder_int_tensor(int_tensor, bits: int):
+    assert bits == 4
+
+    int_tensor = int_tensor.T.contiguous()
+    compress_ratio = 32 // bits
+    assert int_tensor.shape[-1] % compress_ratio == 0
+
+    order_map = [0, 2, 4, 6, 1, 3, 5, 7]
+    order_tensor = torch.tensor(order_map, dtype=torch.int32, device=int_tensor.device).reshape(1, -1)
+    order_tensor = order_tensor.repeat(int_tensor.shape[1] // compress_ratio, 1)
+    order_tensor = order_tensor + torch.arange(
+        0,
+        int_tensor.shape[1],
+        compress_ratio,
+        dtype=torch.int32,
+        device=int_tensor.device,
+    ).reshape(-1, 1)
+    order_tensor = order_tensor.reshape(-1)
+
+    reverse_order_tensor = torch.arange(order_tensor.shape[0]).cuda()[order_tensor]
+    reverse_order_tensor = reverse_order_tensor[order_tensor]
+    int_tensor = int_tensor[:, reverse_order_tensor]
+    return int_tensor
+
+
+def unpack_awq(
+    awq_qweight: torch.Tensor,
+    awq_qzeros: torch.Tensor,
+    awq_scales: torch.Tensor,
+    bits: int,
+    group_size: int,
+):
+    """
+    Args:
+        awq_qweight (`torch.LongTensor`):
+            Expected shape: (in_features, out_features // (32 // bits))
+        awq_qzeros (`torch.LongTensor`):
+            Expected shape: (in_features // group_size, out_features // (32 // bits))
+        awq_scales (`torch.LongTensor`):
+            Expected shape: (in_features // group_size, out_features)
+
+    Returns:
+        fp16_weight (`torch.LongTensor`):
+            With shape (in_features, out_features).
+        zeros (`torch.LongTensor`):
+            With shape (in_features // group_size, out_features).
+    """
+    assert bits == 4
+
+    qzeros = awq_qzeros.cuda()
+    qweight = awq_qweight.cuda()
+    qweight = qweight.T.contiguous()
+
+    infeatures = awq_qweight.shape[0]
+
+    wf = torch.tensor(list(range(0, 32, bits)), dtype=torch.int32, device=qzeros.device).unsqueeze(0)
+    zeros = torch.bitwise_right_shift(torch.unsqueeze(qzeros, 2), wf.unsqueeze(0)).to(
+        torch.int16 if bits == 8 else torch.int8
+    )
+
+    # zeros = zeros + 1
+
+    torch.bitwise_and(zeros, (2**bits) - 1, out=zeros)
+
+    zeros = zeros.reshape(-1, 1, zeros.shape[1] * zeros.shape[2])
+
+    weight = torch.bitwise_right_shift(torch.unsqueeze(qweight, 1), wf.unsqueeze(-1)).to(
+        torch.int16 if bits == 8 else torch.int8
+    )
+    torch.bitwise_and(weight, (2**bits) - 1, out=weight)
+    weight = weight.reshape(-1, group_size, weight.shape[2])
+
+    weight = weight.view(-1, weight.shape[-1])
+    zeros = zeros.view(-1, zeros.shape[-1])
+
+    zeros = zeros.T.contiguous()
+    zeros = awq_reverse_reorder_int_tensor(zeros, bits)
+    weight = awq_reverse_reorder_int_tensor(weight, bits)
+
+    # Dequantize weights.
+    scales = awq_scales.cuda()
+    zeros = zeros.contiguous()
+    scale_zeros = zeros * scales
+
+    g_idx = torch.tensor([i // group_size for i in range(infeatures)], dtype=torch.int32)
+    scale_mat = scales[g_idx]
+    scale_zeros_mat = scale_zeros[g_idx].half()
+
+    qdq_weight_T = weight * scale_mat - scale_zeros_mat.half()
+
+    fp16_weight = qdq_weight_T.T.cuda()
+
+    return fp16_weight, zeros
+
+
+def pack_from_tensors(
+    unpacked_qweight: torch.Tensor,
+    unpacked_qzeros: torch.Tensor,
+    awq_scales: torch.Tensor,
+    bits: int,
+    group_size: int,
+):
+    """
+    Args:
+        unpacked_qweight (`torch.LongTensor`):
+            Expected shape: (in_features, out_features)
+        unpacked_qzeros (`torch.LongTensor`):
+            Expected shape: (in_features // group_size, out_features)
+        awq_scales (`torch.LongTensor`):
+            Expected shape: (in_features // group_size, out_features)
+
+    Returns:
+        qweight (`torch.LongTensor`):
+            With shape (in_features // (32 // bits), out_features)
+        qzeros (`torch.LongTensor`):
+            With shape (in_features // group_size, out_features // (32 // bits))
+    """
+    assert bits == 4
+    W = unpacked_qweight.clone().cpu()
+
+    # TODO: This should be checked somehow.
+    # if isinstance(linear, nn.Conv2d):
+    #     W = W.flatten(1)
+    # if isinstance(linear, transformers.pytorch_utils.Conv1D):
+    #     W = W.t()
+
+    awq_scales = awq_scales.t().contiguous()
+    unpacked_qzeros = unpacked_qzeros.contiguous()
+    unpacked_qzeros = unpacked_qzeros.cpu()
+
+    awq_scales = awq_scales.cpu()
+    scale_zeros = unpacked_qzeros.t() * awq_scales
+    scales = awq_scales.clone()
+
+    infeatures = unpacked_qweight.shape[1]
+
+    intweight = []
+    for idx in range(infeatures):
+        g_idx = idx // group_size
+
+        intweight.append(torch.round((W[:, idx] + scale_zeros[:, g_idx]) / scales[:, g_idx]).to(torch.int)[:, None])
+    intweight = torch.cat(intweight, dim=1)
+    intweight = intweight.t().contiguous()
+    intweight = intweight.numpy().astype(np.uint32)
+
+    i = 0
+    row = 0
+    qweight = np.zeros((intweight.shape[0] // 32 * bits, intweight.shape[1]), dtype=np.uint32)
+    while row < qweight.shape[0]:
+        for j in range(i, i + (32 // bits)):
+            qweight[row] |= intweight[j] << (bits * (j - i))
+        i += 32 // bits
+        row += 1
+
+    qweight = qweight.astype(np.int32)
+    qweight = torch.from_numpy(qweight)
+
+    unpacked_qzeros = unpacked_qzeros - 1
+    torch.bitwise_and(unpacked_qzeros, (2**bits) - 1, out=unpacked_qzeros)
+
+    unpacked_qzeros = unpacked_qzeros.numpy().astype(np.uint32)
+    qzeros = np.zeros(
+        (unpacked_qzeros.shape[0], unpacked_qzeros.shape[1] // 32 * bits),
+        dtype=np.uint32,
+    )
+    i = 0
+    col = 0
+    while col < qzeros.shape[1]:
+        for j in range(i, i + (32 // bits)):
+            qzeros[:, col] |= unpacked_qzeros[:, j] << (bits * (j - i))
+        i += 32 // bits
+        col += 1
+
+    qzeros = qzeros.astype(np.int32)
+    qzeros = torch.from_numpy(qzeros)
+
+    return qweight, qzeros
+
+
+def get_checkpoints(model_name_or_path: str, extensions: List[str], possible_model_basenames: List[str], **cached_file_kwargs):
+    """
+    Retrives (and if necessary downloads from Hugging Face Hub) the model checkpoint. Sharding is supported. All the `possible_model_basenames` (e.g. `["model", "model-4bit-gptq"]`) will be explored over all `extensions` (e.g. `[".bin", ".safetensors"]`).
+    """
+    searched_files = []
+    resolved_archive_file = None
+    true_model_basename = None
+
+    if os.path.isdir(model_name_or_path):
+        for ext in extensions:
+            for possible_model_basename in possible_model_basenames:
+                shard_index_name = possible_model_basename + ext + ".index.json"
+                searched_files.append(shard_index_name)
+                possible_index_file = os.path.join(model_name_or_path, shard_index_name)
+                if os.path.isfile(possible_index_file):
+                    # The model is sharded over several checkpoints.
+                    possible_model_basename = possible_index_file.replace(ext + ".index.json", "")
+                    return True, possible_index_file, possible_model_basename
+                else:
+                    model_save_name = os.path.join(model_name_or_path, possible_model_basename)
+                    searched_files.append(possible_model_basename + ext)
+                    if os.path.isfile(model_save_name + ext):
+                        resolved_archive_file = model_save_name + ext
+                        return False, resolved_archive_file, possible_model_basename
+    else:
+        temp = None
+        for ext in extensions:
+            for possible_model_basename in possible_model_basenames:
+                shard_index_name = possible_model_basename + ext + ".index.json"
+                shard_index = cached_file(
+                    model_name_or_path,
+                    shard_index_name,
+                    **cached_file_kwargs,
+                )
+                searched_files.append(shard_index_name)
+                if shard_index is not None:
+                    # The model is sharded over several checkpoints.
+                    with open(str(shard_index)) as f:
+                        index_json = json.load(f)
+                        # Download the shards from the index.json.
+                        shards = list(set(index_json["weight_map"].values()))
+                        for shard in shards:
+                            resolved_archive_file = cached_file(
+                                model_name_or_path,
+                                shard,
+                                **cached_file_kwargs,
+                            )
+                        return True, shard_index, possible_model_basename
+                else:
+                    resolved_archive_file = cached_file(
+                        model_name_or_path,
+                        possible_model_basename + ext,
+                        **cached_file_kwargs,
+                    )
+                    if resolved_archive_file is None:
+                        resolved_archive_file = temp
+                    searched_files.append(possible_model_basename + ext)
+                    if resolved_archive_file is not None:
+                        temp = resolved_archive_file
+                        return False, resolved_archive_file, possible_model_basename
+
+    if resolved_archive_file is None:
+        raise FileNotFoundError(
+            f"Could not find a model in {model_name_or_path} with a name in {', '.join(searched_files)}. Please specify the argument model_basename to use a custom file name."
+        )
+
+    return False, resolved_archive_file, true_model_basename
+
+
+__all__ = [
+    "get_device",
+    "move_to_device",
+    "find_layers",
+    "get_module_by_name_prefix",
+    "get_module_by_name_suffix",
+    "make_quant",
+    "preprocess_checkpoint_qigen",
+    "pack_model",
+    "autogptq_post_init",
+    "check_and_get_model_type",
+    "simple_dispatch_model",
+    "make_sure_no_tensor_in_meta_device",
+]
--- a/auto_gptq/modeling/auto.py
+++ b/auto_gptq/modeling/auto.py
+from inspect import signature
+from typing import Dict, Optional, Union
+
+from ._base import BaseGPTQForCausalLM, BaseQuantizeConfig
+from ._utils import check_and_get_model_type
+from .baichuan import BaiChuanGPTQForCausalLM
+from .bloom import BloomGPTQForCausalLM
+from .codegen import CodeGenGPTQForCausalLM
+from .cohere import CohereGPTQForCausalLM
+from .decilm import DeciLMGPTQForCausalLM
+from .gemma import GemmaGPTQForCausalLM
+from .gemma2 import Gemma2GPTQForCausalLM
+from .gpt2 import GPT2GPTQForCausalLM
+from .gpt_bigcode import GPTBigCodeGPTQForCausalLM
+from .gpt_neox import GPTNeoXGPTQForCausalLM
+from .gptj import GPTJGPTQForCausalLM
+from .internlm import InternLMGPTQForCausalLM
+from .llama import LlamaGPTQForCausalLM
+from .longllama import LongLlamaGPTQForCausalLM
+from .mistral import MistralGPTQForCausalLM
+from .mixtral import MixtralGPTQForCausalLM
+from .moss import MOSSGPTQForCausalLM
+from .mpt import MPTGPTQForCausalLM
+from .opt import OPTGPTQForCausalLM
+from .phi import PhiGPTQForCausalLM
+from .qwen import QwenGPTQForCausalLM
+from .qwen2 import Qwen2GPTQForCausalLM
+from .rw import RWGPTQForCausalLM
+from .stablelmepoch import StableLMEpochGPTQForCausalLM
+from .starcoder2 import Starcoder2GPTQForCausalLM
+from .xverse import XverseGPTQForCausalLM
+from .yi import YiGPTQForCausalLM
+
+
+GPTQ_CAUSAL_LM_MODEL_MAP = {
+    "bloom": BloomGPTQForCausalLM,
+    "gpt_neox": GPTNeoXGPTQForCausalLM,
+    "gptj": GPTJGPTQForCausalLM,
+    "gpt2": GPT2GPTQForCausalLM,
+    "llama": LlamaGPTQForCausalLM,
+    "opt": OPTGPTQForCausalLM,
+    "moss": MOSSGPTQForCausalLM,
+    "gpt_bigcode": GPTBigCodeGPTQForCausalLM,
+    "codegen": CodeGenGPTQForCausalLM,
+    "cohere": CohereGPTQForCausalLM,
+    "RefinedWebModel": RWGPTQForCausalLM,
+    "RefinedWeb": RWGPTQForCausalLM,
+    "falcon": RWGPTQForCausalLM,
+    "baichuan": BaiChuanGPTQForCausalLM,
+    "internlm": InternLMGPTQForCausalLM,
+    "qwen": QwenGPTQForCausalLM,
+    "mistral": MistralGPTQForCausalLM,
+    "Yi": YiGPTQForCausalLM,
+    "xverse": XverseGPTQForCausalLM,
+    "deci": DeciLMGPTQForCausalLM,
+    "stablelm_epoch": StableLMEpochGPTQForCausalLM,
+    "starcoder2": Starcoder2GPTQForCausalLM,
+    "mixtral": MixtralGPTQForCausalLM,
+    "qwen2": Qwen2GPTQForCausalLM,
+    "longllama": LongLlamaGPTQForCausalLM,
+    "gemma": GemmaGPTQForCausalLM,
+    "gemma2": Gemma2GPTQForCausalLM,
+    "phi": PhiGPTQForCausalLM,
+    "mpt": MPTGPTQForCausalLM,
+}
+
+
+class AutoGPTQForCausalLM:
+    def __init__(self):
+        raise EnvironmentError(
+            "AutoGPTQModelForCausalLM is designed to be instantiated\n"
+            "using `AutoGPTQModelForCausalLM.from_pretrained` if want to quantize a pretrained model.\n"
+            "using `AutoGPTQModelForCausalLM.from_quantized` if want to inference with quantized model."
+        )
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: str,
+        quantize_config: BaseQuantizeConfig,
+        max_memory: Optional[dict] = None,
+        trust_remote_code: bool = False,
+        **model_init_kwargs,
+    ) -> BaseGPTQForCausalLM:
+        model_type = check_and_get_model_type(pretrained_model_name_or_path, trust_remote_code)
+        return GPTQ_CAUSAL_LM_MODEL_MAP[model_type].from_pretrained(
+            pretrained_model_name_or_path=pretrained_model_name_or_path,
+            quantize_config=quantize_config,
+            max_memory=max_memory,
+            trust_remote_code=trust_remote_code,
+            **model_init_kwargs,
+        )
+
+    @classmethod
+    def from_quantized(
+        cls,
+        model_name_or_path: Optional[str],
+        device_map: Optional[Union[str, Dict[str, Union[str, int]]]] = None,
+        max_memory: Optional[dict] = None,
+        device: Optional[Union[str, int]] = None,
+        low_cpu_mem_usage: bool = False,
+        use_triton: bool = False,
+        inject_fused_attention: bool = False,
+        inject_fused_mlp: bool = False,
+        use_cuda_fp16: bool = True,
+        quantize_config: Optional[BaseQuantizeConfig] = None,
+        model_basename: Optional[str] = None,
+        use_safetensors: bool = True,
+        trust_remote_code: bool = False,
+        warmup_triton: bool = False,
+        trainable: bool = False,
+        disable_exllama: Optional[bool] = None,
+        disable_exllamav2: bool = False,
+        use_marlin: bool = False,
+        use_tritonv2: bool = False,
+        **kwargs,
+    ) -> BaseGPTQForCausalLM:
+        # If disable_exllamav2 is True, we want to fall back on the exllama kernel and not the cuda/cuda_old ones.
+        if disable_exllama is None:
+            if disable_exllamav2:
+                disable_exllama = False
+            else:
+                disable_exllama = True
+
+        model_type = check_and_get_model_type(model_name_or_path, trust_remote_code)
+        quant_func = GPTQ_CAUSAL_LM_MODEL_MAP[model_type].from_quantized
+        # A static list of kwargs needed for huggingface_hub
+        huggingface_kwargs = [
+            "cache_dir",
+            "force_download",
+            "proxies",
+            "resume_download",
+            "local_files_only",
+            "use_auth_token",
+            "revision",
+            "subfolder",
+            "_raise_exceptions_for_missing_entries",
+            "_commit_hash",
+        ]
+        # TODO: do we need this filtering of kwargs? @PanQiWei is there a reason we can't just pass all kwargs?
+        keywords = {
+            key: kwargs[key]
+            for key in list(signature(quant_func).parameters.keys()) + huggingface_kwargs
+            if key in kwargs
+        }
+        return quant_func(
+            model_name_or_path=model_name_or_path,
+            device_map=device_map,
+            max_memory=max_memory,
+            device=device,
+            low_cpu_mem_usage=low_cpu_mem_usage,
+            use_triton=use_triton,
+            inject_fused_attention=inject_fused_attention,
+            inject_fused_mlp=inject_fused_mlp,
+            use_cuda_fp16=use_cuda_fp16,
+            quantize_config=quantize_config,
+            model_basename=model_basename,
+            use_safetensors=use_safetensors,
+            trust_remote_code=trust_remote_code,
+            warmup_triton=warmup_triton,
+            trainable=trainable,
+            disable_exllama=disable_exllama,
+            disable_exllamav2=disable_exllamav2,
+            use_marlin=use_marlin,
+            use_tritonv2=use_tritonv2,
+            **keywords,
+        )
+
+
+__all__ = ["AutoGPTQForCausalLM"]
--- a/auto_gptq/modeling/baichuan.py
+++ b/auto_gptq/modeling/baichuan.py
+from ._base import BaseGPTQForCausalLM
+
+
+class BaiChuanGPTQForCausalLM(BaseGPTQForCausalLM):
+    layer_type = "DecoderLayer"
+    layers_block_name = "model.layers"
+    outside_layer_modules = ["model.embed_tokens", "model.norm"]
+    inside_layer_modules = [
+        ["self_attn.W_pack"],
+        ["self_attn.o_proj"],
+        ["mlp.up_proj", "mlp.gate_proj"],
+        ["mlp.down_proj"],
+    ]
+
+
+__all__ = ["BaiChuanGPTQForCausalLM"]
--- a/auto_gptq/modeling/bloom.py
+++ b/auto_gptq/modeling/bloom.py
+from ._base import BaseGPTQForCausalLM
+
+
+class BloomGPTQForCausalLM(BaseGPTQForCausalLM):
+    layer_type = "BloomBlock"
+    layers_block_name = "transformer.h"
+    outside_layer_modules = [
+        "transformer.word_embeddings",
+        "transformer.word_embeddings_layernorm",
+        "transformer.ln_f",
+    ]
+    inside_layer_modules = [
+        ["self_attention.query_key_value"],
+        ["self_attention.dense"],
+        ["mlp.dense_h_to_4h"],
+        ["mlp.dense_4h_to_h"],
+    ]
+
+
+__all__ = ["BloomGPTQForCausalLM"]
--- a/auto_gptq/modeling/codegen.py
+++ b/auto_gptq/modeling/codegen.py
+from ._base import BaseGPTQForCausalLM
+
+
+class CodeGenGPTQForCausalLM(BaseGPTQForCausalLM):
+    layer_type = "CodeGenBlock"
+    layers_block_name = "transformer.h"
+    outside_layer_modules = ["transformer.wte", "transformer.ln_f"]
+    inside_layer_modules = [
+        ["attn.qkv_proj"],
+        ["attn.out_proj"],
+        ["mlp.fc_in"],
+        ["mlp.fc_out"],
+    ]
+
+
+__all__ = ["CodeGenGPTQForCausalLM"]
--- a/auto_gptq/modeling/cohere.py
+++ b/auto_gptq/modeling/cohere.py
+from logging import getLogger
+
+from ._base import BaseGPTQForCausalLM
+
+
+logger = getLogger(__name__)
+
+class CohereGPTQForCausalLM(BaseGPTQForCausalLM):
+    layer_type = "CohereDecoderLayer"
+    layers_block_name = "model.layers"
+    outside_layer_modules = ["model.embed_tokens", "model.norm"]
+    inside_layer_modules = [
+        ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"],
+        ["self_attn.o_proj"],
+        ["mlp.up_proj", "mlp.gate_proj"],
+        ["mlp.down_proj"],
+    ]
+
+__all__ = ["CohereGPTQForCausalLM"]
+    
\ No newline at end of file
--- a/auto_gptq/modeling/decilm.py
+++ b/auto_gptq/modeling/decilm.py
+from logging import getLogger
+
+from ..utils.import_utils import compare_transformers_version
+from ._base import BaseGPTQForCausalLM
+
+
+if compare_transformers_version("v4.28.0", op="ge"):
+    from ..nn_modules.fused_llama_attn import FusedLlamaAttentionForQuantizedModel
+    from ..nn_modules.fused_llama_mlp import FusedLlamaMLPForQuantizedModel
+else:
+    FusedLlamaAttentionForQuantizedModel = None
+    FusedLlamaMLPForQuantizedModel = None
+
+logger = getLogger(__name__)
+
+
+class DeciLMGPTQForCausalLM(BaseGPTQForCausalLM):
+    layer_type = "DeciLMDecoderLayer"
+    layers_block_name = "model.layers"
+    outside_layer_modules = ["model.embed_tokens", "model.norm"]
+    inside_layer_modules = [
+        ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"],
+        ["self_attn.o_proj"],
+        ["mlp.up_proj", "mlp.gate_proj"],
+        ["mlp.down_proj"],
+    ]
+
+    fused_attn_module_type = FusedLlamaAttentionForQuantizedModel
+    fused_mlp_module_type = FusedLlamaMLPForQuantizedModel
+
+
+__all__ = ["DeciLMGPTQForCausalLM"]
--- a/auto_gptq/modeling/gemma.py
+++ b/auto_gptq/modeling/gemma.py
+from logging import getLogger
+
+from ._base import BaseGPTQForCausalLM
+
+
+logger = getLogger(__name__)
+
+
+class GemmaGPTQForCausalLM(BaseGPTQForCausalLM):
+    layer_type = "GemmaDecoderLayer"
+    layers_block_name = "model.layers"
+    outside_layer_modules = ["model.embed_tokens", "model.norm"]
+    inside_layer_modules = [
+        ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"],
+        ["self_attn.o_proj"],
+        ["mlp.up_proj", "mlp.gate_proj"],
+        ["mlp.down_proj"],
+    ]
+
+
+__all__ = ["GemmaGPTQForCausalLM"]
--- a/auto_gptq/modeling/gemma2.py
+++ b/auto_gptq/modeling/gemma2.py
+from logging import getLogger
+
+from ._base import BaseGPTQForCausalLM
+
+
+logger = getLogger(__name__)
+
+
+class Gemma2GPTQForCausalLM(BaseGPTQForCausalLM):
+    layer_type = "Gemma2DecoderLayer"
+    layers_block_name = "model.layers"
+    outside_layer_modules = ["model.embed_tokens", "model.norm"]
+    inside_layer_modules = [
+        ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"],
+        ["self_attn.o_proj"],
+        ["mlp.up_proj", "mlp.gate_proj"],
+        ["mlp.down_proj"],
+    ]
+
+
+__all__ = ["Gemma2GPTQForCausalLM"]
--- a/auto_gptq/modeling/gpt2.py
+++ b/auto_gptq/modeling/gpt2.py
+from ._base import BaseGPTQForCausalLM
+
+
+class GPT2GPTQForCausalLM(BaseGPTQForCausalLM):
+    layer_type = "GPT2Block"
+    layers_block_name = "transformer.h"
+    outside_layer_modules = ["transformer.wte", "transformer.wpe", "transformer.ln_f"]
+    inside_layer_modules = [
+        ["attn.c_attn"],
+        ["attn.c_proj"],
+        ["mlp.c_fc"],
+        ["mlp.c_proj"],
+    ]
+
+
+__all__ = ["GPT2GPTQForCausalLM"]
--- a/auto_gptq/modeling/gpt_bigcode.py
+++ b/auto_gptq/modeling/gpt_bigcode.py
+from ._base import BaseGPTQForCausalLM
+
+
+class GPTBigCodeGPTQForCausalLM(BaseGPTQForCausalLM):
+    layer_type = "GPTBigCodeBlock"
+    layers_block_name = "transformer.h"
+    outside_layer_modules = ["transformer.wpe", "transformer.wte", "transformer.ln_f"]
+    inside_layer_modules = [
+        ["attn.c_attn"],
+        ["attn.c_proj"],
+        ["mlp.c_fc"],
+        ["mlp.c_proj"],
+    ]
+
+
+__all__ = ["GPTBigCodeGPTQForCausalLM"]
--- a/auto_gptq/modeling/gpt_neox.py
+++ b/auto_gptq/modeling/gpt_neox.py
+from ._base import BaseGPTQForCausalLM
+
+
+class GPTNeoXGPTQForCausalLM(BaseGPTQForCausalLM):
+    layer_type = "GPTNeoXLayer"
+    layers_block_name = "gpt_neox.layers"
+    outside_layer_modules = ["gpt_neox.embed_in", "gpt_neox.final_layer_norm"]
+    inside_layer_modules = [
+        ["attention.query_key_value"],
+        ["attention.dense"],
+        ["mlp.dense_h_to_4h"],
+        ["mlp.dense_4h_to_h"],
+    ]
+    lm_head_name = "embed_out"
+
+
+__all__ = ["GPTNeoXGPTQForCausalLM"]
--- a/auto_gptq/modeling/gptj.py
+++ b/auto_gptq/modeling/gptj.py
+from ..nn_modules.fused_gptj_attn import FusedGPTJAttentionForQuantizedModel
+from ._base import BaseGPTQForCausalLM
+
+
+class GPTJGPTQForCausalLM(BaseGPTQForCausalLM):
+    layer_type = "GPTJBlock"
+    layers_block_name = "transformer.h"
+    outside_layer_modules = ["transformer.wte", "transformer.ln_f"]
+    inside_layer_modules = [
+        ["attn.k_proj", "attn.v_proj", "attn.q_proj"],
+        ["attn.out_proj"],
+        ["mlp.fc_in"],
+        ["mlp.fc_out"],
+    ]
+
+    fused_attn_module_type = FusedGPTJAttentionForQuantizedModel
+
+
+__all__ = ["GPTJGPTQForCausalLM"]