Initial commit

da900c3b · yangql · da900c3b · da900c3b · da900c3b · da900c3b
Commit da900c3b authored Sep 19, 2024 by yangql
20 changed files
--- a/autogptq_extension/exllamav2/hip/util.cuh
+++ b/autogptq_extension/exllamav2/hip/util.cuh
+// !!! This is a file automatically generated by hipify!!!
+#include <ATen/dtk_macros.h>
+#define DIVIDE(x, size) (((x) + (size) - 1) / (size))
+#define DBGS(__x) printf("%s\n", __x)
+#define DBGI(__x) printf("%s: %i\n", #__x, __x)
+#define DBGI2(__x, __y) printf("%s, %s: %i, %i\n", #__x, #__y, __x, __y)
+#define DBGI3(__x, __y, __z) printf("%s, %s, %s: %i, %i, %i\n", #__x, #__y, #__z, __x, __y, __z)
+#define DBGX(__x) printf("%s: %x\n", #__x, __x)
+#define DBGX2(__x, __y) printf("%s, %s: %x, %x\n", #__x, #__y, __x, __y)
+#define DBGX3(__x, __y, __z) printf("%s, %s, %s: %x, %x, %x\n", #__x, #__y, #__z, __x, __y, __z)
+#define DBGF(__x) printf("%s: %f\n", #__x, __x)
+#define DBGF2(__x, __y) printf("%s, %s: %f, %f\n", #__x, #__y, __x, __y)
+#define DBGF3(__x, __y, __z) printf("%s, %s, %s: %f, %f, %f\n", #__x, #__y, #__z, __x, __y, __z)
+#define DBGH(__x) printf("%s: %f\n", #__x, __half2float(__x))
+#define DBGH2(__x, __y) printf("%s, %s: %f, %f\n", #__x, #__y, __half2float(__x), __half2float(__y))
+#define DBGH3(__x, __y, __z) printf("%s, %s, %s: %f, %f, %f\n", #__x, #__y, #__z, __half2float(__x), __half2float(__y), __half2float(__z))
+#define DBGIH(__x, __y) printf("%s, %s: %i, %f\n", #__x, #__y, __x, __half2float(__y))
+#define DBGIH2(__x, __y, __z) printf("%s, %s, %s: %i, %f, %f\n", #__x, #__y, #__z, __x, __half2float(__y), __half2float(__z))
+__forceinline__ __device__ half dq_scale_(const int qs, const half max_scale)
+{
+    half qs_h = __hmul(__int2half_rn(qs + 1), __float2half_rn(1.0f / 16.0f));
+    qs_h = __hmul(qs_h, qs_h);
+    qs_h = __hmul(qs_h, max_scale);
+    return qs_h;
+}
+__forceinline__ __device__ float clamp(float x, float a, float b)
+{
+    return fmaxf(a, fminf(b, x));
+}
+#define cuda_check(ans) { gpu_assert((ans), __FILE__, __LINE__); }
+inline void gpu_assert(hipError_t code, const char *file, int line, bool abort=true)
+{
+   if (code != hipSuccess)
+   {
+      fprintf(stderr,"CUDA error: %s %s %d\n", hipGetErrorString(code), file, line);
+      if (abort) exit(code);
+   }
+}
--- a/docs/INSTALLATION.md
+++ b/docs/INSTALLATION.md
+# Installation
+On Linux and Windows, AutoGPTQ can be installed through pre-built wheels for specific PyTorch versions:
+| AutoGPTQ version | CUDA/ROCm version | Installation                                                                                               | Built against PyTorch |
+|------------------|-------------------|------------------------------------------------------------------------------------------------------------|-----------------------|
+| latest (0.7.1)   | CUDA 11.8         | `pip install auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/`          | 2.2.1+cu118           |
+| latest (0.7.1)   | CUDA 12.1         | `pip install auto-gptq`                                                                                    | 2.2.1+cu121           |
+| latest (0.7.1)   | ROCm 5.7          | `pip install auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/rocm571/`        | 2.2.1+rocm5.7         |
+| 0.7.0   | CUDA 11.8         | `pip install auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/`          | 2.2.0+cu118           |
+| 0.7.0   | CUDA 12.1         | `pip install auto-gptq`                                                                                    | 2.2.0+cu121           |
+| 0.7.0   | ROCm 5.7          | `pip install auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/rocm571/`        | 2.2.0+rocm5.7         |
+| 0.6.0            | CUDA 11.8         | `pip install auto-gptq==0.6.0 --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/`   | 2.1.1+cu118           |
+| 0.6.0            | CUDA 12.1         | `pip install auto-gptq==0.6.0`                                                                             | 2.1.1+cu121           |
+| 0.6.0            | ROCm 5.6          | `pip install auto-gptq==0.6.0 --extra-index-url https://huggingface.github.io/autogptq-index/whl/rocm561/` | 2.1.1+rocm5.6         |
+| 0.5.1            | CUDA 11.8         | `pip install auto-gptq==0.5.1 --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/`   | 2.1.0+cu118           |
+| 0.5.1            | CUDA 12.1         | `pip install auto-gptq==0.5.1`                                                                             | 2.1.0+cu121           |
+| 0.5.1            | ROCm 5.6          | `pip install auto-gptq==0.5.1 --extra-index-url https://huggingface.github.io/autogptq-index/whl/rocm561/` | 2.1.0+rocm5.6         |
+AutoGPTQ is not available on macOS.
\ No newline at end of file
--- a/docs/NEWS_OR_UPDATE.md
+++ b/docs/NEWS_OR_UPDATE.md
+## <center>News or Update</center>
+- 2024-02-15 - (News) - AutoGPTQ 0.7.0 is released, with [Marlin](https://github.com/IST-DASLab/marlin) int4*fp16 matrix multiplication kernel support.
+- 2023-08-23 - (News) - 🤗 Transformers, optimum and peft have integrated `auto-gptq`, so now running and training GPTQ models can be more available to everyone! See [this blog](https://huggingface.co/blog/gptq-integration) and it's resources for more details!
+- 2023-08-21 - (News) - Team of Qwen officially released 4bit quantized version of Qwen-7B based on `auto-gptq`, and provided [a detailed benchmark results](https://huggingface.co/Qwen/Qwen-7B-Chat-Int4#%E9%87%8F%E5%8C%96-quantization)
+- 2023-08-06 - (Update) - Support exllama's q4 CUDA kernel to have at least 1.3x speed up for int4 quantized models when doing inference.
+- 2023-08-04 - (Update) - Support RoCm so that AMD GPU users can use auto-gptq with CUDA extensions.
+- 2023-07-26 - (Update) - An elegant [PPL benchmark script](examples/benchmark/perplexity.py) to get results that can be fairly compared with other libraries such as `llama.cpp`.
+- 2023-06-05 - (Update) - Integrate with 🤗 peft to use gptq quantized model to train adapters, support LoRA, AdaLoRA, AdaptionPrompt, etc.
+- 2023-05-30 - (Update) - support download/upload quantized model from/to 🤗 Hub.
+- 2023-05-27 - (Update) - Support quantization and inference for `gpt_bigcode`, `codegen` and `RefineWeb/RefineWebModel`(falcon) model types.
+- 2023-05-04 - (Update) - Support using faster cuda kernel when `not desc_act or group_size == -1`
+- 2023-04-29 - (Update) - Support loading quantized model from arbitrary quantize_config and model_basename.
+- 2023-04-28 - (Update) - Support CPU offload and quantize/inference on multiple devices, support `gpt2` type models.
+- 2023-04-26 - (Update) - Using `triton` to speed up inference is now supported.
+- 2023-04-25 - (News&Update) - [MOSS](https://github.com/OpenLMLab/MOSS) is an open-source tool-augmented conversational language model from Fudan University, quantization is now supported in AutoGPTQ.
+- 2023-04-23 - (Update) - Support evaluation on multiple (down-stream) tasks such as: language-modeling, text-classification, text-summarization.
+- 2023-04-22 - (News) - qwopqwop200's [AutoGPTQ-triton](https://github.com/qwopqwop200/AutoGPTQ-triton) provides faster speed to integrate with quantized model, for everyone who can access to triton, try and enjoy yourself!
+- 2023-04-20 - (News) - AutoGPTQ is automatically compatible with Stability-AI's newly released `gpt_neox` type model family [StableLM](https://github.com/Stability-AI/StableLM).
+- 2023-04-16 - (Update) - Support quantization and inference for `bloom`, `gpt_neox`, `gptj`, `llama` and `opt`.
\ No newline at end of file
--- a/docs/tutorial/01-Quick-Start.md
+++ b/docs/tutorial/01-Quick-Start.md
+# Quick Start
+Welcome to the tutorial of AutoGPTQ, in this chapter, you will learn quick install `auto-gptq` from pypi and the basic usages of this library.
+## Quick Installation
+Start from v0.0.4, one can install `auto-gptq` directly from pypi using `pip`:
+```shell
+pip install auto-gptq
+```
+AutoGPTQ supports using `triton` to speedup inference, but it currently **only supports Linux**. To integrate triton, using:
+```shell
+pip install auto-gptq[triton]
+```
+For some people who want to try the newly supported `llama` type models in 🤗 Transformers but not update it to the latest version, using:
+```shell
+pip install auto-gptq[llama]
+```
+By default, CUDA extension will be built at installation if CUDA and pytorch are already installed.
+To disable building CUDA extension, you can use the following commands:
+For Linux
+```shell
+BUILD_CUDA_EXT=0 pip install auto-gptq
+```
+For Windows
+```shell
+set BUILD_CUDA_EXT=0 && pip install auto-gptq
+```
+## Basic Usage
+*The full script of basic usage demonstrated here is `examples/quantization/basic_usage.py`*
+The two main classes currently used in AutoGPTQ are `AutoGPTQForCausalLM` and `BaseQuantizeConfig`.
+```python
+from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
+```
+### Quantize a pretrained model
+To quantize a model, you need to load pretrained model and tokenizer first, for example:
+```python
+from transformers import AutoTokenizer
+pretrained_model_name = "facebook/opt-125m"
+quantize_config = BaseQuantizeConfig(bits=4, group_size=128)
+model = AutoGPTQForCausalLM.from_pretrained(pretrained_model_name, quantize_config)
+tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)
+```
+This will download `opt-125m` from 🤗 Hub and cache it to local disk, then load into **CPU memory**.
+*In later tutorial, you will learn advanced model loading strategies such as CPU offload and load model into multiple devices.*
+Then, prepare examples(a list of dict with only two keys, 'input_ids' and 'attention_mask') to guide quantization. Here we use only one text to simplify the code, but you should be noticed that the more examples used, the better(most likely) the quantized model.
+```python
+examples = [
+    tokenizer(
+        "auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."
+    )
+]
+```
+After all recipes are prepared, we can now start to quantize the pretrained model.
+```python
+model.quantize(examples)
+```
+Finally, we can save the quantized model:
+```python
+quantized_model_dir = "opt-125m-4bit-128g"
+model.save_quantized(quantized_model_dir)
+```
+By default, the saved file type is `.bin`, you can also set `use_safetensors=True` to save a `.safetensors` model file. The format of model file base name saved using this method is: `gptq_model-{bits}bit-{group_size}g`.
+Pretrained model's config and the quantize config will also be saved with file names `config.json` and `quantize_config.json`, respectively.
+### Load quantized model and do inference 
+Instead of `.from_pretrained`, you should use `.from_quantized` to load a quantized model.
+```python
+device = "cuda:0"
+model = AutoGPTQForCausalLM.from_quantized(quantized_model_dir, device=device)
+```
+This will first read and load `quantize_config.json` in `opt-125m-4bit-128g` directory, then based on the values of `bits` and `group_size` in it, load `gptq_model-4bit-128g.bin` model file into the first visible GPU.
+Then you can initialize 🤗 Transformers' `TextGenerationPipeline` and do inference.
+```python
+from transformers import TextGenerationPipeline
+pipeline = TextGenerationPipeline(model=model, tokenizer=tokenizer, device=device)
+print(pipeline("auto-gptq is")[0]["generated_text"])
+```
+## Conclusion
+Congrats! You learned how to quickly install `auto-gptq` and integrate with it. In the next chapter, you will learn the advanced loading strategies for pretrained or quantized model and some best practices on different situations.
\ No newline at end of file
--- a/docs/tutorial/02-Advanced-Model-Loading-and-Best-Practice.md
+++ b/docs/tutorial/02-Advanced-Model-Loading-and-Best-Practice.md
+# Advanced Model Loading and Best Practice
+Welcome to the tutorial of AutoGPTQ, in this chapter, you will learn advanced model loading and best practice in `auto-gptq`.
+## Arguments Introduction
+In previous chapter, you learned how to load model into CPU or single GPU with the two basic apis:
+- `.from_pretrained`: by default, load the whole pretrained model into CPU.
+- `.from_quantized`: by default, `auto_gptq` will automatically find the suitable way to load the quantized model.
+  - if there is only single GPU and model can fit into it, will load the whole model into that GPU;
+  - if there are multiple GPUs and model can fit into them, will evenly split model and load into those GPUs;
+  - if model can't fit into GPU(s), will use CPU offloading.
+However, the default settings above may not meet many users' demands, for they want to have more control of model loading.
+Luckily, in AutoGPTQ, we provide some advanced arguments that users can tweak to manually config model loading strategy:
+- `low_cpu_mem_usage`: `bool` type argument, defaults to False, can be used both in `.from_pretrained` and `.from_quantized`, one can enable it when there is a limitation of CPU memory(by default model will be initialized in CPU) or want to load model faster.
+- `max_memory`: an optional `List[Dict[Union[str, int], str]]` type argument, can be used both in `.from_pretrained` and `.from_quantized`.
+- `device_map`: an optional `Union[str, Dict[str, Union[int, str]]]` type argument, currently only be supported in `.from_quantized`.
+Before `auto-gptq`'s existence, there are many users have already used other popular tools such as [GPTQ-for-LLaMa](https://github.com/qwopqwop200/GPTQ-for-LLaMa) to quantize their model and saved with different name without `quantize_config.json` file introduced in previous chapter.
+To address this, two more arguments were introduced in `.from_quantized` so that users can load quantized model with arbitrary names.
+- `quantize_config`: an optional `BaseQuantizeConfig` type argument, can be used to match model file and initialize model incase `quantize_config.json` not in the directory where model is saved.
+- `model_basename`: an optional `str` type argument, if specified, will be used to match model instead of using the file name format introduced in previous chapter.
+## Multiple Devices Model Loading
+### max_memory
+With this argument, you can specify how much memory for CPU and GPUs to use at most.
+That means, by specify the maximum CPU memory used at model loading, you can load some model weights to CPU and picked into GPU only when they're required to be used, and back CPU again after that. This is called "CPU offload", a very useful strategy that used when there is no room left for quantization or inference if you keep the whole model in GPU(s).
+Assume you have multiple GPUs, for each of them, you can also specify maximum memory that used to load model, separately. And by this, quantization and inference will be executed across devices.
+To better understanding, below are some examples.
+```python
+max_memory = {0: "20GIB"}
+```
+In this case, only first GPU (even if you have more GPUs) will be used to load model, and an error will be raised if the model requires memory over 20GB.
+```python
+max_memory = {0: "20GIB", 1: "20GIB"}
+```
+In this case, you can load model that smaller than 40GB into two GPUs, and the model will be split evenly.
+```python
+max_memory = {0: "10GIB", 1: "30GIB"}
+```
+In this case, you can also load model that smaller than 40GB into two GPUs, but the first GPU will use 10GB at most, which means if the model larger than 20GB, all model weights except the first 10GB will be loaded into the second GPU.
+```python
+max_memory = {0: "20GIB", "cpu": "20GIB"}
+```
+In this case, you can also load model that smaller than 40GB but the rest 20GB will be kept in CPU memory, only be collected into GPU when needed.
+### device_map
+So far, only `.from_quantized` supports this argument. 
+You can provide a string to this argument to use pre-set model loading strategies. Current valid values are `["auto", "balanced", "balanced_low_0", "sequential"]`
+In the simplest way, you can set `device_map='auto'` and let 🤗 Accelerate handle the device map computation. For more details of this argument, you can reference to [this document](https://huggingface.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map).
+## Best Practice
+### At Quantization
+It's always recommended to first consider loading the whole model into GPU(s) for it can save the time spend on transferring module's weights between CPU and GPU.
+However, not everyone have large GPU memory. Roughly speaking, always specify the maximum memory CPU will be used to load model, then, for each GPU, you can preserve memory that can fit in 1\~2(2\~3 for the first GPU incase CPU offload used) model layers for examples' tensors and calculations in quantization, and load model weights using all others left. By this, all you need to do is a simple math based on the number of GPUs you have, the size of model weights file(s) and the number of model layers.
+### At Inference
+For inference, following this principle: always using single GPU if you can, otherwise multiple GPUs, CPU offload is the last one to consider.
+## Conclusion
+Congrats! You learned the advanced strategies to load model using `.from_pretrained` and `.from_quantized` in `auto-gptq` with some best practice advices. In the next chapter, you will learn how to quickly customize an AutoGPTQ model and use it to quantize and inference.
--- a/examples/README.md
+++ b/examples/README.md
+# Examples
+To run example scripts in this folder, one must first install `auto_gptq` as described in [this](../README.md)
+## Quantization
+> Commands in this chapter should be run under `quantization` folder.
+### Basic Usage
+To Execute `basic_usage.py`, using command like this:
+```shell
+python basic_usage.py
+```
+This script also showcases how to download/upload quantized model from/to 🤗 Hub, to enable those features, you can uncomment the commented codes.
+To Execute `basic_usage_wikitext2.py`, using command like this:
+```shell
+python basic_usage_wikitext2.py
+```
+> Note: There is about 0.6 ppl degrade on opt-125m model using AutoGPTQ, compared to GPTQ-for-LLaMa.
+### Quantize with Alpaca
+To Execute `quant_with_alpaca.py`, using command like this:
+```shell
+python quant_with_alpaca.py --pretrained_model_dir "facebook/opt-125m" --per_gpu_max_memory 4 --quant_batch_size 16
+```
+Use `--help` flag to see detailed descriptions for more command arguments.
+The alpaca dataset used in here is a cleaned version provided by **gururise** in [AlpacaDataCleaned](https://github.com/gururise/AlpacaDataCleaned)
+## Evaluation
+> Commands in this chapter should be run under `evaluation` folder.
+### Language Modeling Task
+`run_language_modeling_task.py` script gives an example of using `LanguageModelingTask` to evaluate model's performance on language modeling task before and after quantization using `tatsu-lab/alpaca` dataset.
+To execute this script, using command like this:
+```shell
+CUDA_VISIBLE_DEVICES=0 python run_language_modeling_task.py --base_model_dir PATH/TO/BASE/MODEL/DIR --quantized_model_dir PATH/TO/QUANTIZED/MODEL/DIR
+```
+Use `--help` flag to see detailed descriptions for more command arguments.
+### Sequence Classification Task
+`run_sequence_classification_task.py` script gives an example of using `SequenceClassificationTask` to evaluate model's performance on sequence classification task before and after quantization using `cardiffnlp/tweet_sentiment_multilingual` dataset.
+To execute this script, using command like this:
+```shell
+CUDA_VISIBLE_DEVICES=0 python run_sequence_classification_task.py --base_model_dir PATH/TO/BASE/MODEL/DIR --quantized_model_dir PATH/TO/QUANTIZED/MODEL/DIR
+```
+Use `--help` flag to see detailed descriptions for more command arguments.
+### Text Summarization Task
+`run_text_summarization_task.py` script gives an example of using `TextSummarizationTask` to evaluate model's performance on text summarization task before and after quantization using `samsum` dataset.
+To execute this script, using command like this:
+```shell
+CUDA_VISIBLE_DEVICES=0 python run_text_summarization_task.py --base_model_dir PATH/TO/BASE/MODEL/DIR --quantized_model_dir PATH/TO/QUANTIZED/MODEL/DIR
+```
+Use `--help` flag to see detailed descriptions for more command arguments.
+## Benchmark
+> Commands in this chapter should be run under `benchmark` folder.
+### Generation Speed
+`generation_speed.py` script gives an example of how to benchmark the generations speed of pretrained and quantized models that `auto_gptq` supports, this benchmarks model generation speed in tokens/s metric.
+To execute this script, using command like this:
+```shell
+CUDA_VISIBLE_DEVICES=0 python generation_speed.py --model_name_or_path PATH/TO/MODEL/DIR
+```
+Use `--help` flag to see detailed descriptions for more command arguments.
+## PEFT
+> Commands in this chapter should be run under `peft` folder.
+### Lora
+`peft_lora_clm_instruction_tuning.py` script gives an example of instruction tuning gptq quantized model's lora adapter using tools in `auto_gptq.utils.peft_utils` and `🤗 peft` on alpaca dataset.
+To execute this script, using command like this:
+```shell
+CUDA_VISIBLE_DEVICES=0 python peft_lora_clm_instruction_tuning.py --model_name_or_path PATH/TO/MODEL/DIR
+```
+Use `--help` flag to see detailed descriptions for more command arguments.
+### AdaLora
+`peft_adalora_clm_instruction_tuning.py` script gives an example of instruction tuning gptq quantized model's adalora adapter using tools in `auto_gptq.utils.peft_utils` and `🤗 peft` on alpaca dataset.
+To execute this script, using command like this:
+```shell
+CUDA_VISIBLE_DEVICES=0 python peft_adalora_clm_instruction_tuning.py --model_name_or_path PATH/TO/MODEL/DIR
+```
+Use `--help` flag to see detailed descriptions for more command arguments.
+### AdaptionPrompt
+`peft_adaption_prompt_clm_instruction_tuning.py` script gives an example of instruction tuning gptq quantized model's adaption_prompt adapter(llama-adapter) using tools in `auto_gptq.utils.peft_utils` and `🤗 peft` on alpaca dataset.
+To execute this script, using command like this:
+```shell
+CUDA_VISIBLE_DEVICES=0 python peft_adaption_prompt_clm_instruction_tuning.py --model_name_or_path PATH/TO/MODEL/DIR
+```
+Use `--help` flag to see detailed descriptions for more command arguments.
+If you want to try models other than llama, you can install peft from source using [this branch](https://github.com/PanQiWei/peft/tree/multi_modal_adaption_prompt), see [here](https://github.com/PanQiWei/peft/blob/a5f8f74f07591efe5eb3d08cb1b31b981e84a069/src/peft/tuners/adaption_prompt.py#L235) 
+to check what other models are also supported, and with this branch installed, you can also use `ADAPTION_PROMPT_V2` peft type (llama-adapter-v2) by simply replace `AdaptionPromptConfig` with `AdaptionPromptV2Config` in the script.
\ No newline at end of file
--- a/examples/benchmark/generation_speed.py
+++ b/examples/benchmark/generation_speed.py
+import json
+import logging
+import random
+import time
+from argparse import ArgumentParser
+from itertools import chain
+from typing import Dict, List, Optional
+import torch
+from datasets import Dataset
+from tqdm import tqdm
+from transformers import AutoTokenizer, GenerationConfig
+from transformers.generation.logits_process import LogitsProcessor
+from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
+logger = logging.getLogger(__name__)
+random.seed(0)
+class CustomizedMinNewTokensLogitsProcessor(LogitsProcessor):
+    def __init__(
+        self,
+        min_new_tokens: int = None,
+        eos_token_id: int = None,
+    ):
+        self.eos_token_id = eos_token_id
+        self.min_new_tokens = min_new_tokens or 0
+        self.current_step = 0
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        self.current_step += 1
+        if self._skip_process():
+            return scores
+        if any(each is not None for each in [self.eos_token_id]):
+            banned_mask = torch.zeros_like(scores).to(scores.device)
+            if self.eos_token_id and self.current_step <= self.min_new_tokens:
+                banned_mask = self._fill_banned_mask(input_ids, banned_mask, {1: [[self.eos_token_id]]})
+            scores = scores.masked_fill(banned_mask.bool(), -float("inf"))
+        return scores
+    def _skip_process(self):
+        if self.current_step > self.min_new_tokens:
+            return True
+        return False
+    @staticmethod
+    def _fill_banned_mask(
+        input_ids: torch.LongTensor,
+        banned_mask: torch.Tensor,
+        len2words_ids: Dict[int, List[List[int]]],
+    ):
+        for token_len, token_ids in len2words_ids.items():
+            if token_len == 1:
+                banned_mask[..., list(chain(*token_ids))] = 1
+            elif input_ids.shape[-1] < token_len - 1:
+                continue
+            else:
+                token_ids = torch.LongTensor(token_ids).to(input_ids.device)
+                hit_masks = torch.all(
+                    token_ids[..., :-1].unsqueeze(0).repeat(input_ids.shape[0], 1, 1)
+                    == input_ids[..., -(token_ids.shape[-1] - 1) :].unsqueeze(1),
+                    dim=-1,
+                )
+                for idx in range(hit_masks.shape[0]):
+                    selected_token_ids = torch.masked_select(token_ids[..., -1], hit_masks[idx])
+                    if len(selected_token_ids):
+                        banned_mask[idx, selected_token_ids] = 1
+        return banned_mask
+def load_data(data_path, tokenizer, n_samples, max_new_tokens):
+    with open(data_path, "r", encoding="utf-8") as f:
+        raw_data = json.load(f)
+    raw_data = random.sample(raw_data, k=min(n_samples, len(raw_data)))
+    def dummy_gen():
+        return raw_data
+    def tokenize(examples):
+        instructions = examples["instruction"]
+        inputs = examples["input"]
+        outputs = examples["output"]
+        prompts = []
+        texts = []
+        input_ids = []
+        attention_mask = []
+        for istr, inp, opt in zip(instructions, inputs, outputs):
+            if inp:
+                prompt = f"Instruction:\n{istr}\nInput:\n{inp}\nOutput:\n"
+                text = prompt + opt
+            else:
+                prompt = f"Instruction:\n{istr}\nOutput:\n"
+                text = prompt + opt
+            if len(tokenizer(prompt)["input_ids"]) >= tokenizer.model_max_length - max_new_tokens:
+                continue
+            tokenized_data = tokenizer(text)
+            input_ids.append(tokenized_data["input_ids"][: tokenizer.model_max_length])
+            attention_mask.append(tokenized_data["attention_mask"][: tokenizer.model_max_length])
+            prompts.append(prompt)
+            texts.append(text)
+        return {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "prompt": prompts,
+        }
+    dataset = Dataset.from_generator(dummy_gen)
+    dataset = dataset.map(
+        tokenize,
+        batched=True,
+        batch_size=len(dataset),
+        num_proc=1,
+        keep_in_memory=True,
+        load_from_cache_file=False,
+        remove_columns=["instruction", "input"],
+    )
+    dataset = dataset.to_list()
+    for sample in dataset:
+        sample["input_ids"] = torch.LongTensor(sample["input_ids"])
+        sample["attention_mask"] = torch.LongTensor(sample["attention_mask"])
+    return dataset
+def load_model_tokenizer(
+    model_name_or_path: str,
+    tokenizer_name_or_path: Optional[str] = None,
+    from_pretrained: bool = False,
+    max_memory: Optional[dict] = None,
+    model_basename: Optional[str] = None,
+    quantize_config: Optional[str] = None,
+    trust_remote_code: bool = False,
+    use_triton: bool = False,
+    use_safetensors: bool = True,
+    use_fast_tokenizer: bool = False,
+    inject_fused_attention: bool = True,
+    inject_fused_mlp: bool = True,
+    disable_exllama: bool = False,
+):
+    tokenizer = AutoTokenizer.from_pretrained(
+        pretrained_model_name_or_path=tokenizer_name_or_path or model_name_or_path,
+        use_fast=use_fast_tokenizer,
+        trust_remote_code=trust_remote_code,
+    )
+    if not tokenizer.pad_token_id:
+        tokenizer.pad_token_id = tokenizer.eos_token_id
+    if from_pretrained:
+        model = AutoGPTQForCausalLM.from_pretrained(
+            pretrained_model_name_or_path=model_name_or_path,
+            quantize_config=BaseQuantizeConfig(),
+            max_memory=max_memory,
+            trust_remote_code=trust_remote_code,
+        )
+    else:
+        model = AutoGPTQForCausalLM.from_quantized(
+            model_name_or_path,
+            max_memory=max_memory,
+            low_cpu_mem_usage=True,
+            use_triton=use_triton,
+            inject_fused_attention=inject_fused_attention,
+            inject_fused_mlp=inject_fused_mlp,
+            use_cuda_fp16=True,
+            quantize_config=quantize_config,
+            model_basename=model_basename,
+            use_safetensors=use_safetensors,
+            trust_remote_code=trust_remote_code,
+            warmup_triton=False,
+            disable_exllama=disable_exllama,
+        )
+    return model, tokenizer
+def benchmark_generation_speed(model, tokenizer, examples, generation_config):
+    generation_time_list = []
+    num_generated_tokens_list = []
+    progress_bar = tqdm(examples)
+    for example in progress_bar:
+        input_ids = example["input_ids"].to(model.device)
+        start = time.time()
+        outputs_ids = model.generate(
+            input_ids=input_ids.unsqueeze(0),
+            generation_config=generation_config,
+            logits_processor=[
+                CustomizedMinNewTokensLogitsProcessor(generation_config.max_new_tokens, tokenizer.eos_token_id)
+            ],
+        )
+        end = time.time()
+        generation_time_list.append(end - start)
+        num_generated_tokens = 0
+        for output_ids in outputs_ids:
+            num_generated_tokens += len(
+                [token_id for token_id in output_ids[len(input_ids) :] if token_id != tokenizer.pad_token_id]
+            )
+        num_generated_tokens_list.append(num_generated_tokens)
+        progress_bar.set_postfix(
+            num_tokens=num_generated_tokens_list[-1],
+            time=generation_time_list[-1],
+            speed=f"{num_generated_tokens_list[-1] / generation_time_list[-1]:.4f}tokens/s",
+        )
+    total_tokens = sum(num_generated_tokens_list)
+    total_seconds = sum(generation_time_list)
+    logger.info(
+        f"generated {total_tokens} tokens using {total_seconds} seconds, "
+        f"generation speed: {total_tokens / total_seconds}tokens/s"
+    )
+def main():
+    parser = ArgumentParser()
+    parser.add_argument("--model_name_or_path", type=str)
+    parser.add_argument("--tokenizer_name_or_path", type=str, default=None)
+    parser.add_argument("--from_pretrained", action="store_true")
+    parser.add_argument("--model_basename", type=str, default=None)
+    parser.add_argument("--quantize_config_save_dir", type=str, default=None)
+    parser.add_argument("--trust_remote_code", action="store_true")
+    parser.add_argument("--use_triton", action="store_true")
+    parser.add_argument("--use_safetensors", action="store_true")
+    parser.add_argument("--use_fast_tokenizer", action="store_true")
+    parser.add_argument("--disable_exllama", action="store_true")
+    parser.add_argument("--no_inject_fused_attention", action="store_true")
+    parser.add_argument("--no_inject_fused_mlp", action="store_true")
+    parser.add_argument("--num_samples", type=int, default=10)
+    parser.add_argument("--per_gpu_max_memory", type=int, default=None)
+    parser.add_argument("--cpu_max_memory", type=int, default=None)
+    parser.add_argument("--max_new_tokens", type=int, default=512)
+    parser.add_argument("--do_sample", action="store_true")
+    parser.add_argument("--num_beams", type=int, default=1)
+    args = parser.parse_args()
+    max_memory = {}
+    if args.per_gpu_max_memory is not None and args.per_gpu_max_memory > 0:
+        if torch.cuda.is_available():
+            max_memory.update({i: f"{args.per_gpu_max_memory}GIB" for i in range(torch.cuda.device_count())})
+    if args.cpu_max_memory is not None and args.cpu_max_memory > 0 and max_memory:
+        max_memory["cpu"] = f"{args.cpu_max_memory}GIB"
+    if not max_memory:
+        max_memory = None
+    logger.info(f"max_memory: {max_memory}")
+    quantize_config = None
+    if args.quantize_config_save_dir:
+        quantize_config = BaseQuantizeConfig.from_pretrained(args.quantize_config_save_dir)
+    if args.use_safetensors:
+        logger.warning(
+            "The command --use_safetensors is deprecated and will be removed in the next release. It is now by default activated."
+        )
+    logger.info("loading model and tokenizer")
+    start = time.time()
+    model, tokenizer = load_model_tokenizer(
+        model_name_or_path=args.model_name_or_path,
+        tokenizer_name_or_path=args.tokenizer_name_or_path,
+        from_pretrained=args.from_pretrained,
+        max_memory=max_memory,
+        model_basename=args.model_basename,
+        quantize_config=quantize_config,
+        trust_remote_code=args.trust_remote_code,
+        use_triton=args.use_triton,
+        use_safetensors=True,
+        use_fast_tokenizer=args.use_fast_tokenizer,
+        inject_fused_attention=not args.no_inject_fused_attention,
+        inject_fused_mlp=not args.no_inject_fused_mlp,
+        disable_exllama=args.disable_exllama,
+    )
+    end = time.time()
+    logger.info(f"model and tokenizer loading time: {end - start:.4f}s")
+    logger.info(f"model quantized: {model.quantized}")
+    logger.info(f"quantize config: {model.quantize_config.to_dict()}")
+    logger.info(f"model device map: {model.hf_device_map}")
+    if args.use_triton:
+        logger.info("warmup triton, this may take a while.")
+        model.warmup_triton()
+    logger.info("loading data")
+    examples = load_data(
+        "../quantization/dataset/alpaca_data_cleaned.json",
+        tokenizer,
+        args.num_samples,
+        args.max_new_tokens,
+    )
+    generation_config = GenerationConfig(
+        num_beams=args.num_beams,
+        num_return_sequences=args.num_beams,
+        do_sample=args.do_sample,
+        min_new_tokens=args.max_new_tokens,
+        max_new_tokens=args.max_new_tokens,
+        pad_token_id=tokenizer.pad_token_id,
+    )
+    logger.info(f"generation config: {generation_config.to_dict()}")
+    logger.info("benchmark generation speed")
+    benchmark_generation_speed(model, tokenizer, examples, generation_config)
+if __name__ == "__main__":
+    logging.basicConfig(
+        format="%(asctime)s %(levelname)s [%(name)s] %(message)s",
+        level=logging.INFO,
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+    main()
--- a/examples/benchmark/perplexity.py
+++ b/examples/benchmark/perplexity.py
+import argparse
+import os
+import torch
+from transformers import AutoTokenizer
+from auto_gptq.utils import Perplexity
+if __name__ == "__main__":
+    """
+    Example usage.
+    Default usage with GPT2 model:
+    python examples/benchmark/perplexity.py
+    Specify GPTQ quantized model:
+    python examples/benchmark/perplexity.py \
+        --model_name TheBloke/open-llama-7b-open-instruct-GPTQ \
+        --model_basename gptq_model-4bit-128g \
+        --is_quantized
+    Change your dataset:
+    python examples/benchmark/perplexity.py --dataset_path tiny_shakespeare
+    """
+    parser = argparse.ArgumentParser(description="Calculate Perplexity for a model.")
+    parser.add_argument("--model_name", type=str, default="gpt2", help="Model name.")
+    parser.add_argument("--model_basename", type=str, default=None, help="Model file's basename.")
+    parser.add_argument("--n_ctx", type=int, default=512, help="Context size.")
+    parser.add_argument("--n_batch", type=int, default=512, help="Batch size.")
+    parser.add_argument("--dataset_path", type=str, default="wikitext", help="Path to the dataset.")
+    parser.add_argument("--dataset_name", type=str, default=None, help="Name of the dataset.")
+    parser.add_argument("--split", type=str, default="test", help="Dataset split to use.")
+    parser.add_argument(
+        "--text_column",
+        type=str,
+        default="text",
+        help="Column in the dataset containing the text.",
+    )
+    parser.add_argument(
+        "--per_gpu_max_memory",
+        type=int,
+        default=None,
+        help="Max memory used in each GPU.",
+    )
+    parser.add_argument("--cpu_max_memory", type=int, default=None, help="Mx memory used in CPU.")
+    parser.add_argument("--is_quantized", action="store_true", help="Is the model GPTQ quantized?")
+    parser.add_argument(
+        "--use_safetensors",
+        action="store_true",
+        help="Whether to use safetensors model file",
+    )
+    parser.add_argument("--use_fast_tokenizer", action="store_true", help="Wheter to use fast tokenizer")
+    parser.add_argument("--trust_remote_code", action="store_true", help="Whether to use remote code")
+    parser.add_argument(
+        "--disable_exllama",
+        action="store_true",
+        help="Whether to use disable exllama kernel",
+    )
+    args = parser.parse_args()
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name, use_fast=args.use_fast_tokenizer)
+    if not tokenizer.pad_token_id:
+        tokenizer.pad_token_id = tokenizer.eos_token_id
+    max_memory = {}
+    if args.per_gpu_max_memory is not None and args.per_gpu_max_memory > 0:
+        if torch.cuda.is_available():
+            max_memory.update({i: f"{args.per_gpu_max_memory}GIB" for i in range(torch.cuda.device_count())})
+    if args.cpu_max_memory is not None and args.cpu_max_memory > 0 and max_memory:
+        max_memory["cpu"] = f"{args.cpu_max_memory}GIB"
+    if not max_memory:
+        max_memory = None
+    if args.use_safetensors:
+        print(
+            "The argument --use_safetensors is deprecrated and will be removed in the next release. It is now the default behavior."
+        )
+    if args.is_quantized:
+        from auto_gptq import AutoGPTQForCausalLM
+        model = AutoGPTQForCausalLM.from_quantized(
+            args.model_name,
+            low_cpu_mem_usage=True,
+            device_map="auto",
+            max_memory=max_memory,
+            model_basename=args.model_basename,
+            use_safetensors=True,
+            trust_remote_code=args.trust_remote_code,
+            inject_fused_mlp=False,
+            inject_fused_attention=False,
+            disable_exllama=args.disable_exllama,
+        )
+    else:
+        from transformers import AutoModelForCausalLM
+        model = AutoModelForCausalLM.from_pretrained(
+            args.model_name,
+            low_cpu_mem_usage=True,
+            device_map="auto",
+            max_memory=max_memory,
+            torch_dtype=torch.float16,
+            trust_remote_code=args.trust_remote_code,
+        )
+    ppl = Perplexity(
+        model,
+        tokenizer,
+        args.dataset_path,
+        args.dataset_name,
+        args.split,
+        args.text_column,
+    )
+    ppl.calculate_perplexity(args.n_ctx, args.n_batch)
--- a/examples/evaluation/run_language_modeling_task.py
+++ b/examples/evaluation/run_language_modeling_task.py
+from argparse import ArgumentParser
+import datasets
+import torch
+from transformers import AutoTokenizer
+from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
+from auto_gptq.eval_tasks import LanguageModelingTask
+DATASET = "tatsu-lab/alpaca"
+WITH_INPUT_TEMPLATE = "Instruction:\n{instruction}\n\nInput:\n{input}\n\nOutput:\n"
+WITHOUT_INPUT_TEMPLATE = "Instruction:\n{instruction}\n\nOutput:\n"
+def ds_refactor_fn(samples):
+    instruction_data = samples["instruction"]
+    input_data = samples["input"]
+    output_data = samples["output"]
+    new_samples = {"prompt": [], "output": []}
+    for instruction_txt, input_txt, output_txt in zip(instruction_data, input_data, output_data):
+        if input_txt:
+            prompt = WITH_INPUT_TEMPLATE.format(instruction=instruction_txt, input=input_txt)
+        else:
+            prompt = WITHOUT_INPUT_TEMPLATE.format(instruction=instruction_txt)
+        new_samples["prompt"].append(prompt)
+        new_samples["output"].append(output_txt)
+    return new_samples
+def main():
+    parser = ArgumentParser()
+    parser.add_argument("--base_model_dir", type=str)
+    parser.add_argument("--quantized_model_dir", type=str)
+    parser.add_argument(
+        "--num_samples",
+        type=int,
+        default=100,
+        help="how many samples will be sampled to evaluation",
+    )
+    parser.add_argument("--sample_max_len", type=int, default=1024, help="max tokens for each sample")
+    parser.add_argument("--block_max_len", type=int, default=2048, help="max tokens for each data block")
+    parser.add_argument("--use_triton", action="store_true")
+    args = parser.parse_args()
+    tokenizer = AutoTokenizer.from_pretrained(args.base_model_dir)
+    model = AutoGPTQForCausalLM.from_pretrained(args.base_model_dir, BaseQuantizeConfig())
+    model.to("cuda:0")
+    task = LanguageModelingTask(
+        model=model,
+        tokenizer=tokenizer,
+        data_name_or_path=DATASET,
+        prompt_col_name="prompt",
+        label_col_name="output",
+        **{
+            "num_samples": args.num_samples,  # how many samples will be sampled to evaluation
+            "sample_max_len": args.sample_max_len,  # max tokens for each sample
+            "block_max_len": args.block_max_len,  # max tokens for each data block
+            "load_fn": datasets.load_dataset,  # function to load dataset
+            "preprocess_fn": ds_refactor_fn,  # function to preprocess dataset
+            "truncate_prompt": False,  # truncate label when sample's length exceed sample_max_len
+        },
+    )
+    print(f"eval result for base model: {task.run()}")
+    task.model = None
+    model.cpu()
+    del model
+    torch.cuda.empty_cache()
+    model = AutoGPTQForCausalLM.from_quantized(args.quantized_model_dir, device="cuda:0", use_triton=args.use_triton)
+    task.model = model
+    task.device = model.device
+    print(f"eval result for quantized model: {task.run()}")
+if __name__ == "__main__":
+    main()
--- a/examples/evaluation/run_sequence_classification_task.py
+++ b/examples/evaluation/run_sequence_classification_task.py
+from argparse import ArgumentParser
+from functools import partial
+import datasets
+import torch
+from transformers import AutoTokenizer
+from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
+from auto_gptq.eval_tasks import SequenceClassificationTask
+DATASET = "cardiffnlp/tweet_sentiment_multilingual"
+TEMPLATE = "Question:What's the sentiment of the given text? Choices are {labels}.\nText: {text}\nAnswer:"
+ID2LABEL = {0: "negative", 1: "neutral", 2: "positive"}
+LABELS = list(ID2LABEL.values())
+def ds_refactor_fn(samples):
+    text_data = samples["text"]
+    label_data = samples["label"]
+    new_samples = {"prompt": [], "label": []}
+    for text, label in zip(text_data, label_data):
+        prompt = TEMPLATE.format(labels=LABELS, text=text)
+        new_samples["prompt"].append(prompt)
+        new_samples["label"].append(ID2LABEL[label])
+    return new_samples
+def main():
+    parser = ArgumentParser()
+    parser.add_argument("--base_model_dir", type=str)
+    parser.add_argument("--quantized_model_dir", type=str)
+    parser.add_argument(
+        "--num_samples",
+        type=int,
+        default=100,
+        help="how many samples will be sampled to evaluation",
+    )
+    parser.add_argument("--sample_max_len", type=int, default=1024, help="max tokens for each sample")
+    parser.add_argument("--block_max_len", type=int, default=2048, help="max tokens for each data block")
+    parser.add_argument("--use_triton", action="store_true")
+    args = parser.parse_args()
+    tokenizer = AutoTokenizer.from_pretrained(args.base_model_dir)
+    model = AutoGPTQForCausalLM.from_pretrained(args.base_model_dir, BaseQuantizeConfig())
+    model.to("cuda:0")
+    task = SequenceClassificationTask(
+        model=model,
+        tokenizer=tokenizer,
+        classes=LABELS,
+        data_name_or_path=DATASET,
+        prompt_col_name="prompt",
+        label_col_name="label",
+        **{
+            "num_samples": args.num_samples,  # how many samples will be sampled to evaluation
+            "sample_max_len": args.sample_max_len,  # max tokens for each sample
+            "block_max_len": args.block_max_len,  # max tokens for each data block
+            "load_fn": partial(datasets.load_dataset, name="english"),  # function to load dataset
+            "preprocess_fn": ds_refactor_fn,  # function to preprocess dataset
+            "truncate_prompt": False,  # truncate label when sample's length exceed sample_max_len
+        },
+    )
+    print(f"eval result for base model: {task.run()}")
+    task.model = None
+    model.cpu()
+    del model
+    torch.cuda.empty_cache()
+    model = AutoGPTQForCausalLM.from_quantized(args.quantized_model_dir, device="cuda:0", use_triton=args.use_triton)
+    task.model = model
+    task.device = model.device
+    print(f"eval result for quantized model: {task.run()}")
+if __name__ == "__main__":
+    main()
--- a/examples/evaluation/run_text_summarization_task.py
+++ b/examples/evaluation/run_text_summarization_task.py
+import os
+from argparse import ArgumentParser
+import datasets
+import torch
+from transformers import AutoTokenizer, GenerationConfig
+from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
+from auto_gptq.eval_tasks import TextSummarizationTask
+os.system("pip install py7zr")
+DATASET = "samsum"
+TEMPLATE = "Instruction: Summarize the conversation into one sentence.\n\nInput:\n{diag}\n\nOutput:\n"
+def ds_refactor_fn(samples):
+    dialogues = samples["dialogue"]
+    new_samples = {"prompt": [], "summary": samples["summary"]}
+    for diag in dialogues:
+        prompt = TEMPLATE.format(diag=diag)
+        new_samples["prompt"].append(prompt)
+    return new_samples
+def main():
+    parser = ArgumentParser()
+    parser.add_argument("--base_model_dir", type=str)
+    parser.add_argument("--quantized_model_dir", type=str)
+    parser.add_argument(
+        "--num_samples",
+        type=int,
+        default=100,
+        help="how many samples will be sampled to evaluation",
+    )
+    parser.add_argument("--sample_max_len", type=int, default=1024, help="max tokens for each sample")
+    parser.add_argument("--block_max_len", type=int, default=2048, help="max tokens for each data block")
+    parser.add_argument("--use_triton", action="store_true")
+    args = parser.parse_args()
+    tokenizer = AutoTokenizer.from_pretrained(args.base_model_dir)
+    model = AutoGPTQForCausalLM.from_pretrained(args.base_model_dir, BaseQuantizeConfig())
+    model.to("cuda:0")
+    task = TextSummarizationTask(
+        model=model,
+        tokenizer=tokenizer,
+        data_name_or_path=DATASET,
+        prompt_col_name="prompt",
+        label_col_name="summary",
+        **{
+            "num_samples": args.num_samples,  # how many samples will be sampled to evaluation
+            "sample_max_len": args.sample_max_len,  # max tokens for each sample
+            "block_max_len": args.block_max_len,  # max tokens for each data block
+            "load_fn": datasets.load_dataset,  # function to load dataset
+            "preprocess_fn": ds_refactor_fn,  # function to preprocess dataset
+            "truncate_prompt": False,  # truncate label when sample's length exceed sample_max_len
+        },
+    )
+    print(f"eval result for base model: {task.run(generation_config=GenerationConfig(max_new_tokens=32))}")
+    task.model = None
+    model.cpu()
+    del model
+    torch.cuda.empty_cache()
+    model = AutoGPTQForCausalLM.from_quantized(args.quantized_model_dir, device="cuda:0", use_triton=args.use_triton)
+    task.model = model
+    task.device = model.device
+    print(f"eval result for quantized model: {task.run(generation_config=GenerationConfig(max_new_tokens=32))}")
+if __name__ == "__main__":
+    main()
--- a/examples/peft/peft_adalora_clm_instruction_tuning.py
+++ b/examples/peft/peft_adalora_clm_instruction_tuning.py
+import json
+import os
+from argparse import ArgumentParser
+from functools import partial
+import torch
+from datasets import Dataset
+from peft import TaskType
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from transformers import AutoTokenizer, get_linear_schedule_with_warmup
+from auto_gptq import AutoGPTQForCausalLM, get_gptq_peft_model
+from auto_gptq.utils.data_utils import collate_data, make_data_block
+from auto_gptq.utils.peft_utils import GPTQAdaLoraConfig
+parser = ArgumentParser()
+parser.add_argument("--model_name_or_path", type=str)
+parser.add_argument("--lr", type=float, default=3e-3)
+parser.add_argument("--num_epochs", type=int, default=1)
+parser.add_argument("--sample_max_length", type=int, default=1024, help="max length of sample")
+parser.add_argument(
+    "--block_max_length",
+    type=int,
+    default=1024,
+    help="max length of data block(bunch of samples)",
+)
+parser.add_argument("--tokenizer_name_or_path", type=str, default=None)
+parser.add_argument("--use_fast_tokenizer", action="store_true")
+args = parser.parse_args()
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+model_name_or_path = args.model_name_or_path
+tokenizer_name_or_path = args.tokenizer_name_or_path or model_name_or_path
+lr = args.lr
+num_epochs = args.num_epochs
+# creating model
+peft_config = GPTQAdaLoraConfig(
+    init_r=20,
+    target_r=16,
+    beta1=0.85,
+    beta2=0.85,
+    tinit=200,
+    tfinal=1000,
+    deltaT=10,
+    lora_alpha=32,
+    lora_dropout=0.1,
+    task_type=TaskType.CAUSAL_LM,
+    inference_mode=False,
+)
+tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=args.use_fast_tokenizer)
+if not tokenizer.pad_token_id:
+    tokenizer.pad_token_id = tokenizer.eos_token_id
+model = AutoGPTQForCausalLM.from_quantized(
+    model_name_or_path,
+    use_triton=True,
+    warmup_triton=False,
+    trainable=True,
+    inject_fused_attention=True,
+    inject_fused_mlp=False,
+)
+model.warmup_triton()
+device = model.device
+model = get_gptq_peft_model(model, peft_config=peft_config, auto_find_all_linears=True, train_mode=True)
+model.print_trainable_parameters()
+# loading dataset
+WITH_INPUT_TEMPLATE = "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Output:\n"
+WITHOUT_INPUT_TEMPLATE = "### Instruction:\n{instruction}\n\n### Output:\n"
+def ds_refactor_fn(samples):
+    instruction_data = samples["instruction"]
+    input_data = samples["input"]
+    output_data = samples["output"]
+    new_samples = {"prompt": [], "output": []}
+    for instruction_txt, input_txt, output_txt in zip(instruction_data, input_data, output_data):
+        if input_txt:
+            prompt = WITH_INPUT_TEMPLATE.format(instruction=instruction_txt, input=input_txt)
+        else:
+            prompt = WITHOUT_INPUT_TEMPLATE.format(instruction=instruction_txt)
+        new_samples["prompt"].append(prompt)
+        new_samples["output"].append(output_txt)
+    return new_samples
+ds = Dataset.from_generator(
+    lambda: json.load(open("../quantization/dataset/alpaca_data_cleaned.json", "r", encoding="utf-8"))
+)
+ds = ds.map(
+    make_data_block,
+    batched=True,
+    batch_size=len(ds),
+    num_proc=1,
+    remove_columns=ds.column_names,
+    keep_in_memory=True,
+    load_from_cache_file=False,
+    fn_kwargs={
+        "prompt_col_name": "prompt",
+        "label_col_name": "output",
+        "tokenizer": tokenizer,
+        "preprocess_fn": ds_refactor_fn,
+        "sample_max_len": args.sample_max_length,
+        "block_max_len": args.block_max_length,
+        "add_eos_token": True,
+        "truncate_prompt": False,
+        "merge_prompt_label": True,
+    },
+)
+ds = ds.train_test_split(test_size=len(ds) // 10)
+train_ds, eval_ds = ds["train"], ds["test"]
+collate_fn = partial(collate_data, pad_token_id=tokenizer.pad_token_id)
+train_dataloader = DataLoader(train_ds, batch_size=1, shuffle=True, collate_fn=partial(collate_fn))
+eval_dataloader = DataLoader(eval_ds, batch_size=1, shuffle=False, collate_fn=collate_fn)
+# optimizer and lr scheduler
+optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
+lr_scheduler = get_linear_schedule_with_warmup(
+    optimizer=optimizer,
+    num_warmup_steps=0,
+    num_training_steps=(len(train_dataloader) * num_epochs),
+)
+model.base_model.peft_config["default"].total_step = len(train_dataloader) * num_epochs
+# training and evaluation
+with torch.cuda.amp.autocast():
+    global_step = 0
+    for epoch in range(num_epochs):
+        model.train()
+        total_loss = 0
+        progress_bar = tqdm(train_dataloader)
+        for step, batch in enumerate(progress_bar):
+            batch = {k: v.to(device) for k, v in batch.items()}
+            outputs = model(**batch)
+            loss = outputs.loss
+            total_loss += loss.detach().float()
+            loss.backward()
+            optimizer.step()
+            lr_scheduler.step()
+            # Update the importance of low-rank matrices
+            # and allocate the budget accordingly.
+            model.base_model.update_and_allocate(global_step)
+            optimizer.zero_grad()
+            global_step += 1
+            progress_bar.set_postfix(loss=loss.item())
+        model.eval()
+        eval_loss = 0
+        eval_preds = []
+        for step, batch in enumerate(tqdm(eval_dataloader)):
+            batch = {k: v.to(device) for k, v in batch.items()}
+            with torch.no_grad():
+                outputs = model(**batch)
+            loss = outputs.loss
+            eval_loss += loss.detach().float()
+            eval_preds.extend(
+                tokenizer.batch_decode(
+                    torch.argmax(outputs.logits, -1).detach().cpu().numpy(),
+                    skip_special_tokens=True,
+                )
+            )
+        eval_epoch_loss = eval_loss / len(eval_dataloader)
+        eval_ppl = torch.exp(eval_epoch_loss)
+        train_epoch_loss = total_loss / len(train_dataloader)
+        train_ppl = torch.exp(train_epoch_loss)
+        print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")
+model.save_pretrained(os.path.join(model_name_or_path, f"gptq_{peft_config.peft_type.value}_adapter"))
--- a/examples/peft/peft_adaption_prompt_clm_instruction_tuning.py
+++ b/examples/peft/peft_adaption_prompt_clm_instruction_tuning.py
+import json
+import os
+from argparse import ArgumentParser
+from functools import partial
+import torch
+from datasets import Dataset
+from peft import AdaptionPromptConfig, TaskType
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from transformers import AutoTokenizer, get_linear_schedule_with_warmup
+from auto_gptq import AutoGPTQForCausalLM, get_gptq_peft_model
+from auto_gptq.utils.data_utils import collate_data, make_data_block
+parser = ArgumentParser()
+parser.add_argument("--model_name_or_path", type=str)
+parser.add_argument("--adapter_len", type=int, default=10)
+parser.add_argument("--adapter_layers", type=int, default=30)
+parser.add_argument("--lr", type=float, default=3e-3)
+parser.add_argument("--num_epochs", type=int, default=1)
+parser.add_argument("--sample_max_length", type=int, default=1024, help="max length of sample")
+parser.add_argument(
+    "--block_max_length",
+    type=int,
+    default=1024,
+    help="max length of data block(bunch of samples)",
+)
+parser.add_argument("--tokenizer_name_or_path", type=str, default=None)
+parser.add_argument("--use_fast_tokenizer", action="store_true")
+args = parser.parse_args()
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+model_name_or_path = args.model_name_or_path
+tokenizer_name_or_path = args.tokenizer_name_or_path or model_name_or_path
+lr = args.lr
+num_epochs = args.num_epochs
+# creating model
+peft_config = AdaptionPromptConfig(
+    adapter_len=args.adapter_len,
+    adapter_layers=args.adapter_layers,
+    task_type=TaskType.CAUSAL_LM,
+    inference_mode=False,
+)
+tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=args.use_fast_tokenizer)
+if not tokenizer.pad_token_id:
+    tokenizer.pad_token_id = tokenizer.eos_token_id
+model = AutoGPTQForCausalLM.from_quantized(
+    model_name_or_path,
+    use_triton=True,
+    warmup_triton=False,
+    trainable=True,
+    inject_fused_attention=False,
+    inject_fused_mlp=False,
+)
+model.warmup_triton()
+device = model.device
+model = get_gptq_peft_model(model, peft_config=peft_config, auto_find_all_linears=True, train_mode=True)
+model.print_trainable_parameters()
+# loading dataset
+WITH_INPUT_TEMPLATE = "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Output:\n"
+WITHOUT_INPUT_TEMPLATE = "### Instruction:\n{instruction}\n\n### Output:\n"
+def ds_refactor_fn(samples):
+    instruction_data = samples["instruction"]
+    input_data = samples["input"]
+    output_data = samples["output"]
+    new_samples = {"prompt": [], "output": []}
+    for instruction_txt, input_txt, output_txt in zip(instruction_data, input_data, output_data):
+        if input_txt:
+            prompt = WITH_INPUT_TEMPLATE.format(instruction=instruction_txt, input=input_txt)
+        else:
+            prompt = WITHOUT_INPUT_TEMPLATE.format(instruction=instruction_txt)
+        new_samples["prompt"].append(prompt)
+        new_samples["output"].append(output_txt)
+    return new_samples
+ds = Dataset.from_generator(
+    lambda: json.load(open("../quantization/dataset/alpaca_data_cleaned.json", "r", encoding="utf-8"))
+)
+ds = ds.map(
+    make_data_block,
+    batched=True,
+    batch_size=len(ds),
+    num_proc=1,
+    remove_columns=ds.column_names,
+    keep_in_memory=True,
+    load_from_cache_file=False,
+    fn_kwargs={
+        "prompt_col_name": "prompt",
+        "label_col_name": "output",
+        "tokenizer": tokenizer,
+        "preprocess_fn": ds_refactor_fn,
+        "sample_max_len": args.sample_max_length,
+        "block_max_len": args.block_max_length,
+        "add_eos_token": True,
+        "truncate_prompt": False,
+        "merge_prompt_label": True,
+    },
+)
+ds = ds.train_test_split(test_size=len(ds) // 10)
+train_ds, eval_ds = ds["train"], ds["test"]
+collate_fn = partial(collate_data, pad_token_id=tokenizer.pad_token_id)
+train_dataloader = DataLoader(train_ds, batch_size=1, shuffle=True, collate_fn=partial(collate_fn))
+eval_dataloader = DataLoader(eval_ds, batch_size=1, shuffle=False, collate_fn=collate_fn)
+# optimizer and lr scheduler
+optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
+lr_scheduler = get_linear_schedule_with_warmup(
+    optimizer=optimizer,
+    num_warmup_steps=0,
+    num_training_steps=(len(train_dataloader) * num_epochs),
+)
+# training and evaluation
+with torch.cuda.amp.autocast():
+    for epoch in range(num_epochs):
+        model.train()
+        total_loss = 0
+        progress_bar = tqdm(train_dataloader)
+        for step, batch in enumerate(progress_bar):
+            batch = {k: v.to(device) for k, v in batch.items()}
+            outputs = model(**batch)
+            loss = outputs.loss
+            total_loss += loss.detach().float()
+            loss.backward()
+            optimizer.step()
+            lr_scheduler.step()
+            optimizer.zero_grad()
+            progress_bar.set_postfix(loss=loss.item())
+        model.eval()
+        eval_loss = 0
+        eval_preds = []
+        for step, batch in enumerate(tqdm(eval_dataloader)):
+            batch = {k: v.to(device) for k, v in batch.items()}
+            with torch.no_grad():
+                outputs = model(**batch)
+            loss = outputs.loss
+            eval_loss += loss.detach().float()
+            eval_preds.extend(
+                tokenizer.batch_decode(
+                    torch.argmax(outputs.logits, -1).detach().cpu().numpy(),
+                    skip_special_tokens=True,
+                )
+            )
+        eval_epoch_loss = eval_loss / len(eval_dataloader)
+        eval_ppl = torch.exp(eval_epoch_loss)
+        train_epoch_loss = total_loss / len(train_dataloader)
+        train_ppl = torch.exp(train_epoch_loss)
+        print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")
+model.save_pretrained(os.path.join(model_name_or_path, f"gptq_{peft_config.peft_type.value}_adapter"))
--- a/examples/peft/peft_lora_clm_instruction_tuning.py
+++ b/examples/peft/peft_lora_clm_instruction_tuning.py
+import json
+import os
+from argparse import ArgumentParser
+from functools import partial
+import torch
+from datasets import Dataset
+from peft import TaskType
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from transformers import AutoTokenizer, get_linear_schedule_with_warmup
+from auto_gptq import AutoGPTQForCausalLM, get_gptq_peft_model
+from auto_gptq.utils.data_utils import collate_data, make_data_block
+from auto_gptq.utils.peft_utils import GPTQLoraConfig
+parser = ArgumentParser()
+parser.add_argument("--model_name_or_path", type=str)
+parser.add_argument("--lr", type=float, default=3e-5)
+parser.add_argument("--num_epochs", type=int, default=1)
+parser.add_argument("--sample_max_length", type=int, default=1024, help="max length of sample")
+parser.add_argument(
+    "--block_max_length",
+    type=int,
+    default=1024,
+    help="max length of data block(bunch of samples)",
+)
+parser.add_argument("--tokenizer_name_or_path", type=str, default=None)
+parser.add_argument("--use_fast_tokenizer", action="store_true")
+args = parser.parse_args()
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+model_name_or_path = args.model_name_or_path
+tokenizer_name_or_path = args.tokenizer_name_or_path or model_name_or_path
+lr = args.lr
+num_epochs = args.num_epochs
+# creating model
+peft_config = GPTQLoraConfig(
+    r=16,
+    lora_alpha=32,
+    lora_dropout=0.1,
+    task_type=TaskType.CAUSAL_LM,
+    inference_mode=False,
+)
+tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=args.use_fast_tokenizer)
+if not tokenizer.pad_token_id:
+    tokenizer.pad_token_id = tokenizer.eos_token_id
+model = AutoGPTQForCausalLM.from_quantized(
+    model_name_or_path,
+    use_triton=True,
+    warmup_triton=False,
+    trainable=True,
+    inject_fused_attention=True,
+    inject_fused_mlp=False,
+)
+model.warmup_triton()
+device = model.device
+model = get_gptq_peft_model(model, peft_config=peft_config, auto_find_all_linears=True, train_mode=True)
+model.print_trainable_parameters()
+# loading dataset
+WITH_INPUT_TEMPLATE = "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Output:\n"
+WITHOUT_INPUT_TEMPLATE = "### Instruction:\n{instruction}\n\n### Output:\n"
+def ds_refactor_fn(samples):
+    instruction_data = samples["instruction"]
+    input_data = samples["input"]
+    output_data = samples["output"]
+    new_samples = {"prompt": [], "output": []}
+    for instruction_txt, input_txt, output_txt in zip(instruction_data, input_data, output_data):
+        if input_txt:
+            prompt = WITH_INPUT_TEMPLATE.format(instruction=instruction_txt, input=input_txt)
+        else:
+            prompt = WITHOUT_INPUT_TEMPLATE.format(instruction=instruction_txt)
+        new_samples["prompt"].append(prompt)
+        new_samples["output"].append(output_txt)
+    return new_samples
+ds = Dataset.from_generator(
+    lambda: json.load(open("../quantization/dataset/alpaca_data_cleaned.json", "r", encoding="utf-8"))
+)
+ds = ds.map(
+    make_data_block,
+    batched=True,
+    batch_size=len(ds),
+    num_proc=1,
+    remove_columns=ds.column_names,
+    keep_in_memory=True,
+    load_from_cache_file=False,
+    fn_kwargs={
+        "prompt_col_name": "prompt",
+        "label_col_name": "output",
+        "tokenizer": tokenizer,
+        "preprocess_fn": ds_refactor_fn,
+        "sample_max_len": args.sample_max_length,
+        "block_max_len": args.block_max_length,
+        "add_eos_token": True,
+        "truncate_prompt": False,
+        "merge_prompt_label": True,
+    },
+)
+ds = ds.train_test_split(test_size=len(ds) // 10)
+train_ds, eval_ds = ds["train"], ds["test"]
+collate_fn = partial(collate_data, pad_token_id=tokenizer.pad_token_id)
+train_dataloader = DataLoader(train_ds, batch_size=1, shuffle=True, collate_fn=partial(collate_fn))
+eval_dataloader = DataLoader(eval_ds, batch_size=1, shuffle=False, collate_fn=collate_fn)
+# optimizer and lr scheduler
+optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
+lr_scheduler = get_linear_schedule_with_warmup(
+    optimizer=optimizer,
+    num_warmup_steps=0,
+    num_training_steps=(len(train_dataloader) * num_epochs),
+)
+# training and evaluation
+with torch.cuda.amp.autocast():
+    for epoch in range(num_epochs):
+        model.train()
+        total_loss = 0
+        progress_bar = tqdm(train_dataloader)
+        for step, batch in enumerate(progress_bar):
+            batch = {k: v.to(device) for k, v in batch.items()}
+            outputs = model(**batch)
+            loss = outputs.loss
+            total_loss += loss.detach().float()
+            loss.backward()
+            optimizer.step()
+            lr_scheduler.step()
+            optimizer.zero_grad()
+            progress_bar.set_postfix(loss=loss.item())
+        model.eval()
+        eval_loss = 0
+        eval_preds = []
+        for step, batch in enumerate(tqdm(eval_dataloader)):
+            batch = {k: v.to(device) for k, v in batch.items()}
+            with torch.no_grad():
+                outputs = model(**batch)
+            loss = outputs.loss
+            eval_loss += loss.detach().float()
+            eval_preds.extend(
+                tokenizer.batch_decode(
+                    torch.argmax(outputs.logits, -1).detach().cpu().numpy(),
+                    skip_special_tokens=True,
+                )
+            )
+        eval_epoch_loss = eval_loss / len(eval_dataloader)
+        eval_ppl = torch.exp(eval_epoch_loss)
+        train_epoch_loss = total_loss / len(train_dataloader)
+        train_ppl = torch.exp(train_epoch_loss)
+        print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")
+model.save_pretrained(os.path.join(model_name_or_path, f"gptq_{peft_config.peft_type.value}_adapter"))
--- a/examples/quantization/basic_usage.py
+++ b/examples/quantization/basic_usage.py
+from transformers import AutoTokenizer, TextGenerationPipeline
+from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
+pretrained_model_dir = "facebook/opt-125m"
+quantized_model_dir = "opt-125m-4bit-128g"
+def main():
+    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True)
+    examples = [
+        tokenizer(
+            "auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."
+        )
+    ]
+    quantize_config = BaseQuantizeConfig(
+        bits=4,  # quantize model to 4-bit
+        group_size=128,  # it is recommended to set the value to 128
+        desc_act=False,  # set to False can significantly speed up inference but the perplexity may slightly bad
+    )
+    # load un-quantized model, by default, the model will always be loaded into CPU memory
+    model = AutoGPTQForCausalLM.from_pretrained(pretrained_model_dir, quantize_config)
+    # quantize model, the examples should be list of dict whose keys can only be "input_ids" and "attention_mask"
+    model.quantize(examples)
+    # save quantized model
+    model.save_quantized(quantized_model_dir)
+    # push quantized model to Hugging Face Hub.
+    # to use use_auth_token=True, Login first via huggingface-cli login.
+    # or pass explcit token with: use_auth_token="hf_xxxxxxx"
+    # (uncomment the following three lines to enable this feature)
+    # repo_id = f"YourUserName/{quantized_model_dir}"
+    # commit_message = f"AutoGPTQ model for {pretrained_model_dir}: {quantize_config.bits}bits, gr{quantize_config.group_size}, desc_act={quantize_config.desc_act}"
+    # model.push_to_hub(repo_id, commit_message=commit_message, use_auth_token=True)
+    # alternatively you can save and push at the same time
+    # (uncomment the following three lines to enable this feature)
+    # repo_id = f"YourUserName/{quantized_model_dir}"
+    # commit_message = f"AutoGPTQ model for {pretrained_model_dir}: {quantize_config.bits}bits, gr{quantize_config.group_size}, desc_act={quantize_config.desc_act}"
+    # model.push_to_hub(repo_id, save_dir=quantized_model_dir, use_safetensors=True, commit_message=commit_message, use_auth_token=True)
+    # save quantized model using safetensors
+    model.save_quantized(quantized_model_dir, use_safetensors=True)
+    # load quantized model to the first GPU
+    model = AutoGPTQForCausalLM.from_quantized(quantized_model_dir, device="cuda:0")
+    # download quantized model from Hugging Face Hub and load to the first GPU
+    # model = AutoGPTQForCausalLM.from_quantized(repo_id, device="cuda:0", use_safetensors=True, use_triton=False)
+    # inference with model.generate
+    print(tokenizer.decode(model.generate(**tokenizer("auto_gptq is", return_tensors="pt").to(model.device))[0]))
+    # or you can also use pipeline
+    pipeline = TextGenerationPipeline(model=model, tokenizer=tokenizer)
+    print(pipeline("auto-gptq is")[0]["generated_text"])
+if __name__ == "__main__":
+    import logging
+    logging.basicConfig(
+        format="%(asctime)s %(levelname)s [%(name)s] %(message)s",
+        level=logging.INFO,
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+    main()
--- a/examples/quantization/basic_usage_gpt_xl.py
+++ b/examples/quantization/basic_usage_gpt_xl.py
+import random
+import numpy as np
+import torch
+from datasets import load_dataset
+from transformers import TextGenerationPipeline
+from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
+pretrained_model_dir = "gpt2-xl"
+quantized_model_dir = "gpt2-large-4bit-128g"
+# os.makedirs(quantized_model_dir, exist_ok=True)
+def get_wikitext2(nsamples, seed, seqlen, tokenizer):
+    # set seed
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.random.manual_seed(seed)
+    # load dataset and preprocess
+    traindata = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
+    testdata = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
+    trainenc = tokenizer("\n\n".join(traindata["text"]), return_tensors="pt")
+    testenc = tokenizer("\n\n".join(testdata["text"]), return_tensors="pt")
+    traindataset = []
+    for _ in range(nsamples):
+        i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
+        j = i + seqlen
+        inp = trainenc.input_ids[:, i:j]
+        attention_mask = torch.ones_like(inp)
+        traindataset.append({"input_ids": inp, "attention_mask": attention_mask})
+    return traindataset, testenc
+def main():
+    from transformers import AutoTokenizer
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=False)
+    except Exception:
+        tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True)
+    # load un-quantized model, the model will always be force loaded into cpu
+    quantize_config = BaseQuantizeConfig(
+        bits=4,  # quantize model to 4-bit
+        group_size=128,  # it is recommended to set the value to 128
+        desc_act=False,  # desc_act and groupsize only works on triton
+    )
+    # get model maximum sequence length
+    model = AutoGPTQForCausalLM.from_pretrained(pretrained_model_dir, quantize_config)
+    model_config = model.config.to_dict()
+    seq_len_keys = ["max_position_embeddings", "seq_length", "n_positions"]
+    if any(k in model_config for k in seq_len_keys):
+        for key in seq_len_keys:
+            if key in model_config:
+                model.seqlen = model_config[key]
+                break
+    else:
+        print("can't get model's sequence length from model config, will set to 2048.")
+        model.seqlen = 2048
+    # load train dataset for quantize
+    traindataset, testenc = get_wikitext2(128, 0, model.seqlen, tokenizer)
+    # quantize model, the examples should be list of dict whose keys contains "input_ids" and "attention_mask"
+    # with value under torch.LongTensor type.
+    model.quantize(traindataset, use_triton=False)
+    # save quantized model
+    model.save_quantized(quantized_model_dir)
+    # save quantized model using safetensors
+    model.save_quantized(quantized_model_dir, use_safetensors=True)
+    # load quantized model, currently only support cpu or single gpu
+    model = AutoGPTQForCausalLM.from_quantized(quantized_model_dir, device="cuda:0", use_triton=False)
+    # inference with model.generate
+    print(tokenizer.decode(model.generate(**tokenizer("test is", return_tensors="pt").to("cuda:0"))[0]))
+    # or you can also use pipeline
+    pipeline = TextGenerationPipeline(model=model, tokenizer=tokenizer, device="cuda:0")
+    print(pipeline("test is")[0]["generated_text"])
+if __name__ == "__main__":
+    import logging
+    logging.basicConfig(
+        format="%(asctime)s %(levelname)s [%(name)s] %(message)s",
+        level=logging.INFO,
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+    main()
--- a/examples/quantization/basic_usage_wikitext2.py
+++ b/examples/quantization/basic_usage_wikitext2.py
+import numpy as np
+import torch
+import torch.nn as nn
+from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
+pretrained_model_dir = "facebook/opt-125m"
+quantized_model_dir = "opt-125m-4bit-128g"
+# os.makedirs(quantized_model_dir, exist_ok=True)
+def get_wikitext2(nsamples, seed, seqlen, model):
+    from datasets import load_dataset
+    traindata = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
+    testdata = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
+    from transformers import AutoTokenizer
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False)
+    except Exception:
+        tokenizer = AutoTokenizer.from_pretrained(model, use_fast=True)
+    trainenc = tokenizer("\n\n".join(traindata["text"]), return_tensors="pt")
+    testenc = tokenizer("\n\n".join(testdata["text"]), return_tensors="pt")
+    import random
+    random.seed(seed)
+    np.random.seed(0)
+    torch.random.manual_seed(0)
+    traindataset = []
+    for _ in range(nsamples):
+        i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
+        j = i + seqlen
+        inp = trainenc.input_ids[:, i:j]
+        attention_mask = torch.ones_like(inp)
+        traindataset.append({"input_ids": inp, "attention_mask": attention_mask})
+    return traindataset, testenc
+@torch.no_grad()
+def opt_eval(model, testenc, dev, seqlen=2048):
+    print("Evaluating ...")
+    testenc = testenc.input_ids
+    nsamples = testenc.numel() // seqlen
+    use_cache = model.config.use_cache
+    model.config.use_cache = False
+    layers = model.model.decoder.layers
+    model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(dev)
+    model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(dev)
+    if hasattr(model.model.decoder, "project_out") and model.model.decoder.project_out:
+        model.model.decoder.project_out = model.model.decoder.project_out.to(dev)
+    if hasattr(model.model.decoder, "project_in") and model.model.decoder.project_in:
+        model.model.decoder.project_in = model.model.decoder.project_in.to(dev)
+    layers[0] = layers[0].to(dev)
+    dtype = next(iter(model.parameters())).dtype
+    inps = torch.zeros((nsamples, seqlen, model.config.hidden_size), dtype=dtype, device=dev)
+    cache = {"i": 0, "attention_mask": None}
+    class Catcher(nn.Module):
+        def __init__(self, module):
+            super().__init__()
+            self.module = module
+        def forward(self, inp, **kwargs):
+            inps[cache["i"]] = inp
+            cache["i"] += 1
+            cache["attention_mask"] = kwargs["attention_mask"]
+            raise ValueError
+    layers[0] = Catcher(layers[0])
+    for i in range(nsamples):
+        batch = testenc[:, (i * seqlen) : ((i + 1) * seqlen)].to(dev)
+        try:
+            model(batch)
+        except ValueError:
+            pass
+    layers[0] = layers[0].module
+    layers[0] = layers[0].cpu()
+    model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.cpu()
+    model.model.decoder.embed_positions = model.model.decoder.embed_positions.cpu()
+    if hasattr(model.model.decoder, "project_out") and model.model.decoder.project_out:
+        model.model.decoder.project_out = model.model.decoder.project_out.cpu()
+    if hasattr(model.model.decoder, "project_in") and model.model.decoder.project_in:
+        model.model.decoder.project_in = model.model.decoder.project_in.cpu()
+    torch.cuda.empty_cache()
+    outs = torch.zeros_like(inps)
+    attention_mask = cache["attention_mask"]
+    for i in range(len(layers)):
+        print(i)
+        layer = layers[i].to(dev)
+        for j in range(nsamples):
+            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
+        layers[i] = layer.cpu()
+        del layer
+        torch.cuda.empty_cache()
+        inps, outs = outs, inps
+    if model.model.decoder.final_layer_norm is not None:
+        model.model.decoder.final_layer_norm = model.model.decoder.final_layer_norm.to(dev)
+    if model.model.decoder.project_out is not None:
+        model.model.decoder.project_out = model.model.decoder.project_out.to(dev)
+    model.lm_head = model.lm_head.to(dev)
+    testenc = testenc.to(dev)
+    nlls = []
+    for i in range(nsamples):
+        hidden_states = inps[i].unsqueeze(0)
+        if model.model.decoder.final_layer_norm is not None:
+            hidden_states = model.model.decoder.final_layer_norm(hidden_states)
+        if model.model.decoder.project_out is not None:
+            hidden_states = model.model.decoder.project_out(hidden_states)
+        lm_logits = model.lm_head(hidden_states)
+        shift_logits = lm_logits[:, :-1, :].contiguous()
+        shift_labels = testenc[:, (i * seqlen) : ((i + 1) * seqlen)][:, 1:]
+        loss_fct = nn.CrossEntropyLoss()
+        loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+        neg_log_likelihood = loss.float() * seqlen
+        nlls.append(neg_log_likelihood)
+    ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * seqlen))
+    print(ppl.item())
+    model.config.use_cache = use_cache
+def main():
+    traindataset, testenc = get_wikitext2(128, 0, 2048, pretrained_model_dir)
+    quantize_config = BaseQuantizeConfig(
+        bits=4,  # quantize model to 4-bit
+        group_size=128,  # it is recommended to set the value to 128
+        desc_act=False,  # desc_act and group size only works on triton
+    )
+    # load un-quantized model, the model will always be force loaded into cpu
+    model = AutoGPTQForCausalLM.from_pretrained(pretrained_model_dir, quantize_config)
+    # quantize model, the examples should be list of dict whose keys can only be "input_ids" and "attention_mask"
+    # with value under torch.LongTensor type.
+    model.quantize(traindataset, use_triton=False)
+    # save quantized model
+    model.save_quantized(quantized_model_dir)
+    # save quantized model using safetensors
+    model.save_quantized(quantized_model_dir, use_safetensors=True)
+    # load quantized model, currently only support cpu or single gpu
+    model = AutoGPTQForCausalLM.from_quantized(quantized_model_dir, device="cuda:0", use_triton=False)
+    opt_eval(model.model, testenc, "cuda:0")
+if __name__ == "__main__":
+    import logging
+    logging.basicConfig(
+        format="%(asctime)s %(levelname)s [%(name)s] %(message)s",
+        level=logging.INFO,
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+    main()
--- a/examples/quantization/dataset/alpaca_data_cleaned.json
+++ b/examples/quantization/dataset/alpaca_data_cleaned.json
--- a/examples/quantization/quant_with_alpaca.py
+++ b/examples/quantization/quant_with_alpaca.py
+import json
+import random
+import time
+from argparse import ArgumentParser
+import torch
+from datasets import Dataset
+from transformers import AutoTokenizer, TextGenerationPipeline
+from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
+def load_data(data_path, tokenizer, n_samples):
+    with open(data_path, "r", encoding="utf-8") as f:
+        raw_data = json.load(f)
+    raw_data = random.sample(raw_data, k=min(n_samples, len(raw_data)))
+    def dummy_gen():
+        return raw_data
+    def tokenize(examples):
+        instructions = examples["instruction"]
+        inputs = examples["input"]
+        outputs = examples["output"]
+        prompts = []
+        texts = []
+        input_ids = []
+        attention_mask = []
+        for istr, inp, opt in zip(instructions, inputs, outputs):
+            if inp:
+                prompt = f"Instruction:\n{istr}\nInput:\n{inp}\nOutput:\n"
+                text = prompt + opt
+            else:
+                prompt = f"Instruction:\n{istr}\nOutput:\n"
+                text = prompt + opt
+            if len(tokenizer(prompt)["input_ids"]) >= tokenizer.model_max_length:
+                continue
+            tokenized_data = tokenizer(text)
+            input_ids.append(tokenized_data["input_ids"][: tokenizer.model_max_length])
+            attention_mask.append(tokenized_data["attention_mask"][: tokenizer.model_max_length])
+            prompts.append(prompt)
+            texts.append(text)
+        return {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "prompt": prompts,
+        }
+    dataset = Dataset.from_generator(dummy_gen)
+    dataset = dataset.map(
+        tokenize,
+        batched=True,
+        batch_size=len(dataset),
+        num_proc=1,
+        keep_in_memory=True,
+        load_from_cache_file=False,
+        remove_columns=["instruction", "input"],
+    )
+    dataset = dataset.to_list()
+    for sample in dataset:
+        sample["input_ids"] = torch.LongTensor(sample["input_ids"])
+        sample["attention_mask"] = torch.LongTensor(sample["attention_mask"])
+    return dataset
+def main():
+    parser = ArgumentParser()
+    parser.add_argument("--pretrained_model_dir", type=str)
+    parser.add_argument("--quantized_model_dir", type=str, default=None)
+    parser.add_argument("--bits", type=int, default=4, choices=[2, 3, 4, 8])
+    parser.add_argument(
+        "--group_size",
+        type=int,
+        default=128,
+        help="group size, -1 means no grouping or full rank",
+    )
+    parser.add_argument("--desc_act", action="store_true", help="whether to quantize with desc_act")
+    parser.add_argument(
+        "--num_samples",
+        type=int,
+        default=128,
+        help="how many samples will be used to quantize model",
+    )
+    parser.add_argument(
+        "--save_and_reload",
+        action="store_true",
+        help="whether save quantized model to disk and reload back",
+    )
+    parser.add_argument("--fast_tokenizer", action="store_true", help="whether use fast tokenizer")
+    parser.add_argument(
+        "--use_triton",
+        action="store_true",
+        help="whether use triton to speedup at inference",
+    )
+    parser.add_argument(
+        "--per_gpu_max_memory",
+        type=int,
+        default=None,
+        help="max memory used to load model per gpu",
+    )
+    parser.add_argument(
+        "--cpu_max_memory",
+        type=int,
+        default=None,
+        help="max memory used to offload model to cpu",
+    )
+    parser.add_argument(
+        "--quant_batch_size",
+        type=int,
+        default=1,
+        help="examples batch size for quantization",
+    )
+    parser.add_argument(
+        "--trust_remote_code",
+        action="store_true",
+        help="whether to trust remote code when loading model",
+    )
+    args = parser.parse_args()
+    max_memory = {}
+    if args.per_gpu_max_memory is not None and args.per_gpu_max_memory > 0:
+        if torch.cuda.is_available():
+            max_memory.update({i: f"{args.per_gpu_max_memory}GIB" for i in range(torch.cuda.device_count())})
+    if args.cpu_max_memory is not None and args.cpu_max_memory > 0 and max_memory:
+        max_memory["cpu"] = f"{args.cpu_max_memory}GIB"
+    if not max_memory:
+        max_memory = None
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.pretrained_model_dir,
+        use_fast=args.fast_tokenizer,
+        trust_remote_code=args.trust_remote_code,
+    )
+    model = AutoGPTQForCausalLM.from_pretrained(
+        args.pretrained_model_dir,
+        quantize_config=BaseQuantizeConfig(bits=args.bits, group_size=args.group_size, desc_act=args.desc_act),
+        max_memory=max_memory,
+        trust_remote_code=args.trust_remote_code,
+    )
+    examples = load_data("dataset/alpaca_data_cleaned.json", tokenizer, args.num_samples)
+    examples_for_quant = [
+        {"input_ids": example["input_ids"], "attention_mask": example["attention_mask"]} for example in examples
+    ]
+    start = time.time()
+    model.quantize(
+        examples_for_quant,
+        batch_size=args.quant_batch_size,
+        use_triton=args.use_triton,
+        autotune_warmup_after_quantized=args.use_triton,
+    )
+    end = time.time()
+    print(f"quantization took: {end - start: .4f}s")
+    if not args.quantized_model_dir:
+        args.quantized_model_dir = args.pretrained_model_dir
+    if args.save_and_reload:
+        model.save_quantized(args.quantized_model_dir)
+        del model
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        model = AutoGPTQForCausalLM.from_quantized(
+            args.quantized_model_dir,
+            device="cuda:0",
+            use_triton=args.use_triton,
+            max_memory=max_memory,
+            inject_fused_mlp=True,
+            inject_fused_attention=True,
+            trust_remote_code=args.trust_remote_code,
+        )
+    pipeline_init_kwargs = {"model": model, "tokenizer": tokenizer}
+    if not max_memory:
+        pipeline_init_kwargs["device"] = "cuda:0"
+    pipeline = TextGenerationPipeline(**pipeline_init_kwargs)
+    for example in random.sample(examples, k=min(4, len(examples))):
+        print(f"prompt: {example['prompt']}")
+        print("-" * 42)
+        print(f"golden: {example['output']}")
+        print("-" * 42)
+        start = time.time()
+        generated_text = pipeline(
+            example["prompt"],
+            return_full_text=False,
+            num_beams=1,
+            max_length=len(example["input_ids"])
+            + 128,  # use this instead of max_new_token to disable UserWarning when integrate with logging
+        )[0]["generated_text"]
+        end = time.time()
+        print(f"quant: {generated_text}")
+        num_new_tokens = len(tokenizer(generated_text)["input_ids"])
+        print(f"generate {num_new_tokens} tokens using {end-start: .4f}s, {num_new_tokens / (end - start)} tokens/s.")
+        print("=" * 42)
+if __name__ == "__main__":
+    import logging
+    logging.basicConfig(
+        format="%(asctime)s %(levelname)s [%(name)s] %(message)s",
+        level=logging.INFO,
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+    main()
--- a/offline_inference.py
+++ b/offline_inference.py
+from vllm import LLM, SamplingParams
+if __name__ == '__main__':
+    # Sample prompts.
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    # Create a sampling params object.
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=16)
+    # Create an LLM.
+    llm = LLM(model="./Qwen1.5-7B-4bit-gptq-2",tensor_parallel_size=1, dtype="float16",trust_remote_code=True, enforce_eager=True)
+    # Generate texts from the prompts. The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = llm.generate(prompts, sampling_params)
+    # Print the outputs.
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")