GPTQ: add auto-gptq extra, add gptq_use_triton parameter

c11ad4f2 · gk · b465cd01 · c11ad4f2 · c11ad4f2 · c11ad4f2
Commit c11ad4f2 authored May 26, 2023 by gk
Hide whitespace changes
Inline Side-by-side

Showing with 16 additions and 3 deletions

README.md README.md +8 -2

lm_eval/models/huggingface.py lm_eval/models/huggingface.py +7 -1

setup.py setup.py +1 -0

No files found.
--- a/README.md
+++ b/README.md
@@ -29,6 +29,12 @@ To install additional multilingual tokenization and text segmentation packages,
 pip install -e ".[multilingual]"
 ```
+To support loading GPTQ quantized models, install the package with the `auto-gptq` extra:
+```bash
+pip install -e ".[auto-gptq]"
+```
 ## Basic Usage
 > **Note**: When reporting results from eval harness, please include the task versions (shown in `results["versions"]`) for reproducibility. This allows bug fixes to tasks while also ensuring that previously reported scores are reproducible. See the [Task Versioning](#task-versioning) section for more info.
@@ -111,12 +117,12 @@ python main.py \
    --device cuda:0
 ```
-GPTQ quantized models can be loaded by installing [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) with `pip install auto-gptq[triton]` and specifying their file names in `,quantized=NAME` (or `,quantized=True` for default names) in the `model_args` argument:
+GPTQ quantized models can be loaded by specifying their file names in `,quantized=NAME` (or `,quantized=True` for default names) in the `model_args` argument:
 ```bash
 python main.py \
    --model hf-causal-experimental \
-    --model_args pretrained=model-name-or-path,quantized=model.safetensors \
+    --model_args pretrained=model-name-or-path,quantized=model.safetensors,gptq_use_triton=True \
    --tasks hellaswag
 ```

--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -88,6 +88,7 @@ class HuggingFaceAutoLM(BaseLM):
        peft: str = None,
        load_in_8bit: Optional[bool] = False,
        trust_remote_code: Optional[bool] = False,
+        gptq_use_triton: Optional[bool] = False,
    ):
        """Initializes a HuggingFace `AutoModel` and `AutoTokenizer` for evaluation.
        Args:
@@ -144,6 +145,8 @@ class HuggingFaceAutoLM(BaseLM):
                https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel.from_pretrained.load_in_8bit
            trust_remote_code (bool, optional, defaults to False):
                If True, will trust the remote code when loading the model.
+            gptq_use_triton (bool, optional, defaults to False):
+                Use Triton for GPTQ inference.
        """
        super().__init__()
@@ -202,6 +205,7 @@ class HuggingFaceAutoLM(BaseLM):
            revision=revision,
            subfolder=subfolder,
            torch_dtype=_get_dtype(dtype, self._config),
+            gptq_use_triton=gptq_use_triton,
            **model_kwargs,
        )
        # note: peft_path can be different than pretrained model path
@@ -239,6 +243,7 @@ class HuggingFaceAutoLM(BaseLM):
        load_in_8bit: Optional[bool] = False,
        trust_remote_code: Optional[bool] = False,
        torch_dtype: Optional[Union[str, torch.dtype]] = None,
+        gptq_use_triton: Optional[bool] = False,
    ) -> transformers.AutoModel:
        """Returns a pre-trained pytorch model from a pre-trained model configuration."""
        if quantized is None:
@@ -261,7 +266,8 @@ class HuggingFaceAutoLM(BaseLM):
                max_memory=max_memory,
                trust_remote_code=trust_remote_code,
                use_safetensors=True if quantized == True else quantized.endswith('.safetensors'),
-                use_triton=True,
+                use_triton=gptq_use_triton,
+                warmup_triton=gptq_use_triton,
            )
        return model

--- a/setup.py
+++ b/setup.py
@@ -44,5 +44,6 @@ setuptools.setup(
        "dev": ["black", "flake8", "pre-commit", "pytest", "pytest-cov"],
        "multilingual": ["nagisa>=0.2.7", "jieba>=0.42.1"],
        "sentencepiece": ["sentencepiece>=0.1.98", "protobuf>=4.22.1"],
+        "auto-gptq": ["auto-gptq[triton] @ git+https://github.com/PanQiWei/AutoGPTQ"],
    },
 )