Add GPTQModel support for evaluating GPTQ models (#2217)

* support gptqmodel * code opt * add gptqmodel option * Update huggingface.py * Update pyproject.toml * gptqmodel version upgraded to 1.0.6 * GPTQModel version upgraded to 1.0.8 * Update pyproject.toml * fix ruff-format error * add gptqmodel test * Update gptqmodel test model * skip cuda * python3.8 compatible * Update README.md * Update README.md --------- Co-authored-by: CL-ModelCloud <cl@modelcloud.ai>

Add GPTQModel support for evaluating GPTQ models (#2217)
* support gptqmodel * code opt * add gptqmodel option * Update huggingface.py * Update pyproject.toml * gptqmodel version upgraded to 1.0.6 * GPTQModel version upgraded to 1.0.8 * Update pyproject.toml * fix ruff-format error * add gptqmodel test * Update gptqmodel test model * skip cuda * python3.8 compatible * Update README.md * Update README.md --------- Co-authored-by: CL-ModelCloud <cl@modelcloud.ai>
4f8e479e · Qubitium-ModelCloud · GitHub · 57272b63 · 4f8e479e · 4f8e479e
Unverified Commit 4f8e479e authored Nov 01, 2024 by Qubitium-ModelCloud Committed by GitHub Oct 31, 2024
Showing with 103 additions and 18 deletions

README.md README.md +10 -2

lm_eval/models/huggingface.py lm_eval/models/huggingface.py +38 -16

pyproject.toml pyproject.toml +1 -0

tests/models/test_gptqmodel.py tests/models/test_gptqmodel.py +54 -0

No files found.
--- a/README.md
+++ b/README.md
@@ -39,7 +39,7 @@ This project provides a unified framework to test generative language models on

 **Features:**
 - Over 60 standard academic benchmarks for LLMs, with hundreds of subtasks and variants implemented.
- Support for models loaded via [transformers](https://github.com/huggingface/transformers/) (including quantization via [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ)), [GPT-NeoX](https://github.com/EleutherAI/gpt-neox), and [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed/), with a flexible tokenization-agnostic interface.
+- Support for models loaded via [transformers](https://github.com/huggingface/transformers/) (including quantization via [GPTQModel](https://github.com/ModelCloud/GPTQModel) and [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ)), [GPT-NeoX](https://github.com/EleutherAI/gpt-neox), and [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed/), with a flexible tokenization-agnostic interface.
 - Support for fast and memory-efficient inference with [vLLM](https://github.com/vllm-project/vllm).
 - Support for commercial APIs including [OpenAI](https://openai.com), and [TextSynth](https://textsynth.com/).
 - Support for evaluation on adapters (e.g. LoRA) supported in [HuggingFace's PEFT library](https://github.com/huggingface/peft).
@@ -319,8 +319,16 @@ lm_eval --model hf \
    --tasks hellaswag
 ```

-[GPTQ](https://github.com/PanQiWei/AutoGPTQ) quantized models can be loaded by specifying their file names in `,autogptq=NAME` (or `,autogptq=True` for default names) in the `model_args` argument:
+GPTQ quantized models can be loaded using [GPTQModel](https://github.com/ModelCloud/GPTQModel) (faster) or [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ)

+GPTQModel: add `,gptqmodel=True` to `model_args`
+```bash
+lm_eval --model hf \
+    --model_args pretrained=model-name-or-path,gptqmodel=True \
+    --tasks hellaswag
+```
+
+AutoGPTQ: add `,autogptq=True` to `model_args`:
 ```bash
 lm_eval --model hf \
    --model_args pretrained=model-name-or-path,autogptq=model.safetensors,gptq_use_triton=True \

--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -87,6 +87,7 @@ class HFLM(TemplateLM):
        peft: Optional[str] = None,
        delta: Optional[str] = None,
        autogptq: Optional[Union[bool, str]] = False,
+        gptqmodel: Optional[bool] = False,
        **kwargs,
    ) -> None:
        super().__init__()
@@ -192,6 +193,7 @@ class HFLM(TemplateLM):
                peft=peft,
                delta=delta,
                autogptq=autogptq,
+                gptqmodel=gptqmodel,
                **kwargs,
            )

@@ -530,6 +532,7 @@ class HFLM(TemplateLM):
        peft: Optional[str] = None,
        delta: Optional[str] = None,
        autogptq: Optional[Union[bool, str]] = False,
+        gptqmodel: Optional[bool] = False,
        **kwargs,
    ) -> None:
        """
@@ -557,7 +560,7 @@ class HFLM(TemplateLM):
            )
        )

-        if not autogptq:
+        if not autogptq and not gptqmodel:
            if model_kwargs.get("load_in_4bit", None):
                assert (
                    transformers.__version__ >= "4.30.0"
@@ -577,23 +580,42 @@ class HFLM(TemplateLM):
                **model_kwargs,
            )
        else:
-            try:
-                from auto_gptq import AutoGPTQForCausalLM
-            except ModuleNotFoundError as exception:
-                raise type(exception)(
-                    "Tried to load auto_gptq, but auto-gptq is not installed ",
-                    "please install auto-gptq via pip install lm-eval[gptq] or pip install -e .[gptq]",
+            if autogptq and gptqmodel:
+                raise ValueError(
+                    "Cannot use both 'autogptq' and 'gptqmodel' options at the same time."
                )

-            self._model = AutoGPTQForCausalLM.from_quantized(
-                pretrained,
-                trust_remote_code=trust_remote_code,
-                model_basename=None if autogptq is True else Path(autogptq).stem,
-                use_safetensors=True
-                if autogptq is True
-                else autogptq.endswith(".safetensors"),
-                **model_kwargs,
-            )
+            if autogptq:
+                try:
+                    from auto_gptq import AutoGPTQForCausalLM
+                except ModuleNotFoundError as exception:
+                    raise type(exception)(
+                        "Tried to load auto_gptq, but auto-gptq is not installed ",
+                        "please install auto-gptq via pip install lm-eval[gptq] or pip install -e .[gptq]",
+                    )
+
+                self._model = AutoGPTQForCausalLM.from_quantized(
+                    pretrained,
+                    trust_remote_code=trust_remote_code,
+                    model_basename=None if autogptq is True else Path(autogptq).stem,
+                    use_safetensors=True
+                    if autogptq is True
+                    else autogptq.endswith(".safetensors"),
+                    **model_kwargs,
+                )
+
+            if gptqmodel:
+                try:
+                    from gptqmodel import GPTQModel
+                except ModuleNotFoundError as exception:
+                    raise type(exception)(
+                        "Tried to load gptqmodel, but gptqmodel is not installed ",
+                        "please install gptqmodel via `pip install gptqmodel --no-build-isolation` or `pip install lm-eval[gptqmodel] --no-build-isolation`",
+                    )
+
+                self._model = GPTQModel.from_quantized(
+                    pretrained, trust_remote_code=trust_remote_code, **model_kwargs
+                )

        if peft and delta:
            raise ValueError(

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -76,6 +76,7 @@ testing = ["pytest", "pytest-cov", "pytest-xdist"]
 vllm = ["vllm>=0.4.2"]
 zeno = ["pandas", "zeno-client"]
 wandb = ["wandb>=0.16.3", "pandas", "numpy"]
+gptqmodel = ["gptqmodel>=1.0.9"]
 all = [
    "lm_eval[anthropic]",
    "lm_eval[dev]",

--- a/tests/models/test_gptqmodel.py
+++ b/tests/models/test_gptqmodel.py
+from typing import List
+
+import pytest
+
+import lm_eval
+
+
+def assert_less_than(value, threshold, desc):
+    if value is not None:
+        assert float(value) < threshold, f"{desc} should be less than {threshold}"
+
+
+@pytest.mark.skip(reason="requires CUDA")
+class Test_GPTQModel:
+    gptqmodel = pytest.importorskip("gptqmodel", minversion="1.0.9")
+    MODEL_ID = "ModelCloud/Opt-125-GPTQ-4bit-10-25-2024"
+
+    def test_gptqmodel(self) -> None:
+        acc = "acc"
+        acc_norm = "acc_norm"
+        acc_value = None
+        acc_norm_value = None
+        task = "arc_easy"
+
+        model_args = f"pretrained={self.MODEL_ID},gptqmodel=True"
+
+        tasks: List[str] = [task]
+
+        results = lm_eval.simple_evaluate(
+            model="hf",
+            model_args=model_args,
+            tasks=tasks,
+            device="cuda",
+        )
+
+        column = "results"
+        dic = results.get(column, {}).get(self.task)
+        if dic is not None:
+            if "alias" in dic:
+                _ = dic.pop("alias")
+            items = sorted(dic.items())
+            for k, v in items:
+                m, _, f = k.partition(",")
+                if m.endswith("_stderr"):
+                    continue
+
+                if m == acc:
+                    acc_value = "%.4f" % v if isinstance(v, float) else v
+
+                if m == acc_norm:
+                    acc_norm_value = "%.4f" % v if isinstance(v, float) else v
+
+            assert_less_than(acc_value, 0.43, "acc")
+            assert_less_than(acc_norm_value, 0.39, "acc_norm")