Commit b296c4f6 authored by gk's avatar gk
Browse files

Add support for loading GPTQ models via AutoGPTQ

parent 84ef60ee
...@@ -7,7 +7,7 @@ This project provides a unified framework to test generative language models on ...@@ -7,7 +7,7 @@ This project provides a unified framework to test generative language models on
Features: Features:
- 200+ tasks implemented. See the [task-table](./docs/task_table.md) for a complete list. - 200+ tasks implemented. See the [task-table](./docs/task_table.md) for a complete list.
- Support for models loaded via [transformers](https://github.com/huggingface/transformers/), [GPT-NeoX](https://github.com/EleutherAI/gpt-neox), and [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed/), with a flexible tokenization-agnostic interface. - Support for models loaded via [transformers](https://github.com/huggingface/transformers/) (with [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ)), [GPT-NeoX](https://github.com/EleutherAI/gpt-neox), and [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed/), with a flexible tokenization-agnostic interface.
- Support for commercial APIs including [OpenAI](https://openai.com), [goose.ai](https://goose.ai), and [TextSynth](https://textsynth.com/). - Support for commercial APIs including [OpenAI](https://openai.com), [goose.ai](https://goose.ai), and [TextSynth](https://textsynth.com/).
- Support for evaluation on adapters (e.g. LoRa) supported in [HuggingFace's PEFT library](https://github.com/huggingface/peft). - Support for evaluation on adapters (e.g. LoRa) supported in [HuggingFace's PEFT library](https://github.com/huggingface/peft).
- Evaluating with publicly available prompts ensures reproducibility and comparability between papers. - Evaluating with publicly available prompts ensures reproducibility and comparability between papers.
...@@ -111,6 +111,14 @@ python main.py \ ...@@ -111,6 +111,14 @@ python main.py \
--device cuda:0 --device cuda:0
``` ```
GPTQ models can be loaded by specifying their file names in `,quantized=NAME` in the `model_args` argument:
```bash
python main.py \
--model hf-causal-experimental \
--model_args pretrained=model-directory,quantized=model.safetensors \
--tasks hellaswag
```
We support wildcards in task names, for example you can run all of the machine-translated lambada tasks via `--task lambada_openai_mt_*`. We support wildcards in task names, for example you can run all of the machine-translated lambada tasks via `--task lambada_openai_mt_*`.
......
...@@ -3,6 +3,7 @@ import torch ...@@ -3,6 +3,7 @@ import torch
import torch.nn.functional as F import torch.nn.functional as F
import transformers import transformers
import peft import peft
from pathlib import Path
from typing import List, Mapping, NewType, Optional, Tuple, Union from typing import List, Mapping, NewType, Optional, Tuple, Union
from tqdm import tqdm from tqdm import tqdm
...@@ -69,6 +70,7 @@ class HuggingFaceAutoLM(BaseLM): ...@@ -69,6 +70,7 @@ class HuggingFaceAutoLM(BaseLM):
def __init__( def __init__(
self, self,
pretrained: str, pretrained: str,
quantized: Optional[str] = None,
tokenizer: Optional[str] = None, tokenizer: Optional[str] = None,
subfolder: Optional[str] = None, subfolder: Optional[str] = None,
revision: Optional[str] = "main", revision: Optional[str] = "main",
...@@ -93,6 +95,8 @@ class HuggingFaceAutoLM(BaseLM): ...@@ -93,6 +95,8 @@ class HuggingFaceAutoLM(BaseLM):
The HuggingFace Hub model ID name or the path to a pre-trained The HuggingFace Hub model ID name or the path to a pre-trained
model to load. This is effectively the `pretrained_model_name_or_path` model to load. This is effectively the `pretrained_model_name_or_path`
argument of `from_pretrained` in the HuggingFace `transformers` API. argument of `from_pretrained` in the HuggingFace `transformers` API.
quantized (str, optional, defaults to None):
File name of a GPTQ model to load.
add_special_tokens (bool, optional, defaults to True): add_special_tokens (bool, optional, defaults to True):
Whether to add special tokens to the input sequences. If `None`, the Whether to add special tokens to the input sequences. If `None`, the
default value will be set to `True` for seq2seq models (e.g. T5) and default value will be set to `True` for seq2seq models (e.g. T5) and
...@@ -192,6 +196,7 @@ class HuggingFaceAutoLM(BaseLM): ...@@ -192,6 +196,7 @@ class HuggingFaceAutoLM(BaseLM):
model_kwargs["load_in_8bit"] = load_in_8bit model_kwargs["load_in_8bit"] = load_in_8bit
self.model = self._create_auto_model( self.model = self._create_auto_model(
pretrained=pretrained, pretrained=pretrained,
quantized=quantized,
trust_remote_code=trust_remote_code, trust_remote_code=trust_remote_code,
revision=revision, revision=revision,
subfolder=subfolder, subfolder=subfolder,
...@@ -224,6 +229,7 @@ class HuggingFaceAutoLM(BaseLM): ...@@ -224,6 +229,7 @@ class HuggingFaceAutoLM(BaseLM):
self, self,
*, *,
pretrained: str, pretrained: str,
quantized: Optional[str] = None,
revision: str, revision: str,
subfolder: str, subfolder: str,
device_map: Optional[Union[str, _DeviceMapping]] = None, device_map: Optional[Union[str, _DeviceMapping]] = None,
...@@ -234,16 +240,28 @@ class HuggingFaceAutoLM(BaseLM): ...@@ -234,16 +240,28 @@ class HuggingFaceAutoLM(BaseLM):
torch_dtype: Optional[Union[str, torch.dtype]] = None, torch_dtype: Optional[Union[str, torch.dtype]] = None,
) -> transformers.AutoModel: ) -> transformers.AutoModel:
"""Returns a pre-trained pytorch model from a pre-trained model configuration.""" """Returns a pre-trained pytorch model from a pre-trained model configuration."""
model = self.AUTO_MODEL_CLASS.from_pretrained( if quantized is None:
pretrained, model = self.AUTO_MODEL_CLASS.from_pretrained(
revision=revision + ("/" + subfolder if subfolder is not None else ""), pretrained,
device_map=device_map, revision=revision + ("/" + subfolder if subfolder is not None else ""),
max_memory=max_memory, device_map=device_map,
offload_folder=offload_folder, max_memory=max_memory,
load_in_8bit=load_in_8bit, offload_folder=offload_folder,
trust_remote_code=trust_remote_code, load_in_8bit=load_in_8bit,
torch_dtype=torch_dtype, trust_remote_code=trust_remote_code,
) torch_dtype=torch_dtype,
)
else:
from auto_gptq import AutoGPTQForCausalLM
model = AutoGPTQForCausalLM.from_quantized(
pretrained,
model_basename=Path(quantized).stem,
device_map=device_map,
max_memory=max_memory,
trust_remote_code=trust_remote_code,
use_safetensors=quantized.endswith('.safetensors'),
use_triton=True,
)
return model return model
def _create_auto_model_peft( def _create_auto_model_peft(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment