Commit b296c4f6 authored by gk's avatar gk
Browse files

Add support for loading GPTQ models via AutoGPTQ

parent 84ef60ee
......@@ -7,7 +7,7 @@ This project provides a unified framework to test generative language models on
Features:
- 200+ tasks implemented. See the [task-table](./docs/task_table.md) for a complete list.
- Support for models loaded via [transformers](https://github.com/huggingface/transformers/), [GPT-NeoX](https://github.com/EleutherAI/gpt-neox), and [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed/), with a flexible tokenization-agnostic interface.
- Support for models loaded via [transformers](https://github.com/huggingface/transformers/) (with [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ)), [GPT-NeoX](https://github.com/EleutherAI/gpt-neox), and [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed/), with a flexible tokenization-agnostic interface.
- Support for commercial APIs including [OpenAI](https://openai.com), [goose.ai](https://goose.ai), and [TextSynth](https://textsynth.com/).
- Support for evaluation on adapters (e.g. LoRa) supported in [HuggingFace's PEFT library](https://github.com/huggingface/peft).
- Evaluating with publicly available prompts ensures reproducibility and comparability between papers.
......@@ -111,6 +111,14 @@ python main.py \
--device cuda:0
```
GPTQ models can be loaded by specifying their file names in `,quantized=NAME` in the `model_args` argument:
```bash
python main.py \
--model hf-causal-experimental \
--model_args pretrained=model-directory,quantized=model.safetensors \
--tasks hellaswag
```
We support wildcards in task names, for example you can run all of the machine-translated lambada tasks via `--task lambada_openai_mt_*`.
......
......@@ -3,6 +3,7 @@ import torch
import torch.nn.functional as F
import transformers
import peft
from pathlib import Path
from typing import List, Mapping, NewType, Optional, Tuple, Union
from tqdm import tqdm
......@@ -69,6 +70,7 @@ class HuggingFaceAutoLM(BaseLM):
def __init__(
self,
pretrained: str,
quantized: Optional[str] = None,
tokenizer: Optional[str] = None,
subfolder: Optional[str] = None,
revision: Optional[str] = "main",
......@@ -93,6 +95,8 @@ class HuggingFaceAutoLM(BaseLM):
The HuggingFace Hub model ID name or the path to a pre-trained
model to load. This is effectively the `pretrained_model_name_or_path`
argument of `from_pretrained` in the HuggingFace `transformers` API.
quantized (str, optional, defaults to None):
File name of a GPTQ model to load.
add_special_tokens (bool, optional, defaults to True):
Whether to add special tokens to the input sequences. If `None`, the
default value will be set to `True` for seq2seq models (e.g. T5) and
......@@ -192,6 +196,7 @@ class HuggingFaceAutoLM(BaseLM):
model_kwargs["load_in_8bit"] = load_in_8bit
self.model = self._create_auto_model(
pretrained=pretrained,
quantized=quantized,
trust_remote_code=trust_remote_code,
revision=revision,
subfolder=subfolder,
......@@ -224,6 +229,7 @@ class HuggingFaceAutoLM(BaseLM):
self,
*,
pretrained: str,
quantized: Optional[str] = None,
revision: str,
subfolder: str,
device_map: Optional[Union[str, _DeviceMapping]] = None,
......@@ -234,16 +240,28 @@ class HuggingFaceAutoLM(BaseLM):
torch_dtype: Optional[Union[str, torch.dtype]] = None,
) -> transformers.AutoModel:
"""Returns a pre-trained pytorch model from a pre-trained model configuration."""
model = self.AUTO_MODEL_CLASS.from_pretrained(
pretrained,
revision=revision + ("/" + subfolder if subfolder is not None else ""),
device_map=device_map,
max_memory=max_memory,
offload_folder=offload_folder,
load_in_8bit=load_in_8bit,
trust_remote_code=trust_remote_code,
torch_dtype=torch_dtype,
)
if quantized is None:
model = self.AUTO_MODEL_CLASS.from_pretrained(
pretrained,
revision=revision + ("/" + subfolder if subfolder is not None else ""),
device_map=device_map,
max_memory=max_memory,
offload_folder=offload_folder,
load_in_8bit=load_in_8bit,
trust_remote_code=trust_remote_code,
torch_dtype=torch_dtype,
)
else:
from auto_gptq import AutoGPTQForCausalLM
model = AutoGPTQForCausalLM.from_quantized(
pretrained,
model_basename=Path(quantized).stem,
device_map=device_map,
max_memory=max_memory,
trust_remote_code=trust_remote_code,
use_safetensors=quantized.endswith('.safetensors'),
use_triton=True,
)
return model
def _create_auto_model_peft(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment