Commit c11ad4f2 authored by gk's avatar gk
Browse files

GPTQ: add auto-gptq extra, add gptq_use_triton parameter

parent b465cd01
...@@ -29,6 +29,12 @@ To install additional multilingual tokenization and text segmentation packages, ...@@ -29,6 +29,12 @@ To install additional multilingual tokenization and text segmentation packages,
pip install -e ".[multilingual]" pip install -e ".[multilingual]"
``` ```
To support loading GPTQ quantized models, install the package with the `auto-gptq` extra:
```bash
pip install -e ".[auto-gptq]"
```
## Basic Usage ## Basic Usage
> **Note**: When reporting results from eval harness, please include the task versions (shown in `results["versions"]`) for reproducibility. This allows bug fixes to tasks while also ensuring that previously reported scores are reproducible. See the [Task Versioning](#task-versioning) section for more info. > **Note**: When reporting results from eval harness, please include the task versions (shown in `results["versions"]`) for reproducibility. This allows bug fixes to tasks while also ensuring that previously reported scores are reproducible. See the [Task Versioning](#task-versioning) section for more info.
...@@ -111,12 +117,12 @@ python main.py \ ...@@ -111,12 +117,12 @@ python main.py \
--device cuda:0 --device cuda:0
``` ```
GPTQ quantized models can be loaded by installing [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) with `pip install auto-gptq[triton]` and specifying their file names in `,quantized=NAME` (or `,quantized=True` for default names) in the `model_args` argument: GPTQ quantized models can be loaded by specifying their file names in `,quantized=NAME` (or `,quantized=True` for default names) in the `model_args` argument:
```bash ```bash
python main.py \ python main.py \
--model hf-causal-experimental \ --model hf-causal-experimental \
--model_args pretrained=model-name-or-path,quantized=model.safetensors \ --model_args pretrained=model-name-or-path,quantized=model.safetensors,gptq_use_triton=True \
--tasks hellaswag --tasks hellaswag
``` ```
......
...@@ -88,6 +88,7 @@ class HuggingFaceAutoLM(BaseLM): ...@@ -88,6 +88,7 @@ class HuggingFaceAutoLM(BaseLM):
peft: str = None, peft: str = None,
load_in_8bit: Optional[bool] = False, load_in_8bit: Optional[bool] = False,
trust_remote_code: Optional[bool] = False, trust_remote_code: Optional[bool] = False,
gptq_use_triton: Optional[bool] = False,
): ):
"""Initializes a HuggingFace `AutoModel` and `AutoTokenizer` for evaluation. """Initializes a HuggingFace `AutoModel` and `AutoTokenizer` for evaluation.
Args: Args:
...@@ -144,6 +145,8 @@ class HuggingFaceAutoLM(BaseLM): ...@@ -144,6 +145,8 @@ class HuggingFaceAutoLM(BaseLM):
https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel.from_pretrained.load_in_8bit https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel.from_pretrained.load_in_8bit
trust_remote_code (bool, optional, defaults to False): trust_remote_code (bool, optional, defaults to False):
If True, will trust the remote code when loading the model. If True, will trust the remote code when loading the model.
gptq_use_triton (bool, optional, defaults to False):
Use Triton for GPTQ inference.
""" """
super().__init__() super().__init__()
...@@ -202,6 +205,7 @@ class HuggingFaceAutoLM(BaseLM): ...@@ -202,6 +205,7 @@ class HuggingFaceAutoLM(BaseLM):
revision=revision, revision=revision,
subfolder=subfolder, subfolder=subfolder,
torch_dtype=_get_dtype(dtype, self._config), torch_dtype=_get_dtype(dtype, self._config),
gptq_use_triton=gptq_use_triton,
**model_kwargs, **model_kwargs,
) )
# note: peft_path can be different than pretrained model path # note: peft_path can be different than pretrained model path
...@@ -239,6 +243,7 @@ class HuggingFaceAutoLM(BaseLM): ...@@ -239,6 +243,7 @@ class HuggingFaceAutoLM(BaseLM):
load_in_8bit: Optional[bool] = False, load_in_8bit: Optional[bool] = False,
trust_remote_code: Optional[bool] = False, trust_remote_code: Optional[bool] = False,
torch_dtype: Optional[Union[str, torch.dtype]] = None, torch_dtype: Optional[Union[str, torch.dtype]] = None,
gptq_use_triton: Optional[bool] = False,
) -> transformers.AutoModel: ) -> transformers.AutoModel:
"""Returns a pre-trained pytorch model from a pre-trained model configuration.""" """Returns a pre-trained pytorch model from a pre-trained model configuration."""
if quantized is None: if quantized is None:
...@@ -261,7 +266,8 @@ class HuggingFaceAutoLM(BaseLM): ...@@ -261,7 +266,8 @@ class HuggingFaceAutoLM(BaseLM):
max_memory=max_memory, max_memory=max_memory,
trust_remote_code=trust_remote_code, trust_remote_code=trust_remote_code,
use_safetensors=True if quantized == True else quantized.endswith('.safetensors'), use_safetensors=True if quantized == True else quantized.endswith('.safetensors'),
use_triton=True, use_triton=gptq_use_triton,
warmup_triton=gptq_use_triton,
) )
return model return model
......
...@@ -44,5 +44,6 @@ setuptools.setup( ...@@ -44,5 +44,6 @@ setuptools.setup(
"dev": ["black", "flake8", "pre-commit", "pytest", "pytest-cov"], "dev": ["black", "flake8", "pre-commit", "pytest", "pytest-cov"],
"multilingual": ["nagisa>=0.2.7", "jieba>=0.42.1"], "multilingual": ["nagisa>=0.2.7", "jieba>=0.42.1"],
"sentencepiece": ["sentencepiece>=0.1.98", "protobuf>=4.22.1"], "sentencepiece": ["sentencepiece>=0.1.98", "protobuf>=4.22.1"],
"auto-gptq": ["auto-gptq[triton] @ git+https://github.com/PanQiWei/AutoGPTQ"],
}, },
) )
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment