Unverified Commit 977b281a authored by Phil Wee's avatar Phil Wee Committed by GitHub
Browse files

Add support for load_in_8bit and trust_remote_code model params (#422)

* Add 8 bit support

* added trust remote code

* Update main.py

* Update huggingface.py

* Update huggingface.py

* Fix bugs

* Fix PR #422 issues

* fix(style): remove empty newline

* fix(stlye,args): run `pre-commit` and re-order args for backward-compat

* fix(style): pre-commit the `README.md`
parent 221186c7
......@@ -45,7 +45,7 @@ python main.py \
--device cuda:0
```
Additional arguments can be provided to the model constructor using the `--model_args` flag. Most notably, this supports the common practice of using the `revisions` feature on the Hub to store partialy trained checkpoints:
Additional arguments can be provided to the model constructor using the `--model_args` flag. Most notably, this supports the common practice of using the `revisions` feature on the Hub to store partially trained checkpoints:
```bash
python main.py \
......@@ -64,8 +64,8 @@ To use with [PEFT](https://github.com/huggingface/peft), take the call you would
python main.py \
--model hf-causal \
--model_args pretrained=EleutherAI/gpt-j-6b,peft=nomic-ai/gpt4all-j-lora \
--tasks openbookqa,arc_easy,winogrande,hellaswag,arc_challenge,piqa,boolq \
--device cuda:0
--tasks openbookqa,arc_easy,winogrande,hellaswag,arc_challenge,piqa,boolq \
--device cuda:0
```
Our library also supports the OpenAI API:
......@@ -78,7 +78,7 @@ python main.py \
--tasks lambada_openai,hellaswag
```
While this functionality is only officially mantained for the official OpenAI API, it tends to also work for other hosting services that use the same API such as [goose.ai](goose.ai) with minor modification. We also have an implementation for the [TextSynth](https://textsynth.com/index.html) API, using `--model textsynth`.
While this functionality is only officially maintained for the official OpenAI API, it tends to also work for other hosting services that use the same API such as [goose.ai](goose.ai) with minor modification. We also have an implementation for the [TextSynth](https://textsynth.com/index.html) API, using `--model textsynth`.
To verify the data integrity of the tasks you're performing in addition to running the tasks themselves, you can use the `--check_integrity` flag:
......@@ -116,7 +116,7 @@ When reporting eval harness results, please also report the version of each task
## Test Set Decontamination
To address concerns about train / test contamination, we provide utilities for comparing results on a benchmark using only the data points nto found in the model trainign set. Unfortunately, outside of models trained on the Pile ans C4, its very rare that people who train models disclose the contents of the training data. However this utility can be useful to evaluate models you have trained on private data, provided you are willing to pre-compute the necessary indices. We provide computed indices for 13-gram exact match deduplication against the Pile, and plan to add additional precomputed dataset indices in the future (including C4 and min-hash LSH deduplication).
To address concerns about train / test contamination, we provide utilities for comparing results on a benchmark using only the data points nto found in the model training set. Unfortunately, outside of models trained on the Pile and C4, its very rare that people who train models disclose the contents of the training data. However this utility can be useful to evaluate models you have trained on private data, provided you are willing to pre-compute the necessary indices. We provide computed indices for 13-gram exact match deduplication against the Pile, and plan to add additional precomputed dataset indices in the future (including C4 and min-hash LSH deduplication).
For details on text decontamination, see the [decontamination guide](./docs/decontamination.md).
......
import torch
import transformers
from typing import Optional
from lm_eval.base import BaseLM
......@@ -13,6 +14,8 @@ class HFLM(BaseLM):
subfolder=None,
tokenizer=None,
batch_size=1,
load_in_8bit: Optional[bool] = False,
trust_remote_code: Optional[bool] = False,
):
super().__init__()
......@@ -38,13 +41,18 @@ class HFLM(BaseLM):
revision = revision + ("/" + subfolder if subfolder is not None else "")
self.gpt2 = transformers.AutoModelForCausalLM.from_pretrained(
pretrained, revision=revision, low_cpu_mem_usage=low_cpu_mem_usage
pretrained,
load_in_8bit=load_in_8bit,
low_cpu_mem_usage=low_cpu_mem_usage,
revision=revision,
trust_remote_code=trust_remote_code,
).to(self.device)
self.gpt2.eval()
self.tokenizer = transformers.AutoTokenizer.from_pretrained(
pretrained if tokenizer is None else tokenizer,
revision=revision,
trust_remote_code=trust_remote_code,
)
assert isinstance(
......
......@@ -83,6 +83,8 @@ class HuggingFaceAutoLM(BaseLM):
dtype: Optional[Union[str, torch.dtype]] = None,
device: Optional[Union[int, str]] = "cuda",
peft: str = None,
load_in_8bit: Optional[bool] = False,
trust_remote_code: Optional[bool] = False,
):
"""Initializes a HuggingFace `AutoModel` and `AutoTokenizer` for evaluation.
Args:
......@@ -106,20 +108,20 @@ class HuggingFaceAutoLM(BaseLM):
Options:
"auto", "balanced", "balanced_low_0", "sequential"
See the `accelerate` docs for more details on these options:
https://huggingface.co/docs/accelerate/v0.12.0/en/usage_guides/big_modeling#designing-a-device-map
https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel.from_pretrained.device_map
max_memory_per_gpu (Union[int, str], optional, defaults to None):
The maximum memory available for each GPU in bytes as `int` or in
the format f"{significand}{unit_symbol}" where {unit_symbol} is
any of ["GB", "MB", "GIB", "MIB"]. Refer to the `max_memory` arg in
the "Parameters for big model inference" section of the following
docs:
https://huggingface.co/docs/transformers/v4.20.1/en/main_classes/model#large-model-loading
https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel.from_pretrained.max_memory
max_cpu_memory (Union[int, str], optional, defaults to None):
The maximum available CPU RAM in bytes as `int` or in the format
f"{significand}{unit_symbol}" where {unit_symbol} is any of
["GB", "MB", "GIB", "MIB"]. Refer to the `max_memory` arg in the
"Parameters for big model inference" section of the following docs:
https://huggingface.co/docs/transformers/v4.20.1/en/main_classes/model#large-model-loading
https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel.from_pretrained.max_memory
offload_folder (str, optional, defaults to "./offload"):
The folder to offload weights into if `device_map` contains any
"disk" value.
......@@ -129,8 +131,13 @@ class HuggingFaceAutoLM(BaseLM):
Use `dtype="auto"` to derive the type from the model’s weights.
peft (str, optional, defaults to None):
Path of the adapter weights to load from Huggingface. This will usually
include a directory that includes the files `adapter_config.json` and
include a directory that includes the files `adapter_config.json` and
`adapter_model.bin`. Compatible with [PEFT](https://github.com/huggingface/peft)
load_in_8bit (bool, optional, defaults to False):
If True, will convert the loaded model into mixed-8bit quantized model. See:
https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel.from_pretrained.load_in_8bit
trust_remote_code (bool, optional, defaults to False):
If True, will trust the remote code when loading the model.
"""
super().__init__()
......@@ -155,6 +162,7 @@ class HuggingFaceAutoLM(BaseLM):
self._max_length = max_length
self._config = self.AUTO_CONFIG_CLASS.from_pretrained(
pretrained,
trust_remote_code=trust_remote_code,
revision=revision + ("/" + subfolder if subfolder is not None else ""),
)
......@@ -167,20 +175,22 @@ class HuggingFaceAutoLM(BaseLM):
)
self.tokenizer.model_max_length = self.max_length
accelerate_kwargs = {}
model_kwargs = {}
if use_accelerate:
accelerate_kwargs = _get_accelerate_args(
model_kwargs = _get_accelerate_args(
device_map_option,
max_memory_per_gpu,
max_cpu_memory,
offload_folder,
)
model_kwargs["load_in_8bit"] = load_in_8bit
self.model = self._create_auto_model(
pretrained=pretrained,
trust_remote_code=trust_remote_code,
revision=revision,
subfolder=subfolder,
torch_dtype=_get_dtype(dtype, self._config),
**accelerate_kwargs,
**model_kwargs,
)
# note: peft_path can be different than pretrained model path
if peft is not None:
......@@ -190,7 +200,7 @@ class HuggingFaceAutoLM(BaseLM):
revision=revision,
subfolder=subfolder,
torch_dtype=_get_dtype(dtype, self._config),
**accelerate_kwargs,
**model_kwargs,
)
self.model.eval()
torch.set_grad_enabled(False)
......@@ -213,6 +223,8 @@ class HuggingFaceAutoLM(BaseLM):
device_map: Optional[Union[str, _DeviceMapping]] = None,
max_memory: Optional[dict] = None,
offload_folder: Optional[str] = None,
load_in_8bit: Optional[bool] = False,
trust_remote_code: Optional[bool] = False,
torch_dtype: Optional[Union[str, torch.dtype]] = None,
) -> transformers.AutoModel:
"""Returns a pre-trained pytorch model from a pre-trained model configuration."""
......@@ -222,10 +234,12 @@ class HuggingFaceAutoLM(BaseLM):
device_map=device_map,
max_memory=max_memory,
offload_folder=offload_folder,
load_in_8bit=load_in_8bit,
trust_remote_code=trust_remote_code,
torch_dtype=torch_dtype,
)
return model
def _create_auto_model_peft(
self,
*,
......@@ -236,6 +250,8 @@ class HuggingFaceAutoLM(BaseLM):
device_map: Optional[Union[str, _DeviceMapping]] = None,
max_memory: Optional[dict] = None,
offload_folder: Optional[str] = None,
load_in_8bit: Optional[bool] = False,
trust_remote_code: Optional[bool] = False,
torch_dtype: Optional[Union[str, torch.dtype]] = None,
):
model = self.AUTO_PEFT_CLASS.from_pretrained(
......@@ -245,6 +261,8 @@ class HuggingFaceAutoLM(BaseLM):
device_map=device_map,
max_memory=max_memory,
offload_folder=offload_folder,
load_in_8bit=load_in_8bit,
trust_remote_code=trust_remote_code,
torch_dtype=torch_dtype,
)
return model
......@@ -675,4 +693,3 @@ def stop_sequences_criteria(
],
]
)
......@@ -42,6 +42,6 @@ setuptools.setup(
extras_require={
"dev": ["black", "flake8", "pre-commit", "pytest", "pytest-cov"],
"multilingual": ["nagisa>=0.2.7", "jieba>=0.42.1"],
"sentencepiece": ["sentencepiece>=0.1.98", "protobuf>=4.22.1"]
"sentencepiece": ["sentencepiece>=0.1.98", "protobuf>=4.22.1"],
},
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment