Add support for load_in_8bit and trust_remote_code model params (#422)

* Add 8 bit support * added trust remote code * Update main.py * Update huggingface.py * Update huggingface.py * Fix bugs * Fix PR #422 issues * fix(style): remove empty newline * fix(stlye,args): run `pre-commit` and re-order args for backward-compat * fix(style): pre-commit the `README.md`

Add support for load_in_8bit and trust_remote_code model params (#422)
* Add 8 bit support * added trust remote code * Update main.py * Update huggingface.py * Update huggingface.py * Fix bugs * Fix PR #422 issues * fix(style): remove empty newline * fix(stlye,args): run `pre-commit` and re-order args for backward-compat * fix(style): pre-commit the `README.md`
977b281a · Phil Wee · GitHub · 221186c7 · 977b281a · 977b281a
Unverified Commit 977b281a authored Apr 22, 2023 by Phil Wee Committed by GitHub Apr 22, 2023
Showing with 42 additions and 17 deletions

README.md README.md +5 -5

lm_eval/models/gpt2.py lm_eval/models/gpt2.py +9 -1

lm_eval/models/huggingface.py lm_eval/models/huggingface.py +27 -10

setup.py setup.py +1 -1

No files found.
--- a/README.md
+++ b/README.md
@@ -45,7 +45,7 @@ python main.py \
    --device cuda:0
 ```

-Additional arguments can be provided to the model constructor using the `--model_args` flag. Most notably, this supports the common practice of using the `revisions` feature on the Hub to store partialy trained checkpoints:
+Additional arguments can be provided to the model constructor using the `--model_args` flag. Most notably, this supports the common practice of using the `revisions` feature on the Hub to store partially trained checkpoints:

 ```bash
 python main.py \
@@ -64,8 +64,8 @@ To use with [PEFT](https://github.com/huggingface/peft), take the call you would
 python main.py \
    --model hf-causal \
    --model_args pretrained=EleutherAI/gpt-j-6b,peft=nomic-ai/gpt4all-j-lora \
-    --tasks openbookqa,arc_easy,winogrande,hellaswag,arc_challenge,piqa,boolq \ 
-    --device cuda:0 
+    --tasks openbookqa,arc_easy,winogrande,hellaswag,arc_challenge,piqa,boolq \
+    --device cuda:0
 ```

 Our library also supports the OpenAI API:
@@ -78,7 +78,7 @@ python main.py \
    --tasks lambada_openai,hellaswag
 ```

-While this functionality is only officially mantained for the official OpenAI API, it tends to also work for other hosting services that use the same API such as [goose.ai](goose.ai) with minor modification. We also have an implementation for the [TextSynth](https://textsynth.com/index.html) API, using `--model textsynth`.
+While this functionality is only officially maintained for the official OpenAI API, it tends to also work for other hosting services that use the same API such as [goose.ai](goose.ai) with minor modification. We also have an implementation for the [TextSynth](https://textsynth.com/index.html) API, using `--model textsynth`.

 To verify the data integrity of the tasks you're performing in addition to running the tasks themselves, you can use the `--check_integrity` flag:

@@ -116,7 +116,7 @@ When reporting eval harness results, please also report the version of each task

 ## Test Set Decontamination

-To address concerns about train / test contamination, we provide utilities for comparing results on a benchmark using only the data points nto found in the model trainign set. Unfortunately, outside of models trained on the Pile ans C4, its very rare that people who train models disclose the contents of the training data. However this utility can be useful to evaluate models you have trained on private data, provided you are willing to pre-compute the necessary indices. We provide computed indices for 13-gram exact match deduplication against the Pile, and plan to add additional precomputed dataset indices in the future (including C4 and min-hash LSH deduplication).
+To address concerns about train / test contamination, we provide utilities for comparing results on a benchmark using only the data points nto found in the model training set. Unfortunately, outside of models trained on the Pile and C4, its very rare that people who train models disclose the contents of the training data. However this utility can be useful to evaluate models you have trained on private data, provided you are willing to pre-compute the necessary indices. We provide computed indices for 13-gram exact match deduplication against the Pile, and plan to add additional precomputed dataset indices in the future (including C4 and min-hash LSH deduplication).

 For details on text decontamination, see the [decontamination guide](./docs/decontamination.md).


--- a/lm_eval/models/gpt2.py
+++ b/lm_eval/models/gpt2.py
 import torch
 import transformers
+from typing import Optional
 from lm_eval.base import BaseLM


@@ -13,6 +14,8 @@ class HFLM(BaseLM):
        subfolder=None,
        tokenizer=None,
        batch_size=1,
+        load_in_8bit: Optional[bool] = False,
+        trust_remote_code: Optional[bool] = False,
    ):
        super().__init__()

@@ -38,13 +41,18 @@ class HFLM(BaseLM):
        revision = revision + ("/" + subfolder if subfolder is not None else "")

        self.gpt2 = transformers.AutoModelForCausalLM.from_pretrained(
-            pretrained, revision=revision, low_cpu_mem_usage=low_cpu_mem_usage
+            pretrained,
+            load_in_8bit=load_in_8bit,
+            low_cpu_mem_usage=low_cpu_mem_usage,
+            revision=revision,
+            trust_remote_code=trust_remote_code,
        ).to(self.device)
        self.gpt2.eval()

        self.tokenizer = transformers.AutoTokenizer.from_pretrained(
            pretrained if tokenizer is None else tokenizer,
            revision=revision,
+            trust_remote_code=trust_remote_code,
        )

        assert isinstance(

--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -83,6 +83,8 @@ class HuggingFaceAutoLM(BaseLM):
        dtype: Optional[Union[str, torch.dtype]] = None,
        device: Optional[Union[int, str]] = "cuda",
        peft: str = None,
+        load_in_8bit: Optional[bool] = False,
+        trust_remote_code: Optional[bool] = False,
    ):
        """Initializes a HuggingFace `AutoModel` and `AutoTokenizer` for evaluation.
        Args:
@@ -106,20 +108,20 @@ class HuggingFaceAutoLM(BaseLM):
                Options:
                    "auto", "balanced", "balanced_low_0", "sequential"
                See the `accelerate` docs for more details on these options:
-                https://huggingface.co/docs/accelerate/v0.12.0/en/usage_guides/big_modeling#designing-a-device-map
+                https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel.from_pretrained.device_map
            max_memory_per_gpu (Union[int, str], optional, defaults to None):
                The maximum memory available for each GPU in bytes as `int` or in
                the format f"{significand}{unit_symbol}" where {unit_symbol} is
                any of ["GB", "MB", "GIB", "MIB"]. Refer to the `max_memory` arg in
                the "Parameters for big model inference" section of the following
                docs:
-                https://huggingface.co/docs/transformers/v4.20.1/en/main_classes/model#large-model-loading
+                https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel.from_pretrained.max_memory
            max_cpu_memory (Union[int, str], optional, defaults to None):
                The maximum available CPU RAM in bytes as `int` or in the format
                f"{significand}{unit_symbol}" where {unit_symbol} is any of
                ["GB", "MB", "GIB", "MIB"]. Refer to the `max_memory` arg in the
                "Parameters for big model inference" section of the following docs:
-                https://huggingface.co/docs/transformers/v4.20.1/en/main_classes/model#large-model-loading
+                https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel.from_pretrained.max_memory
            offload_folder (str, optional, defaults to "./offload"):
                The folder to offload weights into if `device_map` contains any
                "disk" value.
@@ -129,8 +131,13 @@ class HuggingFaceAutoLM(BaseLM):
                Use `dtype="auto"` to derive the type from the model’s weights.
            peft (str, optional, defaults to None):
                Path of the adapter weights to load from Huggingface. This will usually
-                include a directory that includes the files `adapter_config.json` and 
+                include a directory that includes the files `adapter_config.json` and
                `adapter_model.bin`. Compatible with [PEFT](https://github.com/huggingface/peft)
+            load_in_8bit (bool, optional, defaults to False):
+                If True, will convert the loaded model into mixed-8bit quantized model. See:
+                https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel.from_pretrained.load_in_8bit
+            trust_remote_code (bool, optional, defaults to False):
+                If True, will trust the remote code when loading the model.
        """
        super().__init__()

@@ -155,6 +162,7 @@ class HuggingFaceAutoLM(BaseLM):
        self._max_length = max_length
        self._config = self.AUTO_CONFIG_CLASS.from_pretrained(
            pretrained,
+            trust_remote_code=trust_remote_code,
            revision=revision + ("/" + subfolder if subfolder is not None else ""),
        )

@@ -167,20 +175,22 @@ class HuggingFaceAutoLM(BaseLM):
        )
        self.tokenizer.model_max_length = self.max_length

-        accelerate_kwargs = {}
+        model_kwargs = {}
        if use_accelerate:
-            accelerate_kwargs = _get_accelerate_args(
+            model_kwargs = _get_accelerate_args(
                device_map_option,
                max_memory_per_gpu,
                max_cpu_memory,
                offload_folder,
            )
+        model_kwargs["load_in_8bit"] = load_in_8bit
        self.model = self._create_auto_model(
            pretrained=pretrained,
+            trust_remote_code=trust_remote_code,
            revision=revision,
            subfolder=subfolder,
            torch_dtype=_get_dtype(dtype, self._config),
-            **accelerate_kwargs,
+            **model_kwargs,
        )
        # note: peft_path can be different than pretrained model path
        if peft is not None:
@@ -190,7 +200,7 @@ class HuggingFaceAutoLM(BaseLM):
                revision=revision,
                subfolder=subfolder,
                torch_dtype=_get_dtype(dtype, self._config),
-                **accelerate_kwargs,
+                **model_kwargs,
            )
        self.model.eval()
        torch.set_grad_enabled(False)
@@ -213,6 +223,8 @@ class HuggingFaceAutoLM(BaseLM):
        device_map: Optional[Union[str, _DeviceMapping]] = None,
        max_memory: Optional[dict] = None,
        offload_folder: Optional[str] = None,
+        load_in_8bit: Optional[bool] = False,
+        trust_remote_code: Optional[bool] = False,
        torch_dtype: Optional[Union[str, torch.dtype]] = None,
    ) -> transformers.AutoModel:
        """Returns a pre-trained pytorch model from a pre-trained model configuration."""
@@ -222,10 +234,12 @@ class HuggingFaceAutoLM(BaseLM):
            device_map=device_map,
            max_memory=max_memory,
            offload_folder=offload_folder,
+            load_in_8bit=load_in_8bit,
+            trust_remote_code=trust_remote_code,
            torch_dtype=torch_dtype,
        )
        return model
-        
+
    def _create_auto_model_peft(
        self,
        *,
@@ -236,6 +250,8 @@ class HuggingFaceAutoLM(BaseLM):
        device_map: Optional[Union[str, _DeviceMapping]] = None,
        max_memory: Optional[dict] = None,
        offload_folder: Optional[str] = None,
+        load_in_8bit: Optional[bool] = False,
+        trust_remote_code: Optional[bool] = False,
        torch_dtype: Optional[Union[str, torch.dtype]] = None,
    ):
        model = self.AUTO_PEFT_CLASS.from_pretrained(
@@ -245,6 +261,8 @@ class HuggingFaceAutoLM(BaseLM):
            device_map=device_map,
            max_memory=max_memory,
            offload_folder=offload_folder,
+            load_in_8bit=load_in_8bit,
+            trust_remote_code=trust_remote_code,
            torch_dtype=torch_dtype,
        )
        return model
@@ -675,4 +693,3 @@ def stop_sequences_criteria(
            ],
        ]
    )
-
--- a/setup.py
+++ b/setup.py
@@ -42,6 +42,6 @@ setuptools.setup(
    extras_require={
        "dev": ["black", "flake8", "pre-commit", "pytest", "pytest-cov"],
        "multilingual": ["nagisa>=0.2.7", "jieba>=0.42.1"],
-        "sentencepiece": ["sentencepiece>=0.1.98", "protobuf>=4.22.1"]
+        "sentencepiece": ["sentencepiece>=0.1.98", "protobuf>=4.22.1"],
    },
 )