Merge pull request #427 from EleutherAI/hotfix-hf-causal

Hotfix: patch issues with the `huggingface.py` model classes

Merge pull request #427 from EleutherAI/hotfix-hf-causal
Hotfix: patch issues with the `huggingface.py` model classes
6179fdf5 · Hailey Schoelkopf · GitHub · 977b281a · 62d7937b · 6179fdf5
Unverified Commit 6179fdf5 authored Apr 23, 2023 by Hailey Schoelkopf Committed by GitHub Apr 23, 2023
5 changed files
--- a/README.md
+++ b/README.md
@@ -55,14 +55,14 @@ python main.py \
    --device cuda:0
 ```

-To evaluate models that are called via `AutoSeq2SeqLM`, you instead use `hf-seq2seq`.
+To evaluate models that are loaded via `AutoSeq2SeqLM` in Huggingface, you instead use `hf-seq2seq`. *To evaluate (causal) models across multiple GPUs, use `--model hf-causal-experimental`. Note that this is *

 > **Warning**: Choosing the wrong model may result in erroneous outputs despite not erroring.

 To use with [PEFT](https://github.com/huggingface/peft), take the call you would run to evaluate the base model and add `,peft=PATH` to the `model_args` argument as shown below:
 ```bash
 python main.py \
-    --model hf-causal \
+    --model hf-causal-experimental \
    --model_args pretrained=EleutherAI/gpt-j-6b,peft=nomic-ai/gpt4all-j-lora \
    --tasks openbookqa,arc_easy,winogrande,hellaswag,arc_challenge,piqa,boolq \
    --device cuda:0

--- a/lm_eval/models/__init__.py
+++ b/lm_eval/models/__init__.py
@@ -6,7 +6,8 @@ from . import dummy

 MODEL_REGISTRY = {
    "hf": gpt2.HFLM,
-    "hf-causal": huggingface.AutoCausalLM,
+    "hf-causal": gpt2.HFLM,
+    "hf-causal-experimental": huggingface.AutoCausalLM,
    "hf-seq2seq": huggingface.AutoSeq2SeqLM,
    "gpt2": gpt2.GPT2LM,
    "gpt3": gpt3.GPT3LM,

--- a/lm_eval/models/gpt2.py
+++ b/lm_eval/models/gpt2.py
@@ -55,28 +55,8 @@ class HFLM(BaseLM):
            trust_remote_code=trust_remote_code,
        )

-        assert isinstance(
-            self.tokenizer,
-            (
-                transformers.GPT2Tokenizer,
-                transformers.GPT2TokenizerFast,
-                transformers.T5Tokenizer,
-                transformers.T5TokenizerFast,
-            ),
-        ), "this tokenizer has not been checked for compatibility yet!"
-
        self.vocab_size = self.tokenizer.vocab_size

-        if isinstance(
-            self.tokenizer, (transformers.GPT2Tokenizer, transformers.GPT2TokenizerFast)
-        ):
-            assert self.tokenizer.encode("hello\n\nhello") == [
-                31373,
-                198,
-                198,
-                31373,
-            ], self.tokenizer.encode("hello\n\nhello")
-
        # multithreading and batching
        self.batch_size_per_gpu = batch_size  # todo: adaptive batch size


--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -361,7 +361,7 @@ class HuggingFaceAutoLM(BaseLM):
    def tok_decode(self, tokens: torch.LongTensor) -> List[str]:
        return self.tokenizer.batch_decode(tokens, skip_special_tokens=True)

-    def greedy_until(self, requests: List[Tuple[str, dict]]) -> List[str]:
+    def greedy_until(self, requests: List[Tuple[str, Union[List[str], str]]]) -> List[str]:
        def _collate(x):
            tokens = self.tok_encode(x[0])
            return len(tokens), x[0]
@@ -373,18 +373,16 @@ class HuggingFaceAutoLM(BaseLM):
        ):
            context = [c[0] for c in chunk]
            request_args = chunk[0][1]
-            stop_sequences = request_args["stop_sequences"]
-            max_generation_length = request_args["max_generation_length"]
-            num_fewshot = request_args["num_fewshot"]
+            stop_sequences = request_args if isinstance(request_args, list) else [request_args] # request_args["stop_sequences"]
+            max_generation_length = self._max_gen_toks # request_args["max_generation_length"]

            assert (
                isinstance(max_generation_length, int) or max_generation_length is None
            )
            assert isinstance(stop_sequences, list) or stop_sequences is None
-            assert isinstance(num_fewshot, int) or num_fewshot is None
-
+            
            # TODO: Find a better way to handle stop sequences for 0-shot.
-            if stop_sequences is None or num_fewshot == 0:
+            if stop_sequences is None:
                until = [self.eot_token]
            else:
                until = stop_sequences + [self.eot_token]

--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -5,7 +5,9 @@ import collections
 import functools
 import inspect
 import sys
-from typing import List
+from typing import List, Union
+
+import torch

 from omegaconf import OmegaConf

@@ -116,6 +118,26 @@ def make_disjoint_window(pair):
    return a[: len(a) - (len(b) - 1)], b


+def select_continuation_from_batch_left_padding(
+    generations: Union[List[List[int]], torch.Tensor], max_context_size: int
+):
+    """Select the continuation from the batch, removing prompts of different lengths.
+    Args:
+        generations (Union[List[List[int]], torch.Tensor]):
+            A tensor or list-of-lists of shape [batch_size, sequence length].
+        max_context_size (int):
+            The size of the biggest context; generations will proceed from that
+            index.
+    Example:
+        PAD     PAD Continue : The dog chased the cat  [every       day of the week]
+        Riddle  me    this   : The  dog chased the  cat [yesterday] PAD PAD PAD PAD
+    Output:
+        [every day of the week]
+        [yesterday]  PAD PAD PAD PAD
+    """
+    return generations[:, max_context_size:]
+
+
 class Reorderer:
    def __init__(self, arr, fn):
        self.size = len(arr)
@@ -201,3 +223,4 @@ def run_task_tests(task_list: List[str]):
        raise ValueError(
            f"Not all tests for the specified tasks ({task_list}) ran successfully! Error code: {pytest_return_val}"
        )
+