Merge pull request #582 from EleutherAI/fix-max-len

Fix seqlen issues for bloom, remove extraneous OPT tokenizer check

Merge pull request #582 from EleutherAI/fix-max-len
Fix seqlen issues for bloom, remove extraneous OPT tokenizer check
4c08d72a · Stella Biderman · GitHub · f862a118 · fa43ab2e · 4c08d72a
Unverified Commit 4c08d72a authored Jun 13, 2023 by Stella Biderman Committed by GitHub Jun 13, 2023
Hide whitespace changes
Inline Side-by-side

Showing with 18 additions and 15 deletions

lm_eval/models/gpt2.py lm_eval/models/gpt2.py +18 -15

No files found.
--- a/lm_eval/models/gpt2.py
+++ b/lm_eval/models/gpt2.py
@@ -17,6 +17,9 @@ def _get_dtype(


 class HFLM(BaseLM):
+
+    _DEFAULT_MAX_LENGTH = 2048
+
    def __init__(
        self,
        device="cuda",
@@ -26,6 +29,7 @@ class HFLM(BaseLM):
        subfolder=None,
        tokenizer=None,
        batch_size=1,
+	max_length=None,
        load_in_8bit: Optional[bool] = False,
        trust_remote_code: Optional[bool] = False,
        dtype: Optional[Union[str, torch.dtype]]="auto",
@@ -72,22 +76,14 @@ class HFLM(BaseLM):

        self.vocab_size = self.tokenizer.vocab_size

-        if isinstance(
-            self.tokenizer, (transformers.GPT2Tokenizer, transformers.GPT2TokenizerFast)
-        ):
-            assert self.tokenizer.encode("hello\n\nhello") == [
-                31373,
-                198,
-                198,
-                31373,
-            ], self.tokenizer.encode("hello\n\nhello")
-
        # setup for automatic batch size detection
        if batch_size == "auto":
            self.batch_size_per_gpu = batch_size
        else:
            self.batch_size_per_gpu = int(batch_size)

+        self._max_length = max_length
+
    @property
    def eot_token_id(self):
        # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
@@ -95,11 +91,18 @@ class HFLM(BaseLM):

    @property
    def max_length(self):
-        try:
-            return self.gpt2.config.n_ctx
-        except AttributeError:
-            # gptneoconfig doesn't have n_ctx apparently
-            return self.gpt2.config.max_position_embeddings
+        if self._max_length: # if max length manually set, return it
+            return self._max_length
+        seqlen_config_attrs = ("n_positions", "max_position_embeddings", "n_ctx")
+        for attr in seqlen_config_attrs:
+            if hasattr(self.gpt2.config, attr):
+                return getattr(self.gpt2.config, attr)
+        if hasattr(self.tokenizer, "model_max_length"):
+            if self.tokenizer.model_max_length == 1000000000000000019884624838656:
+                return self._DEFAULT_MAX_LENGTH
+            return self.tokenizer.model_max_length
+        return self._DEFAULT_MAX_LENGTH
+

    @property
    def max_gen_toks(self):