LM: handle empty context

359114fd · Leo Gao · 5d56a47d · 359114fd · 359114fd · 359114fd
Commit 359114fd authored Feb 07, 2021 by Leo Gao
Showing with 17 additions and 3 deletions

lm_eval/base.py lm_eval/base.py +2 -1

lm_eval/models/gpt2.py lm_eval/models/gpt2.py +7 -1

lm_eval/models/gpt3.py lm_eval/models/gpt3.py +6 -1

tests/test_models.py tests/test_models.py +2 -0

No files found.
--- a/lm_eval/base.py
+++ b/lm_eval/base.py
@@ -15,7 +15,8 @@ class LM(abc.ABC):
        :param requests: list
            A list of pairs (context, continuation)
            context: str
-                Context string
+                Context string. Implementations of LM must be able to handle an 
+                empty context string.
            continuation: str
                The continuation over which log likelihood will be calculated. If 
                there is a word boundary, the space should be in the continuation. 

--- a/lm_eval/models/gpt2.py
+++ b/lm_eval/models/gpt2.py
@@ -24,7 +24,13 @@ class GPT2LM(LM):
        # TODO: vectorize properly
        for context, continuation in tqdm(requests):
            # when too long to fit in context, truncate from the left
-            context_enc = self.tokenizer.encode(context)
+            
+            if context == "":
+                # end of text as context
+                context_enc = [50256]
+            else:
+                context_enc = self.tokenizer.encode(context)
+            
            continuation_enc = self.tokenizer.encode(continuation)
            inp = torch.tensor([(context_enc + continuation_enc)[-1024:]], dtype=torch.long).to(self.device)
            ctxlen = len(context_enc) - max(0, len(context_enc) + len(continuation_enc) - 1024)

--- a/lm_eval/models/gpt3.py
+++ b/lm_eval/models/gpt3.py
@@ -72,7 +72,12 @@ class GPT3LM(LM):
            inps = []
            ctxlens = []
            for context, continuation in chunk:
-                context_enc = self.tokenizer.encode(context)
+                if context == "":
+                    # end of text as context
+                    context_enc = [50256]
+                else:
+                    context_enc = self.tokenizer.encode(context)
+                    
                continuation_enc = self.tokenizer.encode(continuation)
                inp = (context_enc + continuation_enc)[-self.MAX_LENGTH:]
                ctxlen = len(context_enc) - max(0, len(context_enc) + len(continuation_enc) - self.MAX_LENGTH)

--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -11,3 +11,5 @@ def test_gpt2():
    assert ll_dog > ll_cat
    assert not ig_cat

+    # test empty context
+    gpt2.loglikelihood([('', 'test')])
\ No newline at end of file