Fix stuff and make tests pass

c971fa82 · Leo Gao · 0966e7b6 · c971fa82 · c971fa82 · c971fa82
Commit c971fa82 authored Mar 26, 2021 by Leo Gao
Hide whitespace changes
Inline Side-by-side

Showing with 10 additions and 6 deletions

lm_eval/models/gpt2.py lm_eval/models/gpt2.py +7 -3

lm_eval/tasks/glue.py lm_eval/tasks/glue.py +2 -2

tests/test_evaluator.py tests/test_evaluator.py +1 -1

No files found.
--- a/lm_eval/models/gpt2.py
+++ b/lm_eval/models/gpt2.py
@@ -60,12 +60,16 @@ class GPT2LM(LM):
                greedy_tokens = logits.argmax(dim=-1)
                max_equal = (greedy_tokens == cont_toks).all()

-                logits = torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(-1) # [batch, seq]
+                last_token_slice = logits[:, -1, :].squeeze(0).tolist()

+                logits = torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(-1) # [batch, seq]

-                res.append((float(logits.sum()), bool(max_equal)))
+                res.append((float(logits[:, :-1].sum() if logits.shape[-1] > 1 else 0), last_token_slice, bool(max_equal)))

-        return reord.get_original(res)
+        # optimization: if two requests have everything the same except the last token, use 
+        # last token distribution to save compute
+        lasttoks = [self.tokenizer.encode(x[1])[-1] for x in requests]
+        return [(l + lts[lasttok], m) for (l, lts, m), lasttok in zip(reord.get_original(res), lasttoks)]
    
    def greedy_until(self, requests):
        # TODO: implement fully general `until` that handles untils that are 

--- a/lm_eval/tasks/glue.py
+++ b/lm_eval/tasks/glue.py
@@ -334,7 +334,7 @@ class MRPC(HFTask):
        return True

    def has_test_docs(self):
-        return True
+        return False

    def fewshot_description(self):
        return "Indicate if both sentences mean the same thing."
@@ -386,7 +386,7 @@ class QQP(HFTask):
        return True

    def has_test_docs(self):
-        return True
+        return False

    def fewshot_description(self):
        return "Indicate if both questions ask the same thing."

--- a/tests/test_evaluator.py
+++ b/tests/test_evaluator.py
@@ -29,4 +29,4 @@ def test_evaluator(taskname, Task):
        

    lm.loglikelihood = ll_fn
-    evaluator.evaluate(lm, task_dict, False, 0, 10)
\ No newline at end of file
+    evaluator.evaluate(lm, task_dict, False, 0, 3)
\ No newline at end of file