Refactor PerplexityTask

1e7f884d · Leo Gao · b0cf0163 · 1e7f884d · 1e7f884d · 1e7f884d
Commit 1e7f884d authored May 05, 2021 by Leo Gao
Show whitespace changes
Inline Side-by-side

Showing with 28 additions and 12 deletions

lm_eval/base.py lm_eval/base.py +21 -11

lm_eval/metrics.py lm_eval/metrics.py +5 -0

lm_eval/models/gpt2.py lm_eval/models/gpt2.py +2 -1

No files found.
--- a/lm_eval/base.py
+++ b/lm_eval/base.py
 import abc
 import random
 import numpy as np
+import re

-from lm_eval.metrics import mean
+from lm_eval.metrics import mean, perplexity, weighted_mean


 class LM(abc.ABC):
@@ -307,14 +308,17 @@ class PerplexityTask(Task, abc.ABC):
        return ""

    def higher_is_better(self):
-        return False
+        return {
+            "word_perplexity": False,
+            "byte_perplexity": False,
+            "bits_per_byte": False,
+        }

    def doc_to_text(self, doc):
        return doc

    def doc_to_target(self, doc):
        raise NotImplementedError()
-        return doc

    def construct_requests(self, doc, ctx):
        assert not ctx
@@ -324,20 +328,26 @@ class PerplexityTask(Task, abc.ABC):
    def process_results(self, doc, results):
        loglikelihood, = results
        return {
-            "perplexity": loglikelihood,
+            "word_perplexity": loglikelihood / self.count_words(self.doc_to_text(doc)),
+            "byte_perplexity": loglikelihood / self.count_bytes(self.doc_to_text(doc)),
+            "bits_per_byte": (-loglikelihood, self.count_bytes(self.doc_to_text(doc)))
        }

    def aggregation(self):
        return {
-            "perplexity": self.compute_perplexity_from_loglikelihood,
+            "word_perplexity": perplexity,
+            "byte_perplexity": perplexity,
+            "bits_per_byte": weighted_mean
        }

-    @classmethod
-    def compute_perplexity_from_loglikelihood(cls, loglikelihoods):
-        aggregate_logprobs = np.concatenate(loglikelihoods)
-        perplexity = np.exp(-aggregate_logprobs.mean())
-        return float(perplexity)
+    def count_bytes(self, s):
+        return len(s.encode("utf-8"))
+    
+    def count_words(self, s):
+        """ Downstream tasks with custom word boundaries should override this! """
+        return len(re.split(r"\s+", s))
    
+    def 

 req_ret_lens = {
    'loglikelihood': 2,

--- a/lm_eval/metrics.py
+++ b/lm_eval/metrics.py
@@ -62,6 +62,11 @@ def perplexity(items):
    return math.exp(-mean(items))


+def weighted_mean(items):
+    a, b = zip(*items)
+    return sum(a) / sum(b)
+
+
 def bleu(items):
    """The Bilingual Evaluation Understudy Score, or BLEU for short, is a metric
    for evaluating a generated sentence to a reference sentence. It counts matching

--- a/lm_eval/models/gpt2.py
+++ b/lm_eval/models/gpt2.py
@@ -60,6 +60,7 @@ class GPT2LM(LM):
        with torch.no_grad():
            for string, in tqdm(requests):
                encoded = self.tokenizer.encode_plus(string)["input_ids"]
+
                rolling_token_windows = list(map(utils.make_disjoint_window, utils.get_rolling_token_windows(
                    token_list=encoded,
                    prefix_token=self.EOT_TOKEN_ID,
@@ -67,9 +68,9 @@ class GPT2LM(LM):
                    context_len=1,
                )))

-                # todo: figure out partial caching
                rolling_token_windows = [(None,) + x for x in rolling_token_windows]

+                # TODO: extract out this call so it only gets called once and also somehow figure out partial caching for that
                string_nll = self._loglikelihood_tokens(rolling_token_windows)
                
                # discard is_greedy