Count perplexity correctly

b691c44f · Leo Gao · 9c4967bc · b691c44f · b691c44f
Commit b691c44f authored May 11, 2021 by Leo Gao
Hide whitespace changes
Inline Side-by-side

Showing with 10 additions and 5 deletions

lm_eval/base.py lm_eval/base.py +7 -5

lm_eval/metrics.py lm_eval/metrics.py +3 -0

No files found.
--- a/lm_eval/base.py
+++ b/lm_eval/base.py
@@ -3,7 +3,7 @@ import random
 import numpy as np
 import re
-from lm_eval.metrics import mean, perplexity, weighted_mean
+from lm_eval.metrics import mean, perplexity, weighted_perplexity, weighted_mean
 class LM(abc.ABC):
@@ -327,16 +327,18 @@ class PerplexityTask(Task, abc.ABC):
    def process_results(self, doc, results):
        loglikelihood, = results
+        words = self.count_words(self.doc_to_text(doc))
+        bytes = self.count_bytes(self.doc_to_text(doc))
        return {
-            "word_perplexity": loglikelihood / self.count_words(self.doc_to_text(doc)),
+            "word_perplexity": (loglikelihood, words),
-            "byte_perplexity": loglikelihood / self.count_bytes(self.doc_to_text(doc)),
+            "byte_perplexity": (loglikelihood, bytes),
            "bits_per_byte": (-loglikelihood, self.count_bytes(self.doc_to_text(doc)))
        }
    def aggregation(self):
        return {
-            "word_perplexity": perplexity,
+            "word_perplexity": weighted_perplexity,
-            "byte_perplexity": perplexity,
+            "byte_perplexity": weighted_perplexity,
            "bits_per_byte": weighted_mean
        }

--- a/lm_eval/metrics.py
+++ b/lm_eval/metrics.py
@@ -98,6 +98,9 @@ def weighted_mean(items):
    a, b = zip(*items)
    return sum(a) / sum(b)
+def weighted_perplexity(items):
+    return math.exp(-weighted_mean(items))
 def bleu(items):
    """The Bilingual Evaluation Understudy Score, or BLEU for short, is a metric