Commit 38360512 authored by Igor Ostrovsky's avatar Igor Ostrovsky
Browse files

Fix bits_per_byte metric in PerplexityTask

bits_per_byte was calculated as average per-byte loglikelihood, which would work if loglikelihood was base-2 log,
but it is natural log. To correct for that, bits_per_byte should be divided by math.log(2).

Also, it should be true that 2^bits_per_byte == byte_perplexity, which is true after the fix.
parent df5d7cf0
...@@ -10,7 +10,7 @@ from tqdm import tqdm ...@@ -10,7 +10,7 @@ from tqdm import tqdm
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
from lm_eval.metrics import mean, weighted_perplexity, weighted_mean from lm_eval.metrics import mean, weighted_perplexity, weighted_mean, bits_per_byte
from lm_eval import utils from lm_eval import utils
from abc import abstractmethod from abc import abstractmethod
...@@ -560,14 +560,14 @@ class PerplexityTask(Task, abc.ABC): ...@@ -560,14 +560,14 @@ class PerplexityTask(Task, abc.ABC):
return { return {
"word_perplexity": (loglikelihood, words), "word_perplexity": (loglikelihood, words),
"byte_perplexity": (loglikelihood, bytes_), "byte_perplexity": (loglikelihood, bytes_),
"bits_per_byte": (-loglikelihood, self.count_bytes(doc)) "bits_per_byte": (loglikelihood, bytes_),
} }
def aggregation(self): def aggregation(self):
return { return {
"word_perplexity": weighted_perplexity, "word_perplexity": weighted_perplexity,
"byte_perplexity": weighted_perplexity, "byte_perplexity": weighted_perplexity,
"bits_per_byte": weighted_mean "bits_per_byte": bits_per_byte,
} }
@classmethod @classmethod
......
...@@ -102,6 +102,9 @@ def weighted_mean(items): ...@@ -102,6 +102,9 @@ def weighted_mean(items):
def weighted_perplexity(items): def weighted_perplexity(items):
return math.exp(-weighted_mean(items)) return math.exp(-weighted_mean(items))
def bits_per_byte(items):
return -weighted_mean(items) / math.log(2)
def bleu(items): def bleu(items):
"""The Bilingual Evaluation Understudy Score, or BLEU for short, is a metric """The Bilingual Evaluation Understudy Score, or BLEU for short, is a metric
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment