Commit 38360512 authored by Igor Ostrovsky's avatar Igor Ostrovsky
Browse files

Fix bits_per_byte metric in PerplexityTask

bits_per_byte was calculated as average per-byte loglikelihood, which would work if loglikelihood was base-2 log,
but it is natural log. To correct for that, bits_per_byte should be divided by math.log(2).

Also, it should be true that 2^bits_per_byte == byte_perplexity, which is true after the fix.
parent df5d7cf0
......@@ -10,7 +10,7 @@ from tqdm import tqdm
import torch
import torch.nn.functional as F
from lm_eval.metrics import mean, weighted_perplexity, weighted_mean
from lm_eval.metrics import mean, weighted_perplexity, weighted_mean, bits_per_byte
from lm_eval import utils
from abc import abstractmethod
......@@ -560,14 +560,14 @@ class PerplexityTask(Task, abc.ABC):
return {
"word_perplexity": (loglikelihood, words),
"byte_perplexity": (loglikelihood, bytes_),
"bits_per_byte": (-loglikelihood, self.count_bytes(doc))
"bits_per_byte": (loglikelihood, bytes_),
}
def aggregation(self):
return {
"word_perplexity": weighted_perplexity,
"byte_perplexity": weighted_perplexity,
"bits_per_byte": weighted_mean
"bits_per_byte": bits_per_byte,
}
@classmethod
......
......@@ -102,6 +102,9 @@ def weighted_mean(items):
def weighted_perplexity(items):
return math.exp(-weighted_mean(items))
def bits_per_byte(items):
return -weighted_mean(items) / math.log(2)
def bleu(items):
"""The Bilingual Evaluation Understudy Score, or BLEU for short, is a metric
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment