Commit 1fb90b91 authored by &'s avatar &
Browse files

metrics file

parent f7992789
env
*.pyc
data/
.idea
\ No newline at end of file
import abc
import random
import numpy as np
import sklearn
import math
from lm_eval.metrics import mean
class LM(abc.ABC):
......@@ -30,6 +30,7 @@ class LM(abc.ABC):
"""
pass
# TODO: Add an optional max length
@abc.abstractmethod
def greedy_until(self, requests):
"""Generate greedily until a stopping sequence
......@@ -61,6 +62,14 @@ class LM(abc.ABC):
class Task(abc.ABC):
"""A task represents an entire benchmark including its dataset, problems,
answers, and evaluation methods. See BoolQ for a simple example implementation
A `doc` can be any python object which represents one instance of evaluation.
This is usually a dictionary e.g.
{"question": ..., "answer": ...} or
{"question": ..., question, answer)
"""
def __init__(self):
self.download()
self._training_docs = None
......@@ -148,9 +157,9 @@ class Task(abc.ABC):
@abc.abstractmethod
def aggregation(self):
"""
:returns: {str: [float] -> float}
:returns: {str: [metric_score] -> float}
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
functions that aggregate a list of metric scores
"""
pass
......@@ -213,60 +222,6 @@ class MultipleChoiceTask(Task):
}
def mean(arr):
return sum(arr) / len(arr)
def median(arr):
return arr[len(arr) // 2]
def matthews_corrcoef(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
return sklearn.metrics.matthews_corrcoef(golds, preds)
def f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = sklearn.metrics.f1_score(golds, preds)
return np.max(fscore)
def acc_all(items):
# Only count as correct if all answers are labeled correctly for each question
question_scoring_dict = {}
preds = list(zip(*items))[0]
docs = list(zip(*items))[1]
for doc, pred in zip(docs, preds):
question_id = doc["idx"]["question"]
if question_id not in question_scoring_dict:
question_scoring_dict[question_id] = []
gold_label = doc["label"] == 1
question_scoring_dict[question_id].append(gold_label == pred)
acc = np.mean([int(all(x)) for x in question_scoring_dict.values()])
return acc
def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
"""Compute max metric between prediction and each ground truth."""
scores_for_ground_truths = []
for ground_truth in ground_truths:
score = metric_fn(prediction, ground_truth)
scores_for_ground_truths.append(score)
return max(scores_for_ground_truths)
def perplexity(items):
return math.exp(-mean(items))
req_ret_lens = {
'loglikelihood': 2,
'greedy_until': None,
......
import math
import numpy as np
import sacrebleu
import sklearn
def mean(arr):
return sum(arr) / len(arr)
def median(arr):
return arr[len(arr) // 2]
def matthews_corrcoef(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
return sklearn.metrics.matthews_corrcoef(golds, preds)
def f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = sklearn.metrics.f1_score(golds, preds)
return np.max(fscore)
def acc_all(items):
# Only count as correct if all answers are labeled correctly for each question
question_scoring_dict = {}
preds = list(zip(*items))[0]
docs = list(zip(*items))[1]
for doc, pred in zip(docs, preds):
question_id = doc["idx"]["question"]
if question_id not in question_scoring_dict:
question_scoring_dict[question_id] = []
gold_label = doc["label"] == 1
question_scoring_dict[question_id].append(gold_label == pred)
acc = np.mean([int(all(x)) for x in question_scoring_dict.values()])
return acc
def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
"""Compute max metric between prediction and each ground truth."""
scores_for_ground_truths = []
for ground_truth in ground_truths:
score = metric_fn(prediction, ground_truth)
scores_for_ground_truths.append(score)
return max(scores_for_ground_truths)
def perplexity(items):
return math.exp(-mean(items))
def bleu(items):
"""The Bilingual Evaluation Understudy Score, or BLEU for short, is a metric
for evaluating a generated sentence to a reference sentence. It counts matching
n-grams in the candidate translation to n-grams in the reference text, where
1-gram or unigram would be each token and a bigram comparison would be each
word pair. The comparison is made regardless of word order
Source: https://machinelearningmastery.com/calculate-bleu-score-for-text-python/
Paper: https://www.aclweb.org/anthology/P02-1040/
Higher is better
"""
preds = list(zip(*items))[0]
docs = list(zip(*items))[1]
pass
def chrf(items):
"""chrF++ is a tool for automatic evaluation of machine translation output
based on character n-gram precision and recall enhanced with word n-grams.
Source: https://github.com/m-popovic/chrF
Paper: https://www.aclweb.org/anthology/W15-3049.pdf
Higher is better # TODO I think
"""
pass
def ter(items):
"""Translation Error Rate is an error metric for machine translation that
measures the number of edits required to change a system output into one
of the references
Source: http://www.cs.umd.edu/~snover/tercom/
Paper: http://mt-archive.info/AMTA-2006-Snover.pdf
Lower is better
"""
pass
import numpy as np
from lm_eval.base import rf, mean
from lm_eval.base import rf
from ..metrics import mean
from . common import HFTask
class ANLIBase(HFTask):
......
import numpy as np
from lm_eval.base import rf, mean
from lm_eval.base import rf
from ..metrics import mean
from . common import HFTask
......
......@@ -2,7 +2,8 @@ import abc
import json
import os
from collections import namedtuple
from lm_eval.base import Task, mean, rf
from lm_eval.base import Task, rf
from lm_eval.metrics import mean
from best_download import download_file
ArithmeticDoc = namedtuple('ArithmeticDoc', ['context', 'completion'])
......
import datasets
import numpy as np
import lm_eval.metrics
from ..base import Task
......@@ -44,7 +46,7 @@ class HFTask(Task):
def simple_accuracy_metric(preds, golds):
acc = float((np.array(preds) == np.array(golds)).mean())
acc = float(lm_eval.metrics.mean())
return {
"major": acc,
"minor": {"acc": acc},
......
import numpy as np
from lm_eval.base import rf, mean, f1_score, matthews_corrcoef
from lm_eval.base import rf
from ..metrics import mean, matthews_corrcoef, f1_score
from scipy.stats import pearsonr, spearmanr
from tqdm import auto as tqdm_lib
from . common import HFTask, yesno
......
from lm_eval.base import Task, rf, mean, perplexity
from lm_eval.base import Task, rf
from lm_eval.metrics import mean, perplexity
from lm_eval.utils import sh
import json
import math
......
import numpy as np
from lm_eval.base import rf, mean
from lm_eval.base import rf
from ..metrics import mean
from . common import HFTask
......
......@@ -2,7 +2,8 @@ import numpy as np
import json
import random
from .common import HFTask
from lm_eval.base import rf, mean
from lm_eval.base import rf
from ..metrics import mean
class Pubmed_QA(HFTask):
......
import os
import numpy as np
from best_download import download_file
from lm_eval.base import MultipleChoiceTask, rf, mean
from lm_eval.base import MultipleChoiceTask, rf
from lm_eval.metrics import mean
import xml.etree.ElementTree as ET
import random
......
import collections
import datasets
import numpy as np
from lm_eval.base import rf, mean
from lm_eval.base import rf
from ..metrics import mean
from . common import HFTask
import os
......
import json
import random
import os
from lm_eval.base import MultipleChoiceTask, rf, mean
from lm_eval.base import MultipleChoiceTask, rf
from ..metrics import mean
from tqdm import auto as tqdm_lib
from . common import simple_accuracy_metric
import numpy as np
......
import os
import json
from ..utils import sh
from lm_eval.base import MultipleChoiceTask, rf, mean
from lm_eval.base import MultipleChoiceTask, rf
from ..metrics import mean
import zipfile
from best_download import download_file
......
......@@ -5,7 +5,8 @@ To-do:
"""
import numpy as np
from . common import HFTask, yesno
from lm_eval.base import rf, mean, acc_all, metric_max_over_ground_truths
from lm_eval.base import rf
from ..metrics import mean, acc_all, metric_max_over_ground_truths
import sklearn
import transformers.data.metrics.squad_metrics as squad_metrics
from ..utils import general_detokenize
......
import os
import json
import random
from lm_eval.base import Task, mean, rf
from lm_eval.base import Task, rf
from ..metrics import mean
from ..utils import sh
class TriviaQA(Task):
......
from . common import HFTask
from lm_eval.base import mean, rf
from lm_eval.base import rf
from ..metrics import mean
class WebQs(HFTask):
DATASET_PATH = "web_questions"
......
import numpy as np
from . common import HFTask
from lm_eval.base import rf, mean
from lm_eval.base import rf
from ..metrics import mean
"""
This evaluation of Winogrande uses partial evaluation as described by
......
import numpy as np
import random
from lm_eval.base import rf, mean
from lm_eval.base import rf
from ..metrics import mean
from . common import HFTask
"""
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment