"include/vscode:/vscode.git/clone" did not exist on "43c92cf3689fc7e78198cf156364c5520c6a6eb6"
Commit 1fb90b91 authored by &'s avatar &
Browse files

metrics file

parent f7992789
env env
*.pyc *.pyc
data/ data/
.idea
\ No newline at end of file
import abc import abc
import random import random
import numpy as np import numpy as np
import sklearn
import math from lm_eval.metrics import mean
class LM(abc.ABC): class LM(abc.ABC):
...@@ -30,6 +30,7 @@ class LM(abc.ABC): ...@@ -30,6 +30,7 @@ class LM(abc.ABC):
""" """
pass pass
# TODO: Add an optional max length
@abc.abstractmethod @abc.abstractmethod
def greedy_until(self, requests): def greedy_until(self, requests):
"""Generate greedily until a stopping sequence """Generate greedily until a stopping sequence
...@@ -61,6 +62,14 @@ class LM(abc.ABC): ...@@ -61,6 +62,14 @@ class LM(abc.ABC):
class Task(abc.ABC): class Task(abc.ABC):
"""A task represents an entire benchmark including its dataset, problems,
answers, and evaluation methods. See BoolQ for a simple example implementation
A `doc` can be any python object which represents one instance of evaluation.
This is usually a dictionary e.g.
{"question": ..., "answer": ...} or
{"question": ..., question, answer)
"""
def __init__(self): def __init__(self):
self.download() self.download()
self._training_docs = None self._training_docs = None
...@@ -148,9 +157,9 @@ class Task(abc.ABC): ...@@ -148,9 +157,9 @@ class Task(abc.ABC):
@abc.abstractmethod @abc.abstractmethod
def aggregation(self): def aggregation(self):
""" """
:returns: {str: [float] -> float} :returns: {str: [metric_score] -> float}
A dictionary where keys are the names of submetrics and values are A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics functions that aggregate a list of metric scores
""" """
pass pass
...@@ -213,60 +222,6 @@ class MultipleChoiceTask(Task): ...@@ -213,60 +222,6 @@ class MultipleChoiceTask(Task):
} }
def mean(arr):
return sum(arr) / len(arr)
def median(arr):
return arr[len(arr) // 2]
def matthews_corrcoef(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
return sklearn.metrics.matthews_corrcoef(golds, preds)
def f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = sklearn.metrics.f1_score(golds, preds)
return np.max(fscore)
def acc_all(items):
# Only count as correct if all answers are labeled correctly for each question
question_scoring_dict = {}
preds = list(zip(*items))[0]
docs = list(zip(*items))[1]
for doc, pred in zip(docs, preds):
question_id = doc["idx"]["question"]
if question_id not in question_scoring_dict:
question_scoring_dict[question_id] = []
gold_label = doc["label"] == 1
question_scoring_dict[question_id].append(gold_label == pred)
acc = np.mean([int(all(x)) for x in question_scoring_dict.values()])
return acc
def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
"""Compute max metric between prediction and each ground truth."""
scores_for_ground_truths = []
for ground_truth in ground_truths:
score = metric_fn(prediction, ground_truth)
scores_for_ground_truths.append(score)
return max(scores_for_ground_truths)
def perplexity(items):
return math.exp(-mean(items))
req_ret_lens = { req_ret_lens = {
'loglikelihood': 2, 'loglikelihood': 2,
'greedy_until': None, 'greedy_until': None,
......
import math
import numpy as np
import sacrebleu
import sklearn
def mean(arr):
return sum(arr) / len(arr)
def median(arr):
return arr[len(arr) // 2]
def matthews_corrcoef(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
return sklearn.metrics.matthews_corrcoef(golds, preds)
def f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = sklearn.metrics.f1_score(golds, preds)
return np.max(fscore)
def acc_all(items):
# Only count as correct if all answers are labeled correctly for each question
question_scoring_dict = {}
preds = list(zip(*items))[0]
docs = list(zip(*items))[1]
for doc, pred in zip(docs, preds):
question_id = doc["idx"]["question"]
if question_id not in question_scoring_dict:
question_scoring_dict[question_id] = []
gold_label = doc["label"] == 1
question_scoring_dict[question_id].append(gold_label == pred)
acc = np.mean([int(all(x)) for x in question_scoring_dict.values()])
return acc
def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
"""Compute max metric between prediction and each ground truth."""
scores_for_ground_truths = []
for ground_truth in ground_truths:
score = metric_fn(prediction, ground_truth)
scores_for_ground_truths.append(score)
return max(scores_for_ground_truths)
def perplexity(items):
return math.exp(-mean(items))
def bleu(items):
"""The Bilingual Evaluation Understudy Score, or BLEU for short, is a metric
for evaluating a generated sentence to a reference sentence. It counts matching
n-grams in the candidate translation to n-grams in the reference text, where
1-gram or unigram would be each token and a bigram comparison would be each
word pair. The comparison is made regardless of word order
Source: https://machinelearningmastery.com/calculate-bleu-score-for-text-python/
Paper: https://www.aclweb.org/anthology/P02-1040/
Higher is better
"""
preds = list(zip(*items))[0]
docs = list(zip(*items))[1]
pass
def chrf(items):
"""chrF++ is a tool for automatic evaluation of machine translation output
based on character n-gram precision and recall enhanced with word n-grams.
Source: https://github.com/m-popovic/chrF
Paper: https://www.aclweb.org/anthology/W15-3049.pdf
Higher is better # TODO I think
"""
pass
def ter(items):
"""Translation Error Rate is an error metric for machine translation that
measures the number of edits required to change a system output into one
of the references
Source: http://www.cs.umd.edu/~snover/tercom/
Paper: http://mt-archive.info/AMTA-2006-Snover.pdf
Lower is better
"""
pass
import numpy as np import numpy as np
from lm_eval.base import rf, mean from lm_eval.base import rf
from ..metrics import mean
from . common import HFTask from . common import HFTask
class ANLIBase(HFTask): class ANLIBase(HFTask):
......
import numpy as np import numpy as np
from lm_eval.base import rf, mean from lm_eval.base import rf
from ..metrics import mean
from . common import HFTask from . common import HFTask
......
...@@ -2,7 +2,8 @@ import abc ...@@ -2,7 +2,8 @@ import abc
import json import json
import os import os
from collections import namedtuple from collections import namedtuple
from lm_eval.base import Task, mean, rf from lm_eval.base import Task, rf
from lm_eval.metrics import mean
from best_download import download_file from best_download import download_file
ArithmeticDoc = namedtuple('ArithmeticDoc', ['context', 'completion']) ArithmeticDoc = namedtuple('ArithmeticDoc', ['context', 'completion'])
......
import datasets import datasets
import numpy as np import numpy as np
import lm_eval.metrics
from ..base import Task from ..base import Task
...@@ -44,7 +46,7 @@ class HFTask(Task): ...@@ -44,7 +46,7 @@ class HFTask(Task):
def simple_accuracy_metric(preds, golds): def simple_accuracy_metric(preds, golds):
acc = float((np.array(preds) == np.array(golds)).mean()) acc = float(lm_eval.metrics.mean())
return { return {
"major": acc, "major": acc,
"minor": {"acc": acc}, "minor": {"acc": acc},
......
import numpy as np import numpy as np
from lm_eval.base import rf, mean, f1_score, matthews_corrcoef from lm_eval.base import rf
from ..metrics import mean, matthews_corrcoef, f1_score
from scipy.stats import pearsonr, spearmanr from scipy.stats import pearsonr, spearmanr
from tqdm import auto as tqdm_lib from tqdm import auto as tqdm_lib
from . common import HFTask, yesno from . common import HFTask, yesno
......
from lm_eval.base import Task, rf, mean, perplexity from lm_eval.base import Task, rf
from lm_eval.metrics import mean, perplexity
from lm_eval.utils import sh from lm_eval.utils import sh
import json import json
import math import math
......
import numpy as np import numpy as np
from lm_eval.base import rf, mean from lm_eval.base import rf
from ..metrics import mean
from . common import HFTask from . common import HFTask
......
...@@ -2,7 +2,8 @@ import numpy as np ...@@ -2,7 +2,8 @@ import numpy as np
import json import json
import random import random
from .common import HFTask from .common import HFTask
from lm_eval.base import rf, mean from lm_eval.base import rf
from ..metrics import mean
class Pubmed_QA(HFTask): class Pubmed_QA(HFTask):
......
import os import os
import numpy as np import numpy as np
from best_download import download_file from best_download import download_file
from lm_eval.base import MultipleChoiceTask, rf, mean from lm_eval.base import MultipleChoiceTask, rf
from lm_eval.metrics import mean
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
import random import random
......
import collections import collections
import datasets import datasets
import numpy as np import numpy as np
from lm_eval.base import rf, mean from lm_eval.base import rf
from ..metrics import mean
from . common import HFTask from . common import HFTask
import os import os
......
import json import json
import random import random
import os import os
from lm_eval.base import MultipleChoiceTask, rf, mean from lm_eval.base import MultipleChoiceTask, rf
from ..metrics import mean
from tqdm import auto as tqdm_lib from tqdm import auto as tqdm_lib
from . common import simple_accuracy_metric from . common import simple_accuracy_metric
import numpy as np import numpy as np
......
import os import os
import json import json
from ..utils import sh from ..utils import sh
from lm_eval.base import MultipleChoiceTask, rf, mean from lm_eval.base import MultipleChoiceTask, rf
from ..metrics import mean
import zipfile import zipfile
from best_download import download_file from best_download import download_file
......
...@@ -5,7 +5,8 @@ To-do: ...@@ -5,7 +5,8 @@ To-do:
""" """
import numpy as np import numpy as np
from . common import HFTask, yesno from . common import HFTask, yesno
from lm_eval.base import rf, mean, acc_all, metric_max_over_ground_truths from lm_eval.base import rf
from ..metrics import mean, acc_all, metric_max_over_ground_truths
import sklearn import sklearn
import transformers.data.metrics.squad_metrics as squad_metrics import transformers.data.metrics.squad_metrics as squad_metrics
from ..utils import general_detokenize from ..utils import general_detokenize
......
import os import os
import json import json
import random import random
from lm_eval.base import Task, mean, rf from lm_eval.base import Task, rf
from ..metrics import mean
from ..utils import sh from ..utils import sh
class TriviaQA(Task): class TriviaQA(Task):
......
from . common import HFTask from . common import HFTask
from lm_eval.base import mean, rf from lm_eval.base import rf
from ..metrics import mean
class WebQs(HFTask): class WebQs(HFTask):
DATASET_PATH = "web_questions" DATASET_PATH = "web_questions"
......
import numpy as np import numpy as np
from . common import HFTask from . common import HFTask
from lm_eval.base import rf, mean from lm_eval.base import rf
from ..metrics import mean
""" """
This evaluation of Winogrande uses partial evaluation as described by This evaluation of Winogrande uses partial evaluation as described by
......
import numpy as np import numpy as np
import random import random
from lm_eval.base import rf, mean from lm_eval.base import rf
from ..metrics import mean
from . common import HFTask from . common import HFTask
""" """
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment