"vscode:/vscode.git/clone" did not exist on "c0de658dac03e3c87c99864e6303efa5a8a1ed33"
Commit 37c3139d authored by thefazzer's avatar thefazzer
Browse files

Merge remote-tracking branch 'origin/master' into fazz/refactor-task-coqa

parents 79c9b68a 7ad6bf45
env
*.pyc
data/
.idea
lm_cache
\ No newline at end of file
import abc
import random
import numpy as np
import sklearn
import math
from lm_eval.metrics import mean
class LM(abc.ABC):
......@@ -15,7 +15,8 @@ class LM(abc.ABC):
:param requests: list
A list of pairs (context, continuation)
context: str
Context string
Context string. Implementations of LM must be able to handle an
empty context string.
continuation: str
The continuation over which log likelihood will be calculated. If
there is a word boundary, the space should be in the continuation.
......@@ -29,6 +30,7 @@ class LM(abc.ABC):
"""
pass
# TODO: Add an optional max length
@abc.abstractmethod
def greedy_until(self, requests):
"""Generate greedily until a stopping sequence
......@@ -37,9 +39,9 @@ class LM(abc.ABC):
A list of pairs (context, until)
context: str
Context string
until: str
The string sequence to generate until. This string sequence may
span across multiple tokens, or may be part of one token.
until: [str]
The string sequences to generate until. These string sequences
may each span across multiple tokens, or may be part of one token.
:return: list
A list of strings continuation
continuation: str
......@@ -60,6 +62,14 @@ class LM(abc.ABC):
class Task(abc.ABC):
"""A task represents an entire benchmark including its dataset, problems,
answers, and evaluation methods. See BoolQ for a simple example implementation
A `doc` can be any python object which represents one instance of evaluation.
This is usually a dictionary e.g.
{"question": ..., "answer": ...} or
{"question": ..., question, answer)
"""
def __init__(self):
self.download()
self._training_docs = None
......@@ -147,9 +157,9 @@ class Task(abc.ABC):
@abc.abstractmethod
def aggregation(self):
"""
:returns: {str: [float] -> float}
:returns: {str: [metric_score] -> float}
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
functions that aggregate a list of metric scores
"""
pass
......@@ -212,62 +222,9 @@ class MultipleChoiceTask(Task):
}
def mean(arr):
return sum(arr) / len(arr)
def median(arr):
return arr[len(arr) // 2]
def matthews_corrcoef(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
return sklearn.metrics.matthews_corrcoef(golds, preds)
def f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = sklearn.metrics.f1_score(golds, preds)
return np.max(fscore)
def acc_all(items):
# Only count as correct if all answers are labeled correctly for each question
question_scoring_dict = {}
preds = list(zip(*items))[0]
docs = list(zip(*items))[1]
for doc, pred in zip(docs, preds):
question_id = doc["idx"]["question"]
if question_id not in question_scoring_dict:
question_scoring_dict[question_id] = []
gold_label = doc["label"] == 1
question_scoring_dict[question_id].append(gold_label == pred)
acc = np.mean([int(all(x)) for x in question_scoring_dict.values()])
return acc
def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
"""Compute max metric between prediction and each ground truth."""
scores_for_ground_truths = []
for ground_truth in ground_truths:
score = metric_fn(prediction, ground_truth)
scores_for_ground_truths.append(score)
return max(scores_for_ground_truths)
def perplexity(items):
return math.exp(-mean(items))
req_ret_lens = {
'loglikelihood': 2,
'greedy_until': None,
}
import os
......@@ -275,13 +232,9 @@ import json
import hashlib
from sqlitedict import SqliteDict
def hash_args(args):
dat = b""
for arg in args:
assert isinstance(arg, str) or isinstance(arg, int)
dat += str(arg).encode()
dat += b"\0"
return hashlib.sha256(dat).hexdigest()
def hash_args(attr, args):
dat = json.dumps([attr] + list(args))
return hashlib.sha256(dat.encode('utf-8')).hexdigest()
class CachingLM:
......@@ -298,7 +251,7 @@ class CachingLM:
# figure out which ones are cached and which ones are new
for req in requests:
hsh = attr + '_' + hash_args(req)
hsh = hash_args(attr, req)
if hsh in self.dbdict:
ob = self.dbdict[hsh]
......@@ -320,9 +273,9 @@ class CachingLM:
res[resptr] = r
# caching
hsh = attr + '_' + hash_args(req)
hsh = hash_args(attr, req)
self.dbdict[hsh] = r
self.dbdict.commit()
return res
return fn
......@@ -338,12 +291,19 @@ class Request:
self.index = index
def __iter__(self):
if req_ret_lens[self.type] is None:
raise IndexError('This request type does not return multiple arguments!')
i = 0
for i in range(req_ret_lens[self.type]):
yield Request(self.type, self.args, i)
def __getitem__(self, i):
if req_ret_lens[self.type] is None:
raise IndexError('This request type does not return multiple arguments!')
return Request(self.type, self.args, i)
def __eq__(self, other):
return self.type == other.type and self.args == other.args and self.index == other.index
class RequestFactory:
......
......@@ -39,6 +39,7 @@ def evaluate(lm, task_dict, provide_description, num_fewshot, limit):
)
reqs = task.construct_requests(doc, ctx)
if not isinstance(reqs, (list, tuple)): reqs = [reqs]
for i, req in enumerate(reqs):
requests[req.type].append(req)
......
import math
from pprint import pprint
import numpy as np
import sacrebleu
import sklearn
def mean(arr):
return sum(arr) / len(arr)
def median(arr):
return arr[len(arr) // 2]
def matthews_corrcoef(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
return sklearn.metrics.matthews_corrcoef(golds, preds)
def f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = sklearn.metrics.f1_score(golds, preds)
return np.max(fscore)
def acc_all(items):
# Only count as correct if all answers are labeled correctly for each question
question_scoring_dict = {}
preds = list(zip(*items))[0]
docs = list(zip(*items))[1]
for doc, pred in zip(docs, preds):
question_id = doc["idx"]["question"]
if question_id not in question_scoring_dict:
question_scoring_dict[question_id] = []
gold_label = doc["label"] == 1
question_scoring_dict[question_id].append(gold_label == pred)
acc = np.mean([int(all(x)) for x in question_scoring_dict.values()])
return acc
def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
"""Compute max metric between prediction and each ground truth."""
scores_for_ground_truths = []
for ground_truth in ground_truths:
score = metric_fn(prediction, ground_truth)
scores_for_ground_truths.append(score)
return max(scores_for_ground_truths)
def perplexity(items):
return math.exp(-mean(items))
def bleu(items):
"""The Bilingual Evaluation Understudy Score, or BLEU for short, is a metric
for evaluating a generated sentence to a reference sentence. It counts matching
n-grams in the candidate translation to n-grams in the reference text, where
1-gram or unigram would be each token and a bigram comparison would be each
word pair. The comparison is made regardless of word order
Source: https://machinelearningmastery.com/calculate-bleu-score-for-text-python/
Paper: https://www.aclweb.org/anthology/P02-1040/
Higher is better
"""
refs = list(zip(*items))[0]
preds = list(zip(*items))[1]
refs, preds = _sacreformat(refs, preds)
return sacrebleu.corpus_bleu(preds, refs).score
def chrf(items):
"""chrF++ is a tool for automatic evaluation of machine translation output
based on character n-gram precision and recall enhanced with word n-grams.
Source: https://github.com/m-popovic/chrF
Paper: https://www.aclweb.org/anthology/W15-3049.pdf
Higher is better # TODO I think
"""
refs = list(zip(*items))[0]
preds = list(zip(*items))[1]
refs, preds = _sacreformat(refs, preds)
return sacrebleu.corpus_chrf(preds, refs).score
def ter(items):
"""Translation Error Rate is an error metric for machine translation that
measures the number of edits required to change a system output into one
of the references
Source: http://www.cs.umd.edu/~snover/tercom/
Paper: http://mt-archive.info/AMTA-2006-Snover.pdf
Lower is better
"""
refs = list(zip(*items))[0]
preds = list(zip(*items))[1]
refs, preds = _sacreformat(refs, preds)
return sacrebleu.corpus_ter(preds, refs).score
def _sacreformat(refs, preds):
"""Format refs and preds for sacrebleu corpus calculation. It is very particular"""
# Sacrebleu expects (List[str], List[List[str])
# e.g. sacrebleu.corpus_bleu([pred_t], [[ref1_stream], [ref2_stream], ...])
# Note [ref1_stream] is the first reference for each pred.
# So lists are size N and (M, N) for N preds and M possible refs for each pred
# This is a different order of dimensions that I would expect
# We expect refs to be List[str] or List[List[str]], the outer list corresponding to preds
# Must become List[List[str]] with the inner list corresponding to preds
if not isinstance(refs, list):
refs = list(refs)
if not isinstance(refs[0], list):
refs = [[ref] for ref in refs]
refs = list(zip(*refs))
# Note the number of refs in each ref list much match the number of preds
# We expect preds to be List[str] or List[List[str]]. Must become List[str]
if not isinstance(preds, list):
preds = list(preds)
if isinstance(preds[0], list):
assert len(preds[0]) == 1, f"Pred must be a str, was {preds[0]}"
preds = [pred[0] for pred in preds]
return refs, preds
......@@ -19,5 +19,9 @@ class DummyLM(LM):
return res
def greedy_until(self, requests):
# TODO: implement
pass
res = []
for _ in requests:
res.append("lol")
return res
......@@ -7,41 +7,75 @@ from tqdm import tqdm
class GPT2LM(LM):
def __init__(self, device="cpu"):
MAX_GEN_TOKS = 256
def __init__(self, device="cpu", pretrained='gpt2'):
self.device = torch.device(device)
self.gpt2 = transformers.GPT2LMHeadModel.from_pretrained('gpt2').to(self.device)
self.gpt2 = transformers.GPT2LMHeadModel.from_pretrained(pretrained).to(self.device)
self.gpt2.eval()
self.tokenizer = transformers.GPT2TokenizerFast.from_pretrained('gpt2')
self.tokenizer = transformers.GPT2TokenizerFast.from_pretrained(pretrained)
self.tokenizer.pad_token = "<|endoftext|>"
@classmethod
def create_from_arg_string(cls, arg_string):
args = utils.simple_parse_args_string(arg_string)
return cls(device=args.get("device", "cpu"))
return cls(device=args.get("device", "cpu"), pretrained=args.get("pretrained", "gpt2"))
def loglikelihood(self, requests):
res = []
# TODO: vectorize properly
for context, continuation in tqdm(requests):
# when too long to fit in context, truncate from the left
context_enc = self.tokenizer.encode(context)
continuation_enc = self.tokenizer.encode(continuation)
inp = torch.tensor([(context_enc + continuation_enc)[-1024:]], dtype=torch.long).to(self.device)
ctxlen = len(context_enc) - max(0, len(context_enc) + len(continuation_enc) - 1024)
cont_toks = inp[:, ctxlen:] # [batch, seq]
logits = F.log_softmax(self.gpt2(inp)[0], dim=-1)[:, ctxlen - 1:-1] # [batch, seq, vocab]
greedy_tokens = logits.argmax(dim=-1)
max_equal = (greedy_tokens == cont_toks).all()
with torch.no_grad():
# TODO: vectorize properly
# TODO: automatic batch size detection for vectorization
for context, continuation in tqdm(requests):
# when too long to fit in context, truncate from the left
if context == "":
# end of text as context
context_enc = [50256]
else:
context_enc = self.tokenizer.encode(context)
continuation_enc = self.tokenizer.encode(continuation)
inp = torch.tensor([(context_enc + continuation_enc)[-1024:]], dtype=torch.long).to(self.device)
ctxlen = len(context_enc) - max(0, len(context_enc) + len(continuation_enc) - 1024)
cont_toks = inp[:, ctxlen:] # [batch, seq]
logits = F.log_softmax(self.gpt2(inp)[0], dim=-1)[:, ctxlen - 1:-1] # [batch, seq, vocab]
greedy_tokens = logits.argmax(dim=-1)
max_equal = (greedy_tokens == cont_toks).all()
logits = torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(-1) # [batch, seq]
logits = torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(-1) # [batch, seq]
res.append((float(logits.sum()), bool(max_equal)))
res.append((float(logits.sum()), bool(max_equal)))
return res
def greedy_until(self, requests):
# TODO: implement
pass
# TODO: implement fully general `until` that handles untils that are
# multiple tokens or that span multiple tokens correctly
res = []
for context, until in tqdm(requests):
if isinstance(until, str): until = [until]
context_enc = torch.tensor([self.tokenizer.encode(context)]).to(self.device)
primary_until, = self.tokenizer.encode(until[0])
cont = self.gpt2.generate(
context_enc,
max_length=context_enc.shape[1] + self.MAX_GEN_TOKS,
eos_token_id=primary_until,
do_sample=False
)
s = self.tokenizer.decode(cont[0].tolist()[context_enc.shape[1]:])
for term in until:
s = s.split(term)[0]
res.append(s)
return res
......@@ -72,7 +72,12 @@ class GPT3LM(LM):
inps = []
ctxlens = []
for context, continuation in chunk:
context_enc = self.tokenizer.encode(context)
if context == "":
# end of text as context
context_enc = [50256]
else:
context_enc = self.tokenizer.encode(context)
continuation_enc = self.tokenizer.encode(continuation)
inp = (context_enc + continuation_enc)[-self.MAX_LENGTH:]
ctxlen = len(context_enc) - max(0, len(context_enc) + len(continuation_enc) - self.MAX_LENGTH)
......@@ -108,6 +113,7 @@ class GPT3LM(LM):
max_tokens=self.MAX_GEN_TOKS,
temperature=0.,
logprobs=10,
stop=until
)
res.append(response.choices[0]['text'])
......
from pprint import pprint
from . import superglue
from . import glue
from . import arc
......@@ -21,7 +23,10 @@ from . import triviaqa
from . import pubmedqa
from . import sciq
from . import webqs
from . import qa4mre
from . import translation
from . import headqa
from . import mathqa
TASK_REGISTRY = {
# GLUE
......@@ -49,19 +54,26 @@ TASK_REGISTRY = {
"lambada": lambada.LAMBADA,
"piqa": piqa.PiQA,
# Science related
"pubmedqa" : pubmedqa.Pubmed_QA,
"sciq" : sciq.SciQ,
#"qa4mre" : qa4mre.QA4MRE,
"qa4mre_2011" : qa4mre.QA4MRE_2011,
"qa4mre_2012" : qa4mre.QA4MRE_2012,
"qa4mre_2013" : qa4mre.QA4MRE_2013,
#"triviaqa": triviaqa.TriviaQA,
"arc_easy": arc.ARCEasy,
"arc_challenge": arc.ARCChallenge,
# "quac": quac.QuAC, # not implemented yet
"hellaswag": hellaswag.HellaSwag, # not implemented yet
# "openbookqa": openbookqa.OpenBookQA, # not implemented yet
"openbookqa": openbookqa.OpenBookQA,
# "sat": sat.SATAnalogies, # not implemented yet
# "squad": squad.SQuAD, # not implemented yet
"race": race.RACE,
# "naturalqs": naturalqs.NaturalQs, # not implemented yet
"headqa": headqa.HeadQA,
"mathqa": mathqa.MathQA,
"webqs": webqs.WebQs,
"wsc273": wsc273.WinogradSchemaChallenge273,
"winogrande": winogrande.Winogrande,
......@@ -80,6 +92,11 @@ TASK_REGISTRY = {
"arithmetic_2dm": arithmetic.Arithmetic2DMultiplication,
"arithmetic_1dc": arithmetic.Arithmetic1DComposite,
# TODO Perhaps make these groups of tasks
# e.g. anli, arithmetic, openai_translations, harness_translations
# e.g. wmt14-fr-en
**translation.create_tasks_from_benchmarks(translation.selected_benchmarks)
}
......@@ -87,7 +104,12 @@ ALL_TASKS = sorted(list(TASK_REGISTRY))
def get_task(task_name):
return TASK_REGISTRY[task_name]
try:
return TASK_REGISTRY[task_name]
except KeyError as e:
print("Available tasks:")
pprint(TASK_REGISTRY)
raise KeyError(f"Missing task {task_name}")
def get_task_dict(task_name_list):
......
import numpy as np
from lm_eval.base import rf, mean
from lm_eval.base import rf
from ..metrics import mean
from . common import HFTask
class ANLIBase(HFTask):
......@@ -39,7 +40,7 @@ class ANLIBase(HFTask):
# of the prompt (yes, repeating it!). also, " True, False, or Neither?" is directly
# appended onto the question, with no "Answer:" or even a newline. Do we *really*
# want to do it exactly as OA did?
return doc['premise'] + '\nQuestion: ' + doc['hypothesis'] + '\nTrue, False, or Neither?'
return doc['premise'] + '\nQuestion: ' + doc['hypothesis'] + ' True, False, or Neither?\nAnswer:'
def doc_to_target(self, doc):
# True = entailment
......
import numpy as np
from lm_eval.base import rf, mean
from lm_eval.base import MultipleChoiceTask
from ..metrics import mean
from . common import HFTask
class ARCEasy(HFTask):
class ARCEasy(HFTask, MultipleChoiceTask):
DATASET_PATH = "ai2_arc"
DATASET_NAME = "ARC-Easy"
letter_to_num = {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4}
def __init__(self):
super().__init__()
self.data = self.__clean_data()
def __clean_data(self):
""" Resolves various edge cases in the unprocessed HF ARC dataset. """
# NOTE: Some `doc["answerKey"]`s are in numeric string format being one
# of {'1', '2', '3', '4', '5'}. We map them back to letters.
num_to_letter = {'1': 'A', '2': 'B', '3': 'C', '4': 'D', '5': 'E'}
result = {}
for split, data in self.data.items():
result[split] = []
for doc in data:
# Ensure all `answerKey`s and `label`s are in letter format.
doc["answerKey"] = num_to_letter.get(doc["answerKey"], doc["answerKey"])
doc["choices"]["label"] = [
num_to_letter.get(label, label) for label in doc["choices"]["label"]
]
result[split].append(doc)
return result
def has_training_docs(self):
return True
......@@ -39,68 +17,41 @@ class ARCEasy(HFTask):
def has_test_docs(self):
return True
def fewshot_description(self):
# TODO: figure out description
return ""
def doc_to_text(self, doc):
return "Question: " + doc['question'] + '\nAnswer:'
def doc_to_target(self, doc):
index = self.letter_to_num[doc["answerKey"]]
return " " + doc['choices']['text'][index]
def _convert_standard(self, doc):
# NOTE: Some `doc["answerKey"]`s are in numeric string format being one
# of {'1', '2', '3', '4', '5'}. We map them back to letters.
num_to_letter = {"1": "A", "2": "B", "3": "C", "4": "D", "5": "E"}
doc["answerKey"] = num_to_letter.get(doc["answerKey"], doc["answerKey"])
out_doc = {
"id": doc["id"],
"query": "Question: " + doc["question"] + "\nAnswer:",
"choices": doc["choices"]["text"],
"gold": ["A", "B", "C", "D", "E"].index(doc["answerKey"]),
}
return out_doc
def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
def _load_docs(self, docs):
for record in docs:
yield self._convert_standard(record)
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
ll_choices = []
for choice in doc["choices"]["text"]:
ll_choices.append(rf.loglikelihood(ctx, " " + choice)[0])
return ll_choices
def training_docs(self):
docs = super().training_docs()
return self._load_docs(docs)
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
def validation_docs(self):
docs = super().validation_docs()
return self._load_docs(docs)
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
The results of the requests created in construct_requests.
"""
gold = self.letter_to_num[doc["answerKey"]]
pred = np.argmax(results)
return {
"acc": pred == gold
}
def test_docs(self):
docs = super().test_docs()
return self._load_docs(docs)
def aggregation(self):
"""
:returns: {str: [float] -> float}
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
return {
"acc": mean
}
def fewshot_description(self):
# TODO: figure out description
return ""
def higher_is_better(self):
"""
:returns: {str: bool}
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
return {
"acc": True
}
def doc_to_text(self, doc):
return doc["query"]
class ARCChallenge(ARCEasy):
......
......@@ -2,7 +2,8 @@ import abc
import json
import os
from collections import namedtuple
from lm_eval.base import Task, mean, rf
from lm_eval.base import Task, rf
from lm_eval.metrics import mean
from best_download import download_file
ArithmeticDoc = namedtuple('ArithmeticDoc', ['context', 'completion'])
......@@ -56,14 +57,17 @@ class Arithmetic(Task):
return doc.completion
def load_doc(self, doc_json):
return ArithmeticDoc(context=doc_json['context'], completion=doc_json['completion'])
return ArithmeticDoc(context=doc_json['context'].strip()
.replace('\n\n', '\n')
.replace('Q:', 'Question:')
.replace('A:', 'Answer:'), completion=doc_json['completion'])
def construct_requests(self, doc, ctx):
ll, is_prediction = rf.loglikelihood(ctx, doc.completion)
return is_prediction
def process_results(self, doc, results):
ll, is_prediction = results
is_prediction, = results
return {
"acc": is_prediction
}
......
import datasets
import numpy as np
import lm_eval.metrics
from ..base import Task
......@@ -44,7 +46,7 @@ class HFTask(Task):
def simple_accuracy_metric(preds, golds):
acc = float((np.array(preds) == np.array(golds)).mean())
acc = float(lm_eval.metrics.mean())
return {
"major": acc,
"minor": {"acc": acc},
......
import numpy as np
from lm_eval.base import rf, mean, f1_score, matthews_corrcoef
from lm_eval.base import rf
from ..metrics import mean, matthews_corrcoef, f1_score
from scipy.stats import pearsonr, spearmanr
from tqdm import auto as tqdm_lib
from . common import HFTask, yesno
from ..utils import general_detokenize
# Single-Sentence Tasks
......@@ -22,17 +23,18 @@ class CoLA(HFTask):
return True
def fewshot_description(self):
return "Does this sentence make sense?:\tTrue or False?"
# TODO
return ""
def doc_to_text(self, doc):
return "Sentence: {}\nAnswer:".format(doc["sentence"])
return "{}\nQuestion: Does this sentence make sense?\nAnswer:".format(doc["sentence"])
def doc_to_target(self, doc):
return " {}".format({1: "True", 0: "False"}[doc["label"]])
return " {}".format({1: "yes", 0: "no"}[doc["label"]])
def construct_requests(self, doc, ctx):
ll_true, _ = rf.loglikelihood(ctx, " True")
ll_false, _ = rf.loglikelihood(ctx, " False")
ll_true, _ = rf.loglikelihood(ctx, " yes")
ll_false, _ = rf.loglikelihood(ctx, " no")
return ll_true, ll_false
def process_results(self, doc, results):
......@@ -68,19 +70,19 @@ class SST(HFTask):
return True
def fewshot_description(self):
return "Indicate if each sentence is Positive or Negative."
return "Indicate if the sentiment of each sentence is positive or negative."
def doc_to_text(self, doc):
return "sentence:\t{}\t\nanswer:".format(
doc["sentence"],
return "{}\nQuestion: Is this sentence positive or negative?\nAnswer:".format(
general_detokenize(doc["sentence"]),
)
def doc_to_target(self, doc):
return " {}".format({1: "Positive", 0: "Negative"}[doc["label"]])
return " {}".format({1: "positive", 0: "negative"}[doc["label"]])
def construct_requests(self, doc, ctx):
ll_positive, _ = rf.loglikelihood(ctx, " Positive")
ll_negative, _ = rf.loglikelihood(ctx, " Negative")
ll_positive, _ = rf.loglikelihood(ctx, " positive")
ll_negative, _ = rf.loglikelihood(ctx, " negative")
return ll_positive, ll_negative
def process_results(self, doc, results):
......@@ -127,9 +129,9 @@ class MNLI(HFTask):
return self.data["test_matched"]
def doc_to_text(self, doc):
return "{}\nquestion:\t{}\tTrue, False or Neither?\nanswer:".format(
return "{}\nQuestion: {} True, False or Neither?\nAnswer:".format(
doc["premise"],
doc["hypothesis"],
doc["hypothesis"].strip() + ('' if doc["hypothesis"].strip().endswith('.') else '.'),
)
def doc_to_target(self, doc):
......@@ -187,7 +189,7 @@ class QNLI(HFTask):
return True
def doc_to_text(self, doc):
return "question:\t{}\nresponse:\t{}\nDoes this answer the question, Yes or No?:".format(
return "{}\n{}\nQuestion: Does this response answer the question?\nAnswer:".format(
doc["question"],
doc["sentence"],
)
......@@ -195,11 +197,11 @@ class QNLI(HFTask):
def doc_to_target(self, doc):
# True = entailment
# False = not entailment
return " {}".format({0: "Yes", 1: "No"}[doc["label"]])
return " {}".format({0: "yes", 1: "no"}[doc["label"]])
def construct_requests(self, doc, ctx):
ll_yes, _ = rf.loglikelihood(ctx, " Yes")
ll_no, _ = rf.loglikelihood(ctx, " No")
ll_yes, _ = rf.loglikelihood(ctx, " yes")
ll_no, _ = rf.loglikelihood(ctx, " no")
return ll_yes, ll_no
def process_results(self, doc, results):
......@@ -235,7 +237,7 @@ class WNLI(HFTask):
return True
def doc_to_text(self, doc):
return "{}\nquestion:\t{}\tTrue, False or Neither?\nanswer:".format(
return "{}\nQuestion: {} True, False or Neither?\nAnswer:".format(
doc["sentence1"],
doc["sentence2"],
)
......@@ -284,7 +286,7 @@ class RTE(HFTask):
return True
def doc_to_text(self, doc):
return "{}\nquestion:\t{}\tTrue or False?\nanswer:".format(
return "{}\nQuestion: {} True or False?\nAnswer:".format(
doc["sentence1"],
doc["sentence2"],
)
......@@ -338,9 +340,9 @@ class MRPC(HFTask):
return "Indicate if both sentences mean the same thing."
def doc_to_text(self, doc):
return "sentence 1:\t{}\nsentence 2:\t{}\nanswer:".format(
doc["sentence1"],
doc["sentence2"],
return "Sentence 1: {}\nSentence 2: {}\nQuestion: Do both sentences mean the same thing?\nAnswer:".format(
general_detokenize(doc["sentence1"]),
general_detokenize(doc["sentence2"]),
)
def doc_to_target(self, doc):
......@@ -390,7 +392,7 @@ class QQP(HFTask):
return "Indicate if both questions ask the same thing."
def doc_to_text(self, doc):
return "question 1:\t{}\nquestion 2:\t{}\nanswer:".format(
return "Question 1: {}\nQuestion 2: {}\nQuestion: Do both questions ask the same thing?\nAnswer:".format(
doc["question1"],
doc["question2"],
)
......@@ -443,7 +445,7 @@ class STSB(HFTask):
"where 5 means identical and 0 means unrelated."
def doc_to_text(self, doc):
return "sentence 1:\t{}\nsentence 2:\t{}\nanswer:".format(
return "sentence 1: {}\nsentence 2: {}\nAnswer:".format(
doc["sentence1"],
doc["sentence2"],
)
......
from . common import HFTask
from lm_eval.base import MultipleChoiceTask
class HeadQA(HFTask, MultipleChoiceTask):
DATASET_PATH = "head_qa"
DATASET_NAME = None
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def _convert_standard(self, doc):
out_doc = {
"id": doc["qid"],
"query": "Question: " + doc["qtext"] + "\nAnswer:",
"choices": [answer["atext"] for answer in doc["answers"]],
"gold": int(doc["ra"]) - 1,
}
return out_doc
def _load_docs(self, docs):
for doc in docs:
yield self._convert_standard(doc)
def training_docs(self):
docs = super().training_docs()
return self._load_docs(docs)
def validation_docs(self):
docs = super().validation_docs()
return self._load_docs(docs)
def test_docs(self):
docs = super().test_docs()
return self._load_docs(docs)
def fewshot_description(self):
# TODO: figure out description
return ""
def doc_to_text(self, doc):
return doc["query"]
import re
import numpy as np
from ..base import rf, mean
from lm_eval.base import MultipleChoiceTask
from . common import HFTask
class HellaSwag(HFTask):
class HellaSwag(HFTask, MultipleChoiceTask):
DATASET_PATH = "hellaswag"
DATASET_NAME = None
@classmethod
def remove_brackets(cls, text):
""" Removes brackets from HellaSwag documents.
NOTE: The brackets are artifacts of the WikiHow dataset portion underlying
HellaSwag.
"""
text = re.sub('\[.*?\]', '', text)
return text
def has_training_docs(self):
return True
......@@ -24,19 +14,37 @@ class HellaSwag(HFTask):
return True
def has_test_docs(self):
return True
return False
@classmethod
def preprocess(cls, text):
text = text.strip()
# NOTE: Brackets are artifacts of the WikiHow dataset portion of HellaSwag.
text = text.replace(" [title]", ". ")
text = re.sub('\\[.*?\\]', '', text)
text = text.replace(" ", " ")
return text
def _convert_standard(self, doc):
ctx = doc["ctx_a"] + " " + doc["ctx_b"].capitalize()
out_doc = {
"query": self.preprocess(doc['activity_label'] + ': ' + ctx),
"choices": [self.preprocess(ending) for ending in doc['endings']],
"gold": int(doc['label']),
}
return out_doc
def _load_docs(self, docs):
for record in docs:
yield self._convert_standard(record)
def training_docs(self):
if self.has_training_docs():
return self.data["train"]
docs = super().training_docs()
return self._load_docs(docs)
def validation_docs(self):
if self.has_validation_docs():
return self.data["validation"]
def test_docs(self):
if self.has_test_docs():
return self.data["test"]
docs = super().validation_docs()
return self._load_docs(docs)
def fewshot_description(self):
return "Label for the relevant action: Sentences describing the " \
......@@ -44,73 +52,4 @@ class HellaSwag(HFTask):
"plausibly completes the situation."
def doc_to_text(self, doc):
text = doc['activity_label'] + ': ' + doc['ctx'] + '\n'
return self.remove_brackets(text)
def doc_to_target(self, doc):
letter_answer = doc['label']
if letter_answer == '0':
index = 0
elif letter_answer == '1':
index = 1
elif letter_answer == '2':
index = 2
elif letter_answer == '3':
index = 3
else:
raise ValueError(
"HellaSwag from HF datasets contained an invalid answer key")
target = doc['endings'][index]
return " " + self.remove_brackets(target)
def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
ll_answers = []
for i in range(4):
continuation = " " + self.remove_brackets(doc['endings'][i])
ll_answers.append(rf.loglikelihood(ctx, continuation))
return ll_answers
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
The results of the requests created in construct_requests.
"""
gold = int(doc['label'])
pred = np.argmax(results)
acc = 1. if pred == gold else 0.
return {
"acc": acc
}
def aggregation(self):
"""
:returns: {str: [float] -> float}
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
return {
"acc": mean
}
def higher_is_better(self):
"""
:returns: {str: bool}
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
return {
"acc": True
}
return doc["query"]
from lm_eval.base import Task, rf, mean, perplexity
from lm_eval.base import Task, rf
from lm_eval.metrics import mean, perplexity
from lm_eval.utils import sh
import json
import math
......@@ -9,7 +10,7 @@ class LAMBADA(Task):
def download(self):
sh("mkdir -p data/lambada")
download_file(
"https://storage.googleapis.com/gpt-2/data/lambada_test.jsonl",
"http://eaidata.bmk.sh/data/lambada_test.jsonl",
"data/lambada/lambada_test.jsonl",
"4aa8d02cd17c719165fc8a7887fddd641f43fcafa4b1c806ca8abc31fabdb226"
)
......@@ -53,18 +54,18 @@ class LAMBADA(Task):
ll, is_greedy = results
return {
'perplexity': ll,
'accuracy': int(is_greedy)
'ppl': ll,
'acc': int(is_greedy)
}
def aggregation(self):
return {
'perplexity': perplexity,
'accuracy': mean
'ppl': perplexity,
'acc': mean
}
def higher_is_better(self):
return {
'perplexity': False,
'accuracy': True
'ppl': False,
'acc': True
}
from . common import HFTask
from lm_eval.base import mean, rf, MultipleChoiceTask
import re
class MathQA(HFTask, MultipleChoiceTask):
DATASET_PATH = "math_qa"
DATASET_NAME = None
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def _convert_standard(self, doc):
answer_idx = ['a', 'b', 'c', 'd', 'e'].index(doc['correct'])
choices = [c[4:].rstrip(" ,") for c in re.findall(r"[abcd] \) .*?, |e \) .*?$", doc['options'])]
out_doc = {
"query": "Question: " + doc['Problem'] +"\nAnswer:",
"choices": choices,
"gold": answer_idx,
}
return out_doc
def _load_docs(self, docs):
for record in docs:
yield self._convert_standard(record)
def training_docs(self):
docs = super().training_docs()
return self._load_docs(docs)
def validation_docs(self):
docs = super().validation_docs()
return self._load_docs(docs)
def test_docs(self):
docs = super().test_docs()
return self._load_docs(docs)
def fewshot_description(self):
# TODO: figure out description
return ""
def doc_to_text(self, doc):
return doc["query"]
import numpy as np
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import f1_score, matthews_corrcoef
from tqdm import auto as tqdm_lib
from . common import HFTask, simple_accuracy_metric, yesno
from lm_eval.base import MultipleChoiceTask
from .common import HFTask
class OpenBookQA(HFTask):
class OpenBookQA(HFTask, MultipleChoiceTask):
DATASET_PATH = "openbookqa"
DATASET_NAME = "main"
......@@ -17,82 +15,34 @@ class OpenBookQA(HFTask):
def has_test_docs(self):
return True
def _convert_standard(self, doc):
out_doc = {
"id": doc["id"],
"query": doc["question_stem"],
"choices": doc["choices"]["text"],
"gold": ["A", "B", "C", "D"].index(doc["answerKey"].strip()),
}
return out_doc
def _load_docs(self, docs):
for record in docs:
yield self._convert_standard(record)
def training_docs(self):
if self.has_training_docs():
if self._training_docs is None:
self._training_docs = list(self.data["train"])
return self._training_docs
docs = super().training_docs()
return self._load_docs(docs)
def validation_docs(self):
if self.has_validation_docs():
return self.data["validation"]
docs = super().validation_docs()
return self._load_docs(docs)
def test_docs(self):
if self.has_test_docs():
return self.data["test"]
docs = super().test_docs()
return self._load_docs(docs)
def fewshot_description(self):
# TODO: figure out fewshot description
return ""
def doc_to_text(self, doc):
return doc['question_stem'] + '\n'
def doc_to_target(self, doc):
letter_answer = doc['answerKey']
if letter_answer == 'A':
index = 0
elif letter_answer == 'B':
index = 1
elif letter_answer == 'C':
index = 2
elif letter_answer == 'D':
index = 3
else:
raise ValueError("OpenBookQA from HF datasets contained an invalid answer key")
return doc['choices']['text'][index] + '.'
def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
The results of the requests created in construct_requests.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def aggregation(self):
"""
:returns: {str: [float] -> float}
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def higher_is_better(self):
"""
:returns: {str: bool}
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
return doc["query"]
import numpy as np
from lm_eval.base import rf, mean
from lm_eval.base import rf
from ..metrics import mean
from . common import HFTask
......@@ -21,15 +22,15 @@ class PiQA(HFTask):
return ""
def doc_to_text(self, doc):
return doc["goal"] + "\n"
return "Question: "+doc["goal"] + "\nAnswer:"
def doc_to_target(self, doc):
solutions = [doc["sol1"], doc["sol2"]]
return solutions[doc["label"]]
return " " + solutions[doc["label"]]
def construct_requests(self, doc, ctx):
ll_1, _ = rf.loglikelihood(ctx, doc['sol1'])
ll_2, _ = rf.loglikelihood(ctx, doc['sol2'])
ll_1, _ = rf.loglikelihood(ctx, " " + doc['sol1'])
ll_2, _ = rf.loglikelihood(ctx, " " + doc['sol2'])
return ll_1, ll_2
def process_results(self, doc, results):
......
......@@ -2,7 +2,8 @@ import numpy as np
import json
import random
from .common import HFTask
from lm_eval.base import rf, mean
from lm_eval.base import rf
from ..metrics import mean
class Pubmed_QA(HFTask):
......@@ -30,7 +31,7 @@ class Pubmed_QA(HFTask):
def doc_to_text(self, doc):
ctxs = "\n".join(doc["context"]["contexts"])
return "abstract: {}\nquestion: {}\nanswer:".format(
return "Abstract: {}\nQuestion: {}\nAnswer:".format(
ctxs,
doc["question"],
doc["final_decision"]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment