Commit 9b933d96 authored by jeffhsu3's avatar jeffhsu3
Browse files

merged changes

parents c71dcb91 c0fbf9e8
# Evaluation Harness for Large Language Models
![](https://github.com/EleutherAI/lm-evaluation-harness/workflows/Python%20application/badge.svg)
[![codecov](https://codecov.io/gh/EleutherAI/lm-evaluation-harness/branch/master/graph/badge.svg?token=JSG3O2427J)](https://codecov.io/gh/EleutherAI/lm-evaluation-harness)
## Overview
The goal of this project is to build a set of tools for evaluating LMs on typical NLU tasks, based on evaluation of GPT-3 as described in https://arxiv.org/pdf/2005.14165.pdf. Following the initial description, this repo should support 3 functions:
......@@ -7,6 +10,49 @@ The goal of this project is to build a set of tools for evaluating LMs on typica
2. Removing task val/test data from LM training set
3. Adding task training data to LM training set
### Overview of Tasks
| Task Name |Train|Val|Test| Metrics |
|---------------|-----|---|----|--------------------|
|cola |✓ |✓ |✓ |mcc |
|mnli |✓ |✓ |✓ |acc |
|mnli_mismatched|✓ |✓ |✓ |acc |
|mrpc |✓ |✓ |✓ |acc, f1 |
|rte |✓ |✓ |✓ |acc |
|qnli |✓ |✓ |✓ |acc |
|qqp |✓ |✓ |✓ |acc, f1 |
|sst |✓ |✓ |✓ |acc |
|wnli |✓ |✓ |✓ |acc |
|boolq |✓ |✓ |✓ |acc |
|cb |✓ |✓ |✓ |acc, f1 |
|copa |✓ |✓ |✓ |acc |
|multirc |✓ |✓ |✓ |acc |
|wic |✓ |✓ |✓ |acc |
|wsc |✓ |✓ |✓ |acc |
|lambada | |✓ | |perplexity, accuracy|
|piqa |✓ |✓ | |acc |
|arc_easy |✓ |✓ |✓ |acc |
|arc_challenge |✓ |✓ |✓ |acc |
|hellaswag |✓ |✓ |✓ |acc |
|race |✓ |✓ |✓ |acc |
|webqs |✓ | |✓ |acc |
|wsc273 | | |✓ |acc |
|winogrande |✓ |✓ |✓ |acc |
|anli_r1 |✓ |✓ |✓ |acc |
|anli_r2 |✓ |✓ |✓ |acc |
|anli_r3 |✓ |✓ |✓ |acc |
|arithmetic_2da | |✓ | |acc |
|arithmetic_2ds | |✓ | |acc |
|arithmetic_3da | |✓ | |acc |
|arithmetic_3ds | |✓ | |acc |
|arithmetic_4da | |✓ | |acc |
|arithmetic_4ds | |✓ | |acc |
|arithmetic_5da | |✓ | |acc |
|arithmetic_5ds | |✓ | |acc |
|arithmetic_2dm | |✓ | |acc |
|arithmetic_1dc | |✓ | |acc |
## Usage
### Evaluate a task
......
......@@ -2,6 +2,7 @@ import abc
import random
import numpy as np
import sklearn
import math
class LM(abc.ABC):
......@@ -58,10 +59,10 @@ class LM(abc.ABC):
return cls()
class Dataset(abc.ABC):
class Task(abc.ABC):
def __init__(self):
self.download()
self._traindocs = None
self._training_docs = None
def download(self):
"""Downloads the task dataset if necessary"""
......@@ -71,7 +72,7 @@ class Dataset(abc.ABC):
def has_training_docs(self):
"""Whether the task has a training set"""
pass
@abc.abstractmethod
def has_validation_docs(self):
"""Whether the task has a validation set"""
......@@ -84,23 +85,29 @@ class Dataset(abc.ABC):
def training_docs(self):
"""
:return: Iterable[obj]
A iterable of any object, that doc_to_text can handle
"""
return []
def validation_docs(self):
"""
:return: Iterable[obj]
A iterable of any object, that doc_to_text can handle
"""
return []
def test_docs(self):
"""
:return: Iterable[obj]
A iterable of any object, that doc_to_text can handle
"""
return []
def fewshot_examples(self, k):
if self._traindocs is None:
self._traindocs = list(self.training_docs())
return random.sample(self._traindocs, k)
def fewshot_examples(self, k):
if self._training_docs is None:
self._training_docs = list(self.training_docs())
return random.sample(self._training_docs, k)
@abc.abstractmethod
def doc_to_text(self, doc):
......@@ -123,7 +130,7 @@ class Dataset(abc.ABC):
part of the document for `doc`.
"""
pass
@abc.abstractmethod
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
......@@ -161,7 +168,7 @@ class Dataset(abc.ABC):
def fewshot_context(self, doc, num_fewshot, provide_description):
raw_description = self.fewshot_description()
description = (raw_description + "\n===\n\n") if provide_description and raw_description else ""
if num_fewshot == 0:
labeled_examples = ""
else:
......@@ -169,10 +176,42 @@ class Dataset(abc.ABC):
[self.doc_to_text(doc) + self.doc_to_target(doc) for doc in self.fewshot_examples(k=num_fewshot)]
) + "\n\n"
example = self.doc_to_text(doc).strip()
example = self.doc_to_text(doc)
return description + labeled_examples + example
class MultipleChoiceTask(Task):
def doc_to_target(self, doc):
return " " + doc['choices'][doc['gold']]
def construct_requests(self, doc, ctx):
lls = [
rf.loglikelihood(ctx, " {}".format(choice))[0]
for choice in doc['choices']
]
return lls
def process_results(self, doc, results):
gold = doc["gold"]
acc = 1. if np.argmax(results) == gold else 0.
return {
"acc": acc
}
def higher_is_better(self):
return {
"acc": True
}
def aggregation(self):
return {
"acc": mean
}
def mean(arr):
return sum(arr) / len(arr)
......@@ -193,7 +232,8 @@ def f1_score(items):
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = sklearn.metrics.f1_score(golds, preds)
return max(fscore)
return np.max(fscore)
def acc_all(items):
......@@ -223,10 +263,70 @@ def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
return max(scores_for_ground_truths)
def perplexity(items):
return math.exp(-mean(items))
req_ret_lens = {
'loglikelihood': 2
'loglikelihood': 2,
}
import os
import json
import hashlib
from sqlitedict import SqliteDict
def hash_args(args):
dat = b""
for arg in args:
assert isinstance(arg, str) or isinstance(arg, int)
dat += str(arg).encode()
dat += b"\0"
return hashlib.sha256(dat).hexdigest()
class CachingLM:
def __init__(self, lm, cache_db):
self.lm = lm
self.cache_db = cache_db
os.makedirs(os.path.dirname(cache_db), exist_ok=True)
self.dbdict = SqliteDict(cache_db, autocommit=True)
def __getattr__(self, attr):
def fn(requests):
res = []
remaining_reqs = []
# figure out which ones are cached and which ones are new
for req in requests:
hsh = attr + '_' + hash_args(req)
if hsh in self.dbdict:
ob = self.dbdict[hsh]
assert ob is not None
res.append(ob)
else:
res.append(None)
remaining_reqs.append(req)
# actually run the LM
rem_res = getattr(self.lm, attr)(remaining_reqs)
# stick the new ones back into the list and also cache any of the new ones
resptr = 0
for req, r in zip(remaining_reqs, rem_res):
while res[resptr] is not None: resptr += 1
res[resptr] = r
# caching
hsh = attr + '_' + hash_args(req)
self.dbdict[hsh] = r
return res
return fn
class Request:
def __init__(self, type, args, index=None):
......
import collections
import itertools
def evaluate(lm, task_dict, provide_description, num_fewshot, limit):
# TODO: completely refactor this entire function to not be a huge mess, ideally breaking it down into smaller pieces
task_dict_items = [(name, task) for name, task in task_dict.items() if(task.has_validation_docs() or task.has_test_docs())]
results = collections.defaultdict(dict)
requests = collections.defaultdict(list)
requests_origin = collections.defaultdict(list)
# if we ever run into issues where the eval tasks don't fit in memory and we can't afford a machine with bigger memory,
# we can always modify this plumbing to support that, but i didn't want to include it just yet because overengineering is bad
# (or we could make it write the requests to disk and then read them back out again - probably using an sqlite db because of all the moving parts we have
# TODO: we need unit tests & sanity checks or something to ensure that the return of `validation_docs` is stable
docs = {}
# get lists of each type of requeste
for task_name, task in task_dict_items:
#default to validation doc, fall back to test doc if validation unavailable
# TODO: the val-fallback-to-test system isn't final, we should revisit it at some point
if task.has_validation_docs():
task_doc_func = task.validation_docs
elif task.has_test_docs():
task_doc_func = task.test_docs
for doc_id, doc in enumerate(itertools.islice(task_doc_func(), 0, limit)):
docs[(task_name, doc_id)] = doc
ctx = task.fewshot_context(
doc=doc,
provide_description=provide_description,
num_fewshot=num_fewshot,
)
reqs = task.construct_requests(doc, ctx)
for i, req in enumerate(reqs):
requests[req.type].append(req)
# i: index in requests for a single task instance
# doc_id: unique id that we can get back to a doc using `docs`
requests_origin[req.type].append((i, task_name, doc, doc_id))
# all responses for each (task, doc)
process_res_queue = collections.defaultdict(list)
# execute each type of request
for reqtype, reqs in requests.items():
# TODO: right now, this code runs multiple seperate LM requests for multiple Requests differing
# only in index. We could implement some kind of caching, but that would be more of a bandaid
# solution. we could also implement some kind of autogrouping here; they should end up next to each other.
resps = getattr(lm, reqtype)([req.args for req in reqs])
resps = [x if req.index is None else x[req.index] for x, req in zip(resps, reqs)]
for resp, (i, task_name, doc, doc_id) in zip(resps, requests_origin[reqtype]):
process_res_queue[(task_name, doc_id)].append((i, resp))
vals = collections.defaultdict(list)
# unpack results and sort back in order and return control to Task
for (task_name, doc_id), requests in process_res_queue.items():
requests.sort(key=lambda x: x[0])
requests = [x[1] for x in requests]
task = task_dict[task_name]
doc = docs[(task_name, doc_id)]
metrics = task.process_results(doc, requests)
for metric, value in metrics.items():
vals[(task_name, metric)].append(value)
# aggregate results
for (task_name, metric), items in vals.items():
task = task_dict[task_name]
results[task_name][metric] = task.aggregation()[metric](items)
return results
\ No newline at end of file
from . import gpt2
from . import gpt3
from . import dummy
MODEL_REGISTRY = {
"gpt2": gpt2.GPT2LM,
"gpt3": gpt3.GPT3LM,
"dummy": dummy.DummyLM,
}
......
......@@ -12,6 +12,7 @@ class GPT2LM(LM):
self.gpt2 = transformers.GPT2LMHeadModel.from_pretrained('gpt2').to(self.device)
self.gpt2.eval()
self.tokenizer = transformers.GPT2TokenizerFast.from_pretrained('gpt2')
self.tokenizer.pad_token = "<|endoftext|>"
@classmethod
def create_from_arg_string(cls, arg_string):
......
# REMINDER: this code needs to be rewritten for the new framework. Remove this comment when the code is fully converted.
import os
import transformers
from lm_eval.base import LM
from lm_eval import utils
from tqdm import tqdm
import time
def get_result(response, ctxlen):
is_greedy = True
logprobs = response["logprobs"]["token_logprobs"]
continuation_logprobs = sum(logprobs[ctxlen:])
for i in range(ctxlen, len(response["logprobs"]["tokens"])):
token = response["logprobs"]["tokens"][i]
top_tokens = response["logprobs"]["top_logprobs"][i]
top_token = max(top_tokens.keys(), key=lambda x: top_tokens[x])
if top_token != token:
is_greedy = False
break
return continuation_logprobs, is_greedy
def oa_completion(**kwargs):
import openai
backoff_time = 3
while True:
try:
return openai.Completion.create(**kwargs)
except openai.error.OpenAIError:
time.sleep(backoff_time)
backoff_time *= 1.5
class GPT3LM(LM):
MAX_LENGTH = 2048
REQ_CHUNK_SIZE = 64
MAX_GEN_TOKS = 256
def __init__(self, engine, truncate=False):
"""
......@@ -21,6 +51,9 @@ class GPT3LM(LM):
import openai
self.engine = engine
self.tokenizer = transformers.GPT2TokenizerFast.from_pretrained('gpt2')
# to make the annoying "Using pad_token, but it is not set yet." error go away
self.tokenizer.pad_token = "<|endoftext|>"
self.truncate = truncate
# Read from environment variable OPENAI_API_SECRET_KEY
......@@ -31,23 +64,53 @@ class GPT3LM(LM):
args = utils.simple_parse_args_string(arg_string)
return cls(engine=args.get("engine", "davinci"))
def loglikelihood(self, context, continuation):
# TODO: implement new framework
def loglikelihood(self, requests):
import openai
res = []
for chunk in tqdm(list(utils.chunks(requests, self.REQ_CHUNK_SIZE))):
inps = []
ctxlens = []
for context, continuation in chunk:
context_enc = self.tokenizer.encode(context)
continuation_enc = self.tokenizer.encode(continuation)
inp = (context_enc + continuation_enc)[-self.MAX_LENGTH:]
ctxlen = len(context_enc) - max(0, len(context_enc) + len(continuation_enc) - self.MAX_LENGTH)
inps.append(inp)
ctxlens.append(ctxlen)
response = oa_completion(
engine=self.engine,
prompt=inps,
echo=True,
max_tokens=0, temperature=0.,
logprobs=10,
)
for resp, ctxlen in zip(response.choices, ctxlens):
res.append(get_result(resp, ctxlen))
return res
def greedy_until(self, requests):
import openai
res = []
for context, until in tqdm(requests):
context_enc = self.tokenizer.encode(context)
inp = context_enc[-(self.MAX_LENGTH - self.MAX_GEN_TOKS):]
ctxlen = len(context_enc) - max(0, len(context_enc) - (self.MAX_LENGTH - self.MAX_GEN_TOKS))
response = oa_completion(
engine=self.engine,
prompt=[inp],
max_tokens=self.MAX_GEN_TOKS,
temperature=0.,
logprobs=10,
)
res.append(response.choices[0]['text'])
context_enc = self.tokenizer.encode(context)
continuation_enc = self.tokenizer.encode(continuation)
inp = (context_enc + continuation_enc)[-1024:]
ctxlen = len(context_enc) - max(0, len(context_enc) + len(continuation_enc) - 1024)
response = openai.Completion.create(
engine=self.engine,
prompt=inp,
echo=True,
max_tokens=0, temperature=0.0,
logprobs=0,
)
logprobs = response.choices[0]["logprobs"]["token_logprobs"]
continuation_logprobs = logprobs[ctxlen:]
return sum(continuation_logprobs)
return res
......@@ -18,6 +18,7 @@ from . import race
from . import piqa
from . import triviaqa
from . import pubmedqa
from . import webqs
TASK_REGISTRY = {
......@@ -37,7 +38,7 @@ TASK_REGISTRY = {
"cb": superglue.CommitmentBank,
"copa": superglue.Copa,
"multirc": superglue.MultiRC,
"record": superglue.ReCoRD,
#"record": superglue.ReCoRD,
"wic": superglue.WordsInContext,
"wsc": superglue.SGWinogradSchemaChallenge,
......@@ -50,8 +51,8 @@ TASK_REGISTRY = {
"sciq" : pubmedqa.SciQ,
#"triviaqa": triviaqa.TriviaQA,
# "arc_easy": arc.ARCEasy, # not implemented yet
# "arc_challenge": arc.ARCChallenge, # not implemented yet
"arc_easy": arc.ARCEasy,
"arc_challenge": arc.ARCChallenge,
# "quac": quac.QuAC, # not implemented yet
"hellaswag": hellaswag.HellaSwag, # not implemented yet
# "openbookqa": openbookqa.OpenBookQA, # not implemented yet
......@@ -59,9 +60,9 @@ TASK_REGISTRY = {
# "squad": squad.SQuAD, # not implemented yet
"race": race.RACE,
# "naturalqs": naturalqs.NaturalQs, # not implemented yet
# "webqs": webqs.WebQs, # not implemented yet
# "wsc273": wsc273.WinogradSchemaChallenge273, # not implemented yet
# "winogrande": winogrande.Winogrande, # not implemented yet
"webqs": webqs.WebQs,
"wsc273": wsc273.WinogradSchemaChallenge273,
"winogrande": winogrande.Winogrande,
"anli_r1": anli.ANLIRound1,
"anli_r2": anli.ANLIRound2,
"anli_r3": anli.ANLIRound3,
......
import numpy as np
from lm_eval.base import rf, mean
from . common import HFTask
class ARCEasy(HFTask):
DATASET_PATH = "ai2_arc"
DATASET_NAME = "ARC-Easy"
letter_to_num = {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4}
def __init__(self):
super().__init__()
self.data = self.__clean_data()
def __clean_data(self):
""" Resolves various edge cases in the unprocessed HF ARC dataset. """
# NOTE: Some `doc["answerKey"]`s are in numeric string format being one
# of {'1', '2', '3', '4', '5'}. We map them back to letters.
num_to_letter = {'1': 'A', '2': 'B', '3': 'C', '4': 'D', '5': 'E'}
result = {}
for split, data in self.data.items():
result[split] = []
for doc in data:
# Ensure all `answerKey`s and `label`s are in letter format.
doc["answerKey"] = num_to_letter.get(doc["answerKey"], doc["answerKey"])
doc["choices"]["label"] = [
num_to_letter.get(label, label) for label in doc["choices"]["label"]
]
result[split].append(doc)
return result
def has_training_docs(self):
return True
......@@ -21,7 +47,8 @@ class ARCEasy(HFTask):
return "Question: " + doc['question'] + '\nAnswer:'
def doc_to_target(self, doc):
return " " + doc['choices']['text'][doc['choices']['label'].index(doc['answerKey'])]
index = self.letter_to_num[doc["answerKey"]]
return " " + doc['choices']['text'][index]
def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of
......@@ -34,9 +61,11 @@ class ARCEasy(HFTask):
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
ll_choices = []
for choice in doc["choices"]["text"]:
ll_choices.append(rf.loglikelihood(ctx, " " + choice)[0])
return ll_choices
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
......@@ -47,8 +76,11 @@ class ARCEasy(HFTask):
:param results:
The results of the requests created in construct_requests.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
gold = self.letter_to_num[doc["answerKey"]]
pred = np.argmax(results)
return {
"acc": pred == gold
}
def aggregation(self):
"""
......@@ -56,8 +88,9 @@ class ARCEasy(HFTask):
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
return {
"acc": mean
}
def higher_is_better(self):
"""
......@@ -65,8 +98,10 @@ class ARCEasy(HFTask):
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
return {
"acc": True
}
class ARCChallenge(ARCEasy):
DATASET_PATH = "ai2_arc"
......
......@@ -2,12 +2,12 @@ import abc
import json
import os
from collections import namedtuple
from lm_eval.base import Dataset, mean, rf
from lm_eval.base import Task, mean, rf
from best_download import download_file
ArithmeticDoc = namedtuple('ArithmeticDoc', ['context', 'completion'])
class Arithmetic(Dataset):
class Arithmetic(Task):
directory = 'data/arithmetic/'
def __init__(self):
......@@ -32,7 +32,7 @@ class Arithmetic(Dataset):
self._docs = [self.load_doc(json.loads(line)) for line in jsons]
def has_training_docs(self):
return True
return False
def has_validation_docs(self):
return True
......@@ -41,10 +41,10 @@ class Arithmetic(Dataset):
return False
def training_docs(self):
return self._docs
return NotImplemented
def validation_docs(self):
return self._docs[:100]
return self._docs
def test_docs(self):
return NotImplemented
......
import datasets
import numpy as np
import random
from ..base import Dataset
from ..base import Task
class HFTask(Dataset):
class HFTask(Task):
DATASET_PATH = None
DATASET_NAME = None
def __init__(self):
self.data = None
super().__init__()
self._training_docs = None
def download(self):
self.data = datasets.load_dataset(path=self.DATASET_PATH, name=self.DATASET_NAME)
......
......@@ -2,11 +2,11 @@
import json
import random
from lm_eval.base import Dataset
from lm_eval.base import Task
from ..utils import sh
class CoQA(Dataset):
class CoQA(Task):
def __init__(self):
self.download()
def download(self):
......
......@@ -5,9 +5,9 @@ from sklearn.metrics import f1_score, matthews_corrcoef
from tqdm import auto as tqdm_lib
from . common import HFTask, simple_accuracy_metric, yesno
from pathlib import Path
from ..base import Dataset
from ..base import Task
class DROP(Dataset):
class DROP(Task):
DATAFOLDER = Path(__file__).parent / "../../data/drop"
def __init__(self):
......
......@@ -61,7 +61,7 @@ class HellaSwag(HFTask):
raise ValueError(
"HellaSwag from HF datasets contained an invalid answer key")
target = doc['endings'][index]
return self.remove_brackets(target)
return " " + self.remove_brackets(target)
def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of
......@@ -75,7 +75,7 @@ class HellaSwag(HFTask):
"""
ll_answers = []
for i in range(4):
continuation = self.remove_brackets(doc['endings'][i])
continuation = " " + self.remove_brackets(doc['endings'][i])
ll_answers.append(rf.loglikelihood(ctx, continuation))
return ll_answers
......
from lm_eval.base import Dataset, rf, mean
from lm_eval.base import Task, rf, mean, perplexity
from lm_eval.utils import sh
import json
import math
from best_download import download_file
class LAMBADA(Dataset):
class LAMBADA(Task):
def download(self):
sh("mkdir -p data/lambada")
download_file(
......@@ -18,22 +18,22 @@ class LAMBADA(Dataset):
return False
def has_validation_docs(self):
return False
return True
def has_test_docs(self):
return True
return False
def training_docs(self):
pass
def validation_docs(self):
pass
def test_docs(self):
with open("data/lambada/lambada_test.jsonl") as fh:
for line in fh:
yield json.loads(line)
def test_docs(self):
pass
def doc_to_text(self, doc):
return doc['text'].rsplit(' ', 1)[0]
......@@ -45,7 +45,7 @@ class LAMBADA(Dataset):
return ""
def construct_requests(self, doc, ctx):
ll, is_greedy = rf.loglikelihood(doc, self.doc_to_target(doc))
ll, is_greedy = rf.loglikelihood(ctx, self.doc_to_target(doc))
return ll, is_greedy
......@@ -53,13 +53,13 @@ class LAMBADA(Dataset):
ll, is_greedy = results
return {
'perplexity': math.exp(-ll),
'perplexity': ll,
'accuracy': int(is_greedy)
}
def aggregation(self):
return {
'perplexity': mean,
'perplexity': perplexity,
'accuracy': mean
}
......
......@@ -30,10 +30,10 @@ class NaturalQs(HFTask):
def fewshot_examples(self, k):
# Data is too large to fit in memory. We just sample from the first bit.
if self._traindocs is None:
self._traindocs = list(islice(self.training_docs(), 0, 100000))
if self._training_docs is None:
self._training_docs = list(islice(self.training_docs(), 0, 100000))
return random.sample(self._traindocs, k)
return random.sample(self._training_docs, k)
def doc_to_text(self, doc):
return 'Q: ' + doc['question']['text'] + '\n\n' + 'A: '
......
import json
import random
from lm_eval.base import Dataset, rf, mean
from lm_eval.base import Task, rf, mean
from ..utils import sh
import os
class PiQA(Dataset):
class PiQA(Task):
def download(self):
if not os.path.exists('data/piqa'):
#TODO: use best_download
......@@ -46,12 +46,12 @@ class PiQA(Dataset):
return ""
def doc_to_text(self, doc):
return doc[0]['goal']
return doc[0]['goal'] + "\n"
def doc_to_target(self, doc):
#TODO: check if oa uses newline
rightanswer = int(doc[1]) + 1
return '\n' + ''.join([doc[0]['goal'],' ',doc[0]['sol'+str(rightanswer)]])
return ''.join([doc[0]['goal'],' ',doc[0]['sol'+str(rightanswer)]])
def construct_requests(self, doc, ctx):
ll_1, _ = rf.loglikelihood(ctx, doc[0]['sol1'])
......
import json
import random
import os
from lm_eval.base import Dataset
from lm_eval.base import Task
from ..utils import sh
class QuAC(Dataset):
class QuAC(Task):
def __init__(self):
super().__init__()
......
......@@ -3,7 +3,19 @@ import datasets
import numpy as np
from lm_eval.base import rf, mean
from . common import HFTask
from ..utils_stream import each
import os
from functools import reduce
import operator
from tqdm import tqdm
import json
class each:
def __init__(self, f):
self.f = f
def __rrshift__(self, other):
return list(map(self.f, other))
class RACE(HFTask):
......
import json
import random
import os
from lm_eval.base import Dataset, rf, mean
from lm_eval.base import MultipleChoiceTask, rf, mean
from tqdm import auto as tqdm_lib
from . common import simple_accuracy_metric
import numpy as np
from ..utils import sh
class SATAnalogies(Dataset):
class SATAnalogies(MultipleChoiceTask):
NEEDS_MANUAL_DL = True
def __init__(self):
......@@ -61,8 +61,8 @@ class SATAnalogies(Dataset):
doc = {
'source': source,
'query': query.split(' ')[:2],
'choices': [c.split(' ')[:2] for c in choices],
'answer_key': ['a','b','c','d','e'].index(answer_key.strip()),
'choices': ["{} is to {}".format(*c.split(' ')[:2]) for c in choices],
'gold': ['a','b','c','d','e'].index(answer_key.strip()),
}
yield doc
......@@ -72,35 +72,4 @@ class SATAnalogies(Dataset):
return ""
def doc_to_text(self, doc):
return "{} is to {} as ".format(*doc['query'])
def doc_to_target(self, doc):
return "{} is to {}".format(*doc['choices'][doc['answer_key']])
def construct_requests(self, doc, ctx):
lls = [
rf.loglikelihood(ctx, "{} is to {}".format(*doc['choices'][i]))[0]
for i in range(5)
]
return lls
def process_results(self, doc, results):
gold = doc["answer_key"]
acc = 1. if np.argmax(results) == gold else 0.
return {
"acc": acc
}
def higher_is_better(self):
return {
"acc": True
}
def aggregation(self):
return {
"acc": mean
}
return "{} is to {} as".format(*doc['query'])
import json
import random
from lm_eval.base import Dataset
from lm_eval.base import Task
from ..utils import sh
import csv
class StoryCloze(Dataset):
class StoryCloze(Task):
NEEDS_MANUAL_DL = True
def download(self):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment