Unverified Commit 31ebb599 authored by Stella Biderman's avatar Stella Biderman Committed by GitHub
Browse files

Merge branch 'master' into multilingual

parents 38c04a0f 8728710c
......@@ -23,11 +23,11 @@ jobs:
path: |
~/.cache
# An explicit key for restoring and saving the cache
key: evaldata-cache-3
key: evaldata-cache-4
- name: Set up Python 3.9
uses: actions/setup-python@v2
with:
python-version: 3.9
python-version: 3.9.7
- name: Install dependencies
run: |
python -m pip install --upgrade pip
......@@ -42,7 +42,7 @@ jobs:
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
- name: Test with pytest
run: |
pytest --cov=lm_eval/ tests/
pytest -vv --cov=lm_eval/ tests/
- name: Upload to codecov
run: |
bash <(curl -s https://codecov.io/bash) -t $CODECOV_TOKEN
@software{eval-harness,
author = {Gao, Leo and
Tow, Jonathan and
Biderman, Stella and
Black, Sid and
DiPofi, Anthony and
Foster, Charles and
Golding, Laurence and
Hsu, Jeffrey and
McDonell, Kyle and
Muennighoff, Niklas and
Phang, Jason and
Reynolds, Laria and
Tang, Eric and
Thite, Anish and
Wang, Ben and
Wang, Kevin and
Zou, Andy},
title = {A framework for few-shot language model evaluation},
month = sep,
year = 2021,
publisher = {Zenodo},
version = {v0.0.1},
doi = {10.5281/zenodo.5371628},
url = {https://doi.org/10.5281/zenodo.5371628}
}
This diff is collapsed.
import abc
import random
from typing import Iterable
import numpy as np
import re
import os
import json
import hashlib
from sqlitedict import SqliteDict
from tqdm import tqdm
import torch
import torch.nn.functional as F
from lm_eval.metrics import mean, perplexity, weighted_perplexity, weighted_mean
from lm_eval.metrics import mean, weighted_perplexity, weighted_mean, bits_per_byte
from lm_eval import utils
from abc import abstractmethod
class LM(abc.ABC):
def __init__(self):
self.cache_hook = CacheHook(None)
@abc.abstractmethod
@abstractmethod
def loglikelihood(self, requests):
"""Compute log-likelihood of generating a continuation from a context.
Downstream tasks should attempt to use loglikelihood instead of other
......@@ -34,7 +43,7 @@ class LM(abc.ABC):
"""
pass
@abc.abstractmethod
@abstractmethod
def loglikelihood_rolling(self, requests):
"""Compute full log-likelihood of a string, with no truncation, for perplexity computation
- We will use the full max context length of the model.
......@@ -77,7 +86,7 @@ class LM(abc.ABC):
pass
# TODO: Add an optional max length
@abc.abstractmethod
@abstractmethod
def greedy_until(self, requests):
"""Generate greedily until a stopping sequence
......@@ -96,18 +105,235 @@ class LM(abc.ABC):
pass
@classmethod
def create_from_arg_string(cls, arg_string):
"""Constructor method, in case models need additional arguments
e.g. OpenAI API engine, paths for loading, other params
def create_from_arg_string(cls, arg_string, additional_config=None):
additional_config = {} if additional_config is None else additional_config
args = utils.simple_parse_args_string(arg_string)
args2 = {k: v for k, v in additional_config.items() if v is not None}
return cls(**args, **args2)
def set_cache_hook(self, cache_hook):
self.cache_hook = cache_hook
class BaseLM(LM):
@property
@abstractmethod
def eot_token_id(self):
pass
@property
@abstractmethod
def max_length(self):
pass
@property
@abstractmethod
def max_gen_toks(self):
pass
@property
@abstractmethod
def batch_size(self):
pass
@property
@abstractmethod
def device(self):
pass
@abstractmethod
def tok_encode(self, string: str): pass
@abstractmethod
def tok_decode(self, tokens: Iterable[int]): pass
:param arg_string: str
Left up to individual model class to handle
@abstractmethod
def _model_generate(self, context, max_length, eos_token_id): pass
@abstractmethod
def _model_call(self, inps):
"""
return cls()
inps: a torch tensor of shape [batch, sequence]
the size of sequence may vary from call to call
def set_cache_hook(self, cache_hook):
self.cache_hook = cache_hook
returns: a torch tensor of shape [batch, sequence, vocab] with the
logits returned from the model
"""
pass
# subclass must implement properties vocab_size, eot_token_id, max_gen_toks, batch_size, device, max_length.
# TODO: enforce this somehow
def loglikelihood(self, requests):
new_reqs = []
for context, continuation in requests:
if context == "":
# end of text as context
context_enc = [self.eot_token_id]
else:
context_enc = self.tok_encode(context)
continuation_enc = self.tok_encode(continuation)
new_reqs.append(((context, continuation), context_enc, continuation_enc))
return self._loglikelihood_tokens(new_reqs)
def loglikelihood_rolling(self, requests):
# TODO: Implement caching once we've confirmed the perplexity implementation
# TODO: automatic batch size detection for vectorization
loglikelihoods = []
for string, in tqdm(requests):
rolling_token_windows = list(map(utils.make_disjoint_window, utils.get_rolling_token_windows(
token_list=self.tok_encode(string),
prefix_token=self.eot_token_id,
max_seq_len=self.max_length,
context_len=1,
)))
rolling_token_windows = [(None,) + x for x in rolling_token_windows]
# TODO: extract out this call so it only gets called once and also somehow figure out partial caching for
# that
string_nll = self._loglikelihood_tokens(rolling_token_windows, disable_tqdm=True)
# discard is_greedy
string_nll = [x[0] for x in string_nll]
string_nll = sum(string_nll)
loglikelihoods.append(string_nll)
return loglikelihoods
def _loglikelihood_tokens(self, requests, disable_tqdm=False):
# TODO: implement some kind of efficient-request-middleware that lumps together requests with the same context
res = []
def _collate(x):
# the negative sign on len(toks) sorts descending - this has a few advantages:
# - time estimates will always be over not underestimates, which is more useful for planning
# - to know the size of a batch when going through the list, you know the first one is always the batch
# padded context length. this is useful to simplify the batching logic and more importantly to make
# automatic adaptive batches much much easier to implement
# - any OOMs will happen right away rather than near the end
toks = x[1] + x[2]
return -len(toks), tuple(toks)
# TODO: automatic (variable) batch size detection for vectorization
reord = utils.Reorderer(requests, _collate)
for chunk in utils.chunks(tqdm(reord.get_reordered(), disable=disable_tqdm), self.batch_size):
inps = []
cont_toks_list = []
inplens = []
padding_length = None
# because vectorizing is annoying, we first convert each (context, continuation) pair to padded
# tensors, then we pack them together into a batch, call the model, and then pick it all apart
# again because vectorizing is annoying
for _, context_enc, continuation_enc in chunk:
# sanity check
assert len(context_enc) > 0
assert len(continuation_enc) > 0
assert len(continuation_enc) <= self.max_length
# how this all works:
# CTX CONT
# inp 0 1 2 3|4 5 6 7 8 9 <- last token is deleted by inp[:, :-1]
# gpt2 \ \
# logits 1 2 3|4 5 6 7 8 9 <- the ctx half gets tossed out by the
# cont_toks 4 5 6 7 8 9 [:, -len(continuation_enc):, :self.vocab_size] slice
# when too long to fit in context, truncate from the left
inp = torch.tensor(
(context_enc + continuation_enc)[-(self.max_length+1):][:-1],
dtype=torch.long
).to(self.device)
inplen, = inp.shape
cont = continuation_enc
# since in _collate we make sure length is descending, the longest is always the first one.
padding_length = padding_length if padding_length is not None else inplen
# pad length from seq to padding_length
inp = torch.cat([
inp, # [seq]
torch.zeros(padding_length - inplen, dtype=torch.long).to(inp.device) # [padding_length - seq]
], dim=0)
inps.append(inp.unsqueeze(0)) # [1, padding_length]
cont_toks_list.append(cont)
inplens.append(inplen)
batched_inps = torch.cat(inps, dim=0) # [batch, padding_length
multi_logits = F.log_softmax(self._model_call(batched_inps), dim=-1).cpu() # [batch, padding_length, vocab]
for (cache_key, _, _), logits, inp, inplen, cont_toks \
in zip(chunk, multi_logits, inps, inplens, cont_toks_list):
# Slice to original seq length
contlen = len(cont_toks)
logits = logits[inplen-contlen:inplen].unsqueeze(0) # [1, seq, vocab]
# Check if per-token argmax is exactly equal to continuation
greedy_tokens = logits.argmax(dim=-1)
cont_toks = torch.tensor(cont_toks, dtype=torch.long).unsqueeze(0) # [1, seq]
max_equal = (greedy_tokens == cont_toks).all()
# Obtain log-probs at the corresponding continuation token indices
# last_token_slice = logits[:, -1, :].squeeze(0).tolist()
logits = torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(-1) # [1, seq]
# Answer: (log prob, is-exact-match)
answer = (float(logits.sum()), bool(max_equal))
# partial caching
if cache_key is not None:
self.cache_hook.add_partial("loglikelihood", cache_key, answer)
res.append(answer)
return reord.get_original(res)
def greedy_until(self, requests):
# TODO: implement fully general `until` that handles untils that are
# multiple tokens or that span multiple tokens correctly
# TODO: extract to TokenizedLM?
res = []
def _collate(x):
toks = self.tok_encode(x[0])
return len(toks), x[0]
reord = utils.Reorderer(requests, _collate)
for context, until in tqdm(reord.get_reordered()):
if isinstance(until, str):
until = [until]
primary_until, = self.tok_encode(until[0])
context_enc = torch.tensor([self.tok_encode(context)[self.max_gen_toks - self.max_length:]]).to(self.device)
cont = self._model_generate(context_enc, context_enc.shape[1] + self.max_gen_toks, primary_until)
s = self.tok_decode(cont[0].tolist()[context_enc.shape[1]:])
for term in until:
s = s.split(term)[0]
# partial caching
self.cache_hook.add_partial("greedy_until", (context, until), s)
res.append(s)
return reord.get_original(res)
class Task(abc.ABC):
......@@ -128,17 +354,17 @@ class Task(abc.ABC):
"""Downloads the task dataset if necessary"""
pass
@abc.abstractmethod
@abstractmethod
def has_training_docs(self):
"""Whether the task has a training set"""
pass
@abc.abstractmethod
@abstractmethod
def has_validation_docs(self):
"""Whether the task has a validation set"""
pass
@abc.abstractmethod
@abstractmethod
def has_test_docs(self):
"""Whether the task has a test set"""
pass
......@@ -170,15 +396,15 @@ class Task(abc.ABC):
return rnd.sample(self._training_docs, k)
@abc.abstractmethod
@abstractmethod
def doc_to_text(self, doc):
pass
@abc.abstractmethod
@abstractmethod
def doc_to_target(self, doc):
pass
@abc.abstractmethod
@abstractmethod
def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
......@@ -192,7 +418,7 @@ class Task(abc.ABC):
"""
pass
@abc.abstractmethod
@abstractmethod
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
......@@ -205,7 +431,7 @@ class Task(abc.ABC):
"""
pass
@abc.abstractmethod
@abstractmethod
def aggregation(self):
"""
:returns: {str: [metric_score] -> float}
......@@ -214,7 +440,7 @@ class Task(abc.ABC):
"""
pass
@abc.abstractmethod
@abstractmethod
def higher_is_better(self):
"""
:returns: {str: bool}
......@@ -238,7 +464,9 @@ class Task(abc.ABC):
fewshotex = self.fewshot_examples(k=num_fewshot, rnd=rnd)
else:
if self._fewshot_docs is None:
self._fewshot_docs = list(self.validation_docs() if self.has_validation_docs() else self.test_docs())
self._fewshot_docs = list(
self.validation_docs() if self.has_validation_docs() else self.test_docs()
)
fewshotex = rnd.sample(self._fewshot_docs, num_fewshot + 1)
......@@ -253,7 +481,7 @@ class Task(abc.ABC):
return description + labeled_examples + example
class MultipleChoiceTask(Task):
class MultipleChoiceTask(Task, abc.ABC):
def doc_to_target(self, doc):
return " " + doc['choices'][doc['gold']]
......@@ -328,39 +556,30 @@ class PerplexityTask(Task, abc.ABC):
def process_results(self, doc, results):
loglikelihood, = results
words = self.count_words(doc)
bytes = self.count_bytes(doc)
bytes_ = self.count_bytes(doc)
return {
"word_perplexity": (loglikelihood, words),
"byte_perplexity": (loglikelihood, bytes),
"bits_per_byte": (-loglikelihood, self.count_bytes(doc))
"byte_perplexity": (loglikelihood, bytes_),
"bits_per_byte": (loglikelihood, bytes_),
}
def aggregation(self):
return {
"word_perplexity": weighted_perplexity,
"byte_perplexity": weighted_perplexity,
"bits_per_byte": weighted_mean
"bits_per_byte": bits_per_byte,
}
def count_bytes(self, doc):
@classmethod
def count_bytes(cls, doc):
return len(doc.encode("utf-8"))
def count_words(self, doc):
@classmethod
def count_words(cls, doc):
""" Downstream tasks with custom word boundaries should override this! """
return len(re.split(r"\s+", doc))
req_ret_lens = {
'loglikelihood': 2,
'greedy_until': None,
'loglikelihood_rolling': None,
}
import os
import json
import hashlib
from sqlitedict import SqliteDict
def hash_args(attr, args):
dat = json.dumps([attr] + list(args))
return hashlib.sha256(dat.encode('utf-8')).hexdigest()
......@@ -383,9 +602,17 @@ class CacheHook:
class CachingLM:
def __init__(self, lm, cache_db):
"""LM wrapper that returns cached results if they exist, and uses the underlying LM if not.
:param lm: LM
Underlying LM
:param cache_db: str
Path to cache db
"""
self.lm = lm
self.cache_db = cache_db
if os.path.dirname(cache_db): os.makedirs(os.path.dirname(cache_db), exist_ok=True)
if os.path.dirname(cache_db):
os.makedirs(os.path.dirname(cache_db), exist_ok=True)
self.dbdict = SqliteDict(cache_db, autocommit=True)
# add hook to lm
......@@ -409,13 +636,14 @@ class CachingLM:
res.append(None)
remaining_reqs.append(req)
# actually run the LM
# actually run the LM on the requests that do not have cached results
rem_res = getattr(self.lm, attr)(remaining_reqs)
# stick the new ones back into the list and also cache any of the new ones
resptr = 0
for req, r in zip(remaining_reqs, rem_res):
while res[resptr] is not None: resptr += 1
while res[resptr] is not None:
resptr += 1
res[resptr] = r
......@@ -431,32 +659,39 @@ class CachingLM:
return CacheHook(self)
REQUEST_RETURN_LENGTHS = {
'loglikelihood': 2,
'greedy_until': None,
'loglikelihood_rolling': None,
}
class Request:
def __init__(self, type, args, index=None):
if type not in req_ret_lens.keys():
raise NotImplementedError('The request type {} is not implemented!'.format(type))
def __init__(self, request_type, args, index=None):
if request_type not in REQUEST_RETURN_LENGTHS.keys():
raise NotImplementedError('The request type {} is not implemented!'.format(request_type))
self.type = type
self.request_type = request_type
self.args = args
self.index = index
def __iter__(self):
if req_ret_lens[self.type] is None:
if REQUEST_RETURN_LENGTHS[self.request_type] is None:
raise IndexError('This request type does not return multiple arguments!')
i = 0
for i in range(req_ret_lens[self.type]):
yield Request(self.type, self.args, i)
for i in range(REQUEST_RETURN_LENGTHS[self.request_type]):
yield Request(self.request_type, self.args, i)
def __getitem__(self, i):
if req_ret_lens[self.type] is None:
if REQUEST_RETURN_LENGTHS[self.request_type] is None:
raise IndexError('This request type does not return multiple arguments!')
return Request(self.type, self.args, i)
return Request(self.request_type, self.args, i)
def __eq__(self, other):
return self.type == other.type and self.args == other.args and self.index == other.index
return self.request_type == other.request_type and self.args == other.args and self.index == other.index
def __repr__(self):
return f"Req_{self.type}{self.args}[{self.index}]\n"
return f"Req_{self.request_type}{self.args}[{self.index}]\n"
class RequestFactory:
def __getattr__(self, attr):
......
......@@ -2,12 +2,96 @@ import collections
import itertools
import random
import lm_eval.metrics
import lm_eval.models
import lm_eval.tasks
import lm_eval.base
import numpy as np
def simple_evaluate(model, model_args, task_names,
num_fewshot=0, batch_size=None, device=None,
no_cache=False, limit=None, bootstrap_iters=100000):
"""Instantiate and evaluate a model on a list of tasks.
:param model: str
Name of model, see lm_eval.models.get_model
:param model_args: str
String arguments for each model class, see LM.create_from_arg_string
:param task_names: list[str]
List of task names
:param num_fewshot: int
Number of examples in few-shot context
:param batch_size: int, optional
Batch size for model
:param device: str, optional
PyTorch device (e.g. "cpu" or "cuda:0") for running models
:param no_cache: bool
Whether or not to cache
:param limit: int, optional
Limit the number of examples per task (only use this for testing)
:param bootstrap_iters:
Number of iterations for bootstrap statistics
:return
Dictionary of results
"""
random.seed(1234)
np.random.seed(1234)
lm = lm_eval.models.get_model(model).create_from_arg_string(model_args, {
'batch_size': batch_size, 'device': device
})
if not no_cache:
lm = lm_eval.base.CachingLM(
lm, 'lm_cache/' + model + '_' + model_args.replace('=', '-').replace(',', '_').replace('/', '-') + '.db'
)
task_dict = lm_eval.tasks.get_task_dict(task_names)
results = evaluate(lm, task_dict, False, num_fewshot, limit)
# add info about the model and few shot config
results["config"] = {
"model": model,
"model_args": model_args,
"num_fewshot": num_fewshot,
"batch_size": batch_size,
"device": device,
"no_cache": no_cache,
"limit": limit,
"bootstrap_iters": bootstrap_iters
}
return results
def evaluate(lm, task_dict, provide_description, num_fewshot, limit, bootstrap_iters=100000):
"""Instantiate and evaluate a model on a list of tasks.
:param lm: obj
Language Model
:param task_dict: dict[str, Task]
Dictionary of tasks
:param provide_description: bool
Not implemented, and this option is deprecated and will be removed in a future version in favor of a different description providing method
:param num_fewshot: int
Number of examples in few-shot context
:param limit: int, optional
Limit the number of examples per task (only use this for testing)
:param bootstrap_iters:
Number of iterations for bootstrap statistics
:return
Dictionary of results
"""
# TODO: completely refactor this entire function to not be a huge mess, ideally breaking it down into smaller pieces
task_dict_items = [(name, task) for name, task in task_dict.items() if(task.has_validation_docs() or task.has_test_docs())]
# TODO: todo: implement proper description-providing system
assert not provide_description # not implemented.
task_dict_items = [
(name, task)
for name, task in task_dict.items()
if(task.has_validation_docs() or task.has_test_docs())
]
results = collections.defaultdict(dict)
versions = collections.defaultdict(dict)
......@@ -15,23 +99,25 @@ def evaluate(lm, task_dict, provide_description, num_fewshot, limit, bootstrap_i
requests = collections.defaultdict(list)
requests_origin = collections.defaultdict(list)
# if we ever run into issues where the eval tasks don't fit in memory and we can't afford a machine with bigger memory,
# we can always modify this plumbing to support that, but i didn't want to include it just yet because overengineering is bad
# (or we could make it write the requests to disk and then read them back out again - probably using an sqlite db because of all the moving parts we have
# If we ever run into issues where the eval tasks don't fit in memory and we can't afford a machine with bigger
# memory, we can always modify this plumbing to support that, but I didn't want to include it just yet because
# over-engineering is bad (or we could make it write the requests to disk and then read them back out again
# - probably using an sqlite db because of all the moving parts we have
# TODO: we need unit tests & sanity checks or something to ensure that the return of `validation_docs` is stable
docs = {}
# get lists of each type of requeste
# get lists of each type of request
for task_name, task in task_dict_items:
versions[task_name] = task.VERSION
#default to test doc, fall back to val doc if validation unavailable
# default to test doc, fall back to val doc if validation unavailable
# TODO: the test-fallback-to-val system isn't final, we should revisit it at some point
if task.has_test_docs():
task_doc_func = task.test_docs
elif task.has_validation_docs():
task_doc_func = task.validation_docs
else:
raise RuntimeError("Task has neither test_docs nor validation_docs")
# deterministically shuffle docs and chop off the first `limit` because sometimes docs are in some kind of order
task_docs = list(task_doc_func())
......@@ -50,25 +136,26 @@ def evaluate(lm, task_dict, provide_description, num_fewshot, limit, bootstrap_i
)
reqs = task.construct_requests(doc, ctx)
if not isinstance(reqs, (list, tuple)): reqs = [reqs]
if not isinstance(reqs, (list, tuple)):
reqs = [reqs]
for i, req in enumerate(reqs):
requests[req.type].append(req)
requests[req.request_type].append(req)
# i: index in requests for a single task instance
# doc_id: unique id that we can get back to a doc using `docs`
requests_origin[req.type].append((i, task_name, doc, doc_id))
requests_origin[req.request_type].append((i, task_name, doc, doc_id))
# all responses for each (task, doc)
process_res_queue = collections.defaultdict(list)
# execute each type of request
for reqtype, reqs in requests.items():
# TODO: right now, this code runs multiple seperate LM requests for multiple Requests differing
# only in index. We could implement some kind of caching, but that would be more of a bandaid
# solution. we could also implement some kind of autogrouping here; they should end up next to each other.
# TODO: right now, this code runs multiple separate LM requests for multiple Requests differing
# only in index. We could implement some kind of caching, but that would be more of a band-aid
# solution. we could also implement some kind of auto-grouping here;
# they should end up next to each other.
print("Running", reqtype, "requests")
resps = getattr(lm, reqtype)([req.args for req in reqs])
resps = [x if req.index is None else x[req.index] for x, req in zip(resps, reqs)]
for resp, (i, task_name, doc, doc_id) in zip(resps, requests_origin[reqtype]):
......@@ -93,11 +180,49 @@ def evaluate(lm, task_dict, provide_description, num_fewshot, limit, bootstrap_i
task = task_dict[task_name]
results[task_name][metric] = task.aggregation()[metric](items)
stderr = lm_eval.metrics.stderr_for_metric(task.aggregation()[metric], bootstrap_iters=bootstrap_iters)
# hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
# so we run them less iterations. still looking for a cleaner way to do this
stderr = lm_eval.metrics.stderr_for_metric(
metric=task.aggregation()[metric],
bootstrap_iters=min(bootstrap_iters, 1000) if metric in ["bleu", "chrf", "ter"] else bootstrap_iters,
)
if stderr is not None:
results[task_name][metric + "_stderr"] = stderr(items)
return {
"results": results,
"versions": versions
"results": dict(results),
"versions": dict(versions)
}
def make_table(result_dict):
"""Generate table of results."""
from pytablewriter import MarkdownTableWriter, LatexTableWriter
md_writer = MarkdownTableWriter()
latex_writer = LatexTableWriter()
md_writer.headers = ["Task", "Version", "Metric", "Value", "", "Stderr"]
latex_writer.headers = ["Task", "Version", "Metric", "Value", "", "Stderr"]
values = []
for k, dic in result_dict["results"].items():
version = result_dict["versions"][k]
for m, v in dic.items():
if m.endswith("_stderr"):
continue
if m + "_stderr" in dic:
se = dic[m + "_stderr"]
values.append([k, version, m, '%.4f' % v, '±', '%.4f' % se])
else:
values.append([k, version, m, '%.4f' % v, '', ''])
k = ""
version = ""
md_writer.value_matrix = values
latex_writer.value_matrix = values
# todo: make latex table look good
# print(latex_writer.dumps())
return md_writer.dumps()
import math
from collections import Iterable
from pprint import pprint
from collections.abc import Iterable
import numpy as np
import sacrebleu
......@@ -63,6 +62,7 @@ def acc_all(items):
acc = np.mean([int(all(x)) for x in question_scoring_dict.values()])
return acc
def acc_all_stderr(items):
# Only count as correct if all answers are labeled correctly for each question
question_scoring_dict = {}
......@@ -98,9 +98,13 @@ def weighted_mean(items):
a, b = zip(*items)
return sum(a) / sum(b)
def weighted_perplexity(items):
return math.exp(-weighted_mean(items))
def bits_per_byte(items):
return -weighted_mean(items) / math.log(2)
def bleu(items):
"""The Bilingual Evaluation Understudy Score, or BLEU for short, is a metric
......@@ -179,12 +183,13 @@ def _sacreformat(refs, preds):
return refs, preds
## stderr stuff
# stderr stuff
class _bootstrap_internal:
def __init__(self, f, n):
self.f = f
self.n = n
def __call__(self, v):
i, xs = v
rnd = random.Random()
......@@ -208,7 +213,9 @@ def bootstrap_stderr(f, xs, iters):
chunk_size = min(1000, iters)
from tqdm import tqdm
print("bootstrapping for stddev:", f.__name__)
for bootstrap in tqdm(pool.imap(_bootstrap_internal(f, chunk_size), [(i, xs) for i in range(iters // chunk_size)]), total=iters // chunk_size):
for bootstrap in tqdm(pool.imap(
_bootstrap_internal(f, chunk_size),
[(i, xs) for i in range(iters // chunk_size)]), total=iters // chunk_size):
# sample w replacement
res.extend(bootstrap)
......
......@@ -3,6 +3,7 @@ from . import gpt3
from . import dummy
MODEL_REGISTRY = {
"hf": gpt2.HFLM,
"gpt2": gpt2.GPT2LM,
"gpt3": gpt3.GPT3LM,
"dummy": dummy.DummyLM,
......
import transformers
import torch
import torch.nn as nn
import torch.nn.functional as F
from lm_eval.base import LM
from lm_eval import utils
from tqdm import tqdm
import numpy as np
from lm_eval.base import BaseLM
class GPT2LM(LM):
MAX_GEN_TOKS = 256
VOCAB_SIZE = 50257
EOT_TOKEN_ID = 50256
class HFLM(BaseLM):
def __init__(self, device='cuda', pretrained='gpt2', batch_size=1):
def __init__(self, device='cuda', pretrained='gpt2', revision='main', subfolder=None, tokenizer=None, batch_size=1):
super().__init__()
assert isinstance(device, str)
assert isinstance(pretrained, str)
assert isinstance(batch_size, int)
if device:
self.device = torch.device(device)
self._device = torch.device(device)
else:
self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
self.gpt2 = transformers.AutoModelForCausalLM.from_pretrained(pretrained).to(self.device)
self._device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# TODO: update this to be less of a hack once subfolder is fixed in HF
self.gpt2 = transformers.AutoModelForCausalLM.from_pretrained(
pretrained, revision=revision + ("/" + subfolder if subfolder is not None else "")
).to(self.device)
self.gpt2.eval()
# pretrained tokenizer for neo is broken for now so just hardcoding this to gpt2
self.tokenizer = transformers.GPT2TokenizerFast.from_pretrained('gpt2')
self.tokenizer.pad_token = "<|endoftext|>"
try:
self.max_length = self.gpt2.config.n_ctx
except AttributeError:
# gptneoconfig doesn't have n_ctx apparantly
self.max_length = self.gpt2.config.max_position_embeddings
# pretrained tokenizer for neo is broken for now so just hard-coding this to gpt2
self.tokenizer = transformers.AutoTokenizer.from_pretrained(
pretrained if tokenizer is None else tokenizer, revision=revision, subfolder=subfolder)
assert self.tokenizer.encode('hello\n\nhello') == [31373, 198, 198, 31373]
assert isinstance(self.tokenizer, (
transformers.GPT2Tokenizer, transformers.GPT2TokenizerFast,
transformers.T5Tokenizer, transformers.T5TokenizerFast,
)), "this tokenizer has not been checked for compatibility yet!"
# multithreading and batching
gpus = torch.cuda.device_count()
batch_size_per_gpu = batch_size # todo: adaptive batch size
self.vocab_size = self.tokenizer.vocab_size
self.batch_size = batch_size_per_gpu * gpus
if isinstance(self.tokenizer, (transformers.GPT2Tokenizer, transformers.GPT2TokenizerFast)):
assert self.tokenizer.encode('hello\n\nhello') == [31373, 198, 198, 31373], \
self.tokenizer.encode('hello\n\nhello')
# multithreading and batching
self.batch_size_per_gpu = batch_size # todo: adaptive batch size
# TODO: fix multi-gpu
# gpus = torch.cuda.device_count()
# if gpus > 1:
# self.gpt2 = nn.DataParallel(self.gpt2)
@classmethod
def create_from_arg_string(cls, arg_string, additional_config={}):
args = utils.simple_parse_args_string(arg_string)
args2 = {k: v for k, v in additional_config.items() if v is not None}
return cls(**args, **args2)
def loglikelihood(self, requests):
new_reqs = []
for context, continuation in requests:
if context == "":
# end of text as context
context_enc = [self.EOT_TOKEN_ID]
else:
context_enc = self.tokenizer.encode(context)
continuation_enc = self.tokenizer.encode(continuation)
new_reqs.append(((context, continuation), context_enc, continuation_enc))
return self._loglikelihood_tokens(new_reqs)
def loglikelihood_rolling(self, requests):
# TODO: Implement caching once we've confirmed the perplexity implementation
# TODO: automatic batch size detection for vectorization
loglikelihoods = []
with torch.no_grad():
for string, in tqdm(requests):
rolling_token_windows = list(map(utils.make_disjoint_window, utils.get_rolling_token_windows(
token_list=self.tokenizer.encode(string),
prefix_token=self.EOT_TOKEN_ID,
max_seq_len=self.max_length,
context_len=1,
)))
rolling_token_windows = [(None,) + x for x in rolling_token_windows]
# TODO: extract out this call so it only gets called once and also somehow figure out partial caching for that
string_nll = self._loglikelihood_tokens(rolling_token_windows, disable_tqdm=True)
# discard is_greedy
string_nll = [x[0] for x in string_nll]
string_nll = sum(string_nll)
loglikelihoods.append(string_nll)
return loglikelihoods
def _loglikelihood_tokens(self, requests, disable_tqdm=False):
# TODO: implement some kind of efficient-request-middleware that lumps together requests with the same context
res = []
with torch.no_grad():
def _collate(x):
# the negative sign on len(toks) sorts descending - this has a few advantages:
# - time estimates will always be over not underestimates, which is more useful for planning
# - to know the size of a batch when going through the list, you know the first one is always the batch padded context length.
# this is useful to simplify the batching logic and more importantly to make automatic adaptive batches much much easier to implement
# - any OOMs will happen right away rather than near the end
toks = x[1] + x[2]
return (-len(toks), tuple(toks))
# TODO: automatic (variable) batch size detection for vectorization
reord = utils.Reorderer(requests, _collate)
for chunk in utils.chunks(tqdm(reord.get_reordered(), disable=disable_tqdm), self.batch_size):
inps = []
contlens = []
inplens = []
padding_length = None
# because vectorizing is annoying, we first convert each (context, continuation) pair to padded
# tensors, then we pack them together into a batch, call the model, and then pick it all apart
# again because vectorizing is annoying
for _, context_enc, continuation_enc in chunk:
# sanity check
assert len(context_enc) > 0
assert len(continuation_enc) > 0
assert len(continuation_enc) <= self.max_length
@property
def eot_token_id(self):
# we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
return self.tokenizer.eos_token_id
# how this all works:
# CTX CONT
# inp 0 1 2 3|4 5 6 7 8 9 <- last token is deleted by inp[:, :-1]
# gpt2 \ \
# logits 1 2 3|4 5 6 7 8 9 <- the ctx half gets tossed out by the [:, -len(continuation_enc):, :self.VOCAB_SIZE] slice
# cont_toks 4 5 6 7 8 9
# when too long to fit in context, truncate from the left
inp = torch.tensor(
(context_enc + continuation_enc)[-(self.max_length+1):][:-1]
, dtype=torch.long).to(self.device)
inplen, = inp.shape
cont = continuation_enc
# since in _collate we make sure length is descending, the longest is always the first one.
padding_length = padding_length if padding_length is not None else inplen
# pad to length
inp = torch.cat([
inp, # [seq]
torch.zeros(padding_length - inplen, dtype=torch.long).to(inp.device) # [padding_length - seq]
], dim=0)
inps.append(inp.unsqueeze(0))
contlens.append(cont)
inplens.append(inplen)
multi_logits = F.log_softmax(self._model_call(torch.cat(inps, dim=0)), dim=-1).cpu() # [batch, seq, vocab]
for (cache_key, _, _), logits, inp, inplen, cont_toks in zip(chunk, multi_logits, inps, inplens, contlens):
contlen = len(cont_toks)
logits = logits[inplen-contlen:inplen].unsqueeze(0) # [1, seq, vocab]
greedy_tokens = logits.argmax(dim=-1)
# cont_toks :: [1, seq]
cont_toks = torch.tensor(cont_toks, dtype=torch.long).unsqueeze(0)
max_equal = (greedy_tokens == cont_toks).all()
#last_token_slice = logits[:, -1, :].squeeze(0).tolist()
logits = torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(-1) # [1, seq]
@property
def max_length(self):
try:
return self.gpt2.config.n_ctx
except AttributeError:
# gptneoconfig doesn't have n_ctx apparently
return self.gpt2.config.max_position_embeddings
answer = (float(logits.sum()), bool(max_equal))
@property
def max_gen_toks(self):
return 256
# partial caching
if cache_key is not None:
self.cache_hook.add_partial("loglikelihood", cache_key, answer)
@property
def batch_size(self):
# TODO: fix multi-gpu
return self.batch_size_per_gpu # * gpus
res.append(answer)
@property
def device(self):
# TODO: fix multi-gpu
return self._device
return reord.get_original(res)
def tok_encode(self, string: str):
return self.tokenizer.encode(string, add_special_tokens=False)
def tok_decode(self, tokens):
return self.tokenizer.decode(tokens)
def _model_call(self, inps):
"""
inps: a torch tensor of shape [batch, sequence]
the size of sequence may vary from call to call
returns: a torch tensor of shape [batch, sequence, vocab] with the
logits retuned from the model
logits returned from the model
"""
return self.gpt2(inps)[0][:, :, :50257]
with torch.no_grad():
return self.gpt2(inps)[0][:, :, :50257]
def greedy_until(self, requests):
# TODO: implement fully general `until` that handles untils that are
# multiple tokens or that span multiple tokens correctly
res = []
def _collate(x):
toks = self.tokenizer.encode(x[0])
return (len(toks), x[0])
reord = utils.Reorderer(requests, _collate)
for context, until in tqdm(reord.get_reordered()):
if isinstance(until, str): until = [until]
context_enc = torch.tensor([self.tokenizer.encode(context)[self.MAX_GEN_TOKS - self.max_length:]]).to(self.device)
primary_until, = self.tokenizer.encode(until[0])
cont = self.gpt2.generate(
context_enc,
max_length=context_enc.shape[1] + self.MAX_GEN_TOKS,
eos_token_id=primary_until,
do_sample=False
)
s = self.tokenizer.decode(cont[0].tolist()[context_enc.shape[1]:])
for term in until:
s = s.split(term)[0]
# partial caching
self.cache_hook.add_partial("greedy_until", (context, until), s)
res.append(s)
return reord.get_original(res)
def _model_generate(self, context, max_length, eos_token_id):
return self.gpt2.generate(
context,
max_length=max_length,
eos_token_id=eos_token_id,
do_sample=False
)
# for backwards compatibility
GPT2LM = HFLM
import os
import numpy as np
import transformers
from lm_eval.base import LM
from lm_eval.base import BaseLM
from lm_eval import utils
from tqdm import tqdm
import time
def get_result(response, ctxlen):
"""Process results from OpenAI API response.
:param response: dict
OpenAI API Response
:param ctxlen: int
Length of context (so we can slice them away and only keep the predictions)
:return:
continuation_logprobs: np.array
Log probabilities of continuation tokens
is_greedy: bool
whether argmax matches given continuation exactly
"""
is_greedy = True
logprobs = response["logprobs"]["token_logprobs"]
continuation_logprobs = sum(logprobs[ctxlen:])
......@@ -24,8 +36,11 @@ def get_result(response, ctxlen):
def oa_completion(**kwargs):
import openai
""" Query OpenAI API for completion.
Retry with back-off until they respond
"""
import openai
backoff_time = 3
while True:
try:
......@@ -35,11 +50,8 @@ def oa_completion(**kwargs):
backoff_time *= 1.5
class GPT3LM(LM):
MAX_LENGTH = 2048
class GPT3LM(BaseLM):
REQ_CHUNK_SIZE = 20
MAX_GEN_TOKS = 256
def __init__(self, engine, truncate=False):
"""
......@@ -50,10 +62,12 @@ class GPT3LM(LM):
Truncate input if too long (if False and input is too long, throw error)
"""
super().__init__()
import openai
self.engine = engine
self.tokenizer = transformers.GPT2TokenizerFast.from_pretrained('gpt2')
self.vocab_size = self.tokenizer.vocab_size
# to make the annoying "Using pad_token, but it is not set yet." error go away
self.tokenizer.pad_token = "<|endoftext|>"
......@@ -64,53 +78,36 @@ class GPT3LM(LM):
# Read from environment variable OPENAI_API_SECRET_KEY
openai.api_key = os.environ["OPENAI_API_SECRET_KEY"]
@classmethod
def create_from_arg_string(cls, arg_string, additional_config={}):
args = utils.simple_parse_args_string(arg_string)
args2 = {k: v for k, v in additional_config.items() if v is not None}
return cls(**args, **args2)
def loglikelihood(self, requests):
new_reqs = []
for context, continuation in requests:
if context == "":
# end of text as context
context_enc = [50256]
else:
context_enc = self.tokenizer.encode(context)
continuation_enc = self.tokenizer.encode(continuation)
new_reqs.append(((context, continuation), context_enc, continuation_enc))
return self._loglikelihood_tokens(new_reqs)
def loglikelihood_rolling(self, requests):
# TODO: switch implementation to use _loglikelihood_tokens rather than having it do its own thing
loglikelihoods = []
for string, in tqdm(requests):
encoded = self.tokenizer.encode_plus(string)["input_ids"]
rolling_token_windows = utils.get_rolling_token_windows(
token_list=encoded,
prefix_token=self.end_of_text_token_id,
max_seq_len=self.MAX_LENGTH,
context_len=1,
)
string_loglikelihoods = []
for input_tokens, pred_tokens in rolling_token_windows:
block_output = self.get_token_logprobs(
input_tokens=input_tokens,
pred_tokens=pred_tokens,
)
string_loglikelihoods.append(block_output["logprobs"])
string_loglikelihoods = np.concatenate(string_loglikelihoods).sum()
loglikelihoods.append(string_loglikelihoods)
return loglikelihoods
def _loglikelihood_tokens(self, requests):
import openai
@property
def eot_token_id(self):
return self.tokenizer.eos_token_id
@property
def max_length(self):
# Note: the OpenAI API supports up to 2049 tokens, with the first token being the first input token
return 2048
@property
def max_gen_toks(self):
return 256
@property
def batch_size(self):
# Isn't used because we override _loglikelihood_tokens
raise NotImplementedError()
@property
def device(self):
# Isn't used because we override _loglikelihood_tokens
raise NotImplementedError()
def tok_encode(self, string: str):
return self.tokenizer.encode(string, add_special_tokens=False)
def tok_decode(self, tokens):
return self.tokenizer.decode(tokens)
def _loglikelihood_tokens(self, requests, disable_tqdm=False):
res = []
def _collate(x):
......@@ -118,16 +115,18 @@ class GPT3LM(LM):
# it's not guaranteed that the 100 or so logprobs we get to see actually contain all the continuations
# we care about and so we need some kind of backup for when it isn't
toks = x[1] + x[2]
return (-len(toks), tuple(toks))
return -len(toks), tuple(toks)
reord = utils.Reorderer(requests, _collate)
for chunk in tqdm(list(utils.chunks(reord.get_reordered(), self.REQ_CHUNK_SIZE))):
for chunk in tqdm(list(utils.chunks(reord.get_reordered(), self.REQ_CHUNK_SIZE)), disable=disable_tqdm):
inps = []
ctxlens = []
for cache_key, context_enc, continuation_enc in chunk:
inp = (context_enc + continuation_enc)[-self.MAX_LENGTH:]
ctxlen = len(context_enc) - max(0, len(context_enc) + len(continuation_enc) - self.MAX_LENGTH)
# max_length+1 because the API takes up to 2049 tokens, including the first context token
inp = (context_enc + continuation_enc)[-(self.max_length+1):]
# TODO: the logic is much simpler if we just look at the length of continuation tokens
ctxlen = len(context_enc) - max(0, len(context_enc) + len(continuation_enc) - (self.max_length+1))
inps.append(inp)
ctxlens.append(ctxlen)
......@@ -151,35 +150,14 @@ class GPT3LM(LM):
return reord.get_original(res)
def get_token_logprobs(self, input_tokens, pred_tokens):
pred_start = len(input_tokens) - len(pred_tokens) + 1
# We're going to stitch together the input_tokens and pred_tokens
# In the longest case, this gets us to length = max_seq_len+1 (which the API works with)
assert input_tokens[pred_start:] == pred_tokens[:-1]
token_ids = input_tokens + [pred_tokens[-1]]
response = oa_completion(
engine=self.engine,
prompt=token_ids,
max_tokens=0,
temperature=0.0,
logprobs=0,
echo=True,
)
logprobs = np.array(response["choices"][0]["logprobs"]["token_logprobs"][pred_start:])
positions = np.arange(pred_start-1, pred_start-1 + len(token_ids[pred_start:]))
return {
"logprobs": logprobs,
"positions": positions,
}
def greedy_until(self, requests):
if not requests: return []
import openai
if not requests:
return []
res = []
def _collate(x):
toks = self.tokenizer.encode(x[0])
return (len(toks), x[0])
toks = self.tok_encode(x[0])
return len(toks), x[0]
reord = utils.Reorderer(requests, _collate)
......@@ -193,34 +171,43 @@ class GPT3LM(LM):
lastuntil = x[1]
ret.append(x)
if ret: yield ret, lastuntil
if ret:
yield ret, lastuntil
# todo: more intelligent batching for heterogenous `until`
# todo: more intelligent batching for heterogeneous `until`
for chunk, until in tqdm(list(sameuntil_chunks(reord.get_reordered(), self.REQ_CHUNK_SIZE))):
inps = []
for context, _ in chunk:
context_enc = self.tokenizer.encode(context)
inp = context_enc[-(self.MAX_LENGTH - self.MAX_GEN_TOKS):]
context_enc = self.tok_encode(context)
inp = context_enc[-(self.max_length - self.max_gen_toks):]
inps.append(inp)
response = oa_completion(
engine=self.engine,
prompt=inps,
max_tokens=self.MAX_GEN_TOKS,
max_tokens=self.max_gen_toks,
temperature=0.,
logprobs=10,
stop=until
stop=until,
)
for resp, (context, until) in zip(response.choices, chunk):
for resp, (context, until_) in zip(response.choices, chunk):
s = resp['text']
for term in until:
for term in until_:
s = s.split(term)[0]
# partial caching
self.cache_hook.add_partial("greedy_until", (context, until), s)
self.cache_hook.add_partial("greedy_until", (context, until_), s)
res.append(s)
return reord.get_original(res)
def _model_call(self, inps):
# Isn't used because we override _loglikelihood_tokens
raise NotImplementedError()
def _model_generate(self, context, max_length, eos_token_id):
# Isn't used because we override greedy_until
raise NotImplementedError()
......@@ -22,6 +22,7 @@ from . import lambada
from . import race
from . import piqa
from . import prost
from . import mc_taco
from . import triviaqa
from . import pubmedqa
from . import sciq
......@@ -42,6 +43,11 @@ from . import pile
from . import wikitext
from . import xquad
from . import mlqa
from . import lambada_multilingual
from . import mutual
from . import truthfulqa
from . import blimp
from . import asdiv
########################################
# Translation tasks
......@@ -99,12 +105,17 @@ TASK_REGISTRY = {
"drop": drop.DROP,
"lambada": lambada.LAMBADA,
"lambada_cloze": lambada_cloze.LAMBADA_cloze,
# multilingual lambada
**lambada_multilingual.construct_tasks(),
"wikitext": wikitext.WikiText,
# "cbt-cn": cbt.CBTCN, # disabled pending context length fix
# "cbt-ne": cbt.CBTNE, # disabled pending context length fix
"piqa": piqa.PiQA,
"prost": prost.PROST,
"mc_taco": mc_taco.MCTACO,
# Science related
"pubmedqa" : pubmedqa.Pubmed_QA,
......@@ -143,7 +154,9 @@ TASK_REGISTRY = {
"race": race.RACE,
# "naturalqs": naturalqs.NaturalQs, # not implemented yet
"headqa": headqa.HeadQA,
"headqa": headqa.HeadQAEsDeprecated, # for backwards compat - headqa used to default to es
"headqa_es": headqa.HeadQAEs,
"headqa_en": headqa.HeadQAEn,
"mathqa": mathqa.MathQA,
"webqs": webqs.WebQs,
"wsc273": wsc273.WinogradSchemaChallenge273,
......@@ -159,6 +172,13 @@ TASK_REGISTRY = {
"ethics_utilitarianism": hendrycks_ethics.EthicsUtilitarianism,
"ethics_virtue": hendrycks_ethics.EthicsVirtue,
"truthfulqa_mc": truthfulqa.TruthfulQAMultipleChoice,
"truthfulqa_gen": truthfulqa.TruthfulQAGeneration,
# dialogue
"mutual": mutual.MuTual,
"mutual_plus": mutual.MuTualPlus,
# math
"math_algebra": hendrycks_math.MathAlgebra,
"math_counting_and_prob": hendrycks_math.MathCountingAndProbability,
......@@ -167,6 +187,7 @@ TASK_REGISTRY = {
"math_num_theory": hendrycks_math.MathNumberTheory,
"math_prealgebra": hendrycks_math.MathPrealgebra,
"math_precalc": hendrycks_math.MathPrecalculus,
"math_asdiv": asdiv.Asdiv,
# arithmetic
"arithmetic_2da": arithmetic.Arithmetic2DPlus,
......@@ -220,6 +241,75 @@ TASK_REGISTRY = {
"pile_ubuntu-irc": pile.PileUbuntuIrc,
"pile_wikipedia": pile.PileWikipedia,
"pile_youtubesubtitles": pile.PileYoutubeSubtitles,
# BLiMP
"blimp_adjunct_island": blimp.BlimpAdjunctIsland,
"blimp_anaphor_gender_agreement": blimp.BlimpAnaphorGenderAgreement,
"blimp_anaphor_number_agreement": blimp.BlimpAnaphorNumberAgreement,
"blimp_animate_subject_passive": blimp.BlimpAnimateSubjectPassive,
"blimp_animate_subject_trans": blimp.BlimpAnimateSubjectTrans,
"blimp_causative": blimp.BlimpCausative,
"blimp_complex_NP_island": blimp.BlimpComplex_NPIsland,
"blimp_coordinate_structure_constraint_complex_left_branch": blimp.BlimpCoordinateStructureConstraintComplexLeftBranch,
"blimp_coordinate_structure_constraint_object_extraction": blimp.BlimpCoordinateStructureConstraintObjectExtraction,
"blimp_determiner_noun_agreement_1": blimp.BlimpDeterminerNounAgreement_1,
"blimp_determiner_noun_agreement_2": blimp.BlimpDeterminerNounAgreement_2,
"blimp_determiner_noun_agreement_irregular_1": blimp.BlimpDeterminerNounAgreementIrregular_1,
"blimp_determiner_noun_agreement_irregular_2": blimp.BlimpDeterminerNounAgreementIrregular_2,
"blimp_determiner_noun_agreement_with_adj_2": blimp.BlimpDeterminerNounAgreementWithAdj_2,
"blimp_determiner_noun_agreement_with_adj_irregular_1": blimp.BlimpDeterminerNounAgreementWithAdjIrregular_1,
"blimp_determiner_noun_agreement_with_adj_irregular_2": blimp.BlimpDeterminerNounAgreementWithAdjIrregular_2,
"blimp_determiner_noun_agreement_with_adjective_1": blimp.BlimpDeterminerNounAgreementWithAdjective_1,
"blimp_distractor_agreement_relational_noun": blimp.BlimpDistractorAgreementRelationalNoun,
"blimp_distractor_agreement_relative_clause": blimp.BlimpDistractorAgreementRelativeClause,
"blimp_drop_argument": blimp.BlimpDropArgument,
"blimp_ellipsis_n_bar_1": blimp.BlimpEllipsisNBar_1,
"blimp_ellipsis_n_bar_2": blimp.BlimpEllipsisNBar_2,
"blimp_existential_there_object_raising": blimp.BlimpExistentialThereObjectRaising,
"blimp_existential_there_quantifiers_1": blimp.BlimpExistentialThereQuantifiers_1,
"blimp_existential_there_quantifiers_2": blimp.BlimpExistentialThereQuantifiers_2,
"blimp_existential_there_subject_raising": blimp.BlimpExistentialThereSubjectRaising,
"blimp_expletive_it_object_raising": blimp.BlimpExpletiveItObjectRaising,
"blimp_inchoative": blimp.BlimpInchoative,
"blimp_intransitive": blimp.BlimpIntransitive,
"blimp_irregular_past_participle_adjectives": blimp.BlimpIrregularPastParticipleAdjectives,
"blimp_irregular_past_participle_verbs": blimp.BlimpIrregularPastParticipleVerbs,
"blimp_irregular_plural_subject_verb_agreement_1": blimp.BlimpIrregularPluralSubjectVerbAgreement_1,
"blimp_irregular_plural_subject_verb_agreement_2": blimp.BlimpIrregularPluralSubjectVerbAgreement_2,
"blimp_left_branch_island_echo_question": blimp.BlimpLeftBranchIslandEchoQuestion,
"blimp_left_branch_island_simple_question": blimp.BlimpLeftBranchIslandSimpleQuestion,
"blimp_matrix_question_npi_licensor_present": blimp.BlimpMatrixQuestionNpiLicensorPresent,
"blimp_npi_present_1": blimp.BlimpNpiPresent_1,
"blimp_npi_present_2": blimp.BlimpNpiPresent_2,
"blimp_only_npi_licensor_present": blimp.BlimpOnlyNpiLicensorPresent,
"blimp_only_npi_scope": blimp.BlimpOnlyNpiScope,
"blimp_passive_1": blimp.BlimpPassive_1,
"blimp_passive_2": blimp.BlimpPassive_2,
"blimp_principle_A_c_command": blimp.BlimpPrinciple_ACCommand,
"blimp_principle_A_case_1": blimp.BlimpPrinciple_ACase_1,
"blimp_principle_A_case_2": blimp.BlimpPrinciple_ACase_2,
"blimp_principle_A_domain_1": blimp.BlimpPrinciple_ADomain_1,
"blimp_principle_A_domain_2": blimp.BlimpPrinciple_ADomain_2,
"blimp_principle_A_domain_3": blimp.BlimpPrinciple_ADomain_3,
"blimp_principle_A_reconstruction": blimp.BlimpPrinciple_AReconstruction,
"blimp_regular_plural_subject_verb_agreement_1": blimp.BlimpRegularPluralSubjectVerbAgreement_1,
"blimp_regular_plural_subject_verb_agreement_2": blimp.BlimpRegularPluralSubjectVerbAgreement_2,
"blimp_sentential_negation_npi_licensor_present": blimp.BlimpSententialNegationNpiLicensorPresent,
"blimp_sentential_negation_npi_scope": blimp.BlimpSententialNegationNpiScope,
"blimp_sentential_subject_island": blimp.BlimpSententialSubjectIsland,
"blimp_superlative_quantifiers_1": blimp.BlimpSuperlativeQuantifiers_1,
"blimp_superlative_quantifiers_2": blimp.BlimpSuperlativeQuantifiers_2,
"blimp_tough_vs_raising_1": blimp.BlimpToughVsRaising_1,
"blimp_tough_vs_raising_2": blimp.BlimpToughVsRaising_2,
"blimp_transitive": blimp.BlimpTransitive,
"blimp_wh_island": blimp.BlimpWhIsland,
"blimp_wh_questions_object_gap": blimp.BlimpWhQuestionsObjectGap,
"blimp_wh_questions_subject_gap": blimp.BlimpWhQuestionsSubjectGap,
"blimp_wh_questions_subject_gap_long_distance": blimp.BlimpWhQuestionsSubjectGapLongDistance,
"blimp_wh_vs_that_no_gap": blimp.BlimpWhVsThatNoGap,
"blimp_wh_vs_that_no_gap_long_distance": blimp.BlimpWhVsThatNoGapLongDistance,
"blimp_wh_vs_that_with_gap": blimp.BlimpWhVsThatWithGap,
"blimp_wh_vs_that_with_gap_long_distance": blimp.BlimpWhVsThatWithGapLongDistance,
}
......
"""
ASDiv: A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers
https://arxiv.org/abs/2106.15772
@misc{miao2021diverse,
title={A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers},
author={Shen-Yun Miao and Chao-Chun Liang and Keh-Yih Su},
year={2021},
eprint={2106.15772},
archivePrefix={arXiv},
primaryClass={cs.AI}
}
"""
from lm_eval.base import Task
from pathlib import Path
from best_download import download_file
import xml.etree.ElementTree as ET
from lm_eval.base import rf
from lm_eval.metrics import mean,perplexity
import numpy as np
from zipfile import ZipFile
import os
#currently ignoring formula for answer generation
# given a subset, splits return the docs
class Asdiv(Task):
VERSION = 0
DATASET_PATH = Path("data/asdiv")
def download(self):
if self.DATASET_PATH.exists():
return
Path.mkdir(self.DATASET_PATH)
url = "https://github.com/chaochun/nlu-asdiv-dataset/archive/55790e5270bb91ccfa5053194b25732534696b50.zip"
checksum = "8f1fe4f6d5f170ec1e24ab78c244153c14c568b1bb2b1dad0324e71f37939a2d"
zip_path = self.DATASET_PATH / "55790e5270bb91ccfa5053194b25732534696b50.zip"
download_file(url, str(zip_path), checksum)
with ZipFile(zip_path, "r") as zip:
zip.extractall(self.DATASET_PATH)
os.remove(zip_path)
def _convert_standard(self, problem):
#TODO: include solution-type and formula
out_doc = {
"question" : problem.find('Question').text,
"body" : problem.find('Body').text,
"answer": problem.find('Answer').text
}
return out_doc
def load_docs(self, textfilename, tfds=False):
tree = ET.parse(textfilename)
root = tree.getroot()
for pid, problem in enumerate(root.iter('Problem')):
out_doc = self._convert_standard(problem)
yield out_doc
def has_training_docs(self):
return False
def has_validation_docs(self):
return True
def has_test_docs(self):
return False
def training_docs(self):
raise NotImplementedError("This dataset has no training docs")
def test_docs(self):
raise NotImplementedError("This dataset has no test docs")
def validation_docs(self):
data_xml_path = self.DATASET_PATH / "nlu-asdiv-dataset-55790e5270bb91ccfa5053194b25732534696b50/dataset/ASDiv.xml"
return self.load_docs(data_xml_path)
def fewshot_context(self, doc, num_fewshot, provide_description, rnd):
assert num_fewshot == 0, "ASDiv is intended only for the zero-shot setting."
return super().fewshot_context(doc, num_fewshot, provide_description, rnd)
def fewshot_description(self):
# TODO: add solution-type and formula
desc = "information containing the context of the question\nQuestion: Text of a question.\nAnswer: Answer to the question, based on the passage.\n"
return desc
def doc_to_text(self, doc):
# TODO: add solution-type
return doc['body'] + '\n' + 'Question:' + doc['question'] + '\n' + 'Answer:'
def doc_to_target(self, doc):
# TODO: add formula
answer = doc['answer'].split(' (')[0]
return " " + answer
def construct_requests(self, doc, ctx):
ll, is_greedy = rf.loglikelihood(ctx, self.doc_to_target(doc))
return ll, is_greedy
def process_results(self, doc, results):
ll, is_greedy = results
return {
'acc': int(is_greedy)
}
def aggregation(self):
return {
'acc': mean
}
def higher_is_better(self):
return {
'acc': True
}
"""
BLiMP: A Benchmark of Linguistic Minimal Pairs for English
https://arxiv.org/abs/1912.00582
@article{warstadt2019blimp,
title={BLiMP: A Benchmark of Linguistic Minimal Pairs for English},
author={Warstadt, Alex and Parrish, Alicia and Liu, Haokun and Mohananey, Anhad and Peng, Wei, and Wang, Sheng-Fu and Bowman, Samuel R},
journal={arXiv preprint arXiv:1912.00582},
year={2019}
}
"""
from lm_eval.base import rf
from lm_eval.metrics import mean
from .common import HFTask
class BlimpTask(HFTask):
VERSION = 0
DATASET_PATH = "blimp"
def download(self):
super().download()
# The HF dataset only contains a "train" dataset, but the harness expects a "validation"
# dataset. Let's use the training dataset, on the assumption that the model wasn't actually
# trained on this data.
self.data["validation"] = self.data["train"]
del self.data["train"]
def fewshot_context(self, doc, num_fewshot, provide_description, rnd):
assert num_fewshot == 0
assert not provide_description
return ""
def doc_to_text(self, doc):
# this method is invoked by tests only
return ""
def doc_to_target(self, doc):
# this method is invoked by tests only
return ""
def construct_requests(self, doc, ctx):
assert not ctx
# Calculate the loglikelihood for the good and the bad sentence.
# Note that loglikelihood translates the "" prefix to the "<|endoftext|>" token
return [
rf.loglikelihood("", doc["sentence_good"]),
rf.loglikelihood("", doc["sentence_bad"]),
]
def process_results(self, doc, results):
likelihood1, likelihood2 = results
# the model got this case right iff the good sentence scored higher than the bad sentence
acc = 1.0 if likelihood1 > likelihood2 else 0.0
return {
"acc": acc,
}
def higher_is_better(self):
return {
"acc": True,
}
def aggregation(self):
return {
"acc": mean,
}
class BlimpAdjunctIsland(BlimpTask):
DATASET_NAME = "adjunct_island"
class BlimpAnaphorGenderAgreement(BlimpTask):
DATASET_NAME = "anaphor_gender_agreement"
class BlimpAnaphorNumberAgreement(BlimpTask):
DATASET_NAME = "anaphor_number_agreement"
class BlimpAnimateSubjectPassive(BlimpTask):
DATASET_NAME = "animate_subject_passive"
class BlimpAnimateSubjectTrans(BlimpTask):
DATASET_NAME = "animate_subject_trans"
class BlimpCausative(BlimpTask):
DATASET_NAME = "causative"
class BlimpComplex_NPIsland(BlimpTask):
DATASET_NAME = "complex_NP_island"
class BlimpCoordinateStructureConstraintComplexLeftBranch(BlimpTask):
DATASET_NAME = "coordinate_structure_constraint_complex_left_branch"
class BlimpCoordinateStructureConstraintObjectExtraction(BlimpTask):
DATASET_NAME = "coordinate_structure_constraint_object_extraction"
class BlimpDeterminerNounAgreement_1(BlimpTask):
DATASET_NAME = "determiner_noun_agreement_1"
class BlimpDeterminerNounAgreement_2(BlimpTask):
DATASET_NAME = "determiner_noun_agreement_2"
class BlimpDeterminerNounAgreementIrregular_1(BlimpTask):
DATASET_NAME = "determiner_noun_agreement_irregular_1"
class BlimpDeterminerNounAgreementIrregular_2(BlimpTask):
DATASET_NAME = "determiner_noun_agreement_irregular_2"
class BlimpDeterminerNounAgreementWithAdj_2(BlimpTask):
DATASET_NAME = "determiner_noun_agreement_with_adj_2"
class BlimpDeterminerNounAgreementWithAdjIrregular_1(BlimpTask):
DATASET_NAME = "determiner_noun_agreement_with_adj_irregular_1"
class BlimpDeterminerNounAgreementWithAdjIrregular_2(BlimpTask):
DATASET_NAME = "determiner_noun_agreement_with_adj_irregular_2"
class BlimpDeterminerNounAgreementWithAdjective_1(BlimpTask):
DATASET_NAME = "determiner_noun_agreement_with_adjective_1"
class BlimpDistractorAgreementRelationalNoun(BlimpTask):
DATASET_NAME = "distractor_agreement_relational_noun"
class BlimpDistractorAgreementRelativeClause(BlimpTask):
DATASET_NAME = "distractor_agreement_relative_clause"
class BlimpDropArgument(BlimpTask):
DATASET_NAME = "drop_argument"
class BlimpEllipsisNBar_1(BlimpTask):
DATASET_NAME = "ellipsis_n_bar_1"
class BlimpEllipsisNBar_2(BlimpTask):
DATASET_NAME = "ellipsis_n_bar_2"
class BlimpExistentialThereObjectRaising(BlimpTask):
DATASET_NAME = "existential_there_object_raising"
class BlimpExistentialThereQuantifiers_1(BlimpTask):
DATASET_NAME = "existential_there_quantifiers_1"
class BlimpExistentialThereQuantifiers_2(BlimpTask):
DATASET_NAME = "existential_there_quantifiers_2"
class BlimpExistentialThereSubjectRaising(BlimpTask):
DATASET_NAME = "existential_there_subject_raising"
class BlimpExpletiveItObjectRaising(BlimpTask):
DATASET_NAME = "expletive_it_object_raising"
class BlimpInchoative(BlimpTask):
DATASET_NAME = "inchoative"
class BlimpIntransitive(BlimpTask):
DATASET_NAME = "intransitive"
class BlimpIrregularPastParticipleAdjectives(BlimpTask):
DATASET_NAME = "irregular_past_participle_adjectives"
class BlimpIrregularPastParticipleVerbs(BlimpTask):
DATASET_NAME = "irregular_past_participle_verbs"
class BlimpIrregularPluralSubjectVerbAgreement_1(BlimpTask):
DATASET_NAME = "irregular_plural_subject_verb_agreement_1"
class BlimpIrregularPluralSubjectVerbAgreement_2(BlimpTask):
DATASET_NAME = "irregular_plural_subject_verb_agreement_2"
class BlimpLeftBranchIslandEchoQuestion(BlimpTask):
DATASET_NAME = "left_branch_island_echo_question"
class BlimpLeftBranchIslandSimpleQuestion(BlimpTask):
DATASET_NAME = "left_branch_island_simple_question"
class BlimpMatrixQuestionNpiLicensorPresent(BlimpTask):
DATASET_NAME = "matrix_question_npi_licensor_present"
class BlimpNpiPresent_1(BlimpTask):
DATASET_NAME = "npi_present_1"
class BlimpNpiPresent_2(BlimpTask):
DATASET_NAME = "npi_present_2"
class BlimpOnlyNpiLicensorPresent(BlimpTask):
DATASET_NAME = "only_npi_licensor_present"
class BlimpOnlyNpiScope(BlimpTask):
DATASET_NAME = "only_npi_scope"
class BlimpPassive_1(BlimpTask):
DATASET_NAME = "passive_1"
class BlimpPassive_2(BlimpTask):
DATASET_NAME = "passive_2"
class BlimpPrinciple_ACCommand(BlimpTask):
DATASET_NAME = "principle_A_c_command"
class BlimpPrinciple_ACase_1(BlimpTask):
DATASET_NAME = "principle_A_case_1"
class BlimpPrinciple_ACase_2(BlimpTask):
DATASET_NAME = "principle_A_case_2"
class BlimpPrinciple_ADomain_1(BlimpTask):
DATASET_NAME = "principle_A_domain_1"
class BlimpPrinciple_ADomain_2(BlimpTask):
DATASET_NAME = "principle_A_domain_2"
class BlimpPrinciple_ADomain_3(BlimpTask):
DATASET_NAME = "principle_A_domain_3"
class BlimpPrinciple_AReconstruction(BlimpTask):
DATASET_NAME = "principle_A_reconstruction"
class BlimpRegularPluralSubjectVerbAgreement_1(BlimpTask):
DATASET_NAME = "regular_plural_subject_verb_agreement_1"
class BlimpRegularPluralSubjectVerbAgreement_2(BlimpTask):
DATASET_NAME = "regular_plural_subject_verb_agreement_2"
class BlimpSententialNegationNpiLicensorPresent(BlimpTask):
DATASET_NAME = "sentential_negation_npi_licensor_present"
class BlimpSententialNegationNpiScope(BlimpTask):
DATASET_NAME = "sentential_negation_npi_scope"
class BlimpSententialSubjectIsland(BlimpTask):
DATASET_NAME = "sentential_subject_island"
class BlimpSuperlativeQuantifiers_1(BlimpTask):
DATASET_NAME = "superlative_quantifiers_1"
class BlimpSuperlativeQuantifiers_2(BlimpTask):
DATASET_NAME = "superlative_quantifiers_2"
class BlimpToughVsRaising_1(BlimpTask):
DATASET_NAME = "tough_vs_raising_1"
class BlimpToughVsRaising_2(BlimpTask):
DATASET_NAME = "tough_vs_raising_2"
class BlimpTransitive(BlimpTask):
DATASET_NAME = "transitive"
class BlimpWhIsland(BlimpTask):
DATASET_NAME = "wh_island"
class BlimpWhQuestionsObjectGap(BlimpTask):
DATASET_NAME = "wh_questions_object_gap"
class BlimpWhQuestionsSubjectGap(BlimpTask):
DATASET_NAME = "wh_questions_subject_gap"
class BlimpWhQuestionsSubjectGapLongDistance(BlimpTask):
DATASET_NAME = "wh_questions_subject_gap_long_distance"
class BlimpWhVsThatNoGap(BlimpTask):
DATASET_NAME = "wh_vs_that_no_gap"
class BlimpWhVsThatNoGapLongDistance(BlimpTask):
DATASET_NAME = "wh_vs_that_no_gap_long_distance"
class BlimpWhVsThatWithGap(BlimpTask):
DATASET_NAME = "wh_vs_that_with_gap"
class BlimpWhVsThatWithGapLongDistance(BlimpTask):
DATASET_NAME = "wh_vs_that_with_gap_long_distance"
......@@ -14,15 +14,16 @@ Acknowledgement: This implementation is based on the official evaluation for `DR
https://github.com/allenai/allennlp-reading-comprehension/blob/master/allennlp_rc/eval/drop_eval.py
"""
_ARTICLES = re.compile(r"\b(a|an|the)\b", re.UNICODE)
class DROP(Task):
VERSION = 0
VERSION = 1
DATASET_PATH = Path("data/drop")
def download(self):
if self.DATASET_PATH.exists():
return
Path.mkdir(self.DATASET_PATH)
Path.mkdir(self.DATASET_PATH, parents=True)
url = "https://s3-us-west-2.amazonaws.com/allennlp/datasets/drop/drop_dataset.zip"
checksum = "39d2278a29fd729de301b111a45f434c24834f40df8f4ff116d864589e3249d6"
zip_path = self.DATASET_PATH / "drop_dataset.zip"
......@@ -50,19 +51,34 @@ class DROP(Task):
"id": qa["query_id"],
"passage": doc["passage"],
"question": qa["question"],
"answers": self.get_answers(qa["answer"]),
"answers": self.get_answers(qa),
}
@classmethod
def get_answers(cls, answers):
# NOTE: We wrap every non-`list` answer into a list for uniformity.
if answers["number"] != "":
return [str(answers["number"])]
if answers["spans"] != []:
return answers["spans"]
return [" ".join([answers["date"]["day"],
answers["date"]["month"],
answers["date"]["year"]]).strip()]
def get_answers(cls, qa):
answers = []
answers_set = set()
candidates = [qa["answer"]] + qa.get("validated_answers", [])
for candidate in candidates:
answer = cls.parse_answer(candidate)
if answer in answers_set:
continue
answers_set.add(answer)
answers.append(answer)
return answers
@classmethod
def parse_answer(cls, answer):
# NOTE: Everything is returned as a tuple for uniformity and hashability.
if answer["number"] != "":
return (str(answer["number"]),)
if answer["spans"] != []:
return tuple(answer["spans"])
return (" ".join([answer["date"]["day"],
answer["date"]["month"],
answer["date"]["year"]]).strip(),)
def training_docs(self):
docs = json.load(open(self.DATASET_PATH / "drop_dataset" / "drop_dataset_train.json"))
......@@ -76,7 +92,7 @@ class DROP(Task):
return f"Passage: {doc['passage']}\nQuestion: {doc['question']}\nAnswer:"
def doc_to_target(self, doc):
return " " + ", ".join(doc["answers"])
return " " + ", ".join(doc["answers"][0])
def construct_requests(self, doc, ctx):
"""Uses RequestFactory to construct Requests and returns an iterable of
......@@ -89,9 +105,7 @@ class DROP(Task):
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
conts = []
for _ in doc["answers"]:
conts.append(rf.greedy_until(ctx, ["."]))
conts = [rf.greedy_until(ctx, ["."])]
return conts
def process_results(self, doc, results):
......@@ -105,66 +119,96 @@ class DROP(Task):
The results of the requests created in construct_requests.
"""
preds, golds = results, doc["answers"]
exact_match, f1_score = self.get_metrics(preds, golds)
max_em = 0
max_f1 = 0
for gold_answer in golds:
exact_match, f1_score = self.get_metrics(preds, gold_answer)
if gold_answer[0].strip():
max_em = max(max_em, exact_match)
max_f1 = max(max_f1, f1_score)
return {
"em": exact_match,
"f1": f1_score
"em": max_em,
"f1": max_f1
}
def get_metrics(self, preds, golds):
exact_match = self._exact_match(preds, golds)
f1_score = self._f1_score(preds, golds)
return exact_match, f1_score
def _exact_match(self, preds, golds):
""" Returns the exact match of normalized gold answers and predictions. """
normalized_preds = [self._normalize(pred) for pred in preds]
normalized_golds = [self._normalize(gold) for gold in golds]
is_equal_sets = set(normalized_preds) == set(normalized_golds)
is_equal_length = len(normalized_preds) == len(normalized_golds)
return int(is_equal_sets and is_equal_length)
def _f1_score(self, preds, golds):
"""Returns the average F1-score over normalized gold answers and predictions.
From Section 5 of Dua et al. "DROP:...":
"When an answer has multiple spans, we first perform a one-to-one
alignment greedily based on bag-of-word overlap on the set of spans
and then compute average F1 over each span."
def get_metrics(self, predicted, gold):
"""
Takes a predicted answer and a gold answer (that are both either a string or a list of
strings), and returns exact match and the DROP F1 metric for the prediction. If you are
writing a script for evaluating objects in memory (say, the output of predictions during
validation, or while training), this is the function you want to call, after using
:func:`answer_json_to_strings` when reading the gold answer from the released data file.
"""
pred_bags = self._answer_to_bags(preds)
gold_bags = self._answer_to_bags(golds)
f1_per_bag = self._align_bags(pred_bags, gold_bags)
return np.mean(f1_per_bag)
def _answer_to_bags(self, answers):
return [set(self._normalize(answer).split()) for answer in answers]
def _align_bags(self, pred_bags, gold_bags):
""" Returns the max metric value over all the answers. """
scores = np.zeros([len(gold_bags), len(pred_bags)])
for gold_index, gold_bag in enumerate(gold_bags):
for pred_index, pred_bag in enumerate(pred_bags):
if self._is_number_match(pred_bag, gold_bag):
scores[gold_index, pred_index] = self._bag_f1(pred_bag, gold_bag)
predicted_bags = self._answer_to_bags(predicted)
gold_bags = self._answer_to_bags(gold)
if set(predicted_bags[0]) == set(gold_bags[0]) and len(predicted_bags[0]) == len(gold_bags[0]):
exact_match = 1.0
else:
exact_match = 0.0
f1_per_bag = self._align_bags(predicted_bags[1], gold_bags[1])
f1 = np.mean(f1_per_bag)
f1 = round(f1, 2)
return exact_match, f1
def _answer_to_bags(self, answer):
if isinstance(answer, (list, tuple)):
raw_spans = answer
else:
raw_spans = [answer]
normalized_spans = []
token_bags = []
for raw_span in raw_spans:
normalized_span = self._normalize(raw_span)
normalized_spans.append(normalized_span)
token_bags.append(set(normalized_span.split()))
return normalized_spans, token_bags
def _align_bags(self, predicted, gold):
"""
Takes gold and predicted answer sets and first finds the optimal 1-1 alignment
between them and gets maximum metric values over all the answers.
"""
scores = np.zeros([len(gold), len(predicted)])
for gold_index, gold_item in enumerate(gold):
for pred_index, pred_item in enumerate(predicted):
if self._match_numbers_if_present(gold_item, pred_item):
scores[gold_index, pred_index] = self._compute_f1(pred_item, gold_item)
row_ind, col_ind = linear_sum_assignment(-scores)
max_scores = np.zeros([max(len(gold_bags), len(pred_bags))])
max_scores = np.zeros([max(len(gold), len(predicted))])
for row, column in zip(row_ind, col_ind):
max_scores[row] = max(max_scores[row], scores[row, column])
return max_scores
def _bag_f1(self, pred_bag, gold_bag):
intersection = len(gold_bag.intersection(pred_bag))
if intersection == 0:
return 0.0
precision = intersection / float(len(pred_bag)) if pred_bag else 1.0
recall = intersection / float(len(gold_bag)) if gold_bag else 1.0
f1 = (2 * precision * recall) / (precision + recall)
def _compute_f1(self, predicted_bag, gold_bag):
intersection = len(gold_bag.intersection(predicted_bag))
if not predicted_bag:
precision = 1.0
else:
precision = intersection / float(len(predicted_bag))
if not gold_bag:
recall = 1.0
else:
recall = intersection / float(len(gold_bag))
f1 = (
(2 * precision * recall) / (precision + recall)
if not (precision == 0.0 and recall == 0.0)
else 0.0
)
return f1
def _is_number_match(self, pred_bag, gold_bag):
pred_numbers = set([word for word in pred_bag if self._is_number(word)])
gold_numbers = set([word for word in gold_bag if self._is_number(word)])
if (not gold_numbers) or gold_numbers.intersection(pred_numbers):
def _match_numbers_if_present(self, gold_bag, predicted_bag):
gold_numbers = set()
predicted_numbers = set()
for word in gold_bag:
if self._is_number(word):
gold_numbers.add(word)
for word in predicted_bag:
if self._is_number(word):
predicted_numbers.add(word)
if (not gold_numbers) or gold_numbers.intersection(predicted_numbers):
return True
return False
......@@ -175,30 +219,29 @@ class DROP(Task):
except ValueError:
return False
def _normalize(self, answer):
def remove_articles(text):
regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
return re.sub(regex, " ", text)
def _remove_articles(self, text):
return _ARTICLES.sub(" ", text)
def white_space_fix(text):
return " ".join(text.split())
def _white_space_fix(self, text):
return " ".join(text.split())
def remove_punc(text):
exclude = set(string.punctuation)
if not self._is_number(text):
return "".join(ch for ch in text if ch not in exclude)
else:
return text
def _remove_punc(self, text):
exclude = set(string.punctuation)
if not self._is_number(text):
return "".join(ch for ch in text if ch not in exclude)
else:
return text
def fix_number(text):
return str(float(text)) if self._is_number(text) else text
def _fix_number(self, text):
return str(float(text)) if self._is_number(text) else text
def tokenize(text):
return re.split(" |-", text)
def _tokenize(self, text):
return re.split(" |-", text)
def _normalize(self, answer):
tokens = [
white_space_fix(remove_articles(fix_number(remove_punc(token.lower()))))
for token in tokenize(answer)
self._white_space_fix(self._remove_articles(self._fix_number(self._remove_punc(token.lower()))))
for token in self._tokenize(answer)
]
tokens = [token for token in tokens if token.strip()]
normalized = " ".join(tokens).strip()
......
......@@ -227,7 +227,7 @@ class QNLI(HFTask):
class WNLI(HFTask):
VERSION = 0
VERSION = 1
DATASET_PATH = "glue"
DATASET_NAME = "wnli"
......@@ -241,26 +241,25 @@ class WNLI(HFTask):
return False
def doc_to_text(self, doc):
return "{}\nQuestion: {} True, False or Neither?\nAnswer:".format(
return "{}\nQuestion: {} True or False?\nAnswer:".format(
doc["sentence1"],
doc["sentence2"],
)
def doc_to_target(self, doc):
# True = entailment
# False = contradiction
# Neither = neutral
return " {}".format({0: "True", 1: "Neither", 2: "False"}[doc["label"]])
# False = not_entailment
return " {}".format({0: "False", 1: "True"}[doc["label"]])
def construct_requests(self, doc, ctx):
ll_true, _ = rf.loglikelihood(ctx, " True")
ll_neither, _ = rf.loglikelihood(ctx, " Neither")
ll_false, _ = rf.loglikelihood(ctx, " False")
return ll_true, ll_neither, ll_false
return ll_true, ll_false
def process_results(self, doc, results):
ll_true, ll_false = results
pred = ll_true > ll_false
gold = doc["label"]
pred = np.argmax(results)
return {
"acc": pred == gold
}
......
......@@ -2,10 +2,9 @@ from . common import HFTask
from lm_eval.base import MultipleChoiceTask
class HeadQA(HFTask, MultipleChoiceTask):
class HeadQABase(HFTask, MultipleChoiceTask):
VERSION = 0
DATASET_PATH = "head_qa"
DATASET_NAME = None
def has_training_docs(self):
return True
......@@ -31,3 +30,15 @@ class HeadQA(HFTask, MultipleChoiceTask):
def doc_to_text(self, doc):
return doc["query"]
class HeadQAEn(HeadQABase):
DATASET_NAME = "en"
class HeadQAEs(HeadQABase):
DATASET_NAME = "es"
# for backwards compatibility
class HeadQAEsDeprecated(HeadQABase):
DATASET_NAME = "es"
print("WARNING: headqa is deprecated. Please use headqa_es or headqa_en instead. See https://github.com/EleutherAI/lm-evaluation-harness/pull/240 for more info.")
\ No newline at end of file
......@@ -3,6 +3,7 @@ from lm_eval.base import Task, rf
from lm_eval.metrics import mean, perplexity
from lm_eval.utils import sh
from best_download import download_file
import os
class LAMBADA(Task):
......@@ -10,11 +11,12 @@ class LAMBADA(Task):
def download(self):
sh("mkdir -p data/lambada")
try:
download_file(
"http://eaidata.bmk.sh/data/lambada_test.jsonl",
"data/lambada/lambada_test.jsonl",
"4aa8d02cd17c719165fc8a7887fddd641f43fcafa4b1c806ca8abc31fabdb226"
)
if not os.path.exists("data/lambada/lambada_test.jsonl"):
download_file(
"http://eaidata.bmk.sh/data/lambada_test.jsonl",
"data/lambada/lambada_test.jsonl",
"4aa8d02cd17c719165fc8a7887fddd641f43fcafa4b1c806ca8abc31fabdb226"
)
except:
# fallback - for some reason best_download doesnt work all the time here
sh("wget http://eaidata.bmk.sh/data/lambada_test.jsonl -O data/lambada/lambada_test.jsonl")
......
from . import lambada
from lm_eval.base import Task, rf
from lm_eval.metrics import mean, perplexity
from lm_eval.utils import sh
from best_download import download_file
import json
from functools import partial
import os
# This task is lambada but machine-translated to the other languages.
LANGS = ["en", "fr", "de", "it", "es"]
CHECKSUMS = {"en": "4aa8d02cd17c719165fc8a7887fddd641f43fcafa4b1c806ca8abc31fabdb226",
"fr": "941ec6a73dba7dc91c860bf493eb66a527cd430148827a4753a4535a046bf362",
"de": "51c6c1795894c46e88e4c104b5667f488efe79081fb34d746b82b8caa663865e",
"it": "86654237716702ab74f42855ae5a78455c1b0e50054a4593fb9c6fcf7fad0850",
"es": "ffd760026c647fb43c67ce1bc56fd527937304b348712dce33190ea6caba6f9c"
}
class MultilingualLAMBADA(lambada.LAMBADA):
VERSION = 0
def __init__(self, lang=None):
self.LANG = lang
super().__init__()
def download(self):
sh("mkdir -p data/lambada")
f = f"data/lambada/lambada_test_{self.LANG}.jsonl"
url = f"http://eaidata.bmk.sh/data/lambada_test_{self.LANG}.jsonl"
try:
if not os.path.exists(f):
download_file(
url,
f,
CHECKSUMS[self.LANG]
)
except:
# fallback - for some reason best_download doesnt work all the time here
sh(f"wget {url} -O {f}")
sh(f'echo "{CHECKSUMS[self.LANG]} {f}" | sha256sum --check')
def validation_docs(self):
with open(f"data/lambada/lambada_test_{self.LANG}.jsonl") as fh:
for line in fh:
yield json.loads(line)
class MultilingualLAMBADAEN(MultilingualLAMBADA):
def __init__(self):
super().__init__('en')
class MultilingualLAMBADAFR(MultilingualLAMBADA):
def __init__(self):
super().__init__('fr')
class MultilingualLAMBADADE(MultilingualLAMBADA):
def __init__(self):
super().__init__('de')
class MultilingualLAMBADAIT(MultilingualLAMBADA):
def __init__(self):
super().__init__('it')
class MultilingualLAMBADAES(MultilingualLAMBADA):
def __init__(self):
super().__init__('es')
LANG_CLASSES = [MultilingualLAMBADAEN, MultilingualLAMBADAFR, MultilingualLAMBADADE, MultilingualLAMBADAIT, MultilingualLAMBADAES]
def construct_tasks():
tasks = {}
for lang, lang_class in zip(LANGS, LANG_CLASSES):
tasks[f"lambada_mt_{lang}"] = lang_class
return tasks
......@@ -10,7 +10,7 @@ class LogiQA(MultipleChoiceTask):
def download(self):
if self.DATASET_PATH.exists():
return
Path.mkdir(self.DATASET_PATH)
Path.mkdir(self.DATASET_PATH, parents=True)
base_url = "https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master"
splits = [
{"name": "Train", "checksum": "7d5bb1f58278e33b395744cd2ad8d7600faa0b3c4d615c659a44ec1181d759fa"},
......
"""
“Going on a vacation” takes longer than “Going for a walk”:
A Study of Temporal Commonsense Understanding
https://arxiv.org/pdf/1909.03065.pdf
WARNING: Running this task with a `--limit` arg will give misleading results! The
corresponding dataset is structured such that each multiple-choice-question gathered
by the authors is split into question-option pairs, where each such pair gets
siloed into an individual document for plausibility testing. Because the harness
shuffles these documents, setting `--limit` will likely "cut off" certain candidate
answers. This is a problem because the task's metrics require an exhaustive evaluation
of a question's options. See section 4 of the paper for details.
@inproceedings{ZKNR19,
author = {Ben Zhou, Daniel Khashabi, Qiang Ning and Dan Roth},
title = {“Going on a vacation” takes longer than “Going for a walk”: A Study of Temporal Commonsense Understanding },
booktitle = {EMNLP},
year = {2019},
}
"""
import numpy as np
from lm_eval.base import rf
from collections import defaultdict
from . common import HFTask
class MCTACO(HFTask):
VERSION = 0
DATASET_PATH = "mc_taco"
DATASET_NAME = None
def has_training_docs(self):
return False
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def fewshot_description(self):
return "Determine whether the candidate answer is plausible (\"yes\") or not (\"no\")"
def doc_to_text(self, doc):
return f"{doc['sentence']}\nQuestion: {doc['question']}\n"\
f"Answer: {doc['answer']}\nPlausible:"
def doc_to_target(self, doc):
return " " + ["no", "yes"][doc['label']]
def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
ll_no, _ = rf.loglikelihood(ctx, " no")
ll_yes, _ = rf.loglikelihood(ctx, " yes")
return ll_no, ll_yes
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
The results of the requests created in construct_requests.
"""
ll_no, ll_yes = results
gold = doc['label']
pred = int(ll_yes > ll_no)
question_id = self._question2id(doc)
items = (gold, pred, question_id)
return {
"em": items,
"f1": items
}
def _question2id(self, doc):
""" Returns an identifier for the question in the given document. """
return " ".join([doc['sentence'], doc['question']])
def aggregation(self):
return {
"f1": f1,
"em": exact_match,
}
def higher_is_better(self):
return {
"f1": True,
"em": True,
}
def exact_match(items):
"""
Counts a question as correct if the model accurately classifies the plausibility
of an answer for all candidate answers. See section 4 "Evaluation Metrics" in the paper.
"""
results = list(zip(*items))
accuracies = defaultdict(list)
for gold, pred, question in zip(results[0], results[1], results[2]):
accuracies[question].append(pred == gold)
return np.mean([int(all(accs)) for accs in accuracies.values()])
def f1(items):
""" See section 4 "Evaluation Metrics" in the paper about the F1 metric used. """
results = list(zip(*items))
# Group the positive ("yes" = 1) golds and predictions by question.
gold_positives, pred_positives = defaultdict(list), defaultdict(list)
for gold, pred, question in zip(results[0], results[1], results[2]):
gold_positives[question].append(gold)
pred_positives[question].append(pred)
f1 = []
for question in gold_positives.keys():
gp, pp = sum(gold_positives[question]), sum(pred_positives[question])
tp = sum(np.logical_and(gold_positives[question], pred_positives[question]))
p = tp / pp if pp > 0.0 else 1.0
r = tp / gp if gp > 0.0 else 1.0
if p + r > 0.0:
f1.append(2. * (p * r) / (p + r))
return np.mean(f1)
"""
MuTual: A Dataset for Multi-Turn Dialogue Reasoning
https://www.aclweb.org/anthology/2020.acl-main.130/
@inproceedings{mutual,
title = "MuTual: A Dataset for Multi-Turn Dialogue Reasoning",
author = "Cui, Leyang and Wu, Yu and Liu, Shujie and Zhang, Yue and Zhou, Ming" ,
booktitle = "Proceedings of the 58th Conference of the Association for Computational Linguistics",
year = "2020",
publisher = "Association for Computational Linguistics",
}
"""
import json
import zipfile
import shutil
import numpy as np
from pathlib import Path
from lm_eval.base import Task, rf
from lm_eval.metrics import mean
from best_download import download_file
class MuTualBase(Task):
VERSION = 1
BASE_PATH = Path("data/mutual")
DATASET_NAME = None
CHOICES = ['A', 'B', 'C', 'D']
def __init__(self):
super().__init__()
def download(self):
if self.BASE_PATH.exists():
return
Path.mkdir(self.BASE_PATH, parents=True)
master_zip = Path("data/master.zip")
download_file(
"https://github.com/Nealcly/MuTual/archive/master.zip",
str(master_zip),
"bb325cf6c672f0f02699993a37138b0fa0af6fcfc77ec81dfbe46add4d7b29f9")
with zipfile.ZipFile(master_zip, 'r') as zip:
zip.extractall("data")
Path("data/MuTual-master/data").rename(str(self.BASE_PATH))
# Remove left over files and directories.
master_zip.unlink()
shutil.rmtree("data/MuTual-master")
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return False
def _load_docs(self, path):
for file in sorted(path.iterdir()):
if file.suffix != ".txt":
continue
with open(file, 'r', encoding='utf-8') as f:
yield json.load(f)
def training_docs(self):
return self._load_docs(self.BASE_PATH / self.DATASET_NAME / "train")
def validation_docs(self):
return self._load_docs(self.BASE_PATH / self.DATASET_NAME / "dev")
def test_docs(self):
return NotImplemented
def fewshot_description(self):
# TODO: figure out fewshot description
return ""
def doc_to_text(self, doc):
return self.detokenize(doc["article"])
def doc_to_target(self, doc):
return " " + self.detokenize(doc["options"][self.CHOICES.index(doc["answers"])])
def construct_requests(self, doc, ctx):
lls = []
for option in doc["options"]:
lls.append(rf.loglikelihood(ctx, f" {self.detokenize(option)}")[0])
return lls
def detokenize(self, text):
text = text.replace(" '", "'")
text = text.replace(" \n", "\n")
text = text.replace("\n ", "\n")
text = text.replace(" n't", "n't")
text = text.replace("`` ", '"')
text = text.replace("''", '"')
# punctuation
text = text.replace(" :", ":")
text = text.replace(" ;", ";")
text = text.replace(" !", "!")
text = text.replace(" ?", "?")
text = text.replace(" ,", ",")
text = text.replace(" .", ".")
return text
def process_results(self, doc, results):
gold = self.CHOICES.index(doc["answers"])
r4_1 = np.argmax(results) == gold # r4_1 = accuracy
ranks = sorted(results, reverse=True)
r4_2 = (ranks.index(results[gold]) == 1) + r4_1
mrr = 1. / (ranks.index(results[gold]) + 1) # `+ 1` for index offset
return {
"r@1": r4_1,
"r@2": r4_2,
"mrr": mrr
}
def aggregation(self):
return {
"r@1": mean,
"r@2": mean,
"mrr": mean
}
def higher_is_better(self):
return {
"r@1": True,
"r@2": True,
"mrr": True
}
class MuTual(MuTualBase):
DATASET_NAME = Path("mutual")
class MuTualPlus(MuTualBase):
DATASET_NAME = Path("mutual_plus")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment