Unverified Commit 31ebb599 authored by Stella Biderman's avatar Stella Biderman Committed by GitHub
Browse files

Merge branch 'master' into multilingual

parents 38c04a0f 8728710c
...@@ -23,11 +23,11 @@ jobs: ...@@ -23,11 +23,11 @@ jobs:
path: | path: |
~/.cache ~/.cache
# An explicit key for restoring and saving the cache # An explicit key for restoring and saving the cache
key: evaldata-cache-3 key: evaldata-cache-4
- name: Set up Python 3.9 - name: Set up Python 3.9
uses: actions/setup-python@v2 uses: actions/setup-python@v2
with: with:
python-version: 3.9 python-version: 3.9.7
- name: Install dependencies - name: Install dependencies
run: | run: |
python -m pip install --upgrade pip python -m pip install --upgrade pip
...@@ -42,7 +42,7 @@ jobs: ...@@ -42,7 +42,7 @@ jobs:
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
- name: Test with pytest - name: Test with pytest
run: | run: |
pytest --cov=lm_eval/ tests/ pytest -vv --cov=lm_eval/ tests/
- name: Upload to codecov - name: Upload to codecov
run: | run: |
bash <(curl -s https://codecov.io/bash) -t $CODECOV_TOKEN bash <(curl -s https://codecov.io/bash) -t $CODECOV_TOKEN
@software{eval-harness,
author = {Gao, Leo and
Tow, Jonathan and
Biderman, Stella and
Black, Sid and
DiPofi, Anthony and
Foster, Charles and
Golding, Laurence and
Hsu, Jeffrey and
McDonell, Kyle and
Muennighoff, Niklas and
Phang, Jason and
Reynolds, Laria and
Tang, Eric and
Thite, Anish and
Wang, Ben and
Wang, Kevin and
Zou, Andy},
title = {A framework for few-shot language model evaluation},
month = sep,
year = 2021,
publisher = {Zenodo},
version = {v0.0.1},
doi = {10.5281/zenodo.5371628},
url = {https://doi.org/10.5281/zenodo.5371628}
}
This diff is collapsed.
import abc import abc
import random from typing import Iterable
import numpy as np import numpy as np
import re import re
import os
import json
import hashlib
from sqlitedict import SqliteDict
from tqdm import tqdm
import torch
import torch.nn.functional as F
from lm_eval.metrics import mean, perplexity, weighted_perplexity, weighted_mean from lm_eval.metrics import mean, weighted_perplexity, weighted_mean, bits_per_byte
from lm_eval import utils
from abc import abstractmethod
class LM(abc.ABC): class LM(abc.ABC):
def __init__(self): def __init__(self):
self.cache_hook = CacheHook(None) self.cache_hook = CacheHook(None)
@abc.abstractmethod @abstractmethod
def loglikelihood(self, requests): def loglikelihood(self, requests):
"""Compute log-likelihood of generating a continuation from a context. """Compute log-likelihood of generating a continuation from a context.
Downstream tasks should attempt to use loglikelihood instead of other Downstream tasks should attempt to use loglikelihood instead of other
...@@ -34,7 +43,7 @@ class LM(abc.ABC): ...@@ -34,7 +43,7 @@ class LM(abc.ABC):
""" """
pass pass
@abc.abstractmethod @abstractmethod
def loglikelihood_rolling(self, requests): def loglikelihood_rolling(self, requests):
"""Compute full log-likelihood of a string, with no truncation, for perplexity computation """Compute full log-likelihood of a string, with no truncation, for perplexity computation
- We will use the full max context length of the model. - We will use the full max context length of the model.
...@@ -77,7 +86,7 @@ class LM(abc.ABC): ...@@ -77,7 +86,7 @@ class LM(abc.ABC):
pass pass
# TODO: Add an optional max length # TODO: Add an optional max length
@abc.abstractmethod @abstractmethod
def greedy_until(self, requests): def greedy_until(self, requests):
"""Generate greedily until a stopping sequence """Generate greedily until a stopping sequence
...@@ -96,18 +105,235 @@ class LM(abc.ABC): ...@@ -96,18 +105,235 @@ class LM(abc.ABC):
pass pass
@classmethod @classmethod
def create_from_arg_string(cls, arg_string): def create_from_arg_string(cls, arg_string, additional_config=None):
"""Constructor method, in case models need additional arguments additional_config = {} if additional_config is None else additional_config
e.g. OpenAI API engine, paths for loading, other params args = utils.simple_parse_args_string(arg_string)
args2 = {k: v for k, v in additional_config.items() if v is not None}
return cls(**args, **args2)
def set_cache_hook(self, cache_hook):
self.cache_hook = cache_hook
class BaseLM(LM):
@property
@abstractmethod
def eot_token_id(self):
pass
@property
@abstractmethod
def max_length(self):
pass
@property
@abstractmethod
def max_gen_toks(self):
pass
@property
@abstractmethod
def batch_size(self):
pass
@property
@abstractmethod
def device(self):
pass
@abstractmethod
def tok_encode(self, string: str): pass
@abstractmethod
def tok_decode(self, tokens: Iterable[int]): pass
:param arg_string: str @abstractmethod
Left up to individual model class to handle def _model_generate(self, context, max_length, eos_token_id): pass
@abstractmethod
def _model_call(self, inps):
""" """
return cls() inps: a torch tensor of shape [batch, sequence]
the size of sequence may vary from call to call
def set_cache_hook(self, cache_hook): returns: a torch tensor of shape [batch, sequence, vocab] with the
self.cache_hook = cache_hook logits returned from the model
"""
pass
# subclass must implement properties vocab_size, eot_token_id, max_gen_toks, batch_size, device, max_length.
# TODO: enforce this somehow
def loglikelihood(self, requests):
new_reqs = []
for context, continuation in requests:
if context == "":
# end of text as context
context_enc = [self.eot_token_id]
else:
context_enc = self.tok_encode(context)
continuation_enc = self.tok_encode(continuation)
new_reqs.append(((context, continuation), context_enc, continuation_enc))
return self._loglikelihood_tokens(new_reqs)
def loglikelihood_rolling(self, requests):
# TODO: Implement caching once we've confirmed the perplexity implementation
# TODO: automatic batch size detection for vectorization
loglikelihoods = []
for string, in tqdm(requests):
rolling_token_windows = list(map(utils.make_disjoint_window, utils.get_rolling_token_windows(
token_list=self.tok_encode(string),
prefix_token=self.eot_token_id,
max_seq_len=self.max_length,
context_len=1,
)))
rolling_token_windows = [(None,) + x for x in rolling_token_windows]
# TODO: extract out this call so it only gets called once and also somehow figure out partial caching for
# that
string_nll = self._loglikelihood_tokens(rolling_token_windows, disable_tqdm=True)
# discard is_greedy
string_nll = [x[0] for x in string_nll]
string_nll = sum(string_nll)
loglikelihoods.append(string_nll)
return loglikelihoods
def _loglikelihood_tokens(self, requests, disable_tqdm=False):
# TODO: implement some kind of efficient-request-middleware that lumps together requests with the same context
res = []
def _collate(x):
# the negative sign on len(toks) sorts descending - this has a few advantages:
# - time estimates will always be over not underestimates, which is more useful for planning
# - to know the size of a batch when going through the list, you know the first one is always the batch
# padded context length. this is useful to simplify the batching logic and more importantly to make
# automatic adaptive batches much much easier to implement
# - any OOMs will happen right away rather than near the end
toks = x[1] + x[2]
return -len(toks), tuple(toks)
# TODO: automatic (variable) batch size detection for vectorization
reord = utils.Reorderer(requests, _collate)
for chunk in utils.chunks(tqdm(reord.get_reordered(), disable=disable_tqdm), self.batch_size):
inps = []
cont_toks_list = []
inplens = []
padding_length = None
# because vectorizing is annoying, we first convert each (context, continuation) pair to padded
# tensors, then we pack them together into a batch, call the model, and then pick it all apart
# again because vectorizing is annoying
for _, context_enc, continuation_enc in chunk:
# sanity check
assert len(context_enc) > 0
assert len(continuation_enc) > 0
assert len(continuation_enc) <= self.max_length
# how this all works:
# CTX CONT
# inp 0 1 2 3|4 5 6 7 8 9 <- last token is deleted by inp[:, :-1]
# gpt2 \ \
# logits 1 2 3|4 5 6 7 8 9 <- the ctx half gets tossed out by the
# cont_toks 4 5 6 7 8 9 [:, -len(continuation_enc):, :self.vocab_size] slice
# when too long to fit in context, truncate from the left
inp = torch.tensor(
(context_enc + continuation_enc)[-(self.max_length+1):][:-1],
dtype=torch.long
).to(self.device)
inplen, = inp.shape
cont = continuation_enc
# since in _collate we make sure length is descending, the longest is always the first one.
padding_length = padding_length if padding_length is not None else inplen
# pad length from seq to padding_length
inp = torch.cat([
inp, # [seq]
torch.zeros(padding_length - inplen, dtype=torch.long).to(inp.device) # [padding_length - seq]
], dim=0)
inps.append(inp.unsqueeze(0)) # [1, padding_length]
cont_toks_list.append(cont)
inplens.append(inplen)
batched_inps = torch.cat(inps, dim=0) # [batch, padding_length
multi_logits = F.log_softmax(self._model_call(batched_inps), dim=-1).cpu() # [batch, padding_length, vocab]
for (cache_key, _, _), logits, inp, inplen, cont_toks \
in zip(chunk, multi_logits, inps, inplens, cont_toks_list):
# Slice to original seq length
contlen = len(cont_toks)
logits = logits[inplen-contlen:inplen].unsqueeze(0) # [1, seq, vocab]
# Check if per-token argmax is exactly equal to continuation
greedy_tokens = logits.argmax(dim=-1)
cont_toks = torch.tensor(cont_toks, dtype=torch.long).unsqueeze(0) # [1, seq]
max_equal = (greedy_tokens == cont_toks).all()
# Obtain log-probs at the corresponding continuation token indices
# last_token_slice = logits[:, -1, :].squeeze(0).tolist()
logits = torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(-1) # [1, seq]
# Answer: (log prob, is-exact-match)
answer = (float(logits.sum()), bool(max_equal))
# partial caching
if cache_key is not None:
self.cache_hook.add_partial("loglikelihood", cache_key, answer)
res.append(answer)
return reord.get_original(res)
def greedy_until(self, requests):
# TODO: implement fully general `until` that handles untils that are
# multiple tokens or that span multiple tokens correctly
# TODO: extract to TokenizedLM?
res = []
def _collate(x):
toks = self.tok_encode(x[0])
return len(toks), x[0]
reord = utils.Reorderer(requests, _collate)
for context, until in tqdm(reord.get_reordered()):
if isinstance(until, str):
until = [until]
primary_until, = self.tok_encode(until[0])
context_enc = torch.tensor([self.tok_encode(context)[self.max_gen_toks - self.max_length:]]).to(self.device)
cont = self._model_generate(context_enc, context_enc.shape[1] + self.max_gen_toks, primary_until)
s = self.tok_decode(cont[0].tolist()[context_enc.shape[1]:])
for term in until:
s = s.split(term)[0]
# partial caching
self.cache_hook.add_partial("greedy_until", (context, until), s)
res.append(s)
return reord.get_original(res)
class Task(abc.ABC): class Task(abc.ABC):
...@@ -128,17 +354,17 @@ class Task(abc.ABC): ...@@ -128,17 +354,17 @@ class Task(abc.ABC):
"""Downloads the task dataset if necessary""" """Downloads the task dataset if necessary"""
pass pass
@abc.abstractmethod @abstractmethod
def has_training_docs(self): def has_training_docs(self):
"""Whether the task has a training set""" """Whether the task has a training set"""
pass pass
@abc.abstractmethod @abstractmethod
def has_validation_docs(self): def has_validation_docs(self):
"""Whether the task has a validation set""" """Whether the task has a validation set"""
pass pass
@abc.abstractmethod @abstractmethod
def has_test_docs(self): def has_test_docs(self):
"""Whether the task has a test set""" """Whether the task has a test set"""
pass pass
...@@ -170,15 +396,15 @@ class Task(abc.ABC): ...@@ -170,15 +396,15 @@ class Task(abc.ABC):
return rnd.sample(self._training_docs, k) return rnd.sample(self._training_docs, k)
@abc.abstractmethod @abstractmethod
def doc_to_text(self, doc): def doc_to_text(self, doc):
pass pass
@abc.abstractmethod @abstractmethod
def doc_to_target(self, doc): def doc_to_target(self, doc):
pass pass
@abc.abstractmethod @abstractmethod
def construct_requests(self, doc, ctx): def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of """ Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM. Requests which will be sent to the LM.
...@@ -192,7 +418,7 @@ class Task(abc.ABC): ...@@ -192,7 +418,7 @@ class Task(abc.ABC):
""" """
pass pass
@abc.abstractmethod @abstractmethod
def process_results(self, doc, results): def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a """Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of dict where keys are the names of submetrics and values are the values of
...@@ -205,7 +431,7 @@ class Task(abc.ABC): ...@@ -205,7 +431,7 @@ class Task(abc.ABC):
""" """
pass pass
@abc.abstractmethod @abstractmethod
def aggregation(self): def aggregation(self):
""" """
:returns: {str: [metric_score] -> float} :returns: {str: [metric_score] -> float}
...@@ -214,7 +440,7 @@ class Task(abc.ABC): ...@@ -214,7 +440,7 @@ class Task(abc.ABC):
""" """
pass pass
@abc.abstractmethod @abstractmethod
def higher_is_better(self): def higher_is_better(self):
""" """
:returns: {str: bool} :returns: {str: bool}
...@@ -238,7 +464,9 @@ class Task(abc.ABC): ...@@ -238,7 +464,9 @@ class Task(abc.ABC):
fewshotex = self.fewshot_examples(k=num_fewshot, rnd=rnd) fewshotex = self.fewshot_examples(k=num_fewshot, rnd=rnd)
else: else:
if self._fewshot_docs is None: if self._fewshot_docs is None:
self._fewshot_docs = list(self.validation_docs() if self.has_validation_docs() else self.test_docs()) self._fewshot_docs = list(
self.validation_docs() if self.has_validation_docs() else self.test_docs()
)
fewshotex = rnd.sample(self._fewshot_docs, num_fewshot + 1) fewshotex = rnd.sample(self._fewshot_docs, num_fewshot + 1)
...@@ -253,7 +481,7 @@ class Task(abc.ABC): ...@@ -253,7 +481,7 @@ class Task(abc.ABC):
return description + labeled_examples + example return description + labeled_examples + example
class MultipleChoiceTask(Task): class MultipleChoiceTask(Task, abc.ABC):
def doc_to_target(self, doc): def doc_to_target(self, doc):
return " " + doc['choices'][doc['gold']] return " " + doc['choices'][doc['gold']]
...@@ -328,39 +556,30 @@ class PerplexityTask(Task, abc.ABC): ...@@ -328,39 +556,30 @@ class PerplexityTask(Task, abc.ABC):
def process_results(self, doc, results): def process_results(self, doc, results):
loglikelihood, = results loglikelihood, = results
words = self.count_words(doc) words = self.count_words(doc)
bytes = self.count_bytes(doc) bytes_ = self.count_bytes(doc)
return { return {
"word_perplexity": (loglikelihood, words), "word_perplexity": (loglikelihood, words),
"byte_perplexity": (loglikelihood, bytes), "byte_perplexity": (loglikelihood, bytes_),
"bits_per_byte": (-loglikelihood, self.count_bytes(doc)) "bits_per_byte": (loglikelihood, bytes_),
} }
def aggregation(self): def aggregation(self):
return { return {
"word_perplexity": weighted_perplexity, "word_perplexity": weighted_perplexity,
"byte_perplexity": weighted_perplexity, "byte_perplexity": weighted_perplexity,
"bits_per_byte": weighted_mean "bits_per_byte": bits_per_byte,
} }
def count_bytes(self, doc): @classmethod
def count_bytes(cls, doc):
return len(doc.encode("utf-8")) return len(doc.encode("utf-8"))
def count_words(self, doc): @classmethod
def count_words(cls, doc):
""" Downstream tasks with custom word boundaries should override this! """ """ Downstream tasks with custom word boundaries should override this! """
return len(re.split(r"\s+", doc)) return len(re.split(r"\s+", doc))
req_ret_lens = {
'loglikelihood': 2,
'greedy_until': None,
'loglikelihood_rolling': None,
}
import os
import json
import hashlib
from sqlitedict import SqliteDict
def hash_args(attr, args): def hash_args(attr, args):
dat = json.dumps([attr] + list(args)) dat = json.dumps([attr] + list(args))
return hashlib.sha256(dat.encode('utf-8')).hexdigest() return hashlib.sha256(dat.encode('utf-8')).hexdigest()
...@@ -383,9 +602,17 @@ class CacheHook: ...@@ -383,9 +602,17 @@ class CacheHook:
class CachingLM: class CachingLM:
def __init__(self, lm, cache_db): def __init__(self, lm, cache_db):
"""LM wrapper that returns cached results if they exist, and uses the underlying LM if not.
:param lm: LM
Underlying LM
:param cache_db: str
Path to cache db
"""
self.lm = lm self.lm = lm
self.cache_db = cache_db self.cache_db = cache_db
if os.path.dirname(cache_db): os.makedirs(os.path.dirname(cache_db), exist_ok=True) if os.path.dirname(cache_db):
os.makedirs(os.path.dirname(cache_db), exist_ok=True)
self.dbdict = SqliteDict(cache_db, autocommit=True) self.dbdict = SqliteDict(cache_db, autocommit=True)
# add hook to lm # add hook to lm
...@@ -409,13 +636,14 @@ class CachingLM: ...@@ -409,13 +636,14 @@ class CachingLM:
res.append(None) res.append(None)
remaining_reqs.append(req) remaining_reqs.append(req)
# actually run the LM # actually run the LM on the requests that do not have cached results
rem_res = getattr(self.lm, attr)(remaining_reqs) rem_res = getattr(self.lm, attr)(remaining_reqs)
# stick the new ones back into the list and also cache any of the new ones # stick the new ones back into the list and also cache any of the new ones
resptr = 0 resptr = 0
for req, r in zip(remaining_reqs, rem_res): for req, r in zip(remaining_reqs, rem_res):
while res[resptr] is not None: resptr += 1 while res[resptr] is not None:
resptr += 1
res[resptr] = r res[resptr] = r
...@@ -431,32 +659,39 @@ class CachingLM: ...@@ -431,32 +659,39 @@ class CachingLM:
return CacheHook(self) return CacheHook(self)
REQUEST_RETURN_LENGTHS = {
'loglikelihood': 2,
'greedy_until': None,
'loglikelihood_rolling': None,
}
class Request: class Request:
def __init__(self, type, args, index=None): def __init__(self, request_type, args, index=None):
if type not in req_ret_lens.keys(): if request_type not in REQUEST_RETURN_LENGTHS.keys():
raise NotImplementedError('The request type {} is not implemented!'.format(type)) raise NotImplementedError('The request type {} is not implemented!'.format(request_type))
self.type = type self.request_type = request_type
self.args = args self.args = args
self.index = index self.index = index
def __iter__(self): def __iter__(self):
if req_ret_lens[self.type] is None: if REQUEST_RETURN_LENGTHS[self.request_type] is None:
raise IndexError('This request type does not return multiple arguments!') raise IndexError('This request type does not return multiple arguments!')
i = 0 for i in range(REQUEST_RETURN_LENGTHS[self.request_type]):
for i in range(req_ret_lens[self.type]): yield Request(self.request_type, self.args, i)
yield Request(self.type, self.args, i)
def __getitem__(self, i): def __getitem__(self, i):
if req_ret_lens[self.type] is None: if REQUEST_RETURN_LENGTHS[self.request_type] is None:
raise IndexError('This request type does not return multiple arguments!') raise IndexError('This request type does not return multiple arguments!')
return Request(self.type, self.args, i) return Request(self.request_type, self.args, i)
def __eq__(self, other): def __eq__(self, other):
return self.type == other.type and self.args == other.args and self.index == other.index return self.request_type == other.request_type and self.args == other.args and self.index == other.index
def __repr__(self): def __repr__(self):
return f"Req_{self.type}{self.args}[{self.index}]\n" return f"Req_{self.request_type}{self.args}[{self.index}]\n"
class RequestFactory: class RequestFactory:
def __getattr__(self, attr): def __getattr__(self, attr):
......
...@@ -2,12 +2,96 @@ import collections ...@@ -2,12 +2,96 @@ import collections
import itertools import itertools
import random import random
import lm_eval.metrics import lm_eval.metrics
import lm_eval.models
import lm_eval.tasks
import lm_eval.base
import numpy as np
def simple_evaluate(model, model_args, task_names,
num_fewshot=0, batch_size=None, device=None,
no_cache=False, limit=None, bootstrap_iters=100000):
"""Instantiate and evaluate a model on a list of tasks.
:param model: str
Name of model, see lm_eval.models.get_model
:param model_args: str
String arguments for each model class, see LM.create_from_arg_string
:param task_names: list[str]
List of task names
:param num_fewshot: int
Number of examples in few-shot context
:param batch_size: int, optional
Batch size for model
:param device: str, optional
PyTorch device (e.g. "cpu" or "cuda:0") for running models
:param no_cache: bool
Whether or not to cache
:param limit: int, optional
Limit the number of examples per task (only use this for testing)
:param bootstrap_iters:
Number of iterations for bootstrap statistics
:return
Dictionary of results
"""
random.seed(1234)
np.random.seed(1234)
lm = lm_eval.models.get_model(model).create_from_arg_string(model_args, {
'batch_size': batch_size, 'device': device
})
if not no_cache:
lm = lm_eval.base.CachingLM(
lm, 'lm_cache/' + model + '_' + model_args.replace('=', '-').replace(',', '_').replace('/', '-') + '.db'
)
task_dict = lm_eval.tasks.get_task_dict(task_names)
results = evaluate(lm, task_dict, False, num_fewshot, limit)
# add info about the model and few shot config
results["config"] = {
"model": model,
"model_args": model_args,
"num_fewshot": num_fewshot,
"batch_size": batch_size,
"device": device,
"no_cache": no_cache,
"limit": limit,
"bootstrap_iters": bootstrap_iters
}
return results
def evaluate(lm, task_dict, provide_description, num_fewshot, limit, bootstrap_iters=100000): def evaluate(lm, task_dict, provide_description, num_fewshot, limit, bootstrap_iters=100000):
"""Instantiate and evaluate a model on a list of tasks.
:param lm: obj
Language Model
:param task_dict: dict[str, Task]
Dictionary of tasks
:param provide_description: bool
Not implemented, and this option is deprecated and will be removed in a future version in favor of a different description providing method
:param num_fewshot: int
Number of examples in few-shot context
:param limit: int, optional
Limit the number of examples per task (only use this for testing)
:param bootstrap_iters:
Number of iterations for bootstrap statistics
:return
Dictionary of results
"""
# TODO: completely refactor this entire function to not be a huge mess, ideally breaking it down into smaller pieces # TODO: completely refactor this entire function to not be a huge mess, ideally breaking it down into smaller pieces
task_dict_items = [(name, task) for name, task in task_dict.items() if(task.has_validation_docs() or task.has_test_docs())] # TODO: todo: implement proper description-providing system
assert not provide_description # not implemented.
task_dict_items = [
(name, task)
for name, task in task_dict.items()
if(task.has_validation_docs() or task.has_test_docs())
]
results = collections.defaultdict(dict) results = collections.defaultdict(dict)
versions = collections.defaultdict(dict) versions = collections.defaultdict(dict)
...@@ -15,23 +99,25 @@ def evaluate(lm, task_dict, provide_description, num_fewshot, limit, bootstrap_i ...@@ -15,23 +99,25 @@ def evaluate(lm, task_dict, provide_description, num_fewshot, limit, bootstrap_i
requests = collections.defaultdict(list) requests = collections.defaultdict(list)
requests_origin = collections.defaultdict(list) requests_origin = collections.defaultdict(list)
# if we ever run into issues where the eval tasks don't fit in memory and we can't afford a machine with bigger memory, # If we ever run into issues where the eval tasks don't fit in memory and we can't afford a machine with bigger
# we can always modify this plumbing to support that, but i didn't want to include it just yet because overengineering is bad # memory, we can always modify this plumbing to support that, but I didn't want to include it just yet because
# (or we could make it write the requests to disk and then read them back out again - probably using an sqlite db because of all the moving parts we have # over-engineering is bad (or we could make it write the requests to disk and then read them back out again
# - probably using an sqlite db because of all the moving parts we have
# TODO: we need unit tests & sanity checks or something to ensure that the return of `validation_docs` is stable # TODO: we need unit tests & sanity checks or something to ensure that the return of `validation_docs` is stable
docs = {} docs = {}
# get lists of each type of requeste # get lists of each type of request
for task_name, task in task_dict_items: for task_name, task in task_dict_items:
versions[task_name] = task.VERSION versions[task_name] = task.VERSION
#default to test doc, fall back to val doc if validation unavailable # default to test doc, fall back to val doc if validation unavailable
# TODO: the test-fallback-to-val system isn't final, we should revisit it at some point # TODO: the test-fallback-to-val system isn't final, we should revisit it at some point
if task.has_test_docs(): if task.has_test_docs():
task_doc_func = task.test_docs task_doc_func = task.test_docs
elif task.has_validation_docs(): elif task.has_validation_docs():
task_doc_func = task.validation_docs task_doc_func = task.validation_docs
else:
raise RuntimeError("Task has neither test_docs nor validation_docs")
# deterministically shuffle docs and chop off the first `limit` because sometimes docs are in some kind of order # deterministically shuffle docs and chop off the first `limit` because sometimes docs are in some kind of order
task_docs = list(task_doc_func()) task_docs = list(task_doc_func())
...@@ -50,25 +136,26 @@ def evaluate(lm, task_dict, provide_description, num_fewshot, limit, bootstrap_i ...@@ -50,25 +136,26 @@ def evaluate(lm, task_dict, provide_description, num_fewshot, limit, bootstrap_i
) )
reqs = task.construct_requests(doc, ctx) reqs = task.construct_requests(doc, ctx)
if not isinstance(reqs, (list, tuple)): reqs = [reqs] if not isinstance(reqs, (list, tuple)):
reqs = [reqs]
for i, req in enumerate(reqs): for i, req in enumerate(reqs):
requests[req.type].append(req) requests[req.request_type].append(req)
# i: index in requests for a single task instance # i: index in requests for a single task instance
# doc_id: unique id that we can get back to a doc using `docs` # doc_id: unique id that we can get back to a doc using `docs`
requests_origin[req.type].append((i, task_name, doc, doc_id)) requests_origin[req.request_type].append((i, task_name, doc, doc_id))
# all responses for each (task, doc) # all responses for each (task, doc)
process_res_queue = collections.defaultdict(list) process_res_queue = collections.defaultdict(list)
# execute each type of request # execute each type of request
for reqtype, reqs in requests.items(): for reqtype, reqs in requests.items():
# TODO: right now, this code runs multiple seperate LM requests for multiple Requests differing # TODO: right now, this code runs multiple separate LM requests for multiple Requests differing
# only in index. We could implement some kind of caching, but that would be more of a bandaid # only in index. We could implement some kind of caching, but that would be more of a band-aid
# solution. we could also implement some kind of autogrouping here; they should end up next to each other. # solution. we could also implement some kind of auto-grouping here;
# they should end up next to each other.
print("Running", reqtype, "requests") print("Running", reqtype, "requests")
resps = getattr(lm, reqtype)([req.args for req in reqs]) resps = getattr(lm, reqtype)([req.args for req in reqs])
resps = [x if req.index is None else x[req.index] for x, req in zip(resps, reqs)] resps = [x if req.index is None else x[req.index] for x, req in zip(resps, reqs)]
for resp, (i, task_name, doc, doc_id) in zip(resps, requests_origin[reqtype]): for resp, (i, task_name, doc, doc_id) in zip(resps, requests_origin[reqtype]):
...@@ -93,11 +180,49 @@ def evaluate(lm, task_dict, provide_description, num_fewshot, limit, bootstrap_i ...@@ -93,11 +180,49 @@ def evaluate(lm, task_dict, provide_description, num_fewshot, limit, bootstrap_i
task = task_dict[task_name] task = task_dict[task_name]
results[task_name][metric] = task.aggregation()[metric](items) results[task_name][metric] = task.aggregation()[metric](items)
stderr = lm_eval.metrics.stderr_for_metric(task.aggregation()[metric], bootstrap_iters=bootstrap_iters) # hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
# so we run them less iterations. still looking for a cleaner way to do this
stderr = lm_eval.metrics.stderr_for_metric(
metric=task.aggregation()[metric],
bootstrap_iters=min(bootstrap_iters, 1000) if metric in ["bleu", "chrf", "ter"] else bootstrap_iters,
)
if stderr is not None: if stderr is not None:
results[task_name][metric + "_stderr"] = stderr(items) results[task_name][metric + "_stderr"] = stderr(items)
return { return {
"results": results, "results": dict(results),
"versions": versions "versions": dict(versions)
} }
def make_table(result_dict):
"""Generate table of results."""
from pytablewriter import MarkdownTableWriter, LatexTableWriter
md_writer = MarkdownTableWriter()
latex_writer = LatexTableWriter()
md_writer.headers = ["Task", "Version", "Metric", "Value", "", "Stderr"]
latex_writer.headers = ["Task", "Version", "Metric", "Value", "", "Stderr"]
values = []
for k, dic in result_dict["results"].items():
version = result_dict["versions"][k]
for m, v in dic.items():
if m.endswith("_stderr"):
continue
if m + "_stderr" in dic:
se = dic[m + "_stderr"]
values.append([k, version, m, '%.4f' % v, '±', '%.4f' % se])
else:
values.append([k, version, m, '%.4f' % v, '', ''])
k = ""
version = ""
md_writer.value_matrix = values
latex_writer.value_matrix = values
# todo: make latex table look good
# print(latex_writer.dumps())
return md_writer.dumps()
import math import math
from collections import Iterable from collections.abc import Iterable
from pprint import pprint
import numpy as np import numpy as np
import sacrebleu import sacrebleu
...@@ -63,6 +62,7 @@ def acc_all(items): ...@@ -63,6 +62,7 @@ def acc_all(items):
acc = np.mean([int(all(x)) for x in question_scoring_dict.values()]) acc = np.mean([int(all(x)) for x in question_scoring_dict.values()])
return acc return acc
def acc_all_stderr(items): def acc_all_stderr(items):
# Only count as correct if all answers are labeled correctly for each question # Only count as correct if all answers are labeled correctly for each question
question_scoring_dict = {} question_scoring_dict = {}
...@@ -98,9 +98,13 @@ def weighted_mean(items): ...@@ -98,9 +98,13 @@ def weighted_mean(items):
a, b = zip(*items) a, b = zip(*items)
return sum(a) / sum(b) return sum(a) / sum(b)
def weighted_perplexity(items): def weighted_perplexity(items):
return math.exp(-weighted_mean(items)) return math.exp(-weighted_mean(items))
def bits_per_byte(items):
return -weighted_mean(items) / math.log(2)
def bleu(items): def bleu(items):
"""The Bilingual Evaluation Understudy Score, or BLEU for short, is a metric """The Bilingual Evaluation Understudy Score, or BLEU for short, is a metric
...@@ -179,12 +183,13 @@ def _sacreformat(refs, preds): ...@@ -179,12 +183,13 @@ def _sacreformat(refs, preds):
return refs, preds return refs, preds
## stderr stuff # stderr stuff
class _bootstrap_internal: class _bootstrap_internal:
def __init__(self, f, n): def __init__(self, f, n):
self.f = f self.f = f
self.n = n self.n = n
def __call__(self, v): def __call__(self, v):
i, xs = v i, xs = v
rnd = random.Random() rnd = random.Random()
...@@ -208,7 +213,9 @@ def bootstrap_stderr(f, xs, iters): ...@@ -208,7 +213,9 @@ def bootstrap_stderr(f, xs, iters):
chunk_size = min(1000, iters) chunk_size = min(1000, iters)
from tqdm import tqdm from tqdm import tqdm
print("bootstrapping for stddev:", f.__name__) print("bootstrapping for stddev:", f.__name__)
for bootstrap in tqdm(pool.imap(_bootstrap_internal(f, chunk_size), [(i, xs) for i in range(iters // chunk_size)]), total=iters // chunk_size): for bootstrap in tqdm(pool.imap(
_bootstrap_internal(f, chunk_size),
[(i, xs) for i in range(iters // chunk_size)]), total=iters // chunk_size):
# sample w replacement # sample w replacement
res.extend(bootstrap) res.extend(bootstrap)
......
...@@ -3,6 +3,7 @@ from . import gpt3 ...@@ -3,6 +3,7 @@ from . import gpt3
from . import dummy from . import dummy
MODEL_REGISTRY = { MODEL_REGISTRY = {
"hf": gpt2.HFLM,
"gpt2": gpt2.GPT2LM, "gpt2": gpt2.GPT2LM,
"gpt3": gpt3.GPT3LM, "gpt3": gpt3.GPT3LM,
"dummy": dummy.DummyLM, "dummy": dummy.DummyLM,
......
import transformers import transformers
import torch import torch
import torch.nn as nn from lm_eval.base import BaseLM
import torch.nn.functional as F
from lm_eval.base import LM
from lm_eval import utils
from tqdm import tqdm
import numpy as np
class GPT2LM(LM): class HFLM(BaseLM):
MAX_GEN_TOKS = 256
VOCAB_SIZE = 50257
EOT_TOKEN_ID = 50256
def __init__(self, device='cuda', pretrained='gpt2', batch_size=1): def __init__(self, device='cuda', pretrained='gpt2', revision='main', subfolder=None, tokenizer=None, batch_size=1):
super().__init__() super().__init__()
assert isinstance(device, str)
assert isinstance(pretrained, str)
assert isinstance(batch_size, int)
if device: if device:
self.device = torch.device(device) self._device = torch.device(device)
else: else:
self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') self._device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
self.gpt2 = transformers.AutoModelForCausalLM.from_pretrained(pretrained).to(self.device)
# TODO: update this to be less of a hack once subfolder is fixed in HF
self.gpt2 = transformers.AutoModelForCausalLM.from_pretrained(
pretrained, revision=revision + ("/" + subfolder if subfolder is not None else "")
).to(self.device)
self.gpt2.eval() self.gpt2.eval()
# pretrained tokenizer for neo is broken for now so just hardcoding this to gpt2 # pretrained tokenizer for neo is broken for now so just hard-coding this to gpt2
self.tokenizer = transformers.GPT2TokenizerFast.from_pretrained('gpt2') self.tokenizer = transformers.AutoTokenizer.from_pretrained(
self.tokenizer.pad_token = "<|endoftext|>" pretrained if tokenizer is None else tokenizer, revision=revision, subfolder=subfolder)
try:
self.max_length = self.gpt2.config.n_ctx
except AttributeError:
# gptneoconfig doesn't have n_ctx apparantly
self.max_length = self.gpt2.config.max_position_embeddings
assert self.tokenizer.encode('hello\n\nhello') == [31373, 198, 198, 31373] assert isinstance(self.tokenizer, (
transformers.GPT2Tokenizer, transformers.GPT2TokenizerFast,
transformers.T5Tokenizer, transformers.T5TokenizerFast,
)), "this tokenizer has not been checked for compatibility yet!"
# multithreading and batching self.vocab_size = self.tokenizer.vocab_size
gpus = torch.cuda.device_count()
batch_size_per_gpu = batch_size # todo: adaptive batch size
self.batch_size = batch_size_per_gpu * gpus if isinstance(self.tokenizer, (transformers.GPT2Tokenizer, transformers.GPT2TokenizerFast)):
assert self.tokenizer.encode('hello\n\nhello') == [31373, 198, 198, 31373], \
self.tokenizer.encode('hello\n\nhello')
# multithreading and batching
self.batch_size_per_gpu = batch_size # todo: adaptive batch size
# TODO: fix multi-gpu # TODO: fix multi-gpu
# gpus = torch.cuda.device_count()
# if gpus > 1: # if gpus > 1:
# self.gpt2 = nn.DataParallel(self.gpt2) # self.gpt2 = nn.DataParallel(self.gpt2)
@classmethod @property
def create_from_arg_string(cls, arg_string, additional_config={}): def eot_token_id(self):
args = utils.simple_parse_args_string(arg_string) # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
args2 = {k: v for k, v in additional_config.items() if v is not None} return self.tokenizer.eos_token_id
return cls(**args, **args2)
def loglikelihood(self, requests):
new_reqs = []
for context, continuation in requests:
if context == "":
# end of text as context
context_enc = [self.EOT_TOKEN_ID]
else:
context_enc = self.tokenizer.encode(context)
continuation_enc = self.tokenizer.encode(continuation)
new_reqs.append(((context, continuation), context_enc, continuation_enc))
return self._loglikelihood_tokens(new_reqs)
def loglikelihood_rolling(self, requests):
# TODO: Implement caching once we've confirmed the perplexity implementation
# TODO: automatic batch size detection for vectorization
loglikelihoods = []
with torch.no_grad():
for string, in tqdm(requests):
rolling_token_windows = list(map(utils.make_disjoint_window, utils.get_rolling_token_windows(
token_list=self.tokenizer.encode(string),
prefix_token=self.EOT_TOKEN_ID,
max_seq_len=self.max_length,
context_len=1,
)))
rolling_token_windows = [(None,) + x for x in rolling_token_windows]
# TODO: extract out this call so it only gets called once and also somehow figure out partial caching for that
string_nll = self._loglikelihood_tokens(rolling_token_windows, disable_tqdm=True)
# discard is_greedy
string_nll = [x[0] for x in string_nll]
string_nll = sum(string_nll)
loglikelihoods.append(string_nll)
return loglikelihoods
def _loglikelihood_tokens(self, requests, disable_tqdm=False):
# TODO: implement some kind of efficient-request-middleware that lumps together requests with the same context
res = []
with torch.no_grad():
def _collate(x):
# the negative sign on len(toks) sorts descending - this has a few advantages:
# - time estimates will always be over not underestimates, which is more useful for planning
# - to know the size of a batch when going through the list, you know the first one is always the batch padded context length.
# this is useful to simplify the batching logic and more importantly to make automatic adaptive batches much much easier to implement
# - any OOMs will happen right away rather than near the end
toks = x[1] + x[2]
return (-len(toks), tuple(toks))
# TODO: automatic (variable) batch size detection for vectorization
reord = utils.Reorderer(requests, _collate)
for chunk in utils.chunks(tqdm(reord.get_reordered(), disable=disable_tqdm), self.batch_size):
inps = []
contlens = []
inplens = []
padding_length = None
# because vectorizing is annoying, we first convert each (context, continuation) pair to padded
# tensors, then we pack them together into a batch, call the model, and then pick it all apart
# again because vectorizing is annoying
for _, context_enc, continuation_enc in chunk:
# sanity check
assert len(context_enc) > 0
assert len(continuation_enc) > 0
assert len(continuation_enc) <= self.max_length
# how this all works: @property
# CTX CONT def max_length(self):
# inp 0 1 2 3|4 5 6 7 8 9 <- last token is deleted by inp[:, :-1] try:
# gpt2 \ \ return self.gpt2.config.n_ctx
# logits 1 2 3|4 5 6 7 8 9 <- the ctx half gets tossed out by the [:, -len(continuation_enc):, :self.VOCAB_SIZE] slice except AttributeError:
# cont_toks 4 5 6 7 8 9 # gptneoconfig doesn't have n_ctx apparently
return self.gpt2.config.max_position_embeddings
# when too long to fit in context, truncate from the left
inp = torch.tensor(
(context_enc + continuation_enc)[-(self.max_length+1):][:-1]
, dtype=torch.long).to(self.device)
inplen, = inp.shape
cont = continuation_enc
# since in _collate we make sure length is descending, the longest is always the first one.
padding_length = padding_length if padding_length is not None else inplen
# pad to length
inp = torch.cat([
inp, # [seq]
torch.zeros(padding_length - inplen, dtype=torch.long).to(inp.device) # [padding_length - seq]
], dim=0)
inps.append(inp.unsqueeze(0))
contlens.append(cont)
inplens.append(inplen)
multi_logits = F.log_softmax(self._model_call(torch.cat(inps, dim=0)), dim=-1).cpu() # [batch, seq, vocab]
for (cache_key, _, _), logits, inp, inplen, cont_toks in zip(chunk, multi_logits, inps, inplens, contlens):
contlen = len(cont_toks)
logits = logits[inplen-contlen:inplen].unsqueeze(0) # [1, seq, vocab]
greedy_tokens = logits.argmax(dim=-1)
# cont_toks :: [1, seq]
cont_toks = torch.tensor(cont_toks, dtype=torch.long).unsqueeze(0)
max_equal = (greedy_tokens == cont_toks).all()
#last_token_slice = logits[:, -1, :].squeeze(0).tolist()
logits = torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(-1) # [1, seq]
answer = (float(logits.sum()), bool(max_equal)) @property
def max_gen_toks(self):
return 256
# partial caching @property
if cache_key is not None: def batch_size(self):
self.cache_hook.add_partial("loglikelihood", cache_key, answer) # TODO: fix multi-gpu
return self.batch_size_per_gpu # * gpus
res.append(answer) @property
def device(self):
# TODO: fix multi-gpu
return self._device
return reord.get_original(res) def tok_encode(self, string: str):
return self.tokenizer.encode(string, add_special_tokens=False)
def tok_decode(self, tokens):
return self.tokenizer.decode(tokens)
def _model_call(self, inps): def _model_call(self, inps):
""" """
inps: a torch tensor of shape [batch, sequence] inps: a torch tensor of shape [batch, sequence]
the size of sequence may vary from call to call the size of sequence may vary from call to call
returns: a torch tensor of shape [batch, sequence, vocab] with the returns: a torch tensor of shape [batch, sequence, vocab] with the
logits retuned from the model logits returned from the model
""" """
return self.gpt2(inps)[0][:, :, :50257] with torch.no_grad():
return self.gpt2(inps)[0][:, :, :50257]
def greedy_until(self, requests): def _model_generate(self, context, max_length, eos_token_id):
# TODO: implement fully general `until` that handles untils that are return self.gpt2.generate(
# multiple tokens or that span multiple tokens correctly context,
res = [] max_length=max_length,
eos_token_id=eos_token_id,
def _collate(x): do_sample=False
toks = self.tokenizer.encode(x[0]) )
return (len(toks), x[0])
reord = utils.Reorderer(requests, _collate) # for backwards compatibility
GPT2LM = HFLM
for context, until in tqdm(reord.get_reordered()):
if isinstance(until, str): until = [until]
context_enc = torch.tensor([self.tokenizer.encode(context)[self.MAX_GEN_TOKS - self.max_length:]]).to(self.device)
primary_until, = self.tokenizer.encode(until[0])
cont = self.gpt2.generate(
context_enc,
max_length=context_enc.shape[1] + self.MAX_GEN_TOKS,
eos_token_id=primary_until,
do_sample=False
)
s = self.tokenizer.decode(cont[0].tolist()[context_enc.shape[1]:])
for term in until:
s = s.split(term)[0]
# partial caching
self.cache_hook.add_partial("greedy_until", (context, until), s)
res.append(s)
return reord.get_original(res)
import os import os
import numpy as np import numpy as np
import transformers import transformers
from lm_eval.base import LM from lm_eval.base import BaseLM
from lm_eval import utils from lm_eval import utils
from tqdm import tqdm from tqdm import tqdm
import time import time
def get_result(response, ctxlen): def get_result(response, ctxlen):
"""Process results from OpenAI API response.
:param response: dict
OpenAI API Response
:param ctxlen: int
Length of context (so we can slice them away and only keep the predictions)
:return:
continuation_logprobs: np.array
Log probabilities of continuation tokens
is_greedy: bool
whether argmax matches given continuation exactly
"""
is_greedy = True is_greedy = True
logprobs = response["logprobs"]["token_logprobs"] logprobs = response["logprobs"]["token_logprobs"]
continuation_logprobs = sum(logprobs[ctxlen:]) continuation_logprobs = sum(logprobs[ctxlen:])
...@@ -24,8 +36,11 @@ def get_result(response, ctxlen): ...@@ -24,8 +36,11 @@ def get_result(response, ctxlen):
def oa_completion(**kwargs): def oa_completion(**kwargs):
import openai """ Query OpenAI API for completion.
Retry with back-off until they respond
"""
import openai
backoff_time = 3 backoff_time = 3
while True: while True:
try: try:
...@@ -35,11 +50,8 @@ def oa_completion(**kwargs): ...@@ -35,11 +50,8 @@ def oa_completion(**kwargs):
backoff_time *= 1.5 backoff_time *= 1.5
class GPT3LM(LM): class GPT3LM(BaseLM):
MAX_LENGTH = 2048
REQ_CHUNK_SIZE = 20 REQ_CHUNK_SIZE = 20
MAX_GEN_TOKS = 256
def __init__(self, engine, truncate=False): def __init__(self, engine, truncate=False):
""" """
...@@ -50,10 +62,12 @@ class GPT3LM(LM): ...@@ -50,10 +62,12 @@ class GPT3LM(LM):
Truncate input if too long (if False and input is too long, throw error) Truncate input if too long (if False and input is too long, throw error)
""" """
super().__init__() super().__init__()
import openai import openai
self.engine = engine self.engine = engine
self.tokenizer = transformers.GPT2TokenizerFast.from_pretrained('gpt2') self.tokenizer = transformers.GPT2TokenizerFast.from_pretrained('gpt2')
self.vocab_size = self.tokenizer.vocab_size
# to make the annoying "Using pad_token, but it is not set yet." error go away # to make the annoying "Using pad_token, but it is not set yet." error go away
self.tokenizer.pad_token = "<|endoftext|>" self.tokenizer.pad_token = "<|endoftext|>"
...@@ -64,53 +78,36 @@ class GPT3LM(LM): ...@@ -64,53 +78,36 @@ class GPT3LM(LM):
# Read from environment variable OPENAI_API_SECRET_KEY # Read from environment variable OPENAI_API_SECRET_KEY
openai.api_key = os.environ["OPENAI_API_SECRET_KEY"] openai.api_key = os.environ["OPENAI_API_SECRET_KEY"]
@classmethod @property
def create_from_arg_string(cls, arg_string, additional_config={}): def eot_token_id(self):
args = utils.simple_parse_args_string(arg_string) return self.tokenizer.eos_token_id
args2 = {k: v for k, v in additional_config.items() if v is not None}
return cls(**args, **args2) @property
def max_length(self):
def loglikelihood(self, requests): # Note: the OpenAI API supports up to 2049 tokens, with the first token being the first input token
new_reqs = [] return 2048
for context, continuation in requests:
if context == "": @property
# end of text as context def max_gen_toks(self):
context_enc = [50256] return 256
else:
context_enc = self.tokenizer.encode(context) @property
def batch_size(self):
continuation_enc = self.tokenizer.encode(continuation) # Isn't used because we override _loglikelihood_tokens
raise NotImplementedError()
new_reqs.append(((context, continuation), context_enc, continuation_enc))
@property
return self._loglikelihood_tokens(new_reqs) def device(self):
# Isn't used because we override _loglikelihood_tokens
def loglikelihood_rolling(self, requests): raise NotImplementedError()
# TODO: switch implementation to use _loglikelihood_tokens rather than having it do its own thing
def tok_encode(self, string: str):
loglikelihoods = [] return self.tokenizer.encode(string, add_special_tokens=False)
for string, in tqdm(requests):
encoded = self.tokenizer.encode_plus(string)["input_ids"] def tok_decode(self, tokens):
rolling_token_windows = utils.get_rolling_token_windows( return self.tokenizer.decode(tokens)
token_list=encoded,
prefix_token=self.end_of_text_token_id, def _loglikelihood_tokens(self, requests, disable_tqdm=False):
max_seq_len=self.MAX_LENGTH,
context_len=1,
)
string_loglikelihoods = []
for input_tokens, pred_tokens in rolling_token_windows:
block_output = self.get_token_logprobs(
input_tokens=input_tokens,
pred_tokens=pred_tokens,
)
string_loglikelihoods.append(block_output["logprobs"])
string_loglikelihoods = np.concatenate(string_loglikelihoods).sum()
loglikelihoods.append(string_loglikelihoods)
return loglikelihoods
def _loglikelihood_tokens(self, requests):
import openai
res = [] res = []
def _collate(x): def _collate(x):
...@@ -118,16 +115,18 @@ class GPT3LM(LM): ...@@ -118,16 +115,18 @@ class GPT3LM(LM):
# it's not guaranteed that the 100 or so logprobs we get to see actually contain all the continuations # it's not guaranteed that the 100 or so logprobs we get to see actually contain all the continuations
# we care about and so we need some kind of backup for when it isn't # we care about and so we need some kind of backup for when it isn't
toks = x[1] + x[2] toks = x[1] + x[2]
return (-len(toks), tuple(toks)) return -len(toks), tuple(toks)
reord = utils.Reorderer(requests, _collate) reord = utils.Reorderer(requests, _collate)
for chunk in tqdm(list(utils.chunks(reord.get_reordered(), self.REQ_CHUNK_SIZE))): for chunk in tqdm(list(utils.chunks(reord.get_reordered(), self.REQ_CHUNK_SIZE)), disable=disable_tqdm):
inps = [] inps = []
ctxlens = [] ctxlens = []
for cache_key, context_enc, continuation_enc in chunk: for cache_key, context_enc, continuation_enc in chunk:
inp = (context_enc + continuation_enc)[-self.MAX_LENGTH:] # max_length+1 because the API takes up to 2049 tokens, including the first context token
ctxlen = len(context_enc) - max(0, len(context_enc) + len(continuation_enc) - self.MAX_LENGTH) inp = (context_enc + continuation_enc)[-(self.max_length+1):]
# TODO: the logic is much simpler if we just look at the length of continuation tokens
ctxlen = len(context_enc) - max(0, len(context_enc) + len(continuation_enc) - (self.max_length+1))
inps.append(inp) inps.append(inp)
ctxlens.append(ctxlen) ctxlens.append(ctxlen)
...@@ -151,35 +150,14 @@ class GPT3LM(LM): ...@@ -151,35 +150,14 @@ class GPT3LM(LM):
return reord.get_original(res) return reord.get_original(res)
def get_token_logprobs(self, input_tokens, pred_tokens):
pred_start = len(input_tokens) - len(pred_tokens) + 1
# We're going to stitch together the input_tokens and pred_tokens
# In the longest case, this gets us to length = max_seq_len+1 (which the API works with)
assert input_tokens[pred_start:] == pred_tokens[:-1]
token_ids = input_tokens + [pred_tokens[-1]]
response = oa_completion(
engine=self.engine,
prompt=token_ids,
max_tokens=0,
temperature=0.0,
logprobs=0,
echo=True,
)
logprobs = np.array(response["choices"][0]["logprobs"]["token_logprobs"][pred_start:])
positions = np.arange(pred_start-1, pred_start-1 + len(token_ids[pred_start:]))
return {
"logprobs": logprobs,
"positions": positions,
}
def greedy_until(self, requests): def greedy_until(self, requests):
if not requests: return [] if not requests:
import openai return []
res = [] res = []
def _collate(x): def _collate(x):
toks = self.tokenizer.encode(x[0]) toks = self.tok_encode(x[0])
return (len(toks), x[0]) return len(toks), x[0]
reord = utils.Reorderer(requests, _collate) reord = utils.Reorderer(requests, _collate)
...@@ -193,34 +171,43 @@ class GPT3LM(LM): ...@@ -193,34 +171,43 @@ class GPT3LM(LM):
lastuntil = x[1] lastuntil = x[1]
ret.append(x) ret.append(x)
if ret: yield ret, lastuntil if ret:
yield ret, lastuntil
# todo: more intelligent batching for heterogenous `until` # todo: more intelligent batching for heterogeneous `until`
for chunk, until in tqdm(list(sameuntil_chunks(reord.get_reordered(), self.REQ_CHUNK_SIZE))): for chunk, until in tqdm(list(sameuntil_chunks(reord.get_reordered(), self.REQ_CHUNK_SIZE))):
inps = [] inps = []
for context, _ in chunk: for context, _ in chunk:
context_enc = self.tokenizer.encode(context) context_enc = self.tok_encode(context)
inp = context_enc[-(self.MAX_LENGTH - self.MAX_GEN_TOKS):] inp = context_enc[-(self.max_length - self.max_gen_toks):]
inps.append(inp) inps.append(inp)
response = oa_completion( response = oa_completion(
engine=self.engine, engine=self.engine,
prompt=inps, prompt=inps,
max_tokens=self.MAX_GEN_TOKS, max_tokens=self.max_gen_toks,
temperature=0., temperature=0.,
logprobs=10, logprobs=10,
stop=until stop=until,
) )
for resp, (context, until) in zip(response.choices, chunk): for resp, (context, until_) in zip(response.choices, chunk):
s = resp['text'] s = resp['text']
for term in until: for term in until_:
s = s.split(term)[0] s = s.split(term)[0]
# partial caching # partial caching
self.cache_hook.add_partial("greedy_until", (context, until), s) self.cache_hook.add_partial("greedy_until", (context, until_), s)
res.append(s) res.append(s)
return reord.get_original(res) return reord.get_original(res)
def _model_call(self, inps):
# Isn't used because we override _loglikelihood_tokens
raise NotImplementedError()
def _model_generate(self, context, max_length, eos_token_id):
# Isn't used because we override greedy_until
raise NotImplementedError()
...@@ -22,6 +22,7 @@ from . import lambada ...@@ -22,6 +22,7 @@ from . import lambada
from . import race from . import race
from . import piqa from . import piqa
from . import prost from . import prost
from . import mc_taco
from . import triviaqa from . import triviaqa
from . import pubmedqa from . import pubmedqa
from . import sciq from . import sciq
...@@ -42,6 +43,11 @@ from . import pile ...@@ -42,6 +43,11 @@ from . import pile
from . import wikitext from . import wikitext
from . import xquad from . import xquad
from . import mlqa from . import mlqa
from . import lambada_multilingual
from . import mutual
from . import truthfulqa
from . import blimp
from . import asdiv
######################################## ########################################
# Translation tasks # Translation tasks
...@@ -99,12 +105,17 @@ TASK_REGISTRY = { ...@@ -99,12 +105,17 @@ TASK_REGISTRY = {
"drop": drop.DROP, "drop": drop.DROP,
"lambada": lambada.LAMBADA, "lambada": lambada.LAMBADA,
"lambada_cloze": lambada_cloze.LAMBADA_cloze, "lambada_cloze": lambada_cloze.LAMBADA_cloze,
# multilingual lambada
**lambada_multilingual.construct_tasks(),
"wikitext": wikitext.WikiText, "wikitext": wikitext.WikiText,
# "cbt-cn": cbt.CBTCN, # disabled pending context length fix # "cbt-cn": cbt.CBTCN, # disabled pending context length fix
# "cbt-ne": cbt.CBTNE, # disabled pending context length fix # "cbt-ne": cbt.CBTNE, # disabled pending context length fix
"piqa": piqa.PiQA, "piqa": piqa.PiQA,
"prost": prost.PROST, "prost": prost.PROST,
"mc_taco": mc_taco.MCTACO,
# Science related # Science related
"pubmedqa" : pubmedqa.Pubmed_QA, "pubmedqa" : pubmedqa.Pubmed_QA,
...@@ -143,7 +154,9 @@ TASK_REGISTRY = { ...@@ -143,7 +154,9 @@ TASK_REGISTRY = {
"race": race.RACE, "race": race.RACE,
# "naturalqs": naturalqs.NaturalQs, # not implemented yet # "naturalqs": naturalqs.NaturalQs, # not implemented yet
"headqa": headqa.HeadQA, "headqa": headqa.HeadQAEsDeprecated, # for backwards compat - headqa used to default to es
"headqa_es": headqa.HeadQAEs,
"headqa_en": headqa.HeadQAEn,
"mathqa": mathqa.MathQA, "mathqa": mathqa.MathQA,
"webqs": webqs.WebQs, "webqs": webqs.WebQs,
"wsc273": wsc273.WinogradSchemaChallenge273, "wsc273": wsc273.WinogradSchemaChallenge273,
...@@ -159,6 +172,13 @@ TASK_REGISTRY = { ...@@ -159,6 +172,13 @@ TASK_REGISTRY = {
"ethics_utilitarianism": hendrycks_ethics.EthicsUtilitarianism, "ethics_utilitarianism": hendrycks_ethics.EthicsUtilitarianism,
"ethics_virtue": hendrycks_ethics.EthicsVirtue, "ethics_virtue": hendrycks_ethics.EthicsVirtue,
"truthfulqa_mc": truthfulqa.TruthfulQAMultipleChoice,
"truthfulqa_gen": truthfulqa.TruthfulQAGeneration,
# dialogue
"mutual": mutual.MuTual,
"mutual_plus": mutual.MuTualPlus,
# math # math
"math_algebra": hendrycks_math.MathAlgebra, "math_algebra": hendrycks_math.MathAlgebra,
"math_counting_and_prob": hendrycks_math.MathCountingAndProbability, "math_counting_and_prob": hendrycks_math.MathCountingAndProbability,
...@@ -167,6 +187,7 @@ TASK_REGISTRY = { ...@@ -167,6 +187,7 @@ TASK_REGISTRY = {
"math_num_theory": hendrycks_math.MathNumberTheory, "math_num_theory": hendrycks_math.MathNumberTheory,
"math_prealgebra": hendrycks_math.MathPrealgebra, "math_prealgebra": hendrycks_math.MathPrealgebra,
"math_precalc": hendrycks_math.MathPrecalculus, "math_precalc": hendrycks_math.MathPrecalculus,
"math_asdiv": asdiv.Asdiv,
# arithmetic # arithmetic
"arithmetic_2da": arithmetic.Arithmetic2DPlus, "arithmetic_2da": arithmetic.Arithmetic2DPlus,
...@@ -220,6 +241,75 @@ TASK_REGISTRY = { ...@@ -220,6 +241,75 @@ TASK_REGISTRY = {
"pile_ubuntu-irc": pile.PileUbuntuIrc, "pile_ubuntu-irc": pile.PileUbuntuIrc,
"pile_wikipedia": pile.PileWikipedia, "pile_wikipedia": pile.PileWikipedia,
"pile_youtubesubtitles": pile.PileYoutubeSubtitles, "pile_youtubesubtitles": pile.PileYoutubeSubtitles,
# BLiMP
"blimp_adjunct_island": blimp.BlimpAdjunctIsland,
"blimp_anaphor_gender_agreement": blimp.BlimpAnaphorGenderAgreement,
"blimp_anaphor_number_agreement": blimp.BlimpAnaphorNumberAgreement,
"blimp_animate_subject_passive": blimp.BlimpAnimateSubjectPassive,
"blimp_animate_subject_trans": blimp.BlimpAnimateSubjectTrans,
"blimp_causative": blimp.BlimpCausative,
"blimp_complex_NP_island": blimp.BlimpComplex_NPIsland,
"blimp_coordinate_structure_constraint_complex_left_branch": blimp.BlimpCoordinateStructureConstraintComplexLeftBranch,
"blimp_coordinate_structure_constraint_object_extraction": blimp.BlimpCoordinateStructureConstraintObjectExtraction,
"blimp_determiner_noun_agreement_1": blimp.BlimpDeterminerNounAgreement_1,
"blimp_determiner_noun_agreement_2": blimp.BlimpDeterminerNounAgreement_2,
"blimp_determiner_noun_agreement_irregular_1": blimp.BlimpDeterminerNounAgreementIrregular_1,
"blimp_determiner_noun_agreement_irregular_2": blimp.BlimpDeterminerNounAgreementIrregular_2,
"blimp_determiner_noun_agreement_with_adj_2": blimp.BlimpDeterminerNounAgreementWithAdj_2,
"blimp_determiner_noun_agreement_with_adj_irregular_1": blimp.BlimpDeterminerNounAgreementWithAdjIrregular_1,
"blimp_determiner_noun_agreement_with_adj_irregular_2": blimp.BlimpDeterminerNounAgreementWithAdjIrregular_2,
"blimp_determiner_noun_agreement_with_adjective_1": blimp.BlimpDeterminerNounAgreementWithAdjective_1,
"blimp_distractor_agreement_relational_noun": blimp.BlimpDistractorAgreementRelationalNoun,
"blimp_distractor_agreement_relative_clause": blimp.BlimpDistractorAgreementRelativeClause,
"blimp_drop_argument": blimp.BlimpDropArgument,
"blimp_ellipsis_n_bar_1": blimp.BlimpEllipsisNBar_1,
"blimp_ellipsis_n_bar_2": blimp.BlimpEllipsisNBar_2,
"blimp_existential_there_object_raising": blimp.BlimpExistentialThereObjectRaising,
"blimp_existential_there_quantifiers_1": blimp.BlimpExistentialThereQuantifiers_1,
"blimp_existential_there_quantifiers_2": blimp.BlimpExistentialThereQuantifiers_2,
"blimp_existential_there_subject_raising": blimp.BlimpExistentialThereSubjectRaising,
"blimp_expletive_it_object_raising": blimp.BlimpExpletiveItObjectRaising,
"blimp_inchoative": blimp.BlimpInchoative,
"blimp_intransitive": blimp.BlimpIntransitive,
"blimp_irregular_past_participle_adjectives": blimp.BlimpIrregularPastParticipleAdjectives,
"blimp_irregular_past_participle_verbs": blimp.BlimpIrregularPastParticipleVerbs,
"blimp_irregular_plural_subject_verb_agreement_1": blimp.BlimpIrregularPluralSubjectVerbAgreement_1,
"blimp_irregular_plural_subject_verb_agreement_2": blimp.BlimpIrregularPluralSubjectVerbAgreement_2,
"blimp_left_branch_island_echo_question": blimp.BlimpLeftBranchIslandEchoQuestion,
"blimp_left_branch_island_simple_question": blimp.BlimpLeftBranchIslandSimpleQuestion,
"blimp_matrix_question_npi_licensor_present": blimp.BlimpMatrixQuestionNpiLicensorPresent,
"blimp_npi_present_1": blimp.BlimpNpiPresent_1,
"blimp_npi_present_2": blimp.BlimpNpiPresent_2,
"blimp_only_npi_licensor_present": blimp.BlimpOnlyNpiLicensorPresent,
"blimp_only_npi_scope": blimp.BlimpOnlyNpiScope,
"blimp_passive_1": blimp.BlimpPassive_1,
"blimp_passive_2": blimp.BlimpPassive_2,
"blimp_principle_A_c_command": blimp.BlimpPrinciple_ACCommand,
"blimp_principle_A_case_1": blimp.BlimpPrinciple_ACase_1,
"blimp_principle_A_case_2": blimp.BlimpPrinciple_ACase_2,
"blimp_principle_A_domain_1": blimp.BlimpPrinciple_ADomain_1,
"blimp_principle_A_domain_2": blimp.BlimpPrinciple_ADomain_2,
"blimp_principle_A_domain_3": blimp.BlimpPrinciple_ADomain_3,
"blimp_principle_A_reconstruction": blimp.BlimpPrinciple_AReconstruction,
"blimp_regular_plural_subject_verb_agreement_1": blimp.BlimpRegularPluralSubjectVerbAgreement_1,
"blimp_regular_plural_subject_verb_agreement_2": blimp.BlimpRegularPluralSubjectVerbAgreement_2,
"blimp_sentential_negation_npi_licensor_present": blimp.BlimpSententialNegationNpiLicensorPresent,
"blimp_sentential_negation_npi_scope": blimp.BlimpSententialNegationNpiScope,
"blimp_sentential_subject_island": blimp.BlimpSententialSubjectIsland,
"blimp_superlative_quantifiers_1": blimp.BlimpSuperlativeQuantifiers_1,
"blimp_superlative_quantifiers_2": blimp.BlimpSuperlativeQuantifiers_2,
"blimp_tough_vs_raising_1": blimp.BlimpToughVsRaising_1,
"blimp_tough_vs_raising_2": blimp.BlimpToughVsRaising_2,
"blimp_transitive": blimp.BlimpTransitive,
"blimp_wh_island": blimp.BlimpWhIsland,
"blimp_wh_questions_object_gap": blimp.BlimpWhQuestionsObjectGap,
"blimp_wh_questions_subject_gap": blimp.BlimpWhQuestionsSubjectGap,
"blimp_wh_questions_subject_gap_long_distance": blimp.BlimpWhQuestionsSubjectGapLongDistance,
"blimp_wh_vs_that_no_gap": blimp.BlimpWhVsThatNoGap,
"blimp_wh_vs_that_no_gap_long_distance": blimp.BlimpWhVsThatNoGapLongDistance,
"blimp_wh_vs_that_with_gap": blimp.BlimpWhVsThatWithGap,
"blimp_wh_vs_that_with_gap_long_distance": blimp.BlimpWhVsThatWithGapLongDistance,
} }
......
"""
ASDiv: A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers
https://arxiv.org/abs/2106.15772
@misc{miao2021diverse,
title={A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers},
author={Shen-Yun Miao and Chao-Chun Liang and Keh-Yih Su},
year={2021},
eprint={2106.15772},
archivePrefix={arXiv},
primaryClass={cs.AI}
}
"""
from lm_eval.base import Task
from pathlib import Path
from best_download import download_file
import xml.etree.ElementTree as ET
from lm_eval.base import rf
from lm_eval.metrics import mean,perplexity
import numpy as np
from zipfile import ZipFile
import os
#currently ignoring formula for answer generation
# given a subset, splits return the docs
class Asdiv(Task):
VERSION = 0
DATASET_PATH = Path("data/asdiv")
def download(self):
if self.DATASET_PATH.exists():
return
Path.mkdir(self.DATASET_PATH)
url = "https://github.com/chaochun/nlu-asdiv-dataset/archive/55790e5270bb91ccfa5053194b25732534696b50.zip"
checksum = "8f1fe4f6d5f170ec1e24ab78c244153c14c568b1bb2b1dad0324e71f37939a2d"
zip_path = self.DATASET_PATH / "55790e5270bb91ccfa5053194b25732534696b50.zip"
download_file(url, str(zip_path), checksum)
with ZipFile(zip_path, "r") as zip:
zip.extractall(self.DATASET_PATH)
os.remove(zip_path)
def _convert_standard(self, problem):
#TODO: include solution-type and formula
out_doc = {
"question" : problem.find('Question').text,
"body" : problem.find('Body').text,
"answer": problem.find('Answer').text
}
return out_doc
def load_docs(self, textfilename, tfds=False):
tree = ET.parse(textfilename)
root = tree.getroot()
for pid, problem in enumerate(root.iter('Problem')):
out_doc = self._convert_standard(problem)
yield out_doc
def has_training_docs(self):
return False
def has_validation_docs(self):
return True
def has_test_docs(self):
return False
def training_docs(self):
raise NotImplementedError("This dataset has no training docs")
def test_docs(self):
raise NotImplementedError("This dataset has no test docs")
def validation_docs(self):
data_xml_path = self.DATASET_PATH / "nlu-asdiv-dataset-55790e5270bb91ccfa5053194b25732534696b50/dataset/ASDiv.xml"
return self.load_docs(data_xml_path)
def fewshot_context(self, doc, num_fewshot, provide_description, rnd):
assert num_fewshot == 0, "ASDiv is intended only for the zero-shot setting."
return super().fewshot_context(doc, num_fewshot, provide_description, rnd)
def fewshot_description(self):
# TODO: add solution-type and formula
desc = "information containing the context of the question\nQuestion: Text of a question.\nAnswer: Answer to the question, based on the passage.\n"
return desc
def doc_to_text(self, doc):
# TODO: add solution-type
return doc['body'] + '\n' + 'Question:' + doc['question'] + '\n' + 'Answer:'
def doc_to_target(self, doc):
# TODO: add formula
answer = doc['answer'].split(' (')[0]
return " " + answer
def construct_requests(self, doc, ctx):
ll, is_greedy = rf.loglikelihood(ctx, self.doc_to_target(doc))
return ll, is_greedy
def process_results(self, doc, results):
ll, is_greedy = results
return {
'acc': int(is_greedy)
}
def aggregation(self):
return {
'acc': mean
}
def higher_is_better(self):
return {
'acc': True
}
"""
BLiMP: A Benchmark of Linguistic Minimal Pairs for English
https://arxiv.org/abs/1912.00582
@article{warstadt2019blimp,
title={BLiMP: A Benchmark of Linguistic Minimal Pairs for English},
author={Warstadt, Alex and Parrish, Alicia and Liu, Haokun and Mohananey, Anhad and Peng, Wei, and Wang, Sheng-Fu and Bowman, Samuel R},
journal={arXiv preprint arXiv:1912.00582},
year={2019}
}
"""
from lm_eval.base import rf
from lm_eval.metrics import mean
from .common import HFTask
class BlimpTask(HFTask):
VERSION = 0
DATASET_PATH = "blimp"
def download(self):
super().download()
# The HF dataset only contains a "train" dataset, but the harness expects a "validation"
# dataset. Let's use the training dataset, on the assumption that the model wasn't actually
# trained on this data.
self.data["validation"] = self.data["train"]
del self.data["train"]
def fewshot_context(self, doc, num_fewshot, provide_description, rnd):
assert num_fewshot == 0
assert not provide_description
return ""
def doc_to_text(self, doc):
# this method is invoked by tests only
return ""
def doc_to_target(self, doc):
# this method is invoked by tests only
return ""
def construct_requests(self, doc, ctx):
assert not ctx
# Calculate the loglikelihood for the good and the bad sentence.
# Note that loglikelihood translates the "" prefix to the "<|endoftext|>" token
return [
rf.loglikelihood("", doc["sentence_good"]),
rf.loglikelihood("", doc["sentence_bad"]),
]
def process_results(self, doc, results):
likelihood1, likelihood2 = results
# the model got this case right iff the good sentence scored higher than the bad sentence
acc = 1.0 if likelihood1 > likelihood2 else 0.0
return {
"acc": acc,
}
def higher_is_better(self):
return {
"acc": True,
}
def aggregation(self):
return {
"acc": mean,
}
class BlimpAdjunctIsland(BlimpTask):
DATASET_NAME = "adjunct_island"
class BlimpAnaphorGenderAgreement(BlimpTask):
DATASET_NAME = "anaphor_gender_agreement"
class BlimpAnaphorNumberAgreement(BlimpTask):
DATASET_NAME = "anaphor_number_agreement"
class BlimpAnimateSubjectPassive(BlimpTask):
DATASET_NAME = "animate_subject_passive"
class BlimpAnimateSubjectTrans(BlimpTask):
DATASET_NAME = "animate_subject_trans"
class BlimpCausative(BlimpTask):
DATASET_NAME = "causative"
class BlimpComplex_NPIsland(BlimpTask):
DATASET_NAME = "complex_NP_island"
class BlimpCoordinateStructureConstraintComplexLeftBranch(BlimpTask):
DATASET_NAME = "coordinate_structure_constraint_complex_left_branch"
class BlimpCoordinateStructureConstraintObjectExtraction(BlimpTask):
DATASET_NAME = "coordinate_structure_constraint_object_extraction"
class BlimpDeterminerNounAgreement_1(BlimpTask):
DATASET_NAME = "determiner_noun_agreement_1"
class BlimpDeterminerNounAgreement_2(BlimpTask):
DATASET_NAME = "determiner_noun_agreement_2"
class BlimpDeterminerNounAgreementIrregular_1(BlimpTask):
DATASET_NAME = "determiner_noun_agreement_irregular_1"
class BlimpDeterminerNounAgreementIrregular_2(BlimpTask):
DATASET_NAME = "determiner_noun_agreement_irregular_2"
class BlimpDeterminerNounAgreementWithAdj_2(BlimpTask):
DATASET_NAME = "determiner_noun_agreement_with_adj_2"
class BlimpDeterminerNounAgreementWithAdjIrregular_1(BlimpTask):
DATASET_NAME = "determiner_noun_agreement_with_adj_irregular_1"
class BlimpDeterminerNounAgreementWithAdjIrregular_2(BlimpTask):
DATASET_NAME = "determiner_noun_agreement_with_adj_irregular_2"
class BlimpDeterminerNounAgreementWithAdjective_1(BlimpTask):
DATASET_NAME = "determiner_noun_agreement_with_adjective_1"
class BlimpDistractorAgreementRelationalNoun(BlimpTask):
DATASET_NAME = "distractor_agreement_relational_noun"
class BlimpDistractorAgreementRelativeClause(BlimpTask):
DATASET_NAME = "distractor_agreement_relative_clause"
class BlimpDropArgument(BlimpTask):
DATASET_NAME = "drop_argument"
class BlimpEllipsisNBar_1(BlimpTask):
DATASET_NAME = "ellipsis_n_bar_1"
class BlimpEllipsisNBar_2(BlimpTask):
DATASET_NAME = "ellipsis_n_bar_2"
class BlimpExistentialThereObjectRaising(BlimpTask):
DATASET_NAME = "existential_there_object_raising"
class BlimpExistentialThereQuantifiers_1(BlimpTask):
DATASET_NAME = "existential_there_quantifiers_1"
class BlimpExistentialThereQuantifiers_2(BlimpTask):
DATASET_NAME = "existential_there_quantifiers_2"
class BlimpExistentialThereSubjectRaising(BlimpTask):
DATASET_NAME = "existential_there_subject_raising"
class BlimpExpletiveItObjectRaising(BlimpTask):
DATASET_NAME = "expletive_it_object_raising"
class BlimpInchoative(BlimpTask):
DATASET_NAME = "inchoative"
class BlimpIntransitive(BlimpTask):
DATASET_NAME = "intransitive"
class BlimpIrregularPastParticipleAdjectives(BlimpTask):
DATASET_NAME = "irregular_past_participle_adjectives"
class BlimpIrregularPastParticipleVerbs(BlimpTask):
DATASET_NAME = "irregular_past_participle_verbs"
class BlimpIrregularPluralSubjectVerbAgreement_1(BlimpTask):
DATASET_NAME = "irregular_plural_subject_verb_agreement_1"
class BlimpIrregularPluralSubjectVerbAgreement_2(BlimpTask):
DATASET_NAME = "irregular_plural_subject_verb_agreement_2"
class BlimpLeftBranchIslandEchoQuestion(BlimpTask):
DATASET_NAME = "left_branch_island_echo_question"
class BlimpLeftBranchIslandSimpleQuestion(BlimpTask):
DATASET_NAME = "left_branch_island_simple_question"
class BlimpMatrixQuestionNpiLicensorPresent(BlimpTask):
DATASET_NAME = "matrix_question_npi_licensor_present"
class BlimpNpiPresent_1(BlimpTask):
DATASET_NAME = "npi_present_1"
class BlimpNpiPresent_2(BlimpTask):
DATASET_NAME = "npi_present_2"
class BlimpOnlyNpiLicensorPresent(BlimpTask):
DATASET_NAME = "only_npi_licensor_present"
class BlimpOnlyNpiScope(BlimpTask):
DATASET_NAME = "only_npi_scope"
class BlimpPassive_1(BlimpTask):
DATASET_NAME = "passive_1"
class BlimpPassive_2(BlimpTask):
DATASET_NAME = "passive_2"
class BlimpPrinciple_ACCommand(BlimpTask):
DATASET_NAME = "principle_A_c_command"
class BlimpPrinciple_ACase_1(BlimpTask):
DATASET_NAME = "principle_A_case_1"
class BlimpPrinciple_ACase_2(BlimpTask):
DATASET_NAME = "principle_A_case_2"
class BlimpPrinciple_ADomain_1(BlimpTask):
DATASET_NAME = "principle_A_domain_1"
class BlimpPrinciple_ADomain_2(BlimpTask):
DATASET_NAME = "principle_A_domain_2"
class BlimpPrinciple_ADomain_3(BlimpTask):
DATASET_NAME = "principle_A_domain_3"
class BlimpPrinciple_AReconstruction(BlimpTask):
DATASET_NAME = "principle_A_reconstruction"
class BlimpRegularPluralSubjectVerbAgreement_1(BlimpTask):
DATASET_NAME = "regular_plural_subject_verb_agreement_1"
class BlimpRegularPluralSubjectVerbAgreement_2(BlimpTask):
DATASET_NAME = "regular_plural_subject_verb_agreement_2"
class BlimpSententialNegationNpiLicensorPresent(BlimpTask):
DATASET_NAME = "sentential_negation_npi_licensor_present"
class BlimpSententialNegationNpiScope(BlimpTask):
DATASET_NAME = "sentential_negation_npi_scope"
class BlimpSententialSubjectIsland(BlimpTask):
DATASET_NAME = "sentential_subject_island"
class BlimpSuperlativeQuantifiers_1(BlimpTask):
DATASET_NAME = "superlative_quantifiers_1"
class BlimpSuperlativeQuantifiers_2(BlimpTask):
DATASET_NAME = "superlative_quantifiers_2"
class BlimpToughVsRaising_1(BlimpTask):
DATASET_NAME = "tough_vs_raising_1"
class BlimpToughVsRaising_2(BlimpTask):
DATASET_NAME = "tough_vs_raising_2"
class BlimpTransitive(BlimpTask):
DATASET_NAME = "transitive"
class BlimpWhIsland(BlimpTask):
DATASET_NAME = "wh_island"
class BlimpWhQuestionsObjectGap(BlimpTask):
DATASET_NAME = "wh_questions_object_gap"
class BlimpWhQuestionsSubjectGap(BlimpTask):
DATASET_NAME = "wh_questions_subject_gap"
class BlimpWhQuestionsSubjectGapLongDistance(BlimpTask):
DATASET_NAME = "wh_questions_subject_gap_long_distance"
class BlimpWhVsThatNoGap(BlimpTask):
DATASET_NAME = "wh_vs_that_no_gap"
class BlimpWhVsThatNoGapLongDistance(BlimpTask):
DATASET_NAME = "wh_vs_that_no_gap_long_distance"
class BlimpWhVsThatWithGap(BlimpTask):
DATASET_NAME = "wh_vs_that_with_gap"
class BlimpWhVsThatWithGapLongDistance(BlimpTask):
DATASET_NAME = "wh_vs_that_with_gap_long_distance"
...@@ -14,15 +14,16 @@ Acknowledgement: This implementation is based on the official evaluation for `DR ...@@ -14,15 +14,16 @@ Acknowledgement: This implementation is based on the official evaluation for `DR
https://github.com/allenai/allennlp-reading-comprehension/blob/master/allennlp_rc/eval/drop_eval.py https://github.com/allenai/allennlp-reading-comprehension/blob/master/allennlp_rc/eval/drop_eval.py
""" """
_ARTICLES = re.compile(r"\b(a|an|the)\b", re.UNICODE)
class DROP(Task): class DROP(Task):
VERSION = 0 VERSION = 1
DATASET_PATH = Path("data/drop") DATASET_PATH = Path("data/drop")
def download(self): def download(self):
if self.DATASET_PATH.exists(): if self.DATASET_PATH.exists():
return return
Path.mkdir(self.DATASET_PATH) Path.mkdir(self.DATASET_PATH, parents=True)
url = "https://s3-us-west-2.amazonaws.com/allennlp/datasets/drop/drop_dataset.zip" url = "https://s3-us-west-2.amazonaws.com/allennlp/datasets/drop/drop_dataset.zip"
checksum = "39d2278a29fd729de301b111a45f434c24834f40df8f4ff116d864589e3249d6" checksum = "39d2278a29fd729de301b111a45f434c24834f40df8f4ff116d864589e3249d6"
zip_path = self.DATASET_PATH / "drop_dataset.zip" zip_path = self.DATASET_PATH / "drop_dataset.zip"
...@@ -50,19 +51,34 @@ class DROP(Task): ...@@ -50,19 +51,34 @@ class DROP(Task):
"id": qa["query_id"], "id": qa["query_id"],
"passage": doc["passage"], "passage": doc["passage"],
"question": qa["question"], "question": qa["question"],
"answers": self.get_answers(qa["answer"]), "answers": self.get_answers(qa),
} }
@classmethod @classmethod
def get_answers(cls, answers): def get_answers(cls, qa):
# NOTE: We wrap every non-`list` answer into a list for uniformity. answers = []
if answers["number"] != "": answers_set = set()
return [str(answers["number"])]
if answers["spans"] != []: candidates = [qa["answer"]] + qa.get("validated_answers", [])
return answers["spans"] for candidate in candidates:
return [" ".join([answers["date"]["day"], answer = cls.parse_answer(candidate)
answers["date"]["month"], if answer in answers_set:
answers["date"]["year"]]).strip()] continue
answers_set.add(answer)
answers.append(answer)
return answers
@classmethod
def parse_answer(cls, answer):
# NOTE: Everything is returned as a tuple for uniformity and hashability.
if answer["number"] != "":
return (str(answer["number"]),)
if answer["spans"] != []:
return tuple(answer["spans"])
return (" ".join([answer["date"]["day"],
answer["date"]["month"],
answer["date"]["year"]]).strip(),)
def training_docs(self): def training_docs(self):
docs = json.load(open(self.DATASET_PATH / "drop_dataset" / "drop_dataset_train.json")) docs = json.load(open(self.DATASET_PATH / "drop_dataset" / "drop_dataset_train.json"))
...@@ -76,7 +92,7 @@ class DROP(Task): ...@@ -76,7 +92,7 @@ class DROP(Task):
return f"Passage: {doc['passage']}\nQuestion: {doc['question']}\nAnswer:" return f"Passage: {doc['passage']}\nQuestion: {doc['question']}\nAnswer:"
def doc_to_target(self, doc): def doc_to_target(self, doc):
return " " + ", ".join(doc["answers"]) return " " + ", ".join(doc["answers"][0])
def construct_requests(self, doc, ctx): def construct_requests(self, doc, ctx):
"""Uses RequestFactory to construct Requests and returns an iterable of """Uses RequestFactory to construct Requests and returns an iterable of
...@@ -89,9 +105,7 @@ class DROP(Task): ...@@ -89,9 +105,7 @@ class DROP(Task):
language description, as well as the few shot examples, and the question language description, as well as the few shot examples, and the question
part of the document for `doc`. part of the document for `doc`.
""" """
conts = [] conts = [rf.greedy_until(ctx, ["."])]
for _ in doc["answers"]:
conts.append(rf.greedy_until(ctx, ["."]))
return conts return conts
def process_results(self, doc, results): def process_results(self, doc, results):
...@@ -105,66 +119,96 @@ class DROP(Task): ...@@ -105,66 +119,96 @@ class DROP(Task):
The results of the requests created in construct_requests. The results of the requests created in construct_requests.
""" """
preds, golds = results, doc["answers"] preds, golds = results, doc["answers"]
exact_match, f1_score = self.get_metrics(preds, golds) max_em = 0
max_f1 = 0
for gold_answer in golds:
exact_match, f1_score = self.get_metrics(preds, gold_answer)
if gold_answer[0].strip():
max_em = max(max_em, exact_match)
max_f1 = max(max_f1, f1_score)
return { return {
"em": exact_match, "em": max_em,
"f1": f1_score "f1": max_f1
} }
def get_metrics(self, preds, golds): def get_metrics(self, predicted, gold):
exact_match = self._exact_match(preds, golds) """
f1_score = self._f1_score(preds, golds) Takes a predicted answer and a gold answer (that are both either a string or a list of
return exact_match, f1_score strings), and returns exact match and the DROP F1 metric for the prediction. If you are
writing a script for evaluating objects in memory (say, the output of predictions during
def _exact_match(self, preds, golds): validation, or while training), this is the function you want to call, after using
""" Returns the exact match of normalized gold answers and predictions. """ :func:`answer_json_to_strings` when reading the gold answer from the released data file.
normalized_preds = [self._normalize(pred) for pred in preds]
normalized_golds = [self._normalize(gold) for gold in golds]
is_equal_sets = set(normalized_preds) == set(normalized_golds)
is_equal_length = len(normalized_preds) == len(normalized_golds)
return int(is_equal_sets and is_equal_length)
def _f1_score(self, preds, golds):
"""Returns the average F1-score over normalized gold answers and predictions.
From Section 5 of Dua et al. "DROP:...":
"When an answer has multiple spans, we first perform a one-to-one
alignment greedily based on bag-of-word overlap on the set of spans
and then compute average F1 over each span."
""" """
pred_bags = self._answer_to_bags(preds) predicted_bags = self._answer_to_bags(predicted)
gold_bags = self._answer_to_bags(golds) gold_bags = self._answer_to_bags(gold)
f1_per_bag = self._align_bags(pred_bags, gold_bags)
return np.mean(f1_per_bag) if set(predicted_bags[0]) == set(gold_bags[0]) and len(predicted_bags[0]) == len(gold_bags[0]):
exact_match = 1.0
def _answer_to_bags(self, answers): else:
return [set(self._normalize(answer).split()) for answer in answers] exact_match = 0.0
def _align_bags(self, pred_bags, gold_bags): f1_per_bag = self._align_bags(predicted_bags[1], gold_bags[1])
""" Returns the max metric value over all the answers. """ f1 = np.mean(f1_per_bag)
scores = np.zeros([len(gold_bags), len(pred_bags)]) f1 = round(f1, 2)
for gold_index, gold_bag in enumerate(gold_bags): return exact_match, f1
for pred_index, pred_bag in enumerate(pred_bags):
if self._is_number_match(pred_bag, gold_bag): def _answer_to_bags(self, answer):
scores[gold_index, pred_index] = self._bag_f1(pred_bag, gold_bag) if isinstance(answer, (list, tuple)):
raw_spans = answer
else:
raw_spans = [answer]
normalized_spans = []
token_bags = []
for raw_span in raw_spans:
normalized_span = self._normalize(raw_span)
normalized_spans.append(normalized_span)
token_bags.append(set(normalized_span.split()))
return normalized_spans, token_bags
def _align_bags(self, predicted, gold):
"""
Takes gold and predicted answer sets and first finds the optimal 1-1 alignment
between them and gets maximum metric values over all the answers.
"""
scores = np.zeros([len(gold), len(predicted)])
for gold_index, gold_item in enumerate(gold):
for pred_index, pred_item in enumerate(predicted):
if self._match_numbers_if_present(gold_item, pred_item):
scores[gold_index, pred_index] = self._compute_f1(pred_item, gold_item)
row_ind, col_ind = linear_sum_assignment(-scores) row_ind, col_ind = linear_sum_assignment(-scores)
max_scores = np.zeros([max(len(gold_bags), len(pred_bags))])
max_scores = np.zeros([max(len(gold), len(predicted))])
for row, column in zip(row_ind, col_ind): for row, column in zip(row_ind, col_ind):
max_scores[row] = max(max_scores[row], scores[row, column]) max_scores[row] = max(max_scores[row], scores[row, column])
return max_scores return max_scores
def _bag_f1(self, pred_bag, gold_bag): def _compute_f1(self, predicted_bag, gold_bag):
intersection = len(gold_bag.intersection(pred_bag)) intersection = len(gold_bag.intersection(predicted_bag))
if intersection == 0: if not predicted_bag:
return 0.0 precision = 1.0
precision = intersection / float(len(pred_bag)) if pred_bag else 1.0 else:
recall = intersection / float(len(gold_bag)) if gold_bag else 1.0 precision = intersection / float(len(predicted_bag))
f1 = (2 * precision * recall) / (precision + recall) if not gold_bag:
recall = 1.0
else:
recall = intersection / float(len(gold_bag))
f1 = (
(2 * precision * recall) / (precision + recall)
if not (precision == 0.0 and recall == 0.0)
else 0.0
)
return f1 return f1
def _is_number_match(self, pred_bag, gold_bag): def _match_numbers_if_present(self, gold_bag, predicted_bag):
pred_numbers = set([word for word in pred_bag if self._is_number(word)]) gold_numbers = set()
gold_numbers = set([word for word in gold_bag if self._is_number(word)]) predicted_numbers = set()
if (not gold_numbers) or gold_numbers.intersection(pred_numbers): for word in gold_bag:
if self._is_number(word):
gold_numbers.add(word)
for word in predicted_bag:
if self._is_number(word):
predicted_numbers.add(word)
if (not gold_numbers) or gold_numbers.intersection(predicted_numbers):
return True return True
return False return False
...@@ -175,30 +219,29 @@ class DROP(Task): ...@@ -175,30 +219,29 @@ class DROP(Task):
except ValueError: except ValueError:
return False return False
def _normalize(self, answer): def _remove_articles(self, text):
def remove_articles(text): return _ARTICLES.sub(" ", text)
regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
return re.sub(regex, " ", text)
def white_space_fix(text): def _white_space_fix(self, text):
return " ".join(text.split()) return " ".join(text.split())
def remove_punc(text): def _remove_punc(self, text):
exclude = set(string.punctuation) exclude = set(string.punctuation)
if not self._is_number(text): if not self._is_number(text):
return "".join(ch for ch in text if ch not in exclude) return "".join(ch for ch in text if ch not in exclude)
else: else:
return text return text
def fix_number(text): def _fix_number(self, text):
return str(float(text)) if self._is_number(text) else text return str(float(text)) if self._is_number(text) else text
def tokenize(text): def _tokenize(self, text):
return re.split(" |-", text) return re.split(" |-", text)
def _normalize(self, answer):
tokens = [ tokens = [
white_space_fix(remove_articles(fix_number(remove_punc(token.lower())))) self._white_space_fix(self._remove_articles(self._fix_number(self._remove_punc(token.lower()))))
for token in tokenize(answer) for token in self._tokenize(answer)
] ]
tokens = [token for token in tokens if token.strip()] tokens = [token for token in tokens if token.strip()]
normalized = " ".join(tokens).strip() normalized = " ".join(tokens).strip()
......
...@@ -227,7 +227,7 @@ class QNLI(HFTask): ...@@ -227,7 +227,7 @@ class QNLI(HFTask):
class WNLI(HFTask): class WNLI(HFTask):
VERSION = 0 VERSION = 1
DATASET_PATH = "glue" DATASET_PATH = "glue"
DATASET_NAME = "wnli" DATASET_NAME = "wnli"
...@@ -241,26 +241,25 @@ class WNLI(HFTask): ...@@ -241,26 +241,25 @@ class WNLI(HFTask):
return False return False
def doc_to_text(self, doc): def doc_to_text(self, doc):
return "{}\nQuestion: {} True, False or Neither?\nAnswer:".format( return "{}\nQuestion: {} True or False?\nAnswer:".format(
doc["sentence1"], doc["sentence1"],
doc["sentence2"], doc["sentence2"],
) )
def doc_to_target(self, doc): def doc_to_target(self, doc):
# True = entailment # True = entailment
# False = contradiction # False = not_entailment
# Neither = neutral return " {}".format({0: "False", 1: "True"}[doc["label"]])
return " {}".format({0: "True", 1: "Neither", 2: "False"}[doc["label"]])
def construct_requests(self, doc, ctx): def construct_requests(self, doc, ctx):
ll_true, _ = rf.loglikelihood(ctx, " True") ll_true, _ = rf.loglikelihood(ctx, " True")
ll_neither, _ = rf.loglikelihood(ctx, " Neither")
ll_false, _ = rf.loglikelihood(ctx, " False") ll_false, _ = rf.loglikelihood(ctx, " False")
return ll_true, ll_neither, ll_false return ll_true, ll_false
def process_results(self, doc, results): def process_results(self, doc, results):
ll_true, ll_false = results
pred = ll_true > ll_false
gold = doc["label"] gold = doc["label"]
pred = np.argmax(results)
return { return {
"acc": pred == gold "acc": pred == gold
} }
......
...@@ -2,10 +2,9 @@ from . common import HFTask ...@@ -2,10 +2,9 @@ from . common import HFTask
from lm_eval.base import MultipleChoiceTask from lm_eval.base import MultipleChoiceTask
class HeadQA(HFTask, MultipleChoiceTask): class HeadQABase(HFTask, MultipleChoiceTask):
VERSION = 0 VERSION = 0
DATASET_PATH = "head_qa" DATASET_PATH = "head_qa"
DATASET_NAME = None
def has_training_docs(self): def has_training_docs(self):
return True return True
...@@ -31,3 +30,15 @@ class HeadQA(HFTask, MultipleChoiceTask): ...@@ -31,3 +30,15 @@ class HeadQA(HFTask, MultipleChoiceTask):
def doc_to_text(self, doc): def doc_to_text(self, doc):
return doc["query"] return doc["query"]
class HeadQAEn(HeadQABase):
DATASET_NAME = "en"
class HeadQAEs(HeadQABase):
DATASET_NAME = "es"
# for backwards compatibility
class HeadQAEsDeprecated(HeadQABase):
DATASET_NAME = "es"
print("WARNING: headqa is deprecated. Please use headqa_es or headqa_en instead. See https://github.com/EleutherAI/lm-evaluation-harness/pull/240 for more info.")
\ No newline at end of file
...@@ -3,6 +3,7 @@ from lm_eval.base import Task, rf ...@@ -3,6 +3,7 @@ from lm_eval.base import Task, rf
from lm_eval.metrics import mean, perplexity from lm_eval.metrics import mean, perplexity
from lm_eval.utils import sh from lm_eval.utils import sh
from best_download import download_file from best_download import download_file
import os
class LAMBADA(Task): class LAMBADA(Task):
...@@ -10,11 +11,12 @@ class LAMBADA(Task): ...@@ -10,11 +11,12 @@ class LAMBADA(Task):
def download(self): def download(self):
sh("mkdir -p data/lambada") sh("mkdir -p data/lambada")
try: try:
download_file( if not os.path.exists("data/lambada/lambada_test.jsonl"):
"http://eaidata.bmk.sh/data/lambada_test.jsonl", download_file(
"data/lambada/lambada_test.jsonl", "http://eaidata.bmk.sh/data/lambada_test.jsonl",
"4aa8d02cd17c719165fc8a7887fddd641f43fcafa4b1c806ca8abc31fabdb226" "data/lambada/lambada_test.jsonl",
) "4aa8d02cd17c719165fc8a7887fddd641f43fcafa4b1c806ca8abc31fabdb226"
)
except: except:
# fallback - for some reason best_download doesnt work all the time here # fallback - for some reason best_download doesnt work all the time here
sh("wget http://eaidata.bmk.sh/data/lambada_test.jsonl -O data/lambada/lambada_test.jsonl") sh("wget http://eaidata.bmk.sh/data/lambada_test.jsonl -O data/lambada/lambada_test.jsonl")
......
from . import lambada
from lm_eval.base import Task, rf
from lm_eval.metrics import mean, perplexity
from lm_eval.utils import sh
from best_download import download_file
import json
from functools import partial
import os
# This task is lambada but machine-translated to the other languages.
LANGS = ["en", "fr", "de", "it", "es"]
CHECKSUMS = {"en": "4aa8d02cd17c719165fc8a7887fddd641f43fcafa4b1c806ca8abc31fabdb226",
"fr": "941ec6a73dba7dc91c860bf493eb66a527cd430148827a4753a4535a046bf362",
"de": "51c6c1795894c46e88e4c104b5667f488efe79081fb34d746b82b8caa663865e",
"it": "86654237716702ab74f42855ae5a78455c1b0e50054a4593fb9c6fcf7fad0850",
"es": "ffd760026c647fb43c67ce1bc56fd527937304b348712dce33190ea6caba6f9c"
}
class MultilingualLAMBADA(lambada.LAMBADA):
VERSION = 0
def __init__(self, lang=None):
self.LANG = lang
super().__init__()
def download(self):
sh("mkdir -p data/lambada")
f = f"data/lambada/lambada_test_{self.LANG}.jsonl"
url = f"http://eaidata.bmk.sh/data/lambada_test_{self.LANG}.jsonl"
try:
if not os.path.exists(f):
download_file(
url,
f,
CHECKSUMS[self.LANG]
)
except:
# fallback - for some reason best_download doesnt work all the time here
sh(f"wget {url} -O {f}")
sh(f'echo "{CHECKSUMS[self.LANG]} {f}" | sha256sum --check')
def validation_docs(self):
with open(f"data/lambada/lambada_test_{self.LANG}.jsonl") as fh:
for line in fh:
yield json.loads(line)
class MultilingualLAMBADAEN(MultilingualLAMBADA):
def __init__(self):
super().__init__('en')
class MultilingualLAMBADAFR(MultilingualLAMBADA):
def __init__(self):
super().__init__('fr')
class MultilingualLAMBADADE(MultilingualLAMBADA):
def __init__(self):
super().__init__('de')
class MultilingualLAMBADAIT(MultilingualLAMBADA):
def __init__(self):
super().__init__('it')
class MultilingualLAMBADAES(MultilingualLAMBADA):
def __init__(self):
super().__init__('es')
LANG_CLASSES = [MultilingualLAMBADAEN, MultilingualLAMBADAFR, MultilingualLAMBADADE, MultilingualLAMBADAIT, MultilingualLAMBADAES]
def construct_tasks():
tasks = {}
for lang, lang_class in zip(LANGS, LANG_CLASSES):
tasks[f"lambada_mt_{lang}"] = lang_class
return tasks
...@@ -10,7 +10,7 @@ class LogiQA(MultipleChoiceTask): ...@@ -10,7 +10,7 @@ class LogiQA(MultipleChoiceTask):
def download(self): def download(self):
if self.DATASET_PATH.exists(): if self.DATASET_PATH.exists():
return return
Path.mkdir(self.DATASET_PATH) Path.mkdir(self.DATASET_PATH, parents=True)
base_url = "https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master" base_url = "https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master"
splits = [ splits = [
{"name": "Train", "checksum": "7d5bb1f58278e33b395744cd2ad8d7600faa0b3c4d615c659a44ec1181d759fa"}, {"name": "Train", "checksum": "7d5bb1f58278e33b395744cd2ad8d7600faa0b3c4d615c659a44ec1181d759fa"},
......
"""
“Going on a vacation” takes longer than “Going for a walk”:
A Study of Temporal Commonsense Understanding
https://arxiv.org/pdf/1909.03065.pdf
WARNING: Running this task with a `--limit` arg will give misleading results! The
corresponding dataset is structured such that each multiple-choice-question gathered
by the authors is split into question-option pairs, where each such pair gets
siloed into an individual document for plausibility testing. Because the harness
shuffles these documents, setting `--limit` will likely "cut off" certain candidate
answers. This is a problem because the task's metrics require an exhaustive evaluation
of a question's options. See section 4 of the paper for details.
@inproceedings{ZKNR19,
author = {Ben Zhou, Daniel Khashabi, Qiang Ning and Dan Roth},
title = {“Going on a vacation” takes longer than “Going for a walk”: A Study of Temporal Commonsense Understanding },
booktitle = {EMNLP},
year = {2019},
}
"""
import numpy as np
from lm_eval.base import rf
from collections import defaultdict
from . common import HFTask
class MCTACO(HFTask):
VERSION = 0
DATASET_PATH = "mc_taco"
DATASET_NAME = None
def has_training_docs(self):
return False
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def fewshot_description(self):
return "Determine whether the candidate answer is plausible (\"yes\") or not (\"no\")"
def doc_to_text(self, doc):
return f"{doc['sentence']}\nQuestion: {doc['question']}\n"\
f"Answer: {doc['answer']}\nPlausible:"
def doc_to_target(self, doc):
return " " + ["no", "yes"][doc['label']]
def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
ll_no, _ = rf.loglikelihood(ctx, " no")
ll_yes, _ = rf.loglikelihood(ctx, " yes")
return ll_no, ll_yes
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
The results of the requests created in construct_requests.
"""
ll_no, ll_yes = results
gold = doc['label']
pred = int(ll_yes > ll_no)
question_id = self._question2id(doc)
items = (gold, pred, question_id)
return {
"em": items,
"f1": items
}
def _question2id(self, doc):
""" Returns an identifier for the question in the given document. """
return " ".join([doc['sentence'], doc['question']])
def aggregation(self):
return {
"f1": f1,
"em": exact_match,
}
def higher_is_better(self):
return {
"f1": True,
"em": True,
}
def exact_match(items):
"""
Counts a question as correct if the model accurately classifies the plausibility
of an answer for all candidate answers. See section 4 "Evaluation Metrics" in the paper.
"""
results = list(zip(*items))
accuracies = defaultdict(list)
for gold, pred, question in zip(results[0], results[1], results[2]):
accuracies[question].append(pred == gold)
return np.mean([int(all(accs)) for accs in accuracies.values()])
def f1(items):
""" See section 4 "Evaluation Metrics" in the paper about the F1 metric used. """
results = list(zip(*items))
# Group the positive ("yes" = 1) golds and predictions by question.
gold_positives, pred_positives = defaultdict(list), defaultdict(list)
for gold, pred, question in zip(results[0], results[1], results[2]):
gold_positives[question].append(gold)
pred_positives[question].append(pred)
f1 = []
for question in gold_positives.keys():
gp, pp = sum(gold_positives[question]), sum(pred_positives[question])
tp = sum(np.logical_and(gold_positives[question], pred_positives[question]))
p = tp / pp if pp > 0.0 else 1.0
r = tp / gp if gp > 0.0 else 1.0
if p + r > 0.0:
f1.append(2. * (p * r) / (p + r))
return np.mean(f1)
"""
MuTual: A Dataset for Multi-Turn Dialogue Reasoning
https://www.aclweb.org/anthology/2020.acl-main.130/
@inproceedings{mutual,
title = "MuTual: A Dataset for Multi-Turn Dialogue Reasoning",
author = "Cui, Leyang and Wu, Yu and Liu, Shujie and Zhang, Yue and Zhou, Ming" ,
booktitle = "Proceedings of the 58th Conference of the Association for Computational Linguistics",
year = "2020",
publisher = "Association for Computational Linguistics",
}
"""
import json
import zipfile
import shutil
import numpy as np
from pathlib import Path
from lm_eval.base import Task, rf
from lm_eval.metrics import mean
from best_download import download_file
class MuTualBase(Task):
VERSION = 1
BASE_PATH = Path("data/mutual")
DATASET_NAME = None
CHOICES = ['A', 'B', 'C', 'D']
def __init__(self):
super().__init__()
def download(self):
if self.BASE_PATH.exists():
return
Path.mkdir(self.BASE_PATH, parents=True)
master_zip = Path("data/master.zip")
download_file(
"https://github.com/Nealcly/MuTual/archive/master.zip",
str(master_zip),
"bb325cf6c672f0f02699993a37138b0fa0af6fcfc77ec81dfbe46add4d7b29f9")
with zipfile.ZipFile(master_zip, 'r') as zip:
zip.extractall("data")
Path("data/MuTual-master/data").rename(str(self.BASE_PATH))
# Remove left over files and directories.
master_zip.unlink()
shutil.rmtree("data/MuTual-master")
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return False
def _load_docs(self, path):
for file in sorted(path.iterdir()):
if file.suffix != ".txt":
continue
with open(file, 'r', encoding='utf-8') as f:
yield json.load(f)
def training_docs(self):
return self._load_docs(self.BASE_PATH / self.DATASET_NAME / "train")
def validation_docs(self):
return self._load_docs(self.BASE_PATH / self.DATASET_NAME / "dev")
def test_docs(self):
return NotImplemented
def fewshot_description(self):
# TODO: figure out fewshot description
return ""
def doc_to_text(self, doc):
return self.detokenize(doc["article"])
def doc_to_target(self, doc):
return " " + self.detokenize(doc["options"][self.CHOICES.index(doc["answers"])])
def construct_requests(self, doc, ctx):
lls = []
for option in doc["options"]:
lls.append(rf.loglikelihood(ctx, f" {self.detokenize(option)}")[0])
return lls
def detokenize(self, text):
text = text.replace(" '", "'")
text = text.replace(" \n", "\n")
text = text.replace("\n ", "\n")
text = text.replace(" n't", "n't")
text = text.replace("`` ", '"')
text = text.replace("''", '"')
# punctuation
text = text.replace(" :", ":")
text = text.replace(" ;", ";")
text = text.replace(" !", "!")
text = text.replace(" ?", "?")
text = text.replace(" ,", ",")
text = text.replace(" .", ".")
return text
def process_results(self, doc, results):
gold = self.CHOICES.index(doc["answers"])
r4_1 = np.argmax(results) == gold # r4_1 = accuracy
ranks = sorted(results, reverse=True)
r4_2 = (ranks.index(results[gold]) == 1) + r4_1
mrr = 1. / (ranks.index(results[gold]) + 1) # `+ 1` for index offset
return {
"r@1": r4_1,
"r@2": r4_2,
"mrr": mrr
}
def aggregation(self):
return {
"r@1": mean,
"r@2": mean,
"mrr": mean
}
def higher_is_better(self):
return {
"r@1": True,
"r@2": True,
"mrr": True
}
class MuTual(MuTualBase):
DATASET_NAME = Path("mutual")
class MuTualPlus(MuTualBase):
DATASET_NAME = Path("mutual_plus")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment