"git@developer.sourcefind.cn:change/sglang.git" did not exist on "66283dbc0c052c6f32bde68451addc5b0d00cf3b"
Unverified Commit cf074822 authored by Stella Biderman's avatar Stella Biderman Committed by GitHub
Browse files

Merge pull request #316 from jon-tow/master

Revert "Merge branch 'master' into master"
parents 5fe7e2c0 7585ec56
This diff is collapsed.
...@@ -2,38 +2,25 @@ import collections ...@@ -2,38 +2,25 @@ import collections
import itertools import itertools
import pathlib import pathlib
import random import random
import lm_eval.metrics import lm_eval.metrics
import lm_eval.models import lm_eval.models
import lm_eval.tasks import lm_eval.tasks
import lm_eval.base import lm_eval.base
import promptsource
import numpy as np import numpy as np
from promptsource.templates import DatasetTemplates
from lm_eval.utils import positional_deprecated, run_task_tests from lm_eval.utils import positional_deprecated, run_task_tests
@positional_deprecated @positional_deprecated
def simple_evaluate( def simple_evaluate(model, model_args=None, tasks=[],
model, num_fewshot=0, batch_size=None, device=None,
model_args=None, no_cache=False, limit=None, bootstrap_iters=100000,
tasks=[], description_dict=None, check_integrity=False):
num_fewshot=0,
batch_size=None,
device=None,
no_cache=False,
limit=None,
bootstrap_iters=100000,
description_dict=None,
check_integrity=False,
):
"""Instantiate and evaluate a model on a list of tasks. """Instantiate and evaluate a model on a list of tasks.
:param model: Union[str, LM] :param model: Union[str, LM]
Name of model or LM object, see lm_eval.models.get_model Name of model or LM object, see lm_eval.models.get_model
:param model_args: Optional[str] :param model_args: Optional[str]
String arguments for each model class, see LM.create_from_arg_string. String arguments for each model class, see LM.create_from_arg_string.
Ignored if `model` argument is a LM object. Ignored if `model` argument is a LM object.
:param tasks: list[Union[str, Task]] :param tasks: list[Union[str, Task]]
List of task names or Task objects. Task objects will be taken to have name task.EVAL_HARNESS_NAME if defined and type(task).__name__ otherwise. List of task names or Task objects. Task objects will be taken to have name task.EVAL_HARNESS_NAME if defined and type(task).__name__ otherwise.
...@@ -50,7 +37,7 @@ def simple_evaluate( ...@@ -50,7 +37,7 @@ def simple_evaluate(
:param bootstrap_iters: :param bootstrap_iters:
Number of iterations for bootstrap statistics Number of iterations for bootstrap statistics
:param description_dict: dict[str, str] :param description_dict: dict[str, str]
Dictionary of custom task descriptions of the form: `task_name: description` Dictionary of custom task descriptions of the form: `task_name: description`
:param check_integrity: bool :param check_integrity: bool
Whether to run the relevant part of the test suite for the tasks Whether to run the relevant part of the test suite for the tasks
:return :return
...@@ -62,28 +49,20 @@ def simple_evaluate( ...@@ -62,28 +49,20 @@ def simple_evaluate(
assert tasks != [], "No tasks specified" assert tasks != [], "No tasks specified"
if isinstance(model, str): if isinstance(model, str):
if model_args is None: if model_args is None: model_args = ""
model_args = "" lm = lm_eval.models.get_model(model).create_from_arg_string(model_args, {
lm = lm_eval.models.get_model(model).create_from_arg_string( 'batch_size': batch_size, 'device': device
model_args, {"batch_size": batch_size, "device": device} })
)
else: else:
assert isinstance(model, lm_eval.base.LM) assert isinstance(model, lm_eval.base.LM)
lm = model lm = model
# TODO: Hard-code turning off cache while testing. Remove once testing is completed.
no_cache = True
if not no_cache: if not no_cache:
lm = lm_eval.base.CachingLM( lm = lm_eval.base.CachingLM(
lm, lm, 'lm_cache/' + model + '_' + model_args.replace('=', '-').replace(',', '_').replace('/', '-') + '.db'
"lm_cache/"
+ model
+ "_"
+ model_args.replace("=", "-").replace(",", "_").replace("/", "-")
+ ".db",
) )
task_dict = lm_eval.tasks.get_task_dict_promptsource(tasks) task_dict = lm_eval.tasks.get_task_dict(tasks)
if check_integrity: if check_integrity:
run_task_tests(task_list=tasks) run_task_tests(task_list=tasks)
...@@ -93,7 +72,7 @@ def simple_evaluate( ...@@ -93,7 +72,7 @@ def simple_evaluate(
task_dict=task_dict, task_dict=task_dict,
num_fewshot=num_fewshot, num_fewshot=num_fewshot,
limit=limit, limit=limit,
description_dict=description_dict, description_dict=description_dict
) )
# add info about the model and few shot config # add info about the model and few shot config
...@@ -106,22 +85,14 @@ def simple_evaluate( ...@@ -106,22 +85,14 @@ def simple_evaluate(
"no_cache": no_cache, "no_cache": no_cache,
"limit": limit, "limit": limit,
"bootstrap_iters": bootstrap_iters, "bootstrap_iters": bootstrap_iters,
"description_dict": description_dict, "description_dict": description_dict
} }
return results return results
@positional_deprecated @positional_deprecated
def evaluate( def evaluate(lm, task_dict, provide_description=None, num_fewshot=0, limit=None, bootstrap_iters=100000, description_dict=None):
lm,
task_dict,
provide_description=None,
num_fewshot=0,
limit=None,
bootstrap_iters=100000,
description_dict=None,
):
"""Instantiate and evaluate a model on a list of tasks. """Instantiate and evaluate a model on a list of tasks.
:param lm: obj :param lm: obj
...@@ -137,7 +108,7 @@ def evaluate( ...@@ -137,7 +108,7 @@ def evaluate(
:param bootstrap_iters: :param bootstrap_iters:
Number of iterations for bootstrap statistics Number of iterations for bootstrap statistics
:param description_dict: dict[str, str] :param description_dict: dict[str, str]
Dictionary of custom task descriptions of the form: `task_name: description` Dictionary of custom task descriptions of the form: `task_name: description`
:return :return
Dictionary of results Dictionary of results
""" """
...@@ -147,14 +118,12 @@ def evaluate( ...@@ -147,14 +118,12 @@ def evaluate(
assert not provide_description # not implemented. assert not provide_description # not implemented.
if provide_description is not None: if provide_description is not None:
# nudge people to not specify it at all # nudge people to not specify it at all
print( print("WARNING: provide_description is deprecated and will be removed in a future version in favor of description_dict")
"WARNING: provide_description is deprecated and will be removed in a future version in favor of description_dict"
)
task_dict_items = [ task_dict_items = [
(name, task) (name, task)
for name, task in task_dict.items() for name, task in task_dict.items()
if (task.has_validation_docs() or task.has_test_docs()) if(task.has_validation_docs() or task.has_test_docs())
] ]
results = collections.defaultdict(dict) results = collections.defaultdict(dict)
...@@ -172,8 +141,8 @@ def evaluate( ...@@ -172,8 +141,8 @@ def evaluate(
docs = {} docs = {}
# get lists of each type of request # get lists of each type of request
for task_prompt_name, task in task_dict_items: for task_name, task in task_dict_items:
versions[task_prompt_name] = task.VERSION versions[task_name] = task.VERSION
# default to test doc, fall back to val doc if validation unavailable # default to test doc, fall back to val doc if validation unavailable
# TODO: the test-fallback-to-val system isn't final, we should revisit it at some point # TODO: the test-fallback-to-val system isn't final, we should revisit it at some point
if task.has_test_docs(): if task.has_test_docs():
...@@ -184,39 +153,29 @@ def evaluate( ...@@ -184,39 +153,29 @@ def evaluate(
raise RuntimeError("Task has neither test_docs nor validation_docs") raise RuntimeError("Task has neither test_docs nor validation_docs")
# deterministically shuffle docs and chop off the first `limit` because sometimes docs are in some kind of order # deterministically shuffle docs and chop off the first `limit` because sometimes docs are in some kind of order
task_docs = list(enumerate(list(task_doc_func()))) task_docs = list(task_doc_func())
rnd = random.Random() rnd = random.Random()
rnd.seed(42) rnd.seed(42)
rnd.shuffle(task_docs) rnd.shuffle(task_docs)
description = ( description = description_dict[task_name] if description_dict and task_name in description_dict else ""
description_dict[task_prompt_name]
if description_dict and task_prompt_name in description_dict
else ""
)
for doc_id, (original_doc_id, doc) in enumerate(
itertools.islice(task_docs, 0, limit)
):
if task.invalid_doc_for_prompt(doc):
continue
docs[(task_prompt_name, doc_id)] = doc for doc_id, doc in enumerate(itertools.islice(task_docs, 0, limit)):
ctx, fewshotex_logging_info = task.fewshot_context( docs[(task_name, doc_id)] = doc
doc=doc, num_fewshot=num_fewshot, rnd=rnd, description=description ctx = task.fewshot_context(
doc=doc,
num_fewshot=num_fewshot,
rnd=rnd,
description=description
) )
fewshotex_logging_info["doc_id"] = original_doc_id reqs = task.construct_requests(doc, ctx)
args = {"num_fewshot": num_fewshot}
reqs = task.construct_requests(doc, ctx, args)
if not isinstance(reqs, (list, tuple)): if not isinstance(reqs, (list, tuple)):
reqs = [reqs] reqs = [reqs]
for i, req in enumerate(reqs): for i, req in enumerate(reqs):
requests[req.request_type].append(req) requests[req.request_type].append(req)
# i: index in requests for a single task instance # i: index in requests for a single task instance
# doc_id: unique id that we can get back to a doc using `docs` # doc_id: unique id that we can get back to a doc using `docs`
requests_origin[req.request_type].append( requests_origin[req.request_type].append((i, task_name, doc, doc_id))
(i, task_prompt_name, doc, doc_id, fewshotex_logging_info)
)
# all responses for each (task, doc) # all responses for each (task, doc)
process_res_queue = collections.defaultdict(list) process_res_queue = collections.defaultdict(list)
...@@ -230,82 +189,42 @@ def evaluate( ...@@ -230,82 +189,42 @@ def evaluate(
print("Running", reqtype, "requests") print("Running", reqtype, "requests")
resps = getattr(lm, reqtype)([req.args for req in reqs]) resps = getattr(lm, reqtype)([req.args for req in reqs])
resps = [ resps = [x if req.index is None else x[req.index] for x, req in zip(resps, reqs)]
x if req.index is None else x[req.index] for x, req in zip(resps, reqs)
]
for resp, (i, task_prompt_name, doc, doc_id, fewshotex_logging_info) in zip(
resps, requests_origin[reqtype]
):
process_res_queue[(task_prompt_name, doc_id)].append(
(i, resp, fewshotex_logging_info)
)
for resp, (i, task_name, doc, doc_id) in zip(resps, requests_origin[reqtype]):
process_res_queue[(task_name, doc_id)].append((i, resp))
vals = collections.defaultdict(list) vals = collections.defaultdict(list)
# unpack results and sort back in order and return control to Task # unpack results and sort back in order and return control to Task
examples = [] for (task_name, doc_id), requests in process_res_queue.items():
for (task_prompt_name, doc_id), per_doc_requests in process_res_queue.items(): requests.sort(key=lambda x: x[0])
per_doc_requests.sort(key=lambda x: x[0]) requests = [x[1] for x in requests]
per_doc_results = [x[1] for x in per_doc_requests]
fewshot_logging_info = [x[2] for x in per_doc_requests][0]
task = task_dict[task_prompt_name]
doc = docs[(task_prompt_name, doc_id)]
output = task.process_results(doc, per_doc_results)
if task.save_examples:
metrics, example = output
example.update(fewshot_logging_info)
example.update(task.get_logging_info())
examples.append(example)
else:
metrics = output
example = fewshot_logging_info
example.update(task.get_logging_info())
examples.append(example)
for metric, value in metrics.items(): task = task_dict[task_name]
vals[(task_prompt_name, metric)].append(value) doc = docs[(task_name, doc_id)]
metrics = task.process_results(doc, requests)
for metric, value in metrics.items():
vals[(task_name, metric)].append(value)
# aggregate results # aggregate results
metric_results = [] for (task_name, metric), items in vals.items():
for (task_prompt_name, metric), items in vals.items(): task = task_dict[task_name]
task_name, prompt_name = task_prompt_name.split("+") results[task_name][metric] = task.aggregation()[metric](items)
results[task_prompt_name]["task_name"] = task_name
results[task_prompt_name]["prompt_name"] = prompt_name
task = task_dict[task_prompt_name]
results[task_prompt_name][metric] = task.aggregation()[metric](items)
_metric_results = {
"task_name": task_name,
"prompt_name": prompt_name,
metric: task.aggregation()[metric](items),
**task.get_logging_info(),
}
# hotfix: bleu, chrf, ter seem to be really expensive to bootstrap # hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
# so we run them less iterations. still looking for a cleaner way to do this # so we run them less iterations. still looking for a cleaner way to do this
stderr = lm_eval.metrics.stderr_for_metric( stderr = lm_eval.metrics.stderr_for_metric(
metric=task.aggregation()[metric], metric=task.aggregation()[metric],
bootstrap_iters=min(bootstrap_iters, 1000) bootstrap_iters=min(bootstrap_iters, 1000) if metric in ["bleu", "chrf", "ter"] else bootstrap_iters,
if metric in ["bleu", "chrf", "ter"]
else bootstrap_iters,
) )
if stderr is not None: if stderr is not None:
results[task_prompt_name][metric + "_stderr"] = stderr(items) results[task_name][metric + "_stderr"] = stderr(items)
_metric_results[metric + "_stderr"] = stderr(items)
metric_results.append(_metric_results)
return { return {
# List of results that tracks the averages per model and prompt. "results": dict(results),
"results": metric_results, "versions": dict(versions)
"versions": dict(versions),
# List of all prompt x doc examples with additional information in it.
"examples": examples,
# Original results used for generating the table when running this file.
"table_results": dict(results),
} }
...@@ -315,50 +234,22 @@ def make_table(result_dict): ...@@ -315,50 +234,22 @@ def make_table(result_dict):
md_writer = MarkdownTableWriter() md_writer = MarkdownTableWriter()
latex_writer = LatexTableWriter() latex_writer = LatexTableWriter()
md_writer.headers = ["Task", "Prompt", "Version", "Metric", "Value", "", "Stderr"] md_writer.headers = ["Task", "Version", "Metric", "Value", "", "Stderr"]
latex_writer.headers = [ latex_writer.headers = ["Task", "Version", "Metric", "Value", "", "Stderr"]
"Task",
"Prompt",
"Version",
"Metric",
"Value",
"",
"Stderr",
]
values = [] values = []
for k, dic in result_dict["table_results"].items():
for k, dic in result_dict["results"].items():
version = result_dict["versions"][k] version = result_dict["versions"][k]
for m, v in dic.items(): for m, v in dic.items():
if m.endswith("_stderr"): if m.endswith("_stderr"):
continue continue
if "_name" in m:
continue
if m + "_stderr" in dic: if m + "_stderr" in dic:
se = dic[m + "_stderr"] se = dic[m + "_stderr"]
values.append( values.append([k, version, m, '%.4f' % v, '±', '%.4f' % se])
[
dic["task_name"],
dic["prompt_name"],
version,
m,
"%.4f" % v,
"±",
"%.4f" % se,
]
)
else: else:
values.append( values.append([k, version, m, '%.4f' % v, '', ''])
[
dic["task_name"],
dic["prompt_name"],
version,
m,
"%.4f" % v,
"",
"",
]
)
k = "" k = ""
version = "" version = ""
md_writer.value_matrix = values md_writer.value_matrix = values
......
import typing
import math import math
from collections.abc import Iterable from collections.abc import Iterable
import numpy as np import numpy as np
import sacrebleu import sacrebleu
from rouge_score import rouge_scorer
import sklearn.metrics import sklearn.metrics
import random import random
...@@ -186,74 +184,6 @@ def _sacreformat(refs, preds): ...@@ -186,74 +184,6 @@ def _sacreformat(refs, preds):
return refs, preds return refs, preds
def rouge(
refs: typing.List[str],
pred: str,
rouge_types: typing.List[str] = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
):
""" ROUGE with multi-reference support
Implementation based on GEM-metrics:
https://github.com/GEM-benchmark/GEM-metrics/blob/431a8174bd6b3637e8d6118bfad2983e39e99733/gem_metrics/rouge.py
:param refs:
A `list` of reference `str`s.
:param pred:
A single prediction `str`s.
"""
# Add newlines between sentences to correctly compute `rougeLsum`.
if "rougeLsum" in rouge_types:
# TODO: Adapt this to handle languages that do not support sentence endings by `.`.
# See GEM-metrics implementation with lang specific `nltk` tokenizers to
# split sentences.
pred = pred.replace(".", ".\n")
refs = [ref.replace(".", ".\n") for ref in refs]
scorer = rouge_scorer.RougeScorer(rouge_types=rouge_types, use_stemmer=True)
# ROUGE multi-ref jackknifing
if len(refs) > 1:
cur_scores = [scorer.score(ref, pred) for ref in refs]
# get best score for all leave-one-out sets
best_scores = []
for leave in range(len(refs)):
cur_scores_leave_one = [
cur_scores[s] for s in range(len(refs)) if s != leave
]
best_scores.append(
{
rouge_type: max(
[s[rouge_type] for s in cur_scores_leave_one],
key=lambda s: s.fmeasure,
)
for rouge_type in rouge_types
}
)
# average the leave-one-out bests to produce the final score
score = {
rouge_type: rouge_scorer.scoring.Score(
np.mean([b[rouge_type].precision for b in best_scores]),
np.mean([b[rouge_type].recall for b in best_scores]),
np.mean([b[rouge_type].fmeasure for b in best_scores]),
)
for rouge_type in rouge_types
}
else:
score = scorer.score(refs[0], pred)
# convert the named tuples to plain nested dicts
score = {
rouge_type: {
"precision": score[rouge_type].precision,
"recall": score[rouge_type].recall,
"fmeasure": score[rouge_type].fmeasure,
}
for rouge_type in rouge_types
}
return score
# stderr stuff # stderr stuff
class _bootstrap_internal: class _bootstrap_internal:
......
from . import gpt2 from . import gpt2
from . import gptj
from . import gpt3 from . import gpt3
from . import t5
from . import t0
from . import dummy from . import dummy
MODEL_REGISTRY = { MODEL_REGISTRY = {
"hf": gpt2.HFLM, "hf": gpt2.HFLM,
"gpt2": gpt2.GPT2LM, "gpt2": gpt2.GPT2LM,
"gptj": gptj.GPTJLM,
"gpt3": gpt3.GPT3LM, "gpt3": gpt3.GPT3LM,
"t5": t5.T5LM,
"mt5": t5.T5LM,
"t0": t0.T0LM,
"dummy": dummy.DummyLM, "dummy": dummy.DummyLM,
} }
......
...@@ -4,16 +4,8 @@ from lm_eval.base import BaseLM ...@@ -4,16 +4,8 @@ from lm_eval.base import BaseLM
class HFLM(BaseLM): class HFLM(BaseLM):
def __init__(
self, def __init__(self, device='cuda', pretrained='gpt2', revision='main', subfolder=None, tokenizer=None, batch_size=1):
device="cuda",
pretrained="gpt2",
revision="main",
subfolder=None,
tokenizer=None,
batch_size=1,
parallelize=False
):
super().__init__() super().__init__()
assert isinstance(device, str) assert isinstance(device, str)
...@@ -23,61 +15,36 @@ class HFLM(BaseLM): ...@@ -23,61 +15,36 @@ class HFLM(BaseLM):
if device: if device:
self._device = torch.device(device) self._device = torch.device(device)
else: else:
self._device = ( self._device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
torch.device("cuda")
if torch.cuda.is_available()
else torch.device("cpu")
)
# TODO: update this to be less of a hack once subfolder is fixed in HF # TODO: update this to be less of a hack once subfolder is fixed in HF
self.gpt2 = transformers.AutoModelForCausalLM.from_pretrained( self.gpt2 = transformers.AutoModelForCausalLM.from_pretrained(
pretrained, pretrained, revision=revision + ("/" + subfolder if subfolder is not None else "")
revision=revision + ("/" + subfolder if subfolder is not None else ""), ).to(self.device)
)
self.gpt2.eval() self.gpt2.eval()
# pretrained tokenizer for neo is broken for now so just hard-coding this to gpt2 # pretrained tokenizer for neo is broken for now so just hard-coding this to gpt2
self.tokenizer = transformers.AutoTokenizer.from_pretrained( self.tokenizer = transformers.AutoTokenizer.from_pretrained(
pretrained if tokenizer is None else tokenizer, pretrained if tokenizer is None else tokenizer, revision=revision, subfolder=subfolder)
revision=revision,
subfolder=subfolder,
)
assert isinstance( assert isinstance(self.tokenizer, (
self.tokenizer, transformers.GPT2Tokenizer, transformers.GPT2TokenizerFast,
( transformers.T5Tokenizer, transformers.T5TokenizerFast,
transformers.GPT2Tokenizer, )), "this tokenizer has not been checked for compatibility yet!"
transformers.GPT2TokenizerFast,
transformers.T5Tokenizer,
transformers.T5TokenizerFast,
),
), "this tokenizer has not been checked for compatibility yet!"
self.vocab_size = self.tokenizer.vocab_size self.vocab_size = self.tokenizer.vocab_size
if isinstance( if isinstance(self.tokenizer, (transformers.GPT2Tokenizer, transformers.GPT2TokenizerFast)):
self.tokenizer, (transformers.GPT2Tokenizer, transformers.GPT2TokenizerFast) assert self.tokenizer.encode('hello\n\nhello') == [31373, 198, 198, 31373], \
): self.tokenizer.encode('hello\n\nhello')
assert self.tokenizer.encode("hello\n\nhello") == [
31373,
198,
198,
31373,
], self.tokenizer.encode("hello\n\nhello")
# multithreading and batching # multithreading and batching
self.batch_size_per_gpu = batch_size # todo: adaptive batch size self.batch_size_per_gpu = batch_size # todo: adaptive batch size
# TODO: fix multi-gpu # TODO: fix multi-gpu
if parallelize: # gpus = torch.cuda.device_count()
self.gpt2.parallelize() # if gpus > 1:
self._device = torch.device('cuda:0') # self.gpt2 = nn.DataParallel(self.gpt2)
else:
self.gpt2.to(self._device)
@property
def eot_token(self):
return self.tokenizer.eos_token
@property @property
def eot_token_id(self): def eot_token_id(self):
...@@ -108,7 +75,7 @@ class HFLM(BaseLM): ...@@ -108,7 +75,7 @@ class HFLM(BaseLM):
def tok_encode(self, string: str): def tok_encode(self, string: str):
return self.tokenizer.encode(string, add_special_tokens=False) return self.tokenizer.encode(string, add_special_tokens=False)
def tok_decode(self, tokens): def tok_decode(self, tokens):
return self.tokenizer.decode(tokens) return self.tokenizer.decode(tokens)
...@@ -122,53 +89,15 @@ class HFLM(BaseLM): ...@@ -122,53 +89,15 @@ class HFLM(BaseLM):
""" """
with torch.no_grad(): with torch.no_grad():
return self.gpt2(inps)[0][:, :, :50257] return self.gpt2(inps)[0][:, :, :50257]
def _model_generate(self, context, max_length, eos_token_id):
return self.gpt2.generate(
context,
max_length=max_length,
eos_token_id=eos_token_id,
do_sample=False
)
def _get_stopping_criteria(self, stopping_criteria_ids):
class MultitokenEOSCriteria(transformers.StoppingCriteria):
def __init__(self, eos_seq_id: torch.LongTensor, tokenizer):
self.eos_seq = tokenizer.decode(eos_seq_id)
self.eos_seq_id = eos_seq_id
self.eos_seq_len = len(eos_seq_id) + 1
self.tokenizer = tokenizer
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
last_token_id = input_ids[0, -self.eos_seq_len:]
last_tokens = self.tokenizer.decode(last_token_id)
is_stopped = self.eos_seq in last_tokens
return is_stopped
class EOSCriteria(transformers.StoppingCriteria):
def __init__(self, eos_token_id: torch.LongTensor):
self.eos_token_id = eos_token_id
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
return input_ids[0,-1] == self.eos_token_id
return transformers.StoppingCriteriaList([
MultitokenEOSCriteria(stopping_criteria_ids, self.tokenizer),
EOSCriteria(self.tokenizer.eos_token)
])
def _model_generate(self, context, max_length, stopping_criteria_ids, num_fewshot):
stopping_criteria = self._get_stopping_criteria(stopping_criteria_ids)
max_length = max_length + context.size(1)
if num_fewshot == 0:
generations = self.gpt2.generate(
context,
max_length=max_length,
eos_token_id=self.eot_token_id,
do_sample=False,
)
else:
generations = self.gpt2.generate(
context,
max_length=max_length,
stopping_criteria=stopping_criteria,
do_sample=False,
)
# Remove the context from the generations
return generations[0, context.shape[1] :]
# for backwards compatibility # for backwards compatibility
GPT2LM = HFLM GPT2LM = HFLM
import transformers
import torch
from lm_eval.base import BaseLM
class GPTJLM(BaseLM):
def __init__(
self,
device="cuda",
batch_size=1,
parallelize=False,
):
super().__init__()
assert isinstance(device, str)
assert isinstance(batch_size, int)
if device:
self._device = torch.device(device)
else:
self._device = (
torch.device("cuda")
if torch.cuda.is_available()
else torch.device("cpu")
)
pretrained = "EleutherAI/gpt-j-6B"
self.gptj = transformers.AutoModelForCausalLM.from_pretrained(pretrained).to(self.device)
self.gptj.eval()
# pretrained tokenizer for neo is broken for now so just hard-coding this to gptj
self.tokenizer = transformers.AutoTokenizer.from_pretrained(pretrained)
self.vocab_size = self.tokenizer.vocab_size
# multithreading and batching
self.batch_size_per_gpu = batch_size # todo: adaptive batch size
# TODO: fix multi-gpu
if parallelize:
self.gptj.parallelize()
self._device = torch.device('cuda:0')
else:
self.gptj.to(self._device)
@property
def eot_token(self):
return self.tokenizer.eos_token
@property
def eot_token_id(self):
# we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
return self.tokenizer.eos_token_id
@property
def max_length(self):
try:
return self.gptj.config.n_ctx
except AttributeError:
# gptneoconfig doesn't have n_ctx apparently
return self.gptj.config.max_position_embeddings
@property
def max_gen_toks(self):
return 256
@property
def batch_size(self):
# TODO: fix multi-gpu
return self.batch_size_per_gpu # * gpus
@property
def device(self):
# TODO: fix multi-gpu
return self._device
def tok_encode(self, string: str):
return self.tokenizer.encode(string, add_special_tokens=False)
def tok_decode(self, tokens):
return self.tokenizer.decode(tokens)
def _model_call(self, inps):
"""
inps: a torch tensor of shape [batch, sequence]
the size of sequence may vary from call to call
returns: a torch tensor of shape [batch, sequence, vocab] with the
logits returned from the model
"""
with torch.no_grad():
return self.gptj(inps)[0][:, :, :50257]
def _get_stopping_criteria(self, stopping_criteria_ids):
class MultitokenEOSCriteria(transformers.StoppingCriteria):
def __init__(self, eos_seq_id: torch.LongTensor, tokenizer):
self.eos_seq = tokenizer.decode(eos_seq_id)
self.eos_seq_id = eos_seq_id
self.eos_seq_len = len(eos_seq_id) + 1
self.tokenizer = tokenizer
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
last_token_id = input_ids[0, -self.eos_seq_len:]
last_tokens = self.tokenizer.decode(last_token_id)
is_stopped = self.eos_seq in last_tokens
return is_stopped
class EOSCriteria(transformers.StoppingCriteria):
def __init__(self, eos_token_id: torch.LongTensor):
self.eos_token_id = eos_token_id
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
return input_ids[0,-1] == self.eos_token_id
return transformers.StoppingCriteriaList([
MultitokenEOSCriteria(stopping_criteria_ids, self.tokenizer),
EOSCriteria(self.tokenizer.eos_token)
])
def _model_generate(self, context, max_length, stopping_criteria_ids, num_fewshot):
stopping_criteria = self._get_stopping_criteria(stopping_criteria_ids)
max_length = max_length + context.size(1)
if num_fewshot == 0:
generations = self.gptj.generate(
context,
max_length=max_length,
eos_token_id=self.eot_token_id,
do_sample=False,
)
else:
generations = self.gptj.generate(
context,
max_length=max_length,
stopping_criteria=stopping_criteria,
do_sample=False,
)
# Remove the context from the generations
return generations[0, context.shape[1] :]
import transformers
import torch
import torch.nn as nn
import torch.nn.functional as F
from lm_eval.base import BaseLM
from lm_eval import utils
from tqdm import tqdm
import numpy as np
import math
class T0LM(BaseLM):
# MAX_GEN_TOKS = 256
# MAX_INP_LENGTH = 512
# VOCAB_SIZE = 32100
# EOT_TOKEN_ID = 1
def __init__(self, device='cuda', parallelize=False, pretrained='t0', batch_size=1):
super().__init__()
if device:
self._device = torch.device(device)
else:
self._device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
self.t0 = transformers.AutoModelForSeq2SeqLM.from_pretrained(pretrained)
self.t0.eval()
if parallelize == "True":
self.t0.parallelize()
self._device = torch.device('cuda:0')
else:
self.t0.to(self._device)
self.tokenizer = transformers.AutoTokenizer.from_pretrained(pretrained)
# self.max_length = self.MAX_INP_LENGTH
self.batch_size = int(batch_size)
@classmethod
def create_from_arg_string(cls, arg_string, additional_config={}):
args = utils.simple_parse_args_string(arg_string)
args2 = {k: v for k, v in additional_config.items() if v is not None}
return cls(**args, **args2)
@property
def eot_token(self):
return self.tokenizer.eos_token
@property
def eot_token_id(self):
# we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
return self.tokenizer.eos_token_id
@property
def max_length(self):
return self.tokenizer.model_max_length
@property
def max_gen_toks(self):
return 256
@property
def batch_size(self):
# TODO: fix multi-gpu
return self._batch_size # * gpus
@property
def device(self):
# TODO: fix multi-gpu
return self._device
def tok_encode(self, string: str):
return self.tokenizer.encode(string, add_special_tokens=False)
def tok_decode(self, tokens):
return self.tokenizer.decode(tokens)
def _model_call(self, inputs_tok, targets_tok):
"""
inps: a torch tensor of shape [batch, sequence]
the size of sequence may vary from call to call
returns: a torch tensor of shape [batch, sequence, vocab] with the
logits returned from the model
"""
with torch.no_grad():
return self.t0(
**inputs_tok,
labels=targets_tok["input_ids"]
)
def loglikelihood(self, requests):
res = []
for chunk in tqdm(utils.chunks(requests, self.batch_size), total=math.ceil(len(requests)/self.batch_size)):
inputs, targets = zip(*chunk)
# Fill in empty encoder inputs with eos_token
inputs = (
f"{self.eot_token}"
if len(input_) == 0
else input_
for input_ in inputs
)
inputs_tok = self.tokenizer(
list(inputs),
max_length=self.max_length,
padding=True,
# truncation=True,
add_special_tokens=False,
return_tensors="pt"
).to(self.device)
for key in inputs_tok:
inputs_tok[key] = inputs_tok[key][:, -(self.max_length - 1) :]
targets_tok = self.tokenizer(
list(targets),
max_length=self.max_gen_toks,
padding=True,
# truncation=True,
add_special_tokens=False,
return_tensors="pt"
).to(self.device)
for key in targets_tok:
targets_tok[key] = targets_tok[key][:, -(self.max_length - 1) :]
outputs = self._model_call(inputs_tok, targets_tok)
log_softmaxes = F.log_softmax(outputs.logits, dim=-1)
output_iterator = zip(
chunk,
log_softmaxes,
targets_tok["input_ids"],
targets_tok["attention_mask"],
)
for cache_key, log_softmax, target_tok, target_mask in output_iterator:
length = target_mask.sum()
log_softmax = log_softmax[:length]
target_tok = target_tok[:length]
greedy_tokens = log_softmax.argmax(dim=-1)
max_equal = (greedy_tokens == target_tok).all()
target_logits = torch.gather(
log_softmax, 1, target_tok.unsqueeze(-1)
).squeeze(-1)
answer = (float(target_logits.sum()), bool(max_equal))
if cache_key is not None:
self.cache_hook.add_partial("loglikelihood", cache_key, answer)
res.append(answer)
return res
def _get_stopping_criteria(self, stopping_criteria_ids):
class MultitokenEOSCriteria(transformers.StoppingCriteria):
def __init__(self, eos_seq_id: torch.LongTensor, tokenizer):
self.eos_seq = tokenizer.decode(eos_seq_id)
self.eos_seq_id = eos_seq_id
self.eos_seq_len = len(eos_seq_id) + 1
self.tokenizer = tokenizer
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
last_token_id = input_ids[0, -self.eos_seq_len:]
last_tokens = self.tokenizer.decode(last_token_id)
is_stopped = self.eos_seq in last_tokens
return is_stopped
class EOSCriteria(transformers.StoppingCriteria):
def __init__(self, eos_token_id: torch.LongTensor):
self.eos_token_id = eos_token_id
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
return input_ids[0,-1] == self.eos_token_id
return transformers.StoppingCriteriaList([
MultitokenEOSCriteria(stopping_criteria_ids, self.tokenizer),
EOSCriteria(self.tokenizer.eos_token)
])
def _model_generate(self, context, max_length, stopping_criteria_ids, num_fewshot):
stopping_criteria = self._get_stopping_criteria(stopping_criteria_ids)
if num_fewshot == 0:
generations = self.t0.generate(
context,
max_length=max_length,
eos_token_id=self.eot_token_id,
do_sample=False,
)
else:
generations = self.t0.generate(
context,
max_length=max_length,
stopping_criteria=stopping_criteria,
do_sample=False,
)
return generations[0]
import transformers
import torch
import torch.nn as nn
import torch.nn.functional as F
from lm_eval.base import BaseLM
from lm_eval import utils
from tqdm import tqdm
import numpy as np
import math
class T5LM(BaseLM):
# MAX_GEN_TOKS = 256
# MAX_INP_LENGTH = 512
# VOCAB_SIZE = 32128
# EOT_TOKEN_ID = 1
def __init__(
self,
device='cuda',
parallelize=False,
pretrained='t5',
batch_size=1
):
super().__init__()
if device:
self._device = torch.device(device)
else:
self._device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
self.t5 = transformers.AutoModelForSeq2SeqLM.from_pretrained(pretrained)
self.t5.eval()
if parallelize == "True":
self.t5.parallelize()
self._device = torch.device('cuda:0')
else:
self.t5.to(self._device)
self.tokenizer = transformers.AutoTokenizer.from_pretrained(pretrained)
# self.max_length = self.MAX_INP_LENGTH
self._batch_size = int(batch_size)
@classmethod
def create_from_arg_string(cls, arg_string, additional_config={}):
args = utils.simple_parse_args_string(arg_string)
args2 = {k: v for k, v in additional_config.items() if v is not None}
return cls(**args, **args2)
@property
def eot_token(self):
return self.tokenizer.eos_token
@property
def eot_token_id(self):
# we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
return self.tokenizer.eos_token_id
@property
def max_length(self):
return self.tokenizer.model_max_length
@property
def max_gen_toks(self):
return 256
@property
def batch_size(self):
# TODO: fix multi-gpu
return self._batch_size # * gpus
@property
def device(self):
# TODO: fix multi-gpu
return self._device
def tok_encode(self, string: str):
return self.tokenizer.encode(string, add_special_tokens=False)
def tok_decode(self, tokens):
return self.tokenizer.decode(tokens)
def _model_call(self, inputs_tok, targets_tok):
"""
inps: a torch tensor of shape [batch, sequence]
the size of sequence may vary from call to call
returns: a torch tensor of shape [batch, sequence, vocab] with the
logits returned from the model
"""
with torch.no_grad():
return self.t5(
**inputs_tok,
labels=targets_tok["input_ids"]
)
def loglikelihood(self, requests):
res = []
for chunk in tqdm(utils.chunks(requests, self.batch_size), total=math.ceil(len(requests)/self.batch_size)):
inputs, targets = zip(*chunk)
# Fill in empty encoder inputs with eos_token
inputs = (
f"{self.eot_token}"
if len(input_) == 0
else input_
for input_ in inputs
)
inputs_tok = self.tokenizer(
list(inputs),
max_length=self.max_length,
padding=True,
# truncation=True,
add_special_tokens=False,
return_tensors="pt"
).to(self.device)
for key in inputs_tok:
inputs_tok[key] = inputs_tok[key][:, -(self.max_length - 1) :]
targets_tok = self.tokenizer(
list(targets),
max_length=self.max_gen_toks,
padding=True,
# truncation=True,
add_special_tokens=False,
return_tensors="pt"
).to(self.device)
for key in targets_tok:
targets_tok[key] = targets_tok[key][:, -(self.max_length - 1) :]
outputs = self._model_call(inputs_tok, targets_tok)
log_softmaxes = F.log_softmax(outputs.logits, dim=-1)
output_iterator = zip(
chunk,
log_softmaxes,
targets_tok["input_ids"],
targets_tok["attention_mask"],
)
for cache_key, log_softmax, target_tok, target_mask in output_iterator:
length = target_mask.sum()
log_softmax = log_softmax[:length]
target_tok = target_tok[:length]
greedy_tokens = log_softmax.argmax(dim=-1)
max_equal = (greedy_tokens == target_tok).all()
target_logits = torch.gather(
log_softmax, 1, target_tok.unsqueeze(-1)
).squeeze(-1)
answer = (float(target_logits.sum()), bool(max_equal))
if cache_key is not None:
self.cache_hook.add_partial("loglikelihood", cache_key, answer)
res.append(answer)
return res
def _get_stopping_criteria(self, stopping_criteria_ids):
class MultitokenEOSCriteria(transformers.StoppingCriteria):
def __init__(self, eos_seq_id: torch.LongTensor, tokenizer):
self.eos_seq = tokenizer.decode(eos_seq_id)
self.eos_seq_id = eos_seq_id
self.eos_seq_len = len(eos_seq_id) + 1
self.tokenizer = tokenizer
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
last_token_id = input_ids[0, -self.eos_seq_len:]
last_tokens = self.tokenizer.decode(last_token_id)
is_stopped = self.eos_seq in last_tokens
return is_stopped
class EOSCriteria(transformers.StoppingCriteria):
def __init__(self, eos_token_id: torch.LongTensor):
self.eos_token_id = eos_token_id
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
return input_ids[0,-1] == self.eos_token_id
return transformers.StoppingCriteriaList([
MultitokenEOSCriteria(stopping_criteria_ids, self.tokenizer),
EOSCriteria(self.tokenizer.eos_token)
])
def _model_generate(self, context, max_length, stopping_criteria_ids, num_fewshot):
stopping_criteria = self._get_stopping_criteria(stopping_criteria_ids)
if num_fewshot == 0:
generations = self.t5.generate(
context,
max_length=max_length,
eos_token_id=self.eot_token_id,
do_sample=False,
)
else:
generations = self.t5.generate(
context,
max_length=max_length,
stopping_criteria=stopping_criteria,
do_sample=False,
)
return generations[0]
"""
A dataset of approximately 200K news headlines from the year 2012 to 2018 collected from HuffPost.
Homepage: https://www.kaggle.com/datasets/rmisra/news-category-dataset
"""
from lm_eval.base import PromptSourceTask
_CITATION = """\
@book{book,
author = {Misra, Rishabh and Grover, Jigyasa},
year = {2021},
month = {01},
pages = {},
title = {Sculpting Data for ML: The first act of Machine Learning},
isbn = {978-0-578-83125-1}
}
@dataset{dataset,
author = {Misra, Rishabh},
year = {2018},
month = {06},
pages = {},
title = {News Category Dataset},
doi = {10.13140/RG.2.2.20331.18729}
}
"""
class HuffPost(PromptSourceTask):
VERSION = 0
DATASET_PATH = "khalidalt/HuffPost"
DATASET_NAME = None
def has_training_docs(self):
return False
def has_validation_docs(self):
return False
def has_test_docs(self):
return True
def training_docs(self):
if self.has_training_docs():
if self._training_docs is None:
self._training_docs = list(self.dataset["train"])
return self._training_docs
def validation_docs(self):
if self.has_validation_docs():
return self.dataset["validation"]
def test_docs(self):
if self.has_test_docs():
return self.dataset["test"]
from promptsource.templates import DatasetTemplates
from pprint import pprint from pprint import pprint
from typing import List, Union from typing import List, Union
import sacrebleu import sacrebleu
import lm_eval.base import lm_eval.base
from . import superglue from . import superglue
from . import glue from . import glue
from . import arc from . import arc
...@@ -54,27 +52,15 @@ from . import blimp ...@@ -54,27 +52,15 @@ from . import blimp
from . import asdiv from . import asdiv
from . import gsm8k from . import gsm8k
from . import storycloze from . import storycloze
from . import hans
from . import gem_webnlg
from . import lama
# from . import e2e_nlg_cleaned
from . import gem_xsum
from . import gem_mlsum
from . import wino_bias
from . import e2e_nlg_cleaned
from . import gem_asset_turk
from . import crows_pairs_multilingual
from . import lama
from . import HuffPost
######################################## ########################################
# Translation tasks # Translation tasks
######################################## ########################################
# 6 total # 6 total
gpt3_translation_benchmarks = { gpt3_translation_benchmarks = {
"wmt14": ["en-fr", "fr-en"], # French "wmt14": ['en-fr', 'fr-en'], # French
"wmt16": ["en-ro", "ro-en", "de-en", "en-de"], # German, Romanian "wmt16": ['en-ro', 'ro-en', 'de-en', 'en-de'], # German, Romanian
} }
...@@ -82,7 +68,7 @@ gpt3_translation_benchmarks = { ...@@ -82,7 +68,7 @@ gpt3_translation_benchmarks = {
selected_translation_benchmarks = { selected_translation_benchmarks = {
**gpt3_translation_benchmarks, **gpt3_translation_benchmarks,
"wmt20": sacrebleu.get_langpairs_for_testset("wmt20"), "wmt20": sacrebleu.get_langpairs_for_testset("wmt20"),
"iwslt17": ["en-ar", "ar-en"], # Arabic "iwslt17": ['en-ar', 'ar-en'] # Arabic
} }
# 319 total # 319 total
...@@ -106,7 +92,7 @@ TASK_REGISTRY = { ...@@ -106,7 +92,7 @@ TASK_REGISTRY = {
"rte": glue.RTE, "rte": glue.RTE,
"qnli": glue.QNLI, "qnli": glue.QNLI,
"qqp": glue.QQP, "qqp": glue.QQP,
# "stsb": glue.STSB, # not implemented yet #"stsb": glue.STSB, # not implemented yet
"sst": glue.SST, "sst": glue.SST,
"wnli": glue.WNLI, "wnli": glue.WNLI,
# SuperGLUE # SuperGLUE
...@@ -117,37 +103,38 @@ TASK_REGISTRY = { ...@@ -117,37 +103,38 @@ TASK_REGISTRY = {
"record": superglue.ReCoRD, "record": superglue.ReCoRD,
"wic": superglue.WordsInContext, "wic": superglue.WordsInContext,
"wsc": superglue.SGWinogradSchemaChallenge, "wsc": superglue.SGWinogradSchemaChallenge,
# Order by benchmark/genre? # Order by benchmark/genre?
"coqa": coqa.CoQA, "coqa": coqa.CoQA,
"drop": drop.DROP, "drop": drop.DROP,
"lambada": lambada.LAMBADA, "lambada": lambada.LAMBADA,
"lambada_cloze": lambada_cloze.LAMBADA_cloze, "lambada_cloze": lambada_cloze.LAMBADA_cloze,
**gem_webnlg.construct_tasks(),
# multilingual lambada # multilingual lambada
**gem_asset_turk.construct_tasks(),
**lambada_multilingual.construct_tasks(), **lambada_multilingual.construct_tasks(),
"wikitext": wikitext.WikiText, "wikitext": wikitext.WikiText,
# "cbt-cn": cbt.CBTCN, # disabled pending context length fix # "cbt-cn": cbt.CBTCN, # disabled pending context length fix
# "cbt-ne": cbt.CBTNE, # disabled pending context length fix # "cbt-ne": cbt.CBTNE, # disabled pending context length fix
"piqa": piqa.PiQA, "piqa": piqa.PiQA,
"prost": prost.PROST, "prost": prost.PROST,
"mc_taco": mc_taco.MCTACO, "mc_taco": mc_taco.MCTACO,
# Science related # Science related
"pubmedqa": pubmedqa.Pubmed_QA, "pubmedqa" : pubmedqa.Pubmed_QA,
"sciq": sciq.SciQ, "sciq" : sciq.SciQ,
"e2e_nlg_cleaned": e2e_nlg_cleaned.E2E_NLG_Cleaned,
"qasper": qasper.QASPER, "qasper": qasper.QASPER,
"qa4mre_2011": qa4mre.QA4MRE_2011,
"qa4mre_2012": qa4mre.QA4MRE_2012, "qa4mre_2011" : qa4mre.QA4MRE_2011,
"qa4mre_2013": qa4mre.QA4MRE_2013, "qa4mre_2012" : qa4mre.QA4MRE_2012,
"qa4mre_2013" : qa4mre.QA4MRE_2013,
"triviaqa": triviaqa.TriviaQA, "triviaqa": triviaqa.TriviaQA,
"arc_easy": arc.ARCEasy, "arc_easy": arc.ARCEasy,
"arc_challenge": arc.ARCChallenge, "arc_challenge": arc.ARCChallenge,
# "quac": quac.QuAC, # not implemented yet # "quac": quac.QuAC, # not implemented yet
"lama_trex": lama.Trex,
"lama_squad": lama.Squad,
"lama_google_re": lama.google_re,
"lama_concptnet": lama.Conceptnet,
"logiqa": logiqa.LogiQA, "logiqa": logiqa.LogiQA,
"hellaswag": hellaswag.HellaSwag, "hellaswag": hellaswag.HellaSwag,
"swag": swag.SWAG, "swag": swag.SWAG,
...@@ -155,7 +142,7 @@ TASK_REGISTRY = { ...@@ -155,7 +142,7 @@ TASK_REGISTRY = {
"squad2": squad.SQuAD2, "squad2": squad.SQuAD2,
"race": race.RACE, "race": race.RACE,
# "naturalqs": naturalqs.NaturalQs, # not implemented yet # "naturalqs": naturalqs.NaturalQs, # not implemented yet
"headqa": headqa.HeadQAEsDeprecated, # for backwards compat - headqa used to default to es "headqa": headqa.HeadQAEsDeprecated, # for backwards compat - headqa used to default to es
"headqa_es": headqa.HeadQAEs, "headqa_es": headqa.HeadQAEs,
"headqa_en": headqa.HeadQAEn, "headqa_en": headqa.HeadQAEn,
"mathqa": mathqa.MathQA, "mathqa": mathqa.MathQA,
...@@ -165,20 +152,21 @@ TASK_REGISTRY = { ...@@ -165,20 +152,21 @@ TASK_REGISTRY = {
"anli_r1": anli.ANLIRound1, "anli_r1": anli.ANLIRound1,
"anli_r2": anli.ANLIRound2, "anli_r2": anli.ANLIRound2,
"anli_r3": anli.ANLIRound3, "anli_r3": anli.ANLIRound3,
"hans": hans.HANS,
"ethics_cm": hendrycks_ethics.EthicsCM, "ethics_cm": hendrycks_ethics.EthicsCM,
"ethics_deontology": hendrycks_ethics.EthicsDeontology, "ethics_deontology": hendrycks_ethics.EthicsDeontology,
"ethics_justice": hendrycks_ethics.EthicsJustice, "ethics_justice": hendrycks_ethics.EthicsJustice,
"ethics_utilitarianism_original": hendrycks_ethics.EthicsUtilitarianismOriginal, "ethics_utilitarianism_original": hendrycks_ethics.EthicsUtilitarianismOriginal,
"ethics_utilitarianism": hendrycks_ethics.EthicsUtilitarianism, "ethics_utilitarianism": hendrycks_ethics.EthicsUtilitarianism,
"ethics_virtue": hendrycks_ethics.EthicsVirtue, "ethics_virtue": hendrycks_ethics.EthicsVirtue,
#"tydiqa_primary" : TyDiQA.Primary, not implemented yet
#"tydiqa_secondary" : TyDiQA.Secondary, not implemented yet "truthfulqa_mc": truthfulqa.TruthfulQAMultipleChoice,
"truthfulqa_mc": truthfulqa.TruthfulQAMultipleChoice, "truthfulqa_gen": truthfulqa.TruthfulQAGeneration,
"truthfulqa_gen": truthfulqa.TruthfulQAGeneration,
# dialogue # dialogue
"mutual": mutual.MuTual, "mutual": mutual.MuTual,
"mutual_plus": mutual.MuTualPlus, "mutual_plus": mutual.MuTualPlus,
# math # math
"math_algebra": hendrycks_math.MathAlgebra, "math_algebra": hendrycks_math.MathAlgebra,
"math_counting_and_prob": hendrycks_math.MathCountingAndProbability, "math_counting_and_prob": hendrycks_math.MathCountingAndProbability,
...@@ -189,6 +177,7 @@ TASK_REGISTRY = { ...@@ -189,6 +177,7 @@ TASK_REGISTRY = {
"math_precalc": hendrycks_math.MathPrecalculus, "math_precalc": hendrycks_math.MathPrecalculus,
"math_asdiv": asdiv.Asdiv, "math_asdiv": asdiv.Asdiv,
"gsm8k": gsm8k.GradeSchoolMath8K, "gsm8k": gsm8k.GradeSchoolMath8K,
# arithmetic # arithmetic
"arithmetic_2da": arithmetic.Arithmetic2DPlus, "arithmetic_2da": arithmetic.Arithmetic2DPlus,
"arithmetic_2ds": arithmetic.Arithmetic2DMinus, "arithmetic_2ds": arithmetic.Arithmetic2DMinus,
...@@ -202,18 +191,22 @@ TASK_REGISTRY = { ...@@ -202,18 +191,22 @@ TASK_REGISTRY = {
"arithmetic_1dc": arithmetic.Arithmetic1DComposite, "arithmetic_1dc": arithmetic.Arithmetic1DComposite,
# TODO Perhaps make these groups of tasks # TODO Perhaps make these groups of tasks
# e.g. anli, arithmetic, openai_translations, harness_translations # e.g. anli, arithmetic, openai_translations, harness_translations
# hendrycksTest (57 tasks) # hendrycksTest (57 tasks)
**hendrycks_test.create_all_tasks(), **hendrycks_test.create_all_tasks(),
# e.g. wmt14-fr-en # e.g. wmt14-fr-en
**translation.create_tasks_from_benchmarks(gpt3_translation_benchmarks), **translation.create_tasks_from_benchmarks(gpt3_translation_benchmarks),
# chef's selection, mostly wmt20 # chef's selection, mostly wmt20
**translation.create_tasks_from_benchmarks(selected_translation_benchmarks), **translation.create_tasks_from_benchmarks(selected_translation_benchmarks),
# Word Scrambling and Manipulation Tasks # Word Scrambling and Manipulation Tasks
"anagrams1": unscramble.Anagrams1, "anagrams1": unscramble.Anagrams1,
"anagrams2": unscramble.Anagrams2, "anagrams2": unscramble.Anagrams2,
"cycle_letters": unscramble.CycleLetters, "cycle_letters": unscramble.CycleLetters,
"random_insertion": unscramble.RandomInsertion, "random_insertion": unscramble.RandomInsertion,
"reversed_words": unscramble.ReversedWords, "reversed_words": unscramble.ReversedWords,
# Pile # Pile
"pile_arxiv": pile.PileArxiv, "pile_arxiv": pile.PileArxiv,
"pile_books3": pile.PileBooks3, "pile_books3": pile.PileBooks3,
...@@ -237,6 +230,7 @@ TASK_REGISTRY = { ...@@ -237,6 +230,7 @@ TASK_REGISTRY = {
"pile_ubuntu-irc": pile.PileUbuntuIrc, "pile_ubuntu-irc": pile.PileUbuntuIrc,
"pile_wikipedia": pile.PileWikipedia, "pile_wikipedia": pile.PileWikipedia,
"pile_youtubesubtitles": pile.PileYoutubeSubtitles, "pile_youtubesubtitles": pile.PileYoutubeSubtitles,
# BLiMP # BLiMP
"blimp_adjunct_island": blimp.BlimpAdjunctIsland, "blimp_adjunct_island": blimp.BlimpAdjunctIsland,
"blimp_anaphor_gender_agreement": blimp.BlimpAnaphorGenderAgreement, "blimp_anaphor_gender_agreement": blimp.BlimpAnaphorGenderAgreement,
...@@ -305,45 +299,11 @@ TASK_REGISTRY = { ...@@ -305,45 +299,11 @@ TASK_REGISTRY = {
"blimp_wh_vs_that_no_gap_long_distance": blimp.BlimpWhVsThatNoGapLongDistance, "blimp_wh_vs_that_no_gap_long_distance": blimp.BlimpWhVsThatNoGapLongDistance,
"blimp_wh_vs_that_with_gap": blimp.BlimpWhVsThatWithGap, "blimp_wh_vs_that_with_gap": blimp.BlimpWhVsThatWithGap,
"blimp_wh_vs_that_with_gap_long_distance": blimp.BlimpWhVsThatWithGapLongDistance, "blimp_wh_vs_that_with_gap_long_distance": blimp.BlimpWhVsThatWithGapLongDistance,
#GEM/mlsum
"mlsum_es":gem_mlsum.GEMMLSUMEs,
"mlsum_de":gem_mlsum.GEMMLSUMDe,
"mlsum_es_covid_challenge_set":gem_mlsum.GEMMLSUMEsChallgeTestCovid,
"mlsum_de_covid_challenge_set":gem_mlsum.GEMMLSUMDeChallgeTestCovid,
# Requires manual download of data. # Requires manual download of data.
# "storycloze_2016": storycloze.StoryCloze2016, # "storycloze_2016": storycloze.StoryCloze2016,
# "storycloze_2018": storycloze.StoryCloze2018, # "storycloze_2018": storycloze.StoryCloze2018,
# "sat": sat.SATAnalogies, # "sat": sat.SATAnalogies,
#GEM/xum
"gem_xsum": gem_xsum.GEMXSUM,
"gem_xsum_challenge_sample": gem_xsum.GEMXSUMChallgeSample,
"gem_xsum_challenge_test_backtranslation": gem_xsum.GEMXSUMChallgeTestBacktranslation,
"gem_xsum_challenge_test_bfp_02": gem_xsum.GEMXSUMChallgeTestBFP02,
"gem_xsum_challenge_test_bfp_05": gem_xsum.GEMXSUMChallgeTestBFP05,
"gem_xsum_challenge_test_nopunc": gem_xsum.GEMXSUMChallgeTestNopunc,
"gem_xsum_challenge_test_covid": gem_xsum.GEMXSUMChallgeTestCovid,
#LAMA
"lama-trex": lama.Trex,
"lama-squad": lama.Squad,
"lama-google_re": lama.google_re,
"lama-concptnet": lama.Conceptnet,
"bigscience-lama":lama.BigScienceLAMA,
# WinoBias
"wino_bias_type1_pro": wino_bias.WinoBiasType1Pro,
"wino_bias_type1_anti": wino_bias.WinoBiasType1Anti,
"wino_bias_type2_pro": wino_bias.WinoBiasType2Pro,
"wino_bias_type2_anti": wino_bias.WinoBiasType2Anti,
# Crows-Pairs
"crows_pairs_english": crows_pairs_multilingual.CrowsPairsEnglish,
"crows_pairs_french": crows_pairs_multilingual.CrowsPairsFrench,
# News
"huffpost": HuffPost.HuffPost,
} }
...@@ -363,51 +323,19 @@ def get_task_name_from_object(task_object): ...@@ -363,51 +323,19 @@ def get_task_name_from_object(task_object):
for name, class_ in TASK_REGISTRY.items(): for name, class_ in TASK_REGISTRY.items():
if class_ is task_object: if class_ is task_object:
return name return name
# this gives a mechanism for non-registered tasks to have a custom name anyways when reporting # this gives a mechanism for non-registered tasks to have a custom name anyways when reporting
return ( return task_object.EVAL_HARNESS_NAME if hasattr(task_object, "EVAL_HARNESS_NAME") else type(task_object).__name__
task_object.EVAL_HARNESS_NAME
if hasattr(task_object, "EVAL_HARNESS_NAME")
else type(task_object).__name__
)
def get_task_dict(task_name_list: List[Union[str, lm_eval.base.Task]]): def get_task_dict(task_name_list: List[Union[str, lm_eval.base.Task]]):
task_name_dict = { task_name_dict = {
task_name: get_task(task_name)() task_name: get_task(task_name)()
for task_name in task_name_list for task_name in task_name_list if isinstance(task_name, str)
if isinstance(task_name, str)
} }
task_name_from_object_dict = { task_name_from_object_dict = {
get_task_name_from_object(task_object): task_object get_task_name_from_object(task_object): task_object
for task_object in task_name_list for task_object in task_name_list if not isinstance(task_object, str)
if not isinstance(task_object, str)
} }
assert set(task_name_dict.keys()).isdisjoint(set(task_name_from_object_dict.keys())) assert set(task_name_dict.keys()).isdisjoint(set(task_name_from_object_dict.keys()))
return {**task_name_dict, **task_name_from_object_dict} return {**task_name_dict, **task_name_from_object_dict}
def get_task_dict_promptsource(task_name_list: List[str]):
"""Loads a task instance for each prompt written for that task."""
task_name_dict = {}
for task_name in task_name_list:
assert isinstance(task_name, str)
# Static version of the Task Use this to get HF dataset path / name.
static_task_obj = get_task(task_name)
# Create the proper task name arg for DatasetTemplates.
sub_task = (
f"/{static_task_obj.DATASET_NAME}" if static_task_obj.DATASET_NAME else ""
)
ps_task_name = f"{static_task_obj.DATASET_PATH}{sub_task}"
task_prompts = DatasetTemplates(ps_task_name)
for prompt_name in task_prompts.all_template_names:
prompt = task_prompts[prompt_name]
# NOTE: We choose a sep that can be easily split.
task_name_dict[f"{task_name}+{prompt_name}"] = get_task(task_name)(
prompt=prompt
)
return task_name_dict
...@@ -10,7 +10,7 @@ provided explanations. ...@@ -10,7 +10,7 @@ provided explanations.
Homepage: "https://github.com/facebookresearch/anli" Homepage: "https://github.com/facebookresearch/anli"
""" """
import numpy as np import numpy as np
from lm_eval.base import rf, PromptSourceTask from lm_eval.base import rf, Task
from lm_eval.metrics import mean from lm_eval.metrics import mean
...@@ -30,7 +30,7 @@ _CITATION = """ ...@@ -30,7 +30,7 @@ _CITATION = """
""" """
class ANLIBase(PromptSourceTask): class ANLIBase(Task):
VERSION = 0 VERSION = 0
DATASET_PATH = "anli" DATASET_PATH = "anli"
DATASET_NAME = None DATASET_NAME = None
...@@ -59,6 +59,51 @@ class ANLIBase(PromptSourceTask): ...@@ -59,6 +59,51 @@ class ANLIBase(PromptSourceTask):
if self.has_test_docs(): if self.has_test_docs():
return self.dataset["test_r" + str(self.SPLIT)] return self.dataset["test_r" + str(self.SPLIT)]
def doc_to_text(self, doc):
# OA does this a bit weirdly: they prepend "anli 1: anli 1: " to the beginning
# of the prompt (yes, repeating it!). also, " True, False, or Neither?" is directly
# appended onto the question, with no "Answer:" or even a newline. Do we *really*
# want to do it exactly as OA did?
return doc['premise'] + '\nQuestion: ' + doc['hypothesis'] + ' True, False, or Neither?\nAnswer:'
def doc_to_target(self, doc):
# True = entailment
# False = contradiction
# Neither = neutral
return " " + ["True", "Neither", "False"][doc['label']]
def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
ll_true, _ = rf.loglikelihood(ctx, " True")
ll_neither, _ = rf.loglikelihood(ctx, " Neither")
ll_false, _ = rf.loglikelihood(ctx, " False")
return ll_true, ll_neither, ll_false
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
The results of the requests created in construct_requests.
"""
gold = doc["label"]
pred = np.argmax(results)
return {
"acc": pred == gold
}
def aggregation(self): def aggregation(self):
""" """
:returns: {str: [float] -> float} :returns: {str: [float] -> float}
......
...@@ -58,11 +58,10 @@ class Arithmetic(Task): ...@@ -58,11 +58,10 @@ class Arithmetic(Task):
def construct_requests(self, doc, ctx): def construct_requests(self, doc, ctx):
ll, is_prediction = rf.loglikelihood(ctx, doc["completion"]) ll, is_prediction = rf.loglikelihood(ctx, doc["completion"])
return ll, is_prediction return is_prediction
def process_results(self, doc, results): def process_results(self, doc, results):
print(results) is_prediction, = results
results = results
return { return {
"acc": is_prediction "acc": is_prediction
} }
......
...@@ -10,7 +10,7 @@ grammars. ...@@ -10,7 +10,7 @@ grammars.
Homepage: https://github.com/alexwarstadt/blimp Homepage: https://github.com/alexwarstadt/blimp
""" """
from lm_eval.base import rf, PromptSourceTask from lm_eval.base import rf, Task
from lm_eval.metrics import mean from lm_eval.metrics import mean
...@@ -31,7 +31,7 @@ _CITATION = """ ...@@ -31,7 +31,7 @@ _CITATION = """
""" """
class BlimpTask(PromptSourceTask): class BlimpTask(Task):
VERSION = 0 VERSION = 0
DATASET_PATH = "blimp" DATASET_PATH = "blimp"
...@@ -50,6 +50,58 @@ class BlimpTask(PromptSourceTask): ...@@ -50,6 +50,58 @@ class BlimpTask(PromptSourceTask):
# trained on this data. # trained on this data.
return self.dataset["train"] return self.dataset["train"]
def fewshot_context(self, doc, num_fewshot, provide_description=None, rnd=None, description=None):
assert num_fewshot == 0
assert rnd is not None, "A `random.Random` generator argument must be provided to `rnd`"
assert not provide_description, (
"The `provide_description` arg will be removed in future versions. To prepend "
"a custom description to the context, supply the corresponding string via the "
"`description` arg."
)
if provide_description is not None:
# nudge people to not specify it at all
print("WARNING: provide_description is deprecated and will be removed in a future version in favor of description_dict")
return ""
def doc_to_text(self, doc):
# this method is invoked by tests only
return ""
def doc_to_target(self, doc):
# this method is invoked by tests only
return ""
def construct_requests(self, doc, ctx):
assert not ctx
# Calculate the loglikelihood for the good and the bad sentence.
# Note that loglikelihood translates the "" prefix to the "<|endoftext|>" token
return [
rf.loglikelihood("", doc["sentence_good"]),
rf.loglikelihood("", doc["sentence_bad"]),
]
def process_results(self, doc, results):
likelihood1, likelihood2 = results
# the model got this case right iff the good sentence scored higher than the bad sentence
acc = 1.0 if likelihood1 > likelihood2 else 0.0
return {
"acc": acc,
}
def higher_is_better(self):
return {
"acc": True,
}
def aggregation(self):
return {
"acc": mean,
}
class BlimpAdjunctIsland(BlimpTask): class BlimpAdjunctIsland(BlimpTask):
DATASET_NAME = "adjunct_island" DATASET_NAME = "adjunct_island"
......
...@@ -12,7 +12,7 @@ Homepage: https://stanfordnlp.github.io/coqa/ ...@@ -12,7 +12,7 @@ Homepage: https://stanfordnlp.github.io/coqa/
import inspect import inspect
import transformers.data.metrics.squad_metrics as squad_metrics import transformers.data.metrics.squad_metrics as squad_metrics
import lm_eval.datasets.coqa.coqa import lm_eval.datasets.coqa.coqa
from lm_eval.base import PromptSourceTask, Task, rf, mean from lm_eval.base import Task, rf, mean
from itertools import zip_longest from itertools import zip_longest
...@@ -28,9 +28,9 @@ _CITATION = """ ...@@ -28,9 +28,9 @@ _CITATION = """
""" """
class CoQA(PromptSourceTask): class CoQA(Task):
VERSION = 1 VERSION = 1
DATASET_PATH = "coqa" DATASET_PATH = inspect.getfile(lm_eval.datasets.coqa.coqa)
DATASET_NAME = None DATASET_NAME = None
def has_training_docs(self): def has_training_docs(self):
...@@ -51,21 +51,44 @@ class CoQA(PromptSourceTask): ...@@ -51,21 +51,44 @@ class CoQA(PromptSourceTask):
def test_docs(self): def test_docs(self):
pass pass
# @classmethod def doc_to_text(self, doc):
# def get_answers(cls, doc, turn_id): # Given a passage p, the conversation history {q1, a1, . . . qi−1, ai−1}
# # Returns unique answers and valid alternatives (Some questions in CoQA have multiple valid answers). # and a question qi, the task is to predict the answer ai
# answers = [] doc_text = doc["story"] + '\n\n'
# answer_forturn = doc["answers"]["input_text"][turn_id - 1] for (q, a) in zip_longest(doc["questions"]["input_text"], doc["answers"]["input_text"][:-1]): # omit target answer ai
# answers.append(answer_forturn) question = f"Q: {q}\n\n"
# additional_answers = doc.get("additional_answers") answer = f"A: {a}\n\n" if a is not None else "A:"
# if additional_answers: doc_text += question + answer
# for key in additional_answers: return doc_text
# additional_answer_for_turn = additional_answers[key]["input_text"][
# turn_id - 1 @classmethod
# ] def get_answers(cls, doc, turn_id):
# if additional_answer_for_turn.lower() not in map(str.lower, answers): # Returns unique answers and valid alternatives (Some questions in CoQA have multiple valid answers).
# answers.append(additional_answer_for_turn) answers = []
# return answers answer_forturn = doc["answers"]["input_text"][turn_id - 1]
answers.append(answer_forturn)
additional_answers = doc.get("additional_answers")
if additional_answers:
for key in additional_answers:
additional_answer_for_turn = additional_answers[key]["input_text"][turn_id - 1]
if additional_answer_for_turn.lower() not in map(str.lower, answers):
answers.append(additional_answer_for_turn)
return answers
@classmethod
def get_answer_choice(self, raw_text):
# Function maps answers to CoQA answer categories
# ~ 1/5 of the CoQA answers are Yes/No
# ~ 2/3 of the CoQA answers are span-based
# (answers overlap with the passage ignoring punctuation and case mismatch)
if raw_text == "unknown":
return '0'
if squad_metrics.normalize_answer(raw_text) == "yes":
return '1'
if squad_metrics.normalize_answer(raw_text) == "no":
return '2'
return '3' # Not a yes/no question
@staticmethod @staticmethod
def compute_scores(gold_list, pred): def compute_scores(gold_list, pred):
...@@ -75,40 +98,40 @@ class CoQA(PromptSourceTask): ...@@ -75,40 +98,40 @@ class CoQA(PromptSourceTask):
em_sum = 0.0 em_sum = 0.0
if len(gold_list) > 1: if len(gold_list) > 1:
for i in range(len(gold_list)): for i in range(len(gold_list)):
gold_answers = gold_list[0:i] + gold_list[i + 1 :] gold_answers = gold_list[0:i] + gold_list[i + 1:]
# predictions compared against (n) golds and take maximum # predictions compared against (n) golds and take maximum
em_sum += max( em_sum += max(squad_metrics.compute_exact(a, pred) for a in gold_answers)
squad_metrics.compute_exact(a, pred) for a in gold_answers
)
f1_sum += max(squad_metrics.compute_f1(a, pred) for a in gold_answers) f1_sum += max(squad_metrics.compute_f1(a, pred) for a in gold_answers)
else: else:
em_sum += max(squad_metrics.compute_exact(a, pred) for a in gold_list) em_sum += max(squad_metrics.compute_exact(a, pred) for a in gold_list)
f1_sum += max(squad_metrics.compute_f1(a, pred) for a in gold_list) f1_sum += max(squad_metrics.compute_f1(a, pred) for a in gold_list)
return { return {'em': em_sum / max(1, len(gold_list)), 'f1': f1_sum / max(1, len(gold_list))}
"em": em_sum / max(1, len(gold_list)),
"f1": f1_sum / max(1, len(gold_list)),
}
# def stopping_criteria(self): def doc_to_target(self, doc, turnid=None):
# return "\n\n" # Default to prediction of last turn.
if turnid is None:
turnid = len(doc["questions"]["input_text"])
raw_text = doc['answers']["input_text"][turnid - 1]
return " " + raw_text
# def construct_requests(self, doc, ctx): def construct_requests(self, doc, ctx):
# """Uses RequestFactory to construct Requests and returns an iterable of """ Uses RequestFactory to construct Requests and returns an iterable of
# Requests which will be sent to the LM. Requests which will be sent to the LM.
# :param doc: :param doc:
# The document as returned from training_docs, validation_docs, or test_docs. The document as returned from training_docs, validation_docs, or test_docs.
# :param ctx: str :param ctx: str
# The context string, generated by fewshot_context. This includes the natural The context string, generated by fewshot_context. This includes the natural
# language description, as well as the few shot examples, and the question language description, as well as the few shot examples, and the question
# part of the document for `doc`. part of the document for `doc`.
# """ """
# return cont_request cont_request = rf.greedy_until(ctx, ['\nQ:'])
return cont_request
def process_results(self, doc, results): def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a """Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of dict where keys are the names of submetrics and values are the values of
the metric for that one document the metric for that one document
:param doc: :param doc:
...@@ -116,19 +139,16 @@ class CoQA(PromptSourceTask): ...@@ -116,19 +139,16 @@ class CoQA(PromptSourceTask):
:param results: :param results:
The results of the requests created in construct_requests. The results of the requests created in construct_requests.
""" """
target = self.doc_to_target(doc).strip() turn_id = len(doc["questions"]["input_text"])
pred = results[0].strip().split("\n")[0] gold_list = self.get_answers(doc, turn_id)
scores = self.compute_scores([target], pred) pred = results[0].strip().split('\n')[0]
out = { scores = self.compute_scores(gold_list, pred)
"f1": scores["f1"],
"em": scores["em"],
}
if self.save_examples: return {
example = {"target": target, "pred": pred} "f1": scores['f1'],
return out, example "em": scores['em'],
return out }
def higher_is_better(self): def higher_is_better(self):
return { return {
......
"""
French CrowS-Pairs: Extending a challenge dataset for measuring social bias in masked language models to a language other than English
https://hal.inria.fr/hal-03629677/file/ACLFinal.pdf
Measuring social biases in masked language models in English and French.
https://gitlab.inria.fr/french-crows-pairs/acl-2022-paper-data-and-code/-/tree/main
"""
from lm_eval.base import PromptSourceTask
_CITATION = """\
@inproceedings{neveol2022french,
title={French CrowS-Pairs: Extending a challenge dataset for measuring social bias in masked language models to a language other than English},
author={N{\'e}v{\'e}ol, Aur{\'e}lie and Dupont, Yoann and Bezan{\c{c}}on, Julien and Fort, Kar{\"e}n},
booktitle={ACL 2022-60th Annual Meeting of the Association for Computational Linguistics},
year={2022}
"""
class CrowsPairsEnglish(PromptSourceTask):
VERSION = 0
DATASET_PATH = "oskarvanderwal/crows_pairs_multilingual"
DATASET_NAME = "english"
def has_training_docs(self):
return False
def has_validation_docs(self):
return False
def has_test_docs(self):
return True
def training_docs(self):
pass
def validation_docs(self):
pass
def test_docs(self):
if self.has_test_docs():
return self.dataset["test"]
class CrowsPairsFrench(PromptSourceTask):
VERSION = 0
DATASET_PATH = "oskarvanderwal/crows_pairs_multilingual"
DATASET_NAME = "french"
def has_training_docs(self):
return False
def has_validation_docs(self):
return False
def has_test_docs(self):
return True
def training_docs(self):
pass
def validation_docs(self):
pass
def test_docs(self):
if self.has_test_docs():
return self.dataset["test"]
...@@ -18,7 +18,7 @@ import re ...@@ -18,7 +18,7 @@ import re
import string import string
import lm_eval.datasets.drop.drop import lm_eval.datasets.drop.drop
from scipy.optimize import linear_sum_assignment from scipy.optimize import linear_sum_assignment
from lm_eval.base import PromptSourceTask, rf from lm_eval.base import Task, rf
from lm_eval.metrics import mean from lm_eval.metrics import mean
...@@ -37,9 +37,9 @@ _CITATION = """ ...@@ -37,9 +37,9 @@ _CITATION = """
_ARTICLES = re.compile(r"\b(a|an|the)\b", re.UNICODE) _ARTICLES = re.compile(r"\b(a|an|the)\b", re.UNICODE)
class DROP(PromptSourceTask): class DROP(Task):
VERSION = 1 VERSION = 1
DATASET_PATH = "drop" # inspect.getfile(lm_eval.datasets.drop.drop) DATASET_PATH = inspect.getfile(lm_eval.datasets.drop.drop)
DATASET_NAME = None DATASET_NAME = None
def has_training_docs(self): def has_training_docs(self):
...@@ -52,13 +52,46 @@ class DROP(PromptSourceTask): ...@@ -52,13 +52,46 @@ class DROP(PromptSourceTask):
return False return False
def training_docs(self): def training_docs(self):
# if self._training_docs is None: if self._training_docs is None:
# self._training_docs = list() self._training_docs = list(map(self._process_doc, self.dataset["train"]))
# return self._training_docs return self._training_docs
return self.dataset["train"]
def validation_docs(self): def validation_docs(self):
return self.dataset["validation"] return map(self._process_doc, self.dataset["validation"])
def _process_doc(self, doc):
return {
"id": doc["query_id"],
"passage": doc["passage"],
"question": doc["question"],
"answers": self.get_answers(doc),
}
@classmethod
def get_answers(cls, qa):
def _flatten_validated_answers(validated_answers):
""" Flattens a dict of lists of validated answers.
{"number": ['1', '8'], ...}
-> [{"number": ['1'], ...}, {"number": ['8'], ...}]
"""
vas = []
for i in range(len(validated_answers["number"])):
vas.append({
"number": validated_answers["number"][i],
"date": validated_answers["date"][i],
"spans": validated_answers["spans"][i],
})
return vas
answers = []
answers_set = set()
candidates = [qa["answer"]] + _flatten_validated_answers(qa["validated_answers"])
for candidate in candidates:
answer = cls.parse_answer(candidate)
if answer in answers_set:
continue
answers_set.add(answer)
answers.append(answer)
return answers
@classmethod @classmethod
def parse_answer(cls, answer): def parse_answer(cls, answer):
...@@ -67,31 +100,29 @@ class DROP(PromptSourceTask): ...@@ -67,31 +100,29 @@ class DROP(PromptSourceTask):
return (str(answer["number"]),) return (str(answer["number"]),)
if answer["spans"] != []: if answer["spans"] != []:
return tuple(answer["spans"]) return tuple(answer["spans"])
return ( return (" ".join([answer["date"]["day"],
" ".join( answer["date"]["month"],
[answer["date"]["day"], answer["date"]["month"], answer["date"]["year"]] answer["date"]["year"]]).strip(),)
).strip(),
)
# def doc_to_text(self, doc): def doc_to_text(self, doc):
# return f"Passage: {doc['passage']}\nQuestion: {doc['question']}\nAnswer:" return f"Passage: {doc['passage']}\nQuestion: {doc['question']}\nAnswer:"
# def doc_to_target(self, doc): def doc_to_target(self, doc):
# return " " + ", ".join(doc["answers"][0]) return " " + ", ".join(doc["answers"][0])
# def construct_requests(self, doc, ctx): def construct_requests(self, doc, ctx):
# """Uses RequestFactory to construct Requests and returns an iterable of """Uses RequestFactory to construct Requests and returns an iterable of
# Requests which will be sent to the LM. Requests which will be sent to the LM.
# :param doc: :param doc:
# The document as returned from training_docs, validation_docs, or test_docs. The document as returned from training_docs, validation_docs, or test_docs.
# :param ctx: str :param ctx: str
# The context string, generated by fewshot_context. This includes the natural The context string, generated by fewshot_context. This includes the natural
# language description, as well as the few shot examples, and the question language description, as well as the few shot examples, and the question
# part of the document for `doc`. part of the document for `doc`.
# """ """
# conts = [rf.greedy_until(ctx, ["."])] conts = [rf.greedy_until(ctx, ["."])]
# return conts return conts
def process_results(self, doc, results): def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a """Take a single document and the LM results and evaluates, returning a
...@@ -103,21 +134,7 @@ class DROP(PromptSourceTask): ...@@ -103,21 +134,7 @@ class DROP(PromptSourceTask):
:param results: :param results:
The results of the requests created in construct_requests. The results of the requests created in construct_requests.
""" """
preds, golds = results, doc["answers"]
pred = results[0].strip()
target = self.doc_to_target(doc).strip()
print("*" * 80)
print(f"DOC: {doc}")
print(f"PS: {self.prompt.apply(doc)}")
print(f"TEXT: {self.doc_to_text(doc)}")
print(f"TARGET: {target} END TARGET")
print(f"PRED: {pred} END PRED")
print("*" * 80)
preds = [pred]
golds = [target]
max_em = 0 max_em = 0
max_f1 = 0 max_f1 = 0
for gold_answer in golds: for gold_answer in golds:
...@@ -125,7 +142,10 @@ class DROP(PromptSourceTask): ...@@ -125,7 +142,10 @@ class DROP(PromptSourceTask):
if gold_answer[0].strip(): if gold_answer[0].strip():
max_em = max(max_em, exact_match) max_em = max(max_em, exact_match)
max_f1 = max(max_f1, f1_score) max_f1 = max(max_f1, f1_score)
return {"em": max_em, "f1": max_f1} return {
"em": max_em,
"f1": max_f1
}
def get_metrics(self, predicted, gold): def get_metrics(self, predicted, gold):
""" """
...@@ -138,9 +158,7 @@ class DROP(PromptSourceTask): ...@@ -138,9 +158,7 @@ class DROP(PromptSourceTask):
predicted_bags = self._answer_to_bags(predicted) predicted_bags = self._answer_to_bags(predicted)
gold_bags = self._answer_to_bags(gold) gold_bags = self._answer_to_bags(gold)
if set(predicted_bags[0]) == set(gold_bags[0]) and len( if set(predicted_bags[0]) == set(gold_bags[0]) and len(predicted_bags[0]) == len(gold_bags[0]):
predicted_bags[0]
) == len(gold_bags[0]):
exact_match = 1.0 exact_match = 1.0
else: else:
exact_match = 0.0 exact_match = 0.0
...@@ -172,9 +190,7 @@ class DROP(PromptSourceTask): ...@@ -172,9 +190,7 @@ class DROP(PromptSourceTask):
for gold_index, gold_item in enumerate(gold): for gold_index, gold_item in enumerate(gold):
for pred_index, pred_item in enumerate(predicted): for pred_index, pred_item in enumerate(predicted):
if self._match_numbers_if_present(gold_item, pred_item): if self._match_numbers_if_present(gold_item, pred_item):
scores[gold_index, pred_index] = self._compute_f1( scores[gold_index, pred_index] = self._compute_f1(pred_item, gold_item)
pred_item, gold_item
)
row_ind, col_ind = linear_sum_assignment(-scores) row_ind, col_ind = linear_sum_assignment(-scores)
max_scores = np.zeros([max(len(gold), len(predicted))]) max_scores = np.zeros([max(len(gold), len(predicted))])
...@@ -240,11 +256,7 @@ class DROP(PromptSourceTask): ...@@ -240,11 +256,7 @@ class DROP(PromptSourceTask):
def _normalize(self, answer): def _normalize(self, answer):
tokens = [ tokens = [
self._white_space_fix( self._white_space_fix(self._remove_articles(self._fix_number(self._remove_punc(token.lower()))))
self._remove_articles(
self._fix_number(self._remove_punc(token.lower()))
)
)
for token in self._tokenize(answer) for token in self._tokenize(answer)
] ]
tokens = [token for token in tokens if token.strip()] tokens = [token for token in tokens if token.strip()]
...@@ -257,7 +269,10 @@ class DROP(PromptSourceTask): ...@@ -257,7 +269,10 @@ class DROP(PromptSourceTask):
A dictionary where keys are the names of submetrics and values are A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics functions that aggregate a list of metrics
""" """
return {"em": mean, "f1": mean} return {
"em": mean,
"f1": mean
}
def higher_is_better(self): def higher_is_better(self):
""" """
...@@ -265,4 +280,7 @@ class DROP(PromptSourceTask): ...@@ -265,4 +280,7 @@ class DROP(PromptSourceTask):
A dictionary where keys are the names of submetrics and values are A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better whether a higher value of the submetric is better
""" """
return {"em": True, "f1": True} return {
"em": True,
"f1": True
}
"""
Semantic Noise Matters for Neural Natural Language Generation
http://arxiv.org/abs/1911.03905
A cleaned version of the dataset from the E2E NLG Challenge.
The dataset contains MR with restaurant attributes and corresponding descriptions.
Homepage: https://github.com/tuetschek/e2e-cleaning
"""
from lm_eval.base import PromptSourceTask, rf
from lm_eval import metrics
_CITATION = """
@inproceedings{dusek-etal-2019-semantic,
title = "Semantic Noise Matters for Neural Natural Language Generation",
author = "Du{\v{s}}ek, Ond{\v{r}}ej and
Howcroft, David M. and
Rieser, Verena",
booktitle = "Proceedings of the 12th International Conference on Natural Language Generation",
year = "2019",
address = "Tokyo, Japan",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/W19-8652",
doi = "10.18653/v1/W19-8652",
pages = "421--426",
}
"""
# Work in progress
class E2E_NLG_Cleaned(PromptSourceTask):
VERSION = 0
DATASET_PATH = "e2e_nlg_cleaned"
DATASET_NAME = None
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def training_docs(self):
if self.has_training_docs():
# We cache training documents in `self._training_docs` for faster
# few-shot processing. If the data is too large to fit in memory,
# return the training data as a generator instead of a list.
if self._training_docs is None:
self._training_docs = list(self.dataset["train"])
return self._training_docs
def validation_docs(self):
if self.has_validation_docs():
return self.dataset["validation"]
def test_docs(self):
if self.has_test_docs():
return self.dataset["test"]
def max_generation_length(self):
return 64
def invalid_doc_for_prompt(self, doc) -> bool:
"""The QA prompts are not applicable to all the examples, we want to filter these out."""
return self.prompt.name.endswith("_qa") or self.prompt.name == "family_friendly_yes_no"
def doc_to_text(self, doc) -> str:
# if the response is not defined in PS, the text will be a single-element list containing an empty string
text = self.prompt.apply(doc)[0]
return text
def construct_requests(self, doc, ctx, args):
"""Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
_requests = []
# NOTE: In the future, target will be a list of strings.
request_args = {
"stopping_criteria": self.stopping_criteria(),
"max_generation_length": self.max_generation_length(),
"num_fewshot": args["num_fewshot"],
}
# Skip examples for which the templates are not applicable
if ctx != "":
cont_request = rf.greedy_until(ctx, request_args)
_requests.append(cont_request)
return _requests
"""
ASSET: ASSET (Alva-Manchego et al., 2020) is multi-reference dataset
for the evaluation of sentence simplification in English. The dataset
uses the same 2,359 sentences from TurkCorpus (Xu et al., 2016)
and each sentence is associated with 10 crowdsourced simplifications.
Unlike previous simplification datasets, which contain a single
transformation (e.g., lexical paraphrasing in TurkCorpus or sentence
splitting in HSplit), the simplifications in ASSET encompass a variety
of rewriting transformations.
https://aclanthology.org/2020.acl-main.424.pdf
TurkCorpus: TURKCorpus is a multi-reference dataset for the evaluation of
sentence simplification in English. The dataset consists of 2,359 sentences
from the Parallel Wikipedia Simplification (PWKP) corpus. Each sentence is
associated with 8 crowdsourced simplifications that focus on only lexical
paraphrasing (no sentence splitting or deletion).
https://cocoxu.github.io/publications/tacl2016-smt-simplification.pdf
"""
from lm_eval.base import PromptSourceTask
_CITATION = """
@article{DBLP:journals/corr/abs-2005-00481,
author = {Fernando Alva{-}Manchego and
Louis Martin and
Antoine Bordes and
Carolina Scarton and
Beno{\^{\i}}t Sagot and
Lucia Specia},
title = {{ASSET:} {A} Dataset for Tuning and Evaluation of Sentence Simplification
Models with Multiple Rewriting Transformations},
journal = {CoRR},
volume = {abs/2005.00481},
year = {2020},
url = {https://arxiv.org/abs/2005.00481},
eprinttype = {arXiv},
eprint = {2005.00481},
timestamp = {Thu, 14 Oct 2021 16:38:25 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2005-00481.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}"""
""""@article{Xu-EtAl:2016:TACL,
author = {Wei Xu and Courtney Napoles and Ellie Pavlick and Quanze Chen and Chris Callison-Burch},
title = {Optimizing Statistical Machine Translation for Text Simplification},
journal = {Transactions of the Association for Computational Linguistics},
volume = {4},
year = {2016},
url = {https://cocoxu.github.io/publications/tacl2016-smt-simplification.pdf},
pages = {401--415}
}"""
class AssetTurk(PromptSourceTask):
VERSION = 0
DATASET_PATH = "GEM/wiki_auto_asset_turk"
DATASET_NAME = None
SPLIT = None
def has_training_docs(self):
return False
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def training_docs(self):
if self.has_training_docs():
if self._training_docs is None:
self._training_docs = list(self.dataset["train"])
return self._training_docs
def validation_docs(self):
if self.has_validation_docs():
return self.dataset["validation"]
def test_docs(self):
return self.dataset[str(self.SPLIT)]
def max_generation_length(self):
return 200
class AssetTest(AssetTurk):
SPLIT = "test_asset"
class TurkTest(AssetTurk):
SPLIT = "test_turk"
class AssetTest1(AssetTurk):
SPLIT = "challenge_test_asset_backtranslation"
class AssetTest2(AssetTurk):
SPLIT = "challenge_test_asset_bfp02"
class AssetTest3(AssetTurk):
SPLIT = "challenge_test_asset_bfp05"
class AssetTest4(AssetTurk):
SPLIT = "challenge_test_asset_nopunc"
class TurkTest1(AssetTurk):
SPLIT = "challenge_test_turk_backtranslation"
class TurkTest2(AssetTurk):
SPLIT = "challenge_test_turk_bfp02"
class TurkTest3(AssetTurk):
SPLIT = "challenge_test_turk_bfp05"
class TurkTest4(AssetTurk):
SPLIT = "challenge_test_turk_nopunc"
ASSET_TURK_CLASSES = [
AssetTest,
TurkTest,
TurkTest1,
TurkTest2,
TurkTest3,
TurkTest4,
AssetTest1,
AssetTest2,
AssetTest3,
AssetTest4,
]
def construct_tasks():
tasks = {}
for asset_turk_class in ASSET_TURK_CLASSES:
tasks[f"GEM/wiki_auto_asset_turk_{asset_turk_class.SPLIT}"] = asset_turk_class
return tasks
"""
MLSUM: The Multilingual Summarization Corpus
https://aclanthology.org/2020.emnlp-main.647/
This is the MLSUM subset of the GEM benchmark. MLSUM is the first large-scale MultiLingual SUMmarization dataset.
Obtained from online newspapers, it contains 1.5M+ article/summary pairs in five different languages -- namely, French, German, Spanish, Russian, Turkish.
Together with English newspapers from the popular CNN/Daily mail dataset, the collected data form a large scale multilingual dataset which can enable new research directions for the text summarization community.
We report cross-lingual comparative analyses based on state-of-the-art systems.
These highlight existing biases which motivate the use of a multi-lingual dataset.
Homepage: https://gitlab.lip6.fr/scialom/mlsum_data/-/raw/master/MLSUM/
"""
from numpy import True_
from lm_eval.base import PromptSourceTask
_CITATION = """
@article{scialom2020mlsum,
title={MLSUM: The Multilingual Summarization Corpus},
author={Scialom, Thomas and Dray, Paul-Alexis and Lamprier, Sylvain and Piwowarski, Benjamin and Staiano, Jacopo},
journal={arXiv preprint arXiv:2004.14900},
year={2020}
}
"""
class GEMMLSUMEsBase(PromptSourceTask):
VERSION = 0
DATASET_PATH = "GEM/mlsum"
DATASET_NAME = "es"
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def training_docs(self):
if self.has_training_docs():
if self._training_docs is None:
self._training_docs = list(self.dataset["train"])
return self._training_docs
def validation_docs(self):
if self.has_validation_docs():
return self.dataset["validation"]
def test_docs(self):
if self.has_test_docs():
return self.dataset["test"]
class GEMMLSUMEs(GEMMLSUMEsBase):
'''this is for train/validation/test'''
SPLIT = ''
class GEMMLSUMEsChallgeTestCovid(GEMMLSUMEsBase):
'''this is for challenge_test_covid'''
SPLIT = 'challenge_test_covid'
def has_training_docs(self):
return False
def has_validation_docs(self):
return False
def test_docs(self):
if self.has_test_docs():
return self.dataset[self.SPLIT]
class GEMMLSUMDeBase(PromptSourceTask):
VERSION = 0
DATASET_PATH = "GEM/mlsum"
DATASET_NAME = "de"
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def training_docs(self):
if self.has_training_docs():
if self._training_docs is None:
self._training_docs = list(self.dataset["train"])
return self._training_docs
def validation_docs(self):
if self.has_validation_docs():
return self.dataset["validation"]
def test_docs(self):
if self.has_test_docs():
return self.dataset["test"]
class GEMMLSUMDe(GEMMLSUMDeBase):
'''this is for train/validation/test'''
SPLIT = ''
class GEMMLSUMDeChallgeTestCovid(GEMMLSUMDeBase):
'''this is for challenge_test_covid'''
SPLIT = 'challenge_test_covid'
def has_training_docs(self):
return False
def has_validation_docs(self):
return False
def test_docs(self):
if self.has_test_docs():
return self.dataset[self.SPLIT]
"""
The 2020 Bilingual, Bi-Directional WebNLG+ Shared Task:
Overview and Evaluation Results (WebNLG+ 2020)
https://aclanthology.org/2020.webnlg-1.7/
WebNLG+ offers two challenges: (i) mapping sets of RDF triples
to English or Russian text (generation) and (ii) converting
English or Russian text to sets of RDF triples (semantic parsing).
Compared to the eponymous WebNLG challenge, WebNLG+ provides an
extended dataset that enable the training, evaluation, and
comparison of microplanners and semantic parsers. In this paper,
we present the results of the generation and semantic parsing
task for both English and Russian and provide a brief
description of the participating systems.
"""
from lm_eval.base import PromptSourceTask
_CITATION = """
@inproceedings{castro-ferreira-etal-2020-2020,
title = "The 2020 Bilingual, Bi-Directional {W}eb{NLG}+ Shared Task: Overview and Evaluation Results ({W}eb{NLG}+ 2020)",
author = "Castro Ferreira, Thiago and
Gardent, Claire and
Ilinykh, Nikolai and
van der Lee, Chris and
Mille, Simon and
Moussallem, Diego and
Shimorina, Anastasia",
booktitle = "Proceedings of the 3rd International Workshop on Natural Language Generation from the Semantic Web (WebNLG+)",
month = "12",
year = "2020",
address = "Dublin, Ireland (Virtual)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.webnlg-1.7",
pages = "55--76",
abstract = "WebNLG+ offers two challenges: (i) mapping sets of RDF triples to English or Russian text (generation) and (ii) converting English or Russian text to sets of RDF triples (semantic parsing). Compared to the eponymous WebNLG challenge, WebNLG+ provides an extended dataset that enable the training, evaluation, and comparison of microplanners and semantic parsers. In this paper, we present the results of the generation and semantic parsing task for both English and Russian and provide a brief description of the participating systems.",
}
"""
class WebNLG(PromptSourceTask):
VERSION = 0
DATASET_PATH = "GEM/web_nlg"
DATASET_NAME = "en"
SPLIT = None
def has_training_docs(self):
return False
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def training_docs(self):
if self.has_training_docs():
if self._training_docs is None:
self._training_docs = list(self.dataset["train"])
return self._training_docs
def validation_docs(self):
if self.has_validation_docs():
return self.dataset["validation"]
def test_docs(self):
if self.has_test_docs():
if self.SPLIT is not None:
return self.dataset[str(self.SPLIT)]
else:
return self.dataset["test"]
def max_generation_length(self):
return 250
class WebNLGRu(WebNLG):
DATASET_NAME = "ru"
## En Challenge Sets
class WebNLGEn1(WebNLG):
SPLIT = "challenge_validation_sample"
class WebNLGEn2(WebNLG):
SPLIT = "challenge_test_scramble"
class WebNLGEn3(WebNLG):
SPLIT = "challenge_test_numbers"
## Ru Challenge sets
class WebNLGRu1(WebNLG):
DATASET_NAME = "ru"
SPLIT = "challenge_validation_sample"
class WebNLGRu2(WebNLG):
DATASET_NAME = "ru"
SPLIT = "challenge_test_scramble"
WEBNLG_CLASSES = [
WebNLG,
WebNLGRu,
WebNLGEn1,
WebNLGEn2,
WebNLGEn3,
WebNLGRu1,
WebNLGRu2,
]
def construct_tasks():
tasks = {}
for webnlg_class in WEBNLG_CLASSES:
if webnlg_class.SPLIT is None:
tasks[f"GEM/web_nlg_{webnlg_class.DATASET_NAME}"] = webnlg_class
else:
tasks[
f"GEM/web_nlg_{webnlg_class.DATASET_NAME}_{webnlg_class.SPLIT}"
] = webnlg_class
return tasks
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment