Unverified Commit cf074822 authored by Stella Biderman's avatar Stella Biderman Committed by GitHub
Browse files

Merge pull request #316 from jon-tow/master

Revert "Merge branch 'master' into master"
parents 5fe7e2c0 7585ec56
import abc
from typing import Iterable, Optional
import promptsource
from typing import Iterable
import numpy as np
import random
import re
......@@ -14,7 +12,6 @@ from tqdm import tqdm
import torch
import torch.nn.functional as F
from lm_eval import metrics
from lm_eval.metrics import mean, weighted_perplexity, weighted_mean, bits_per_byte
from lm_eval import utils
from abc import abstractmethod
......@@ -27,17 +24,17 @@ class LM(abc.ABC):
@abstractmethod
def loglikelihood(self, requests):
"""Compute log-likelihood of generating a continuation from a context.
Downstream tasks should attempt to use loglikelihood instead of other
Downstream tasks should attempt to use loglikelihood instead of other
LM calls whenever possible.
:param requests: list
A list of pairs (context, continuation)
context: str
Context string. Implementations of LM must be able to handle an
Context string. Implementations of LM must be able to handle an
empty context string.
continuation: str
The continuation over which log likelihood will be calculated. If
there is a word boundary, the space should be in the continuation.
The continuation over which log likelihood will be calculated. If
there is a word boundary, the space should be in the continuation.
For example, context="hello" continuation=" world" is correct.
:return: list
A list of pairs (logprob, isgreedy)
......@@ -100,7 +97,7 @@ class LM(abc.ABC):
context: str
Context string
until: [str]
The string sequences to generate until. These string sequences
The string sequences to generate until. These string sequences
may each span across multiple tokens, or may be part of one token.
:return: list
A list of strings continuation
......@@ -121,10 +118,6 @@ class LM(abc.ABC):
class BaseLM(LM):
@property
@abstractmethod
def eot_token(self):
pass
@property
@abstractmethod
......@@ -152,16 +145,13 @@ class BaseLM(LM):
pass
@abstractmethod
def tok_encode(self, string: str):
pass
def tok_encode(self, string: str): pass
@abstractmethod
def tok_decode(self, tokens: Iterable[int]):
pass
def tok_decode(self, tokens: Iterable[int]): pass
@abstractmethod
def _model_generate(self, context, max_length, eos_token_id):
pass
def _model_generate(self, context, max_length, eos_token_id): pass
@abstractmethod
def _model_call(self, inps):
......@@ -197,30 +187,23 @@ class BaseLM(LM):
# TODO: automatic batch size detection for vectorization
loglikelihoods = []
for (string,) in tqdm(requests):
rolling_token_windows = list(
map(
utils.make_disjoint_window,
utils.get_rolling_token_windows(
token_list=self.tok_encode(string),
prefix_token=self.eot_token_id,
max_seq_len=self.max_length,
context_len=1,
),
)
)
for string, in tqdm(requests):
rolling_token_windows = list(map(utils.make_disjoint_window, utils.get_rolling_token_windows(
token_list=self.tok_encode(string),
prefix_token=self.eot_token_id,
max_seq_len=self.max_length,
context_len=1,
)))
rolling_token_windows = [(None,) + x for x in rolling_token_windows]
# TODO: extract out this call so it only gets called once and also somehow figure out partial caching for
# that
string_nll = self._loglikelihood_tokens(
rolling_token_windows, disable_tqdm=True
)
string_nll = self._loglikelihood_tokens(rolling_token_windows, disable_tqdm=True)
# discard is_greedy
string_nll = [x[0] for x in string_nll]
string_nll = sum(string_nll)
loglikelihoods.append(string_nll)
......@@ -240,12 +223,10 @@ class BaseLM(LM):
toks = x[1] + x[2]
return -len(toks), tuple(toks)
# TODO: automatic (variable) batch size detection for vectorization
reord = utils.Reorderer(requests, _collate)
for chunk in utils.chunks(
tqdm(reord.get_reordered(), disable=disable_tqdm), self.batch_size
):
for chunk in utils.chunks(tqdm(reord.get_reordered(), disable=disable_tqdm), self.batch_size):
inps = []
cont_toks_list = []
inplens = []
......@@ -271,60 +252,44 @@ class BaseLM(LM):
# when too long to fit in context, truncate from the left
inp = torch.tensor(
(context_enc + continuation_enc)[-(self.max_length + 1) :][:-1],
dtype=torch.long,
(context_enc + continuation_enc)[-(self.max_length+1):][:-1],
dtype=torch.long
).to(self.device)
(inplen,) = inp.shape
inplen, = inp.shape
cont = continuation_enc
# since in _collate we make sure length is descending, the longest is always the first one.
padding_length = (
padding_length if padding_length is not None else inplen
)
padding_length = padding_length if padding_length is not None else inplen
# pad length from seq to padding_length
inp = torch.cat(
[
inp, # [seq]
torch.zeros(padding_length - inplen, dtype=torch.long).to(
inp.device
), # [padding_length - seq]
],
dim=0,
)
inp = torch.cat([
inp, # [seq]
torch.zeros(padding_length - inplen, dtype=torch.long).to(inp.device) # [padding_length - seq]
], dim=0)
inps.append(inp.unsqueeze(0)) # [1, padding_length]
cont_toks_list.append(cont)
inplens.append(inplen)
batched_inps = torch.cat(inps, dim=0) # [batch, padding_length
multi_logits = F.log_softmax(
self._model_call(batched_inps), dim=-1
).cpu() # [batch, padding_length, vocab]
multi_logits = F.log_softmax(self._model_call(batched_inps), dim=-1).cpu() # [batch, padding_length, vocab]
for (cache_key, _, _), logits, inp, inplen, cont_toks in zip(
chunk, multi_logits, inps, inplens, cont_toks_list
):
for (cache_key, _, _), logits, inp, inplen, cont_toks \
in zip(chunk, multi_logits, inps, inplens, cont_toks_list):
# Slice to original seq length
contlen = len(cont_toks)
logits = logits[inplen - contlen : inplen].unsqueeze(
0
) # [1, seq, vocab]
logits = logits[inplen-contlen:inplen].unsqueeze(0) # [1, seq, vocab]
# Check if per-token argmax is exactly equal to continuation
greedy_tokens = logits.argmax(dim=-1)
cont_toks = torch.tensor(cont_toks, dtype=torch.long).unsqueeze(
0
) # [1, seq]
cont_toks = torch.tensor(cont_toks, dtype=torch.long).unsqueeze(0) # [1, seq]
max_equal = (greedy_tokens == cont_toks).all()
# Obtain log-probs at the corresponding continuation token indices
# last_token_slice = logits[:, -1, :].squeeze(0).tolist()
logits = torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(
-1
) # [1, seq]
logits = torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(-1) # [1, seq]
# Answer: (log prob, is-exact-match)
answer = (float(logits.sum()), bool(max_equal))
......@@ -336,9 +301,9 @@ class BaseLM(LM):
res.append(answer)
return reord.get_original(res)
def greedy_until(self, requests):
# TODO: implement fully general `until` that handles untils that are
# TODO: implement fully general `until` that handles untils that are
# multiple tokens or that span multiple tokens correctly
# TODO: extract to TokenizedLM?
......@@ -347,55 +312,29 @@ class BaseLM(LM):
def _collate(x):
toks = self.tok_encode(x[0])
return len(toks), x[0]
reord = utils.Reorderer(requests, _collate)
for context, request_args in tqdm(reord.get_reordered()):
stopping_criteria = request_args["stopping_criteria"]
max_generation_length = request_args["max_generation_length"]
num_fewshot = request_args["num_fewshot"]
for context, until in tqdm(reord.get_reordered()):
if isinstance(until, str):
until = [until]
assert isinstance(stopping_criteria, str) or stopping_criteria is None
assert (
isinstance(max_generation_length, int) or max_generation_length is None
)
assert isinstance(num_fewshot, int) or num_fewshot is None
primary_until, = self.tok_encode(until[0])
context_enc = torch.tensor([self.tok_encode(context)[self.max_gen_toks - self.max_length:]]).to(self.device)
if stopping_criteria is None:
until = [self.eot_token]
else:
until = [stopping_criteria]
primary_until = self.tok_encode(until[0])
if len(primary_until) == 0:
primary_until = torch.tensor([self.eot_token_id])
cont = self._model_generate(context_enc, context_enc.shape[1] + self.max_gen_toks, primary_until)
context_enc = torch.tensor(
[self.tok_encode(context)[self.max_gen_toks - self.max_length :]]
).to(self.device)
if max_generation_length is None:
max_length = self.max_gen_toks
else:
max_length = max_generation_length
cont = self._model_generate(
context_enc,
max_length,
torch.tensor(primary_until),
num_fewshot,
)
s = self.tok_decode(cont.tolist())
s = self.tok_decode(cont[0].tolist()[context_enc.shape[1]:])
for term in until:
s = s.split(term)[0]
# partial caching
self.cache_hook.add_partial("greedy_until", (context, until), s)
res.append(s)
return reord.get_original(res)
......@@ -444,7 +383,7 @@ class Task(abc.ABC):
self._fewshot_docs = None
def download(self, data_dir=None, cache_dir=None, download_mode=None):
"""Downloads and returns the task dataset.
""" Downloads and returns the task dataset.
Override this method to download the dataset from a custom API.
:param data_dir: str
......@@ -473,7 +412,7 @@ class Task(abc.ABC):
name=self.DATASET_NAME,
data_dir=data_dir,
cache_dir=cache_dir,
download_mode=download_mode,
download_mode=download_mode
)
@abstractmethod
......@@ -538,25 +477,23 @@ class Task(abc.ABC):
pass
@abstractmethod
def construct_requests(self, doc, ctx, args):
"""Uses RequestFactory to construct Requests and returns an iterable of
def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
:param args: dict
The specifics of the context, including number of few shots.
part of the document for `doc`.
"""
pass
@abstractmethod
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc:
......@@ -570,7 +507,7 @@ class Task(abc.ABC):
def aggregation(self):
"""
:returns: {str: [metric_score] -> float}
A dictionary where keys are the names of submetrics and values are
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metric scores
"""
pass
......@@ -579,243 +516,22 @@ class Task(abc.ABC):
def higher_is_better(self):
"""
:returns: {str: bool}
A dictionary where keys are the names of submetrics and values are
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
pass
def fewshot_description(self):
import warnings
warnings.warn(
"`fewshot_description` will be removed in futures versions. Pass "
"any custom descriptions to the `evaluate` function instead.",
DeprecationWarning,
)
DeprecationWarning)
return ""
class PromptSourceTask(Task):
"""These are the metrics from promptsource that we have
added default behavior for. If you want to add default behavior for a new metric,
update the functions below. If you want to use one of the following metrics,
*and* add additional custom processing, override `process_results`, `higher_is_better`, and `aggregation`.
"""
CONFIGURED_RANKED_CHOICE_PS_METRICS = set(["Accuracy"])
CONFIGURED_GENERATION_PS_METRICS = set(["BLEU", "ROUGE"])
SPLIT = None
def __init__(
self,
data_dir=None,
cache_dir=None,
download_mode=None,
prompt=None,
save_examples=True,
):
super().__init__(data_dir, cache_dir, download_mode)
self.prompt = prompt
self.save_examples = save_examples
def stopping_criteria(self) -> Optional[str]:
"""
Denote where the generation should end based on the few-shot example
separator: "\n###\n".
TODO: Handle other separators in the future.
"""
return "\n###\n"
def max_generation_length(self) -> Optional[int]:
"""Denote where the max length of the generation if it is obvious from the task."""
return None
def invalid_doc_for_prompt(self, doc) -> bool:
"""Some prompts may not work for some documents."""
if (
# generate_paraphrase for mrpc
# This generation prompt assumes a positive example. We filter out the negative examples.
# https://github.com/bigscience-workshop/promptsource/blob/ba8c9eccbe82f2409208c655896f1dd131171ece/promptsource/templates/glue/mrpc/templates.yaml#L7
# https://github.com/bigscience-workshop/promptsource/blob/ba8c9eccbe82f2409208c655896f1dd131171ece/promptsource/templates/glue/mrpc/templates.yaml#L88
(
self.prompt.id == "3b88d2c4-0aeb-4c6d-9ccc-653a388250a5"
or self.prompt.id == "d830d7a5-abc0-4275-ac62-974e0088876f"
)
and doc["label"] == 0
):
return True
return False
def doc_to_target(self, doc) -> str:
"""NOTE: In the future, this may return Union[str, List[str]]."""
_, target = self.prompt.apply(doc)
return f" {target}"
def doc_to_text(self, doc) -> str:
text, _ = self.prompt.apply(doc)
return text
def construct_requests(self, doc, ctx, args):
"""Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
:param args: dict
The specifics of the context, including number of few shots.
"""
_requests = []
answer_choices_list = self.prompt.get_answer_choices_list(doc)
if answer_choices_list:
# If answer_choices_list, then this is a ranked choice prompt.
for answer_choice in answer_choices_list:
ll_answer_choice, _ = rf.loglikelihood(ctx, f" {answer_choice}")
_requests.append(ll_answer_choice)
else:
# If not, then this is a generation prompt.
# NOTE: In the future, target will be a list of strings.
request_args = {
"stopping_criteria": self.stopping_criteria(),
"max_generation_length": self.max_generation_length(),
"num_fewshot": args["num_fewshot"],
}
cont_request = rf.greedy_until(ctx, request_args)
_requests.append(cont_request)
return _requests
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
The results of the requests created in construct_requests.
"""
target = self.doc_to_target(doc).strip()
answer_choices_list = self.prompt.get_answer_choices_list(doc)
if answer_choices_list:
# If answer_choices_list, then this is a ranked choice prompt.
# NOTE: In the future, target will be a list of strings.
# For now, we can assume there will be only 1 target, but its possible
# that this not the case so we should check for that.
pred = answer_choices_list[np.argmax(results)]
out = {}
for metric in self.prompt.metadata.metrics:
assert (
metric in self.CONFIGURED_RANKED_CHOICE_PS_METRICS
), "Unexpected metric. Add it, or use a task-specific solution."
if metric == "Accuracy":
out["acc"] = pred == target
# TODO: Add metrics here.
else:
# If not, then this is a generation prompt.
# NOTE: In the future, target will be a list of strings.
pred = results[0].strip()
out = {}
for metric in self.prompt.metadata.metrics:
assert (
metric in self.CONFIGURED_GENERATION_PS_METRICS
), "Unexpected metric. Add it, or use a task-specific solution."
if metric == "BLEU":
out["bleu"] = (target, pred)
elif metric == "ROUGE":
# TODO: This computes all rouge sub-metrics. Find a generic
# way to handle user specified rouge sub-metrics to avoid extra
# compute.
rouge_scores = metrics.rouge(target, pred)
# Flatten rouge score dict.
rouge_scores = utils.flatten(rouge_scores)
# Merge all the rouge-type scores into the `out` dict.
out = {**out, **rouge_scores}
# TODO: Wrap process results s.t. override impl do not
# override the save examples.
if self.save_examples:
example = {
"pred": pred,
"target": target,
"answer_choices_list": answer_choices_list,
}
return out, example
return out
def higher_is_better(self):
out = {}
for metric in self.prompt.metadata.metrics:
if metric == "Accuracy":
out["acc"] = True
if metric == "BLEU":
out["bleu"] = True
if metric == "ROUGE":
# TODO: Find a generic way to handle user specified rouge metrics.
out["rouge1_precision"] = True
out["rouge1_recall"] = True
out["rouge1_fmeasure"] = True
out["rouge2_precision"] = True
out["rouge2_recall"] = True
out["rouge2_fmeasure"] = True
out["rougeL_precision"] = True
out["rougeL_recall"] = True
out["rougeL_fmeasure"] = True
out["rougeLsum_precision"] = True
out["rougeLsum_recall"] = True
out["rougeLsum_fmeasure"] = True
return out
def aggregation(self):
out = {}
for metric in self.prompt.metadata.metrics:
if metric == "Accuracy":
out["acc"] = mean
if metric == "BLEU":
out["bleu"] = metrics.bleu
if metric == "ROUGE":
# TODO: Find a generic way to handle user specified rouge metrics.
out["rouge1_precision"] = mean
out["rouge1_recall"] = mean
out["rouge1_fmeasure"] = mean
out["rouge2_precision"] = mean
out["rouge2_recall"] = mean
out["rouge2_fmeasure"] = mean
out["rougeL_precision"] = mean
out["rougeL_recall"] = mean
out["rougeL_fmeasure"] = mean
out["rougeLsum_precision"] = mean
out["rougeLsum_recall"] = mean
out["rougeLsum_fmeasure"] = mean
return out
def fewshot_examples(self, k, rnd):
if self._training_docs is None:
self._training_docs = list(self.training_docs())
return self._get_fewshot_examples(self._training_docs, k, rnd)
def _get_fewshot_examples(self, docs, k, rnd):
fewshot_idx = rnd.sample(list(np.arange(len(docs))), k)
return [docs[idx] for idx in fewshot_idx], [int(idx) for idx in fewshot_idx]
@utils.positional_deprecated
def fewshot_context(
self, doc, num_fewshot, provide_description=None, rnd=None, description=None
):
"""Returns a fewshot context string that is made up of a prepended description
def fewshot_context(self, doc, num_fewshot, provide_description=None, rnd=None, description=None):
""" Returns a fewshot context string that is made up of a prepended description
(if provided), the `num_fewshot` number of examples, and an appended prompt example.
:param doc: str
......@@ -832,9 +548,7 @@ class PromptSourceTask(Task):
:returns: str
The fewshot context.
"""
assert (
rnd is not None
), "A `random.Random` generator argument must be provided to `rnd`"
assert rnd is not None, "A `random.Random` generator argument must be provided to `rnd`"
assert not provide_description, (
"The `provide_description` arg will be removed in future versions. To prepend "
"a custom description to the context, supply the corresponding string via the "
......@@ -842,93 +556,44 @@ class PromptSourceTask(Task):
)
if provide_description is not None:
# nudge people to not specify it at all
print(
"WARNING: provide_description is deprecated and will be removed in a future version in favor of description_dict"
)
print("WARNING: provide_description is deprecated and will be removed in a future version in favor of description_dict")
description = description + "\n\n" if description else ""
if num_fewshot == 0:
labeled_examples = ""
fewshotex, fewshotidx, self.fewshotsource = [], [], None
else:
# for sets with no training docs, draw from other set *but ensure no overlap with current doc*
if self.has_training_docs():
fewshotex, fewshotidx = self.fewshot_examples(k=num_fewshot, rnd=rnd)
self.fewshotsource = "train"
fewshotex = self.fewshot_examples(k=num_fewshot, rnd=rnd)
else:
if self._fewshot_docs is None:
self._fewshot_docs = list(
self.validation_docs()
if self.has_validation_docs()
else self.test_docs()
self.validation_docs() if self.has_validation_docs() else self.test_docs()
)
if self.has_validation_docs():
self.fewshotsource = "val"
elif self.test_docs():
self.fewshotsource = "test"
fewshotex, fewshotidx = self._get_fewshot_examples(
self._fewshot_docs, k=num_fewshot + 1, rnd=rnd
)
fewshotex, fewshotidx = zip(*[
(shot, idx)
for shot, idx in zip(fewshotex, fewshotidx)
if shot != doc
])
fewshotex = rnd.sample(self._fewshot_docs, num_fewshot + 1)
# get rid of the doc that's the one we're evaluating, if it's in the fewshot
fewshotex, fewshotidx = (
fewshotex[:num_fewshot],
fewshotidx[:num_fewshot],
)
# See Webson & Pavlick (2022) https://arxiv.org/pdf/2109.01247.pdf
# for justification of this separator.
example_separator = "\n###\n"
labeled_examples = (
example_separator.join(
[
self.doc_to_text(doc) + self.doc_to_target(doc)
for doc in fewshotex
]
)
+ example_separator
)
fewshotex = [x for x in fewshotex if x != doc][:num_fewshot]
example = self.doc_to_text(doc)
ctx = description + labeled_examples + example
return (
ctx,
{
"fewshot_idx": fewshotidx,
"fewshot_source": self.fewshotsource,
"fewshot_num": num_fewshot,
"ctx": ctx,
},
)
labeled_examples = "\n\n".join(
[self.doc_to_text(doc) + self.doc_to_target(doc) for doc in fewshotex]
) + "\n\n"
def get_logging_info(self):
return {
"fixed_answer_choice_list": self.prompt.get_fixed_answer_choices_list(),
"dataset_path": self.DATASET_PATH,
"dataset_name": self.DATASET_NAME,
"subset": self.SPLIT,
"prompt_name": self.prompt.get_name(),
"prompt_id": self.prompt.get_id(),
"prompt_jinja": self.prompt.jinja,
"prompt_original_task": self.prompt.metadata.original_task,
# Placeholder for comment in post-processing.
"comment": "",
}
example = self.doc_to_text(doc)
return description + labeled_examples + example
class MultipleChoiceTask(Task):
def doc_to_target(self, doc):
return " " + doc["choices"][doc["gold"]]
return " " + doc['choices'][doc['gold']]
def construct_requests(self, doc, ctx):
lls = [
rf.loglikelihood(ctx, " {}".format(choice))[0] for choice in doc["choices"]
rf.loglikelihood(ctx, " {}".format(choice))[0]
for choice in doc['choices']
]
return lls
......@@ -936,21 +601,21 @@ class MultipleChoiceTask(Task):
def process_results(self, doc, results):
gold = doc["gold"]
acc = 1.0 if np.argmax(results) == gold else 0.0
acc = 1. if np.argmax(results) == gold else 0.
completion_len = np.array([float(len(i)) for i in doc["choices"]])
acc_norm = 1.0 if np.argmax(results / completion_len) == gold else 0.0
acc_norm = 1. if np.argmax(results / completion_len) == gold else 0.
return {
"acc": acc,
"acc_norm": acc_norm,
}
def higher_is_better(self):
return {
"acc": True,
"acc_norm": True,
}
def aggregation(self):
return {
"acc": mean,
......@@ -959,6 +624,7 @@ class MultipleChoiceTask(Task):
class PerplexityTask(Task, abc.ABC):
def has_training_docs(self):
return False
......@@ -966,15 +632,9 @@ class PerplexityTask(Task, abc.ABC):
assert k == 0
return []
def fewshot_context(
self, doc, num_fewshot, provide_description=None, rnd=None, description=None
):
assert (
num_fewshot == 0
), "The number of fewshot examples must be 0 for perplexity tasks."
assert (
rnd is not None
), "A `random.Random` generator argument must be provided to `rnd`."
def fewshot_context(self, doc, num_fewshot, provide_description=None, rnd=None, description=None):
assert num_fewshot == 0, "The number of fewshot examples must be 0 for perplexity tasks."
assert rnd is not None, "A `random.Random` generator argument must be provided to `rnd`."
assert not provide_description, (
"The `provide_description` arg will be removed in future versions. To prepend "
"a custom description to the context, supply the corresponding string via the "
......@@ -982,9 +642,7 @@ class PerplexityTask(Task, abc.ABC):
)
if provide_description is not None:
# nudge people to not specify it at all
print(
"WARNING: provide_description is deprecated and will be removed in a future version in favor of description_dict"
)
print("WARNING: provide_description is deprecated and will be removed in a future version in favor of description_dict")
return ""
......@@ -1007,7 +665,7 @@ class PerplexityTask(Task, abc.ABC):
return req
def process_results(self, doc, results):
(loglikelihood,) = results
loglikelihood, = results
words = self.count_words(doc)
bytes_ = self.count_bytes(doc)
return {
......@@ -1029,23 +687,23 @@ class PerplexityTask(Task, abc.ABC):
@classmethod
def count_words(cls, doc):
"""Downstream tasks with custom word boundaries should override this!"""
""" Downstream tasks with custom word boundaries should override this! """
return len(re.split(r"\s+", doc))
def hash_args(attr, args):
dat = json.dumps([attr] + list(args))
return hashlib.sha256(dat.encode("utf-8")).hexdigest()
return hashlib.sha256(dat.encode('utf-8')).hexdigest()
class CacheHook:
def __init__(self, cachinglm):
if cachinglm is None:
if cachinglm is None:
self.dbdict = None
return
self.dbdict = cachinglm.dbdict
def add_partial(self, attr, req, res):
if self.dbdict is None:
return
......@@ -1075,7 +733,7 @@ class CachingLM:
def fn(requests):
res = []
remaining_reqs = []
# figure out which ones are cached and which ones are new
for req in requests:
hsh = hash_args(attr, req)
......@@ -1088,7 +746,7 @@ class CachingLM:
else:
res.append(None)
remaining_reqs.append(req)
# actually run the LM on the requests that do not have cached results
rem_res = getattr(self.lm, attr)(remaining_reqs)
......@@ -1106,48 +764,41 @@ class CachingLM:
self.dbdict.commit()
return res
return fn
def get_cache_hook(self):
return CacheHook(self)
REQUEST_RETURN_LENGTHS = {
"loglikelihood": 2,
"greedy_until": None,
"loglikelihood_rolling": None,
'loglikelihood': 2,
'greedy_until': None,
'loglikelihood_rolling': None,
}
class Request:
def __init__(self, request_type, args, index=None):
if request_type not in REQUEST_RETURN_LENGTHS.keys():
raise NotImplementedError(
"The request type {} is not implemented!".format(request_type)
)
raise NotImplementedError('The request type {} is not implemented!'.format(request_type))
self.request_type = request_type
self.args = args
self.index = index
def __iter__(self):
if REQUEST_RETURN_LENGTHS[self.request_type] is None:
raise IndexError("This request type does not return multiple arguments!")
raise IndexError('This request type does not return multiple arguments!')
for i in range(REQUEST_RETURN_LENGTHS[self.request_type]):
yield Request(self.request_type, self.args, i)
def __getitem__(self, i):
if REQUEST_RETURN_LENGTHS[self.request_type] is None:
raise IndexError("This request type does not return multiple arguments!")
raise IndexError('This request type does not return multiple arguments!')
return Request(self.request_type, self.args, i)
def __eq__(self, other):
return (
self.request_type == other.request_type
and self.args == other.args
and self.index == other.index
)
return self.request_type == other.request_type and self.args == other.args and self.index == other.index
def __repr__(self):
return f"Req_{self.request_type}{self.args}[{self.index}]\n"
......@@ -1157,7 +808,6 @@ class RequestFactory:
def __getattr__(self, attr):
def fn(*args):
return Request(attr, args)
return fn
......
......@@ -2,38 +2,25 @@ import collections
import itertools
import pathlib
import random
import lm_eval.metrics
import lm_eval.models
import lm_eval.tasks
import lm_eval.base
import promptsource
import numpy as np
from promptsource.templates import DatasetTemplates
from lm_eval.utils import positional_deprecated, run_task_tests
@positional_deprecated
def simple_evaluate(
model,
model_args=None,
tasks=[],
num_fewshot=0,
batch_size=None,
device=None,
no_cache=False,
limit=None,
bootstrap_iters=100000,
description_dict=None,
check_integrity=False,
):
def simple_evaluate(model, model_args=None, tasks=[],
num_fewshot=0, batch_size=None, device=None,
no_cache=False, limit=None, bootstrap_iters=100000,
description_dict=None, check_integrity=False):
"""Instantiate and evaluate a model on a list of tasks.
:param model: Union[str, LM]
Name of model or LM object, see lm_eval.models.get_model
:param model_args: Optional[str]
String arguments for each model class, see LM.create_from_arg_string.
String arguments for each model class, see LM.create_from_arg_string.
Ignored if `model` argument is a LM object.
:param tasks: list[Union[str, Task]]
List of task names or Task objects. Task objects will be taken to have name task.EVAL_HARNESS_NAME if defined and type(task).__name__ otherwise.
......@@ -50,7 +37,7 @@ def simple_evaluate(
:param bootstrap_iters:
Number of iterations for bootstrap statistics
:param description_dict: dict[str, str]
Dictionary of custom task descriptions of the form: `task_name: description`
Dictionary of custom task descriptions of the form: `task_name: description`
:param check_integrity: bool
Whether to run the relevant part of the test suite for the tasks
:return
......@@ -62,28 +49,20 @@ def simple_evaluate(
assert tasks != [], "No tasks specified"
if isinstance(model, str):
if model_args is None:
model_args = ""
lm = lm_eval.models.get_model(model).create_from_arg_string(
model_args, {"batch_size": batch_size, "device": device}
)
if model_args is None: model_args = ""
lm = lm_eval.models.get_model(model).create_from_arg_string(model_args, {
'batch_size': batch_size, 'device': device
})
else:
assert isinstance(model, lm_eval.base.LM)
lm = model
# TODO: Hard-code turning off cache while testing. Remove once testing is completed.
no_cache = True
if not no_cache:
lm = lm_eval.base.CachingLM(
lm,
"lm_cache/"
+ model
+ "_"
+ model_args.replace("=", "-").replace(",", "_").replace("/", "-")
+ ".db",
lm, 'lm_cache/' + model + '_' + model_args.replace('=', '-').replace(',', '_').replace('/', '-') + '.db'
)
task_dict = lm_eval.tasks.get_task_dict_promptsource(tasks)
task_dict = lm_eval.tasks.get_task_dict(tasks)
if check_integrity:
run_task_tests(task_list=tasks)
......@@ -93,7 +72,7 @@ def simple_evaluate(
task_dict=task_dict,
num_fewshot=num_fewshot,
limit=limit,
description_dict=description_dict,
description_dict=description_dict
)
# add info about the model and few shot config
......@@ -106,22 +85,14 @@ def simple_evaluate(
"no_cache": no_cache,
"limit": limit,
"bootstrap_iters": bootstrap_iters,
"description_dict": description_dict,
"description_dict": description_dict
}
return results
@positional_deprecated
def evaluate(
lm,
task_dict,
provide_description=None,
num_fewshot=0,
limit=None,
bootstrap_iters=100000,
description_dict=None,
):
def evaluate(lm, task_dict, provide_description=None, num_fewshot=0, limit=None, bootstrap_iters=100000, description_dict=None):
"""Instantiate and evaluate a model on a list of tasks.
:param lm: obj
......@@ -137,7 +108,7 @@ def evaluate(
:param bootstrap_iters:
Number of iterations for bootstrap statistics
:param description_dict: dict[str, str]
Dictionary of custom task descriptions of the form: `task_name: description`
Dictionary of custom task descriptions of the form: `task_name: description`
:return
Dictionary of results
"""
......@@ -147,14 +118,12 @@ def evaluate(
assert not provide_description # not implemented.
if provide_description is not None:
# nudge people to not specify it at all
print(
"WARNING: provide_description is deprecated and will be removed in a future version in favor of description_dict"
)
print("WARNING: provide_description is deprecated and will be removed in a future version in favor of description_dict")
task_dict_items = [
(name, task)
for name, task in task_dict.items()
if (task.has_validation_docs() or task.has_test_docs())
if(task.has_validation_docs() or task.has_test_docs())
]
results = collections.defaultdict(dict)
......@@ -172,8 +141,8 @@ def evaluate(
docs = {}
# get lists of each type of request
for task_prompt_name, task in task_dict_items:
versions[task_prompt_name] = task.VERSION
for task_name, task in task_dict_items:
versions[task_name] = task.VERSION
# default to test doc, fall back to val doc if validation unavailable
# TODO: the test-fallback-to-val system isn't final, we should revisit it at some point
if task.has_test_docs():
......@@ -184,39 +153,29 @@ def evaluate(
raise RuntimeError("Task has neither test_docs nor validation_docs")
# deterministically shuffle docs and chop off the first `limit` because sometimes docs are in some kind of order
task_docs = list(enumerate(list(task_doc_func())))
task_docs = list(task_doc_func())
rnd = random.Random()
rnd.seed(42)
rnd.shuffle(task_docs)
description = (
description_dict[task_prompt_name]
if description_dict and task_prompt_name in description_dict
else ""
)
for doc_id, (original_doc_id, doc) in enumerate(
itertools.islice(task_docs, 0, limit)
):
if task.invalid_doc_for_prompt(doc):
continue
description = description_dict[task_name] if description_dict and task_name in description_dict else ""
docs[(task_prompt_name, doc_id)] = doc
ctx, fewshotex_logging_info = task.fewshot_context(
doc=doc, num_fewshot=num_fewshot, rnd=rnd, description=description
for doc_id, doc in enumerate(itertools.islice(task_docs, 0, limit)):
docs[(task_name, doc_id)] = doc
ctx = task.fewshot_context(
doc=doc,
num_fewshot=num_fewshot,
rnd=rnd,
description=description
)
fewshotex_logging_info["doc_id"] = original_doc_id
args = {"num_fewshot": num_fewshot}
reqs = task.construct_requests(doc, ctx, args)
reqs = task.construct_requests(doc, ctx)
if not isinstance(reqs, (list, tuple)):
reqs = [reqs]
for i, req in enumerate(reqs):
requests[req.request_type].append(req)
# i: index in requests for a single task instance
# doc_id: unique id that we can get back to a doc using `docs`
requests_origin[req.request_type].append(
(i, task_prompt_name, doc, doc_id, fewshotex_logging_info)
)
requests_origin[req.request_type].append((i, task_name, doc, doc_id))
# all responses for each (task, doc)
process_res_queue = collections.defaultdict(list)
......@@ -230,82 +189,42 @@ def evaluate(
print("Running", reqtype, "requests")
resps = getattr(lm, reqtype)([req.args for req in reqs])
resps = [
x if req.index is None else x[req.index] for x, req in zip(resps, reqs)
]
for resp, (i, task_prompt_name, doc, doc_id, fewshotex_logging_info) in zip(
resps, requests_origin[reqtype]
):
process_res_queue[(task_prompt_name, doc_id)].append(
(i, resp, fewshotex_logging_info)
)
resps = [x if req.index is None else x[req.index] for x, req in zip(resps, reqs)]
for resp, (i, task_name, doc, doc_id) in zip(resps, requests_origin[reqtype]):
process_res_queue[(task_name, doc_id)].append((i, resp))
vals = collections.defaultdict(list)
# unpack results and sort back in order and return control to Task
examples = []
for (task_prompt_name, doc_id), per_doc_requests in process_res_queue.items():
per_doc_requests.sort(key=lambda x: x[0])
per_doc_results = [x[1] for x in per_doc_requests]
fewshot_logging_info = [x[2] for x in per_doc_requests][0]
task = task_dict[task_prompt_name]
doc = docs[(task_prompt_name, doc_id)]
output = task.process_results(doc, per_doc_results)
if task.save_examples:
metrics, example = output
example.update(fewshot_logging_info)
example.update(task.get_logging_info())
examples.append(example)
else:
metrics = output
example = fewshot_logging_info
example.update(task.get_logging_info())
examples.append(example)
for (task_name, doc_id), requests in process_res_queue.items():
requests.sort(key=lambda x: x[0])
requests = [x[1] for x in requests]
for metric, value in metrics.items():
vals[(task_prompt_name, metric)].append(value)
task = task_dict[task_name]
doc = docs[(task_name, doc_id)]
metrics = task.process_results(doc, requests)
for metric, value in metrics.items():
vals[(task_name, metric)].append(value)
# aggregate results
metric_results = []
for (task_prompt_name, metric), items in vals.items():
task_name, prompt_name = task_prompt_name.split("+")
results[task_prompt_name]["task_name"] = task_name
results[task_prompt_name]["prompt_name"] = prompt_name
task = task_dict[task_prompt_name]
results[task_prompt_name][metric] = task.aggregation()[metric](items)
_metric_results = {
"task_name": task_name,
"prompt_name": prompt_name,
metric: task.aggregation()[metric](items),
**task.get_logging_info(),
}
for (task_name, metric), items in vals.items():
task = task_dict[task_name]
results[task_name][metric] = task.aggregation()[metric](items)
# hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
# so we run them less iterations. still looking for a cleaner way to do this
stderr = lm_eval.metrics.stderr_for_metric(
metric=task.aggregation()[metric],
bootstrap_iters=min(bootstrap_iters, 1000)
if metric in ["bleu", "chrf", "ter"]
else bootstrap_iters,
bootstrap_iters=min(bootstrap_iters, 1000) if metric in ["bleu", "chrf", "ter"] else bootstrap_iters,
)
if stderr is not None:
results[task_prompt_name][metric + "_stderr"] = stderr(items)
_metric_results[metric + "_stderr"] = stderr(items)
metric_results.append(_metric_results)
results[task_name][metric + "_stderr"] = stderr(items)
return {
# List of results that tracks the averages per model and prompt.
"results": metric_results,
"versions": dict(versions),
# List of all prompt x doc examples with additional information in it.
"examples": examples,
# Original results used for generating the table when running this file.
"table_results": dict(results),
"results": dict(results),
"versions": dict(versions)
}
......@@ -315,50 +234,22 @@ def make_table(result_dict):
md_writer = MarkdownTableWriter()
latex_writer = LatexTableWriter()
md_writer.headers = ["Task", "Prompt", "Version", "Metric", "Value", "", "Stderr"]
latex_writer.headers = [
"Task",
"Prompt",
"Version",
"Metric",
"Value",
"",
"Stderr",
]
md_writer.headers = ["Task", "Version", "Metric", "Value", "", "Stderr"]
latex_writer.headers = ["Task", "Version", "Metric", "Value", "", "Stderr"]
values = []
for k, dic in result_dict["table_results"].items():
for k, dic in result_dict["results"].items():
version = result_dict["versions"][k]
for m, v in dic.items():
if m.endswith("_stderr"):
continue
if "_name" in m:
continue
if m + "_stderr" in dic:
se = dic[m + "_stderr"]
values.append(
[
dic["task_name"],
dic["prompt_name"],
version,
m,
"%.4f" % v,
"±",
"%.4f" % se,
]
)
values.append([k, version, m, '%.4f' % v, '±', '%.4f' % se])
else:
values.append(
[
dic["task_name"],
dic["prompt_name"],
version,
m,
"%.4f" % v,
"",
"",
]
)
values.append([k, version, m, '%.4f' % v, '', ''])
k = ""
version = ""
md_writer.value_matrix = values
......
import typing
import math
from collections.abc import Iterable
import numpy as np
import sacrebleu
from rouge_score import rouge_scorer
import sklearn.metrics
import random
......@@ -186,74 +184,6 @@ def _sacreformat(refs, preds):
return refs, preds
def rouge(
refs: typing.List[str],
pred: str,
rouge_types: typing.List[str] = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
):
""" ROUGE with multi-reference support
Implementation based on GEM-metrics:
https://github.com/GEM-benchmark/GEM-metrics/blob/431a8174bd6b3637e8d6118bfad2983e39e99733/gem_metrics/rouge.py
:param refs:
A `list` of reference `str`s.
:param pred:
A single prediction `str`s.
"""
# Add newlines between sentences to correctly compute `rougeLsum`.
if "rougeLsum" in rouge_types:
# TODO: Adapt this to handle languages that do not support sentence endings by `.`.
# See GEM-metrics implementation with lang specific `nltk` tokenizers to
# split sentences.
pred = pred.replace(".", ".\n")
refs = [ref.replace(".", ".\n") for ref in refs]
scorer = rouge_scorer.RougeScorer(rouge_types=rouge_types, use_stemmer=True)
# ROUGE multi-ref jackknifing
if len(refs) > 1:
cur_scores = [scorer.score(ref, pred) for ref in refs]
# get best score for all leave-one-out sets
best_scores = []
for leave in range(len(refs)):
cur_scores_leave_one = [
cur_scores[s] for s in range(len(refs)) if s != leave
]
best_scores.append(
{
rouge_type: max(
[s[rouge_type] for s in cur_scores_leave_one],
key=lambda s: s.fmeasure,
)
for rouge_type in rouge_types
}
)
# average the leave-one-out bests to produce the final score
score = {
rouge_type: rouge_scorer.scoring.Score(
np.mean([b[rouge_type].precision for b in best_scores]),
np.mean([b[rouge_type].recall for b in best_scores]),
np.mean([b[rouge_type].fmeasure for b in best_scores]),
)
for rouge_type in rouge_types
}
else:
score = scorer.score(refs[0], pred)
# convert the named tuples to plain nested dicts
score = {
rouge_type: {
"precision": score[rouge_type].precision,
"recall": score[rouge_type].recall,
"fmeasure": score[rouge_type].fmeasure,
}
for rouge_type in rouge_types
}
return score
# stderr stuff
class _bootstrap_internal:
......
from . import gpt2
from . import gptj
from . import gpt3
from . import t5
from . import t0
from . import dummy
MODEL_REGISTRY = {
"hf": gpt2.HFLM,
"gpt2": gpt2.GPT2LM,
"gptj": gptj.GPTJLM,
"gpt3": gpt3.GPT3LM,
"t5": t5.T5LM,
"mt5": t5.T5LM,
"t0": t0.T0LM,
"dummy": dummy.DummyLM,
}
......
......@@ -4,16 +4,8 @@ from lm_eval.base import BaseLM
class HFLM(BaseLM):
def __init__(
self,
device="cuda",
pretrained="gpt2",
revision="main",
subfolder=None,
tokenizer=None,
batch_size=1,
parallelize=False
):
def __init__(self, device='cuda', pretrained='gpt2', revision='main', subfolder=None, tokenizer=None, batch_size=1):
super().__init__()
assert isinstance(device, str)
......@@ -23,61 +15,36 @@ class HFLM(BaseLM):
if device:
self._device = torch.device(device)
else:
self._device = (
torch.device("cuda")
if torch.cuda.is_available()
else torch.device("cpu")
)
self._device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# TODO: update this to be less of a hack once subfolder is fixed in HF
self.gpt2 = transformers.AutoModelForCausalLM.from_pretrained(
pretrained,
revision=revision + ("/" + subfolder if subfolder is not None else ""),
)
pretrained, revision=revision + ("/" + subfolder if subfolder is not None else "")
).to(self.device)
self.gpt2.eval()
# pretrained tokenizer for neo is broken for now so just hard-coding this to gpt2
self.tokenizer = transformers.AutoTokenizer.from_pretrained(
pretrained if tokenizer is None else tokenizer,
revision=revision,
subfolder=subfolder,
)
pretrained if tokenizer is None else tokenizer, revision=revision, subfolder=subfolder)
assert isinstance(
self.tokenizer,
(
transformers.GPT2Tokenizer,
transformers.GPT2TokenizerFast,
transformers.T5Tokenizer,
transformers.T5TokenizerFast,
),
), "this tokenizer has not been checked for compatibility yet!"
assert isinstance(self.tokenizer, (
transformers.GPT2Tokenizer, transformers.GPT2TokenizerFast,
transformers.T5Tokenizer, transformers.T5TokenizerFast,
)), "this tokenizer has not been checked for compatibility yet!"
self.vocab_size = self.tokenizer.vocab_size
if isinstance(
self.tokenizer, (transformers.GPT2Tokenizer, transformers.GPT2TokenizerFast)
):
assert self.tokenizer.encode("hello\n\nhello") == [
31373,
198,
198,
31373,
], self.tokenizer.encode("hello\n\nhello")
if isinstance(self.tokenizer, (transformers.GPT2Tokenizer, transformers.GPT2TokenizerFast)):
assert self.tokenizer.encode('hello\n\nhello') == [31373, 198, 198, 31373], \
self.tokenizer.encode('hello\n\nhello')
# multithreading and batching
self.batch_size_per_gpu = batch_size # todo: adaptive batch size
# TODO: fix multi-gpu
if parallelize:
self.gpt2.parallelize()
self._device = torch.device('cuda:0')
else:
self.gpt2.to(self._device)
@property
def eot_token(self):
return self.tokenizer.eos_token
# gpus = torch.cuda.device_count()
# if gpus > 1:
# self.gpt2 = nn.DataParallel(self.gpt2)
@property
def eot_token_id(self):
......@@ -108,7 +75,7 @@ class HFLM(BaseLM):
def tok_encode(self, string: str):
return self.tokenizer.encode(string, add_special_tokens=False)
def tok_decode(self, tokens):
return self.tokenizer.decode(tokens)
......@@ -122,53 +89,15 @@ class HFLM(BaseLM):
"""
with torch.no_grad():
return self.gpt2(inps)[0][:, :, :50257]
def _model_generate(self, context, max_length, eos_token_id):
return self.gpt2.generate(
context,
max_length=max_length,
eos_token_id=eos_token_id,
do_sample=False
)
def _get_stopping_criteria(self, stopping_criteria_ids):
class MultitokenEOSCriteria(transformers.StoppingCriteria):
def __init__(self, eos_seq_id: torch.LongTensor, tokenizer):
self.eos_seq = tokenizer.decode(eos_seq_id)
self.eos_seq_id = eos_seq_id
self.eos_seq_len = len(eos_seq_id) + 1
self.tokenizer = tokenizer
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
last_token_id = input_ids[0, -self.eos_seq_len:]
last_tokens = self.tokenizer.decode(last_token_id)
is_stopped = self.eos_seq in last_tokens
return is_stopped
class EOSCriteria(transformers.StoppingCriteria):
def __init__(self, eos_token_id: torch.LongTensor):
self.eos_token_id = eos_token_id
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
return input_ids[0,-1] == self.eos_token_id
return transformers.StoppingCriteriaList([
MultitokenEOSCriteria(stopping_criteria_ids, self.tokenizer),
EOSCriteria(self.tokenizer.eos_token)
])
def _model_generate(self, context, max_length, stopping_criteria_ids, num_fewshot):
stopping_criteria = self._get_stopping_criteria(stopping_criteria_ids)
max_length = max_length + context.size(1)
if num_fewshot == 0:
generations = self.gpt2.generate(
context,
max_length=max_length,
eos_token_id=self.eot_token_id,
do_sample=False,
)
else:
generations = self.gpt2.generate(
context,
max_length=max_length,
stopping_criteria=stopping_criteria,
do_sample=False,
)
# Remove the context from the generations
return generations[0, context.shape[1] :]
# for backwards compatibility
GPT2LM = HFLM
import transformers
import torch
from lm_eval.base import BaseLM
class GPTJLM(BaseLM):
def __init__(
self,
device="cuda",
batch_size=1,
parallelize=False,
):
super().__init__()
assert isinstance(device, str)
assert isinstance(batch_size, int)
if device:
self._device = torch.device(device)
else:
self._device = (
torch.device("cuda")
if torch.cuda.is_available()
else torch.device("cpu")
)
pretrained = "EleutherAI/gpt-j-6B"
self.gptj = transformers.AutoModelForCausalLM.from_pretrained(pretrained).to(self.device)
self.gptj.eval()
# pretrained tokenizer for neo is broken for now so just hard-coding this to gptj
self.tokenizer = transformers.AutoTokenizer.from_pretrained(pretrained)
self.vocab_size = self.tokenizer.vocab_size
# multithreading and batching
self.batch_size_per_gpu = batch_size # todo: adaptive batch size
# TODO: fix multi-gpu
if parallelize:
self.gptj.parallelize()
self._device = torch.device('cuda:0')
else:
self.gptj.to(self._device)
@property
def eot_token(self):
return self.tokenizer.eos_token
@property
def eot_token_id(self):
# we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
return self.tokenizer.eos_token_id
@property
def max_length(self):
try:
return self.gptj.config.n_ctx
except AttributeError:
# gptneoconfig doesn't have n_ctx apparently
return self.gptj.config.max_position_embeddings
@property
def max_gen_toks(self):
return 256
@property
def batch_size(self):
# TODO: fix multi-gpu
return self.batch_size_per_gpu # * gpus
@property
def device(self):
# TODO: fix multi-gpu
return self._device
def tok_encode(self, string: str):
return self.tokenizer.encode(string, add_special_tokens=False)
def tok_decode(self, tokens):
return self.tokenizer.decode(tokens)
def _model_call(self, inps):
"""
inps: a torch tensor of shape [batch, sequence]
the size of sequence may vary from call to call
returns: a torch tensor of shape [batch, sequence, vocab] with the
logits returned from the model
"""
with torch.no_grad():
return self.gptj(inps)[0][:, :, :50257]
def _get_stopping_criteria(self, stopping_criteria_ids):
class MultitokenEOSCriteria(transformers.StoppingCriteria):
def __init__(self, eos_seq_id: torch.LongTensor, tokenizer):
self.eos_seq = tokenizer.decode(eos_seq_id)
self.eos_seq_id = eos_seq_id
self.eos_seq_len = len(eos_seq_id) + 1
self.tokenizer = tokenizer
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
last_token_id = input_ids[0, -self.eos_seq_len:]
last_tokens = self.tokenizer.decode(last_token_id)
is_stopped = self.eos_seq in last_tokens
return is_stopped
class EOSCriteria(transformers.StoppingCriteria):
def __init__(self, eos_token_id: torch.LongTensor):
self.eos_token_id = eos_token_id
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
return input_ids[0,-1] == self.eos_token_id
return transformers.StoppingCriteriaList([
MultitokenEOSCriteria(stopping_criteria_ids, self.tokenizer),
EOSCriteria(self.tokenizer.eos_token)
])
def _model_generate(self, context, max_length, stopping_criteria_ids, num_fewshot):
stopping_criteria = self._get_stopping_criteria(stopping_criteria_ids)
max_length = max_length + context.size(1)
if num_fewshot == 0:
generations = self.gptj.generate(
context,
max_length=max_length,
eos_token_id=self.eot_token_id,
do_sample=False,
)
else:
generations = self.gptj.generate(
context,
max_length=max_length,
stopping_criteria=stopping_criteria,
do_sample=False,
)
# Remove the context from the generations
return generations[0, context.shape[1] :]
import transformers
import torch
import torch.nn as nn
import torch.nn.functional as F
from lm_eval.base import BaseLM
from lm_eval import utils
from tqdm import tqdm
import numpy as np
import math
class T0LM(BaseLM):
# MAX_GEN_TOKS = 256
# MAX_INP_LENGTH = 512
# VOCAB_SIZE = 32100
# EOT_TOKEN_ID = 1
def __init__(self, device='cuda', parallelize=False, pretrained='t0', batch_size=1):
super().__init__()
if device:
self._device = torch.device(device)
else:
self._device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
self.t0 = transformers.AutoModelForSeq2SeqLM.from_pretrained(pretrained)
self.t0.eval()
if parallelize == "True":
self.t0.parallelize()
self._device = torch.device('cuda:0')
else:
self.t0.to(self._device)
self.tokenizer = transformers.AutoTokenizer.from_pretrained(pretrained)
# self.max_length = self.MAX_INP_LENGTH
self.batch_size = int(batch_size)
@classmethod
def create_from_arg_string(cls, arg_string, additional_config={}):
args = utils.simple_parse_args_string(arg_string)
args2 = {k: v for k, v in additional_config.items() if v is not None}
return cls(**args, **args2)
@property
def eot_token(self):
return self.tokenizer.eos_token
@property
def eot_token_id(self):
# we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
return self.tokenizer.eos_token_id
@property
def max_length(self):
return self.tokenizer.model_max_length
@property
def max_gen_toks(self):
return 256
@property
def batch_size(self):
# TODO: fix multi-gpu
return self._batch_size # * gpus
@property
def device(self):
# TODO: fix multi-gpu
return self._device
def tok_encode(self, string: str):
return self.tokenizer.encode(string, add_special_tokens=False)
def tok_decode(self, tokens):
return self.tokenizer.decode(tokens)
def _model_call(self, inputs_tok, targets_tok):
"""
inps: a torch tensor of shape [batch, sequence]
the size of sequence may vary from call to call
returns: a torch tensor of shape [batch, sequence, vocab] with the
logits returned from the model
"""
with torch.no_grad():
return self.t0(
**inputs_tok,
labels=targets_tok["input_ids"]
)
def loglikelihood(self, requests):
res = []
for chunk in tqdm(utils.chunks(requests, self.batch_size), total=math.ceil(len(requests)/self.batch_size)):
inputs, targets = zip(*chunk)
# Fill in empty encoder inputs with eos_token
inputs = (
f"{self.eot_token}"
if len(input_) == 0
else input_
for input_ in inputs
)
inputs_tok = self.tokenizer(
list(inputs),
max_length=self.max_length,
padding=True,
# truncation=True,
add_special_tokens=False,
return_tensors="pt"
).to(self.device)
for key in inputs_tok:
inputs_tok[key] = inputs_tok[key][:, -(self.max_length - 1) :]
targets_tok = self.tokenizer(
list(targets),
max_length=self.max_gen_toks,
padding=True,
# truncation=True,
add_special_tokens=False,
return_tensors="pt"
).to(self.device)
for key in targets_tok:
targets_tok[key] = targets_tok[key][:, -(self.max_length - 1) :]
outputs = self._model_call(inputs_tok, targets_tok)
log_softmaxes = F.log_softmax(outputs.logits, dim=-1)
output_iterator = zip(
chunk,
log_softmaxes,
targets_tok["input_ids"],
targets_tok["attention_mask"],
)
for cache_key, log_softmax, target_tok, target_mask in output_iterator:
length = target_mask.sum()
log_softmax = log_softmax[:length]
target_tok = target_tok[:length]
greedy_tokens = log_softmax.argmax(dim=-1)
max_equal = (greedy_tokens == target_tok).all()
target_logits = torch.gather(
log_softmax, 1, target_tok.unsqueeze(-1)
).squeeze(-1)
answer = (float(target_logits.sum()), bool(max_equal))
if cache_key is not None:
self.cache_hook.add_partial("loglikelihood", cache_key, answer)
res.append(answer)
return res
def _get_stopping_criteria(self, stopping_criteria_ids):
class MultitokenEOSCriteria(transformers.StoppingCriteria):
def __init__(self, eos_seq_id: torch.LongTensor, tokenizer):
self.eos_seq = tokenizer.decode(eos_seq_id)
self.eos_seq_id = eos_seq_id
self.eos_seq_len = len(eos_seq_id) + 1
self.tokenizer = tokenizer
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
last_token_id = input_ids[0, -self.eos_seq_len:]
last_tokens = self.tokenizer.decode(last_token_id)
is_stopped = self.eos_seq in last_tokens
return is_stopped
class EOSCriteria(transformers.StoppingCriteria):
def __init__(self, eos_token_id: torch.LongTensor):
self.eos_token_id = eos_token_id
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
return input_ids[0,-1] == self.eos_token_id
return transformers.StoppingCriteriaList([
MultitokenEOSCriteria(stopping_criteria_ids, self.tokenizer),
EOSCriteria(self.tokenizer.eos_token)
])
def _model_generate(self, context, max_length, stopping_criteria_ids, num_fewshot):
stopping_criteria = self._get_stopping_criteria(stopping_criteria_ids)
if num_fewshot == 0:
generations = self.t0.generate(
context,
max_length=max_length,
eos_token_id=self.eot_token_id,
do_sample=False,
)
else:
generations = self.t0.generate(
context,
max_length=max_length,
stopping_criteria=stopping_criteria,
do_sample=False,
)
return generations[0]
import transformers
import torch
import torch.nn as nn
import torch.nn.functional as F
from lm_eval.base import BaseLM
from lm_eval import utils
from tqdm import tqdm
import numpy as np
import math
class T5LM(BaseLM):
# MAX_GEN_TOKS = 256
# MAX_INP_LENGTH = 512
# VOCAB_SIZE = 32128
# EOT_TOKEN_ID = 1
def __init__(
self,
device='cuda',
parallelize=False,
pretrained='t5',
batch_size=1
):
super().__init__()
if device:
self._device = torch.device(device)
else:
self._device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
self.t5 = transformers.AutoModelForSeq2SeqLM.from_pretrained(pretrained)
self.t5.eval()
if parallelize == "True":
self.t5.parallelize()
self._device = torch.device('cuda:0')
else:
self.t5.to(self._device)
self.tokenizer = transformers.AutoTokenizer.from_pretrained(pretrained)
# self.max_length = self.MAX_INP_LENGTH
self._batch_size = int(batch_size)
@classmethod
def create_from_arg_string(cls, arg_string, additional_config={}):
args = utils.simple_parse_args_string(arg_string)
args2 = {k: v for k, v in additional_config.items() if v is not None}
return cls(**args, **args2)
@property
def eot_token(self):
return self.tokenizer.eos_token
@property
def eot_token_id(self):
# we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
return self.tokenizer.eos_token_id
@property
def max_length(self):
return self.tokenizer.model_max_length
@property
def max_gen_toks(self):
return 256
@property
def batch_size(self):
# TODO: fix multi-gpu
return self._batch_size # * gpus
@property
def device(self):
# TODO: fix multi-gpu
return self._device
def tok_encode(self, string: str):
return self.tokenizer.encode(string, add_special_tokens=False)
def tok_decode(self, tokens):
return self.tokenizer.decode(tokens)
def _model_call(self, inputs_tok, targets_tok):
"""
inps: a torch tensor of shape [batch, sequence]
the size of sequence may vary from call to call
returns: a torch tensor of shape [batch, sequence, vocab] with the
logits returned from the model
"""
with torch.no_grad():
return self.t5(
**inputs_tok,
labels=targets_tok["input_ids"]
)
def loglikelihood(self, requests):
res = []
for chunk in tqdm(utils.chunks(requests, self.batch_size), total=math.ceil(len(requests)/self.batch_size)):
inputs, targets = zip(*chunk)
# Fill in empty encoder inputs with eos_token
inputs = (
f"{self.eot_token}"
if len(input_) == 0
else input_
for input_ in inputs
)
inputs_tok = self.tokenizer(
list(inputs),
max_length=self.max_length,
padding=True,
# truncation=True,
add_special_tokens=False,
return_tensors="pt"
).to(self.device)
for key in inputs_tok:
inputs_tok[key] = inputs_tok[key][:, -(self.max_length - 1) :]
targets_tok = self.tokenizer(
list(targets),
max_length=self.max_gen_toks,
padding=True,
# truncation=True,
add_special_tokens=False,
return_tensors="pt"
).to(self.device)
for key in targets_tok:
targets_tok[key] = targets_tok[key][:, -(self.max_length - 1) :]
outputs = self._model_call(inputs_tok, targets_tok)
log_softmaxes = F.log_softmax(outputs.logits, dim=-1)
output_iterator = zip(
chunk,
log_softmaxes,
targets_tok["input_ids"],
targets_tok["attention_mask"],
)
for cache_key, log_softmax, target_tok, target_mask in output_iterator:
length = target_mask.sum()
log_softmax = log_softmax[:length]
target_tok = target_tok[:length]
greedy_tokens = log_softmax.argmax(dim=-1)
max_equal = (greedy_tokens == target_tok).all()
target_logits = torch.gather(
log_softmax, 1, target_tok.unsqueeze(-1)
).squeeze(-1)
answer = (float(target_logits.sum()), bool(max_equal))
if cache_key is not None:
self.cache_hook.add_partial("loglikelihood", cache_key, answer)
res.append(answer)
return res
def _get_stopping_criteria(self, stopping_criteria_ids):
class MultitokenEOSCriteria(transformers.StoppingCriteria):
def __init__(self, eos_seq_id: torch.LongTensor, tokenizer):
self.eos_seq = tokenizer.decode(eos_seq_id)
self.eos_seq_id = eos_seq_id
self.eos_seq_len = len(eos_seq_id) + 1
self.tokenizer = tokenizer
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
last_token_id = input_ids[0, -self.eos_seq_len:]
last_tokens = self.tokenizer.decode(last_token_id)
is_stopped = self.eos_seq in last_tokens
return is_stopped
class EOSCriteria(transformers.StoppingCriteria):
def __init__(self, eos_token_id: torch.LongTensor):
self.eos_token_id = eos_token_id
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
return input_ids[0,-1] == self.eos_token_id
return transformers.StoppingCriteriaList([
MultitokenEOSCriteria(stopping_criteria_ids, self.tokenizer),
EOSCriteria(self.tokenizer.eos_token)
])
def _model_generate(self, context, max_length, stopping_criteria_ids, num_fewshot):
stopping_criteria = self._get_stopping_criteria(stopping_criteria_ids)
if num_fewshot == 0:
generations = self.t5.generate(
context,
max_length=max_length,
eos_token_id=self.eot_token_id,
do_sample=False,
)
else:
generations = self.t5.generate(
context,
max_length=max_length,
stopping_criteria=stopping_criteria,
do_sample=False,
)
return generations[0]
"""
A dataset of approximately 200K news headlines from the year 2012 to 2018 collected from HuffPost.
Homepage: https://www.kaggle.com/datasets/rmisra/news-category-dataset
"""
from lm_eval.base import PromptSourceTask
_CITATION = """\
@book{book,
author = {Misra, Rishabh and Grover, Jigyasa},
year = {2021},
month = {01},
pages = {},
title = {Sculpting Data for ML: The first act of Machine Learning},
isbn = {978-0-578-83125-1}
}
@dataset{dataset,
author = {Misra, Rishabh},
year = {2018},
month = {06},
pages = {},
title = {News Category Dataset},
doi = {10.13140/RG.2.2.20331.18729}
}
"""
class HuffPost(PromptSourceTask):
VERSION = 0
DATASET_PATH = "khalidalt/HuffPost"
DATASET_NAME = None
def has_training_docs(self):
return False
def has_validation_docs(self):
return False
def has_test_docs(self):
return True
def training_docs(self):
if self.has_training_docs():
if self._training_docs is None:
self._training_docs = list(self.dataset["train"])
return self._training_docs
def validation_docs(self):
if self.has_validation_docs():
return self.dataset["validation"]
def test_docs(self):
if self.has_test_docs():
return self.dataset["test"]
from promptsource.templates import DatasetTemplates
from pprint import pprint
from typing import List, Union
import sacrebleu
import lm_eval.base
from . import superglue
from . import glue
from . import arc
......@@ -54,27 +52,15 @@ from . import blimp
from . import asdiv
from . import gsm8k
from . import storycloze
from . import hans
from . import gem_webnlg
from . import lama
# from . import e2e_nlg_cleaned
from . import gem_xsum
from . import gem_mlsum
from . import wino_bias
from . import e2e_nlg_cleaned
from . import gem_asset_turk
from . import crows_pairs_multilingual
from . import lama
from . import HuffPost
########################################
# Translation tasks
########################################
# 6 total
gpt3_translation_benchmarks = {
"wmt14": ["en-fr", "fr-en"], # French
"wmt16": ["en-ro", "ro-en", "de-en", "en-de"], # German, Romanian
"wmt14": ['en-fr', 'fr-en'], # French
"wmt16": ['en-ro', 'ro-en', 'de-en', 'en-de'], # German, Romanian
}
......@@ -82,7 +68,7 @@ gpt3_translation_benchmarks = {
selected_translation_benchmarks = {
**gpt3_translation_benchmarks,
"wmt20": sacrebleu.get_langpairs_for_testset("wmt20"),
"iwslt17": ["en-ar", "ar-en"], # Arabic
"iwslt17": ['en-ar', 'ar-en'] # Arabic
}
# 319 total
......@@ -106,7 +92,7 @@ TASK_REGISTRY = {
"rte": glue.RTE,
"qnli": glue.QNLI,
"qqp": glue.QQP,
# "stsb": glue.STSB, # not implemented yet
#"stsb": glue.STSB, # not implemented yet
"sst": glue.SST,
"wnli": glue.WNLI,
# SuperGLUE
......@@ -117,37 +103,38 @@ TASK_REGISTRY = {
"record": superglue.ReCoRD,
"wic": superglue.WordsInContext,
"wsc": superglue.SGWinogradSchemaChallenge,
# Order by benchmark/genre?
"coqa": coqa.CoQA,
"drop": drop.DROP,
"lambada": lambada.LAMBADA,
"lambada_cloze": lambada_cloze.LAMBADA_cloze,
**gem_webnlg.construct_tasks(),
# multilingual lambada
**gem_asset_turk.construct_tasks(),
**lambada_multilingual.construct_tasks(),
"wikitext": wikitext.WikiText,
# "cbt-cn": cbt.CBTCN, # disabled pending context length fix
# "cbt-ne": cbt.CBTNE, # disabled pending context length fix
"piqa": piqa.PiQA,
"prost": prost.PROST,
"mc_taco": mc_taco.MCTACO,
# Science related
"pubmedqa": pubmedqa.Pubmed_QA,
"sciq": sciq.SciQ,
"e2e_nlg_cleaned": e2e_nlg_cleaned.E2E_NLG_Cleaned,
"pubmedqa" : pubmedqa.Pubmed_QA,
"sciq" : sciq.SciQ,
"qasper": qasper.QASPER,
"qa4mre_2011": qa4mre.QA4MRE_2011,
"qa4mre_2012": qa4mre.QA4MRE_2012,
"qa4mre_2013": qa4mre.QA4MRE_2013,
"qa4mre_2011" : qa4mre.QA4MRE_2011,
"qa4mre_2012" : qa4mre.QA4MRE_2012,
"qa4mre_2013" : qa4mre.QA4MRE_2013,
"triviaqa": triviaqa.TriviaQA,
"arc_easy": arc.ARCEasy,
"arc_challenge": arc.ARCChallenge,
# "quac": quac.QuAC, # not implemented yet
"lama_trex": lama.Trex,
"lama_squad": lama.Squad,
"lama_google_re": lama.google_re,
"lama_concptnet": lama.Conceptnet,
"logiqa": logiqa.LogiQA,
"hellaswag": hellaswag.HellaSwag,
"swag": swag.SWAG,
......@@ -155,7 +142,7 @@ TASK_REGISTRY = {
"squad2": squad.SQuAD2,
"race": race.RACE,
# "naturalqs": naturalqs.NaturalQs, # not implemented yet
"headqa": headqa.HeadQAEsDeprecated, # for backwards compat - headqa used to default to es
"headqa": headqa.HeadQAEsDeprecated, # for backwards compat - headqa used to default to es
"headqa_es": headqa.HeadQAEs,
"headqa_en": headqa.HeadQAEn,
"mathqa": mathqa.MathQA,
......@@ -165,20 +152,21 @@ TASK_REGISTRY = {
"anli_r1": anli.ANLIRound1,
"anli_r2": anli.ANLIRound2,
"anli_r3": anli.ANLIRound3,
"hans": hans.HANS,
"ethics_cm": hendrycks_ethics.EthicsCM,
"ethics_deontology": hendrycks_ethics.EthicsDeontology,
"ethics_justice": hendrycks_ethics.EthicsJustice,
"ethics_utilitarianism_original": hendrycks_ethics.EthicsUtilitarianismOriginal,
"ethics_utilitarianism": hendrycks_ethics.EthicsUtilitarianism,
"ethics_virtue": hendrycks_ethics.EthicsVirtue,
#"tydiqa_primary" : TyDiQA.Primary, not implemented yet
#"tydiqa_secondary" : TyDiQA.Secondary, not implemented yet
"truthfulqa_mc": truthfulqa.TruthfulQAMultipleChoice,
"truthfulqa_gen": truthfulqa.TruthfulQAGeneration,
"truthfulqa_mc": truthfulqa.TruthfulQAMultipleChoice,
"truthfulqa_gen": truthfulqa.TruthfulQAGeneration,
# dialogue
"mutual": mutual.MuTual,
"mutual_plus": mutual.MuTualPlus,
# math
"math_algebra": hendrycks_math.MathAlgebra,
"math_counting_and_prob": hendrycks_math.MathCountingAndProbability,
......@@ -189,6 +177,7 @@ TASK_REGISTRY = {
"math_precalc": hendrycks_math.MathPrecalculus,
"math_asdiv": asdiv.Asdiv,
"gsm8k": gsm8k.GradeSchoolMath8K,
# arithmetic
"arithmetic_2da": arithmetic.Arithmetic2DPlus,
"arithmetic_2ds": arithmetic.Arithmetic2DMinus,
......@@ -202,18 +191,22 @@ TASK_REGISTRY = {
"arithmetic_1dc": arithmetic.Arithmetic1DComposite,
# TODO Perhaps make these groups of tasks
# e.g. anli, arithmetic, openai_translations, harness_translations
# hendrycksTest (57 tasks)
**hendrycks_test.create_all_tasks(),
# e.g. wmt14-fr-en
**translation.create_tasks_from_benchmarks(gpt3_translation_benchmarks),
# chef's selection, mostly wmt20
**translation.create_tasks_from_benchmarks(selected_translation_benchmarks),
# Word Scrambling and Manipulation Tasks
"anagrams1": unscramble.Anagrams1,
"anagrams2": unscramble.Anagrams2,
"cycle_letters": unscramble.CycleLetters,
"random_insertion": unscramble.RandomInsertion,
"reversed_words": unscramble.ReversedWords,
# Pile
"pile_arxiv": pile.PileArxiv,
"pile_books3": pile.PileBooks3,
......@@ -237,6 +230,7 @@ TASK_REGISTRY = {
"pile_ubuntu-irc": pile.PileUbuntuIrc,
"pile_wikipedia": pile.PileWikipedia,
"pile_youtubesubtitles": pile.PileYoutubeSubtitles,
# BLiMP
"blimp_adjunct_island": blimp.BlimpAdjunctIsland,
"blimp_anaphor_gender_agreement": blimp.BlimpAnaphorGenderAgreement,
......@@ -305,45 +299,11 @@ TASK_REGISTRY = {
"blimp_wh_vs_that_no_gap_long_distance": blimp.BlimpWhVsThatNoGapLongDistance,
"blimp_wh_vs_that_with_gap": blimp.BlimpWhVsThatWithGap,
"blimp_wh_vs_that_with_gap_long_distance": blimp.BlimpWhVsThatWithGapLongDistance,
#GEM/mlsum
"mlsum_es":gem_mlsum.GEMMLSUMEs,
"mlsum_de":gem_mlsum.GEMMLSUMDe,
"mlsum_es_covid_challenge_set":gem_mlsum.GEMMLSUMEsChallgeTestCovid,
"mlsum_de_covid_challenge_set":gem_mlsum.GEMMLSUMDeChallgeTestCovid,
# Requires manual download of data.
# "storycloze_2016": storycloze.StoryCloze2016,
# "storycloze_2018": storycloze.StoryCloze2018,
# "sat": sat.SATAnalogies,
#GEM/xum
"gem_xsum": gem_xsum.GEMXSUM,
"gem_xsum_challenge_sample": gem_xsum.GEMXSUMChallgeSample,
"gem_xsum_challenge_test_backtranslation": gem_xsum.GEMXSUMChallgeTestBacktranslation,
"gem_xsum_challenge_test_bfp_02": gem_xsum.GEMXSUMChallgeTestBFP02,
"gem_xsum_challenge_test_bfp_05": gem_xsum.GEMXSUMChallgeTestBFP05,
"gem_xsum_challenge_test_nopunc": gem_xsum.GEMXSUMChallgeTestNopunc,
"gem_xsum_challenge_test_covid": gem_xsum.GEMXSUMChallgeTestCovid,
#LAMA
"lama-trex": lama.Trex,
"lama-squad": lama.Squad,
"lama-google_re": lama.google_re,
"lama-concptnet": lama.Conceptnet,
"bigscience-lama":lama.BigScienceLAMA,
# WinoBias
"wino_bias_type1_pro": wino_bias.WinoBiasType1Pro,
"wino_bias_type1_anti": wino_bias.WinoBiasType1Anti,
"wino_bias_type2_pro": wino_bias.WinoBiasType2Pro,
"wino_bias_type2_anti": wino_bias.WinoBiasType2Anti,
# Crows-Pairs
"crows_pairs_english": crows_pairs_multilingual.CrowsPairsEnglish,
"crows_pairs_french": crows_pairs_multilingual.CrowsPairsFrench,
# News
"huffpost": HuffPost.HuffPost,
}
......@@ -363,51 +323,19 @@ def get_task_name_from_object(task_object):
for name, class_ in TASK_REGISTRY.items():
if class_ is task_object:
return name
# this gives a mechanism for non-registered tasks to have a custom name anyways when reporting
return (
task_object.EVAL_HARNESS_NAME
if hasattr(task_object, "EVAL_HARNESS_NAME")
else type(task_object).__name__
)
return task_object.EVAL_HARNESS_NAME if hasattr(task_object, "EVAL_HARNESS_NAME") else type(task_object).__name__
def get_task_dict(task_name_list: List[Union[str, lm_eval.base.Task]]):
task_name_dict = {
task_name: get_task(task_name)()
for task_name in task_name_list
if isinstance(task_name, str)
for task_name in task_name_list if isinstance(task_name, str)
}
task_name_from_object_dict = {
get_task_name_from_object(task_object): task_object
for task_object in task_name_list
if not isinstance(task_object, str)
for task_object in task_name_list if not isinstance(task_object, str)
}
assert set(task_name_dict.keys()).isdisjoint(set(task_name_from_object_dict.keys()))
return {**task_name_dict, **task_name_from_object_dict}
def get_task_dict_promptsource(task_name_list: List[str]):
"""Loads a task instance for each prompt written for that task."""
task_name_dict = {}
for task_name in task_name_list:
assert isinstance(task_name, str)
# Static version of the Task Use this to get HF dataset path / name.
static_task_obj = get_task(task_name)
# Create the proper task name arg for DatasetTemplates.
sub_task = (
f"/{static_task_obj.DATASET_NAME}" if static_task_obj.DATASET_NAME else ""
)
ps_task_name = f"{static_task_obj.DATASET_PATH}{sub_task}"
task_prompts = DatasetTemplates(ps_task_name)
for prompt_name in task_prompts.all_template_names:
prompt = task_prompts[prompt_name]
# NOTE: We choose a sep that can be easily split.
task_name_dict[f"{task_name}+{prompt_name}"] = get_task(task_name)(
prompt=prompt
)
return task_name_dict
......@@ -10,7 +10,7 @@ provided explanations.
Homepage: "https://github.com/facebookresearch/anli"
"""
import numpy as np
from lm_eval.base import rf, PromptSourceTask
from lm_eval.base import rf, Task
from lm_eval.metrics import mean
......@@ -30,7 +30,7 @@ _CITATION = """
"""
class ANLIBase(PromptSourceTask):
class ANLIBase(Task):
VERSION = 0
DATASET_PATH = "anli"
DATASET_NAME = None
......@@ -59,6 +59,51 @@ class ANLIBase(PromptSourceTask):
if self.has_test_docs():
return self.dataset["test_r" + str(self.SPLIT)]
def doc_to_text(self, doc):
# OA does this a bit weirdly: they prepend "anli 1: anli 1: " to the beginning
# of the prompt (yes, repeating it!). also, " True, False, or Neither?" is directly
# appended onto the question, with no "Answer:" or even a newline. Do we *really*
# want to do it exactly as OA did?
return doc['premise'] + '\nQuestion: ' + doc['hypothesis'] + ' True, False, or Neither?\nAnswer:'
def doc_to_target(self, doc):
# True = entailment
# False = contradiction
# Neither = neutral
return " " + ["True", "Neither", "False"][doc['label']]
def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
ll_true, _ = rf.loglikelihood(ctx, " True")
ll_neither, _ = rf.loglikelihood(ctx, " Neither")
ll_false, _ = rf.loglikelihood(ctx, " False")
return ll_true, ll_neither, ll_false
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
The results of the requests created in construct_requests.
"""
gold = doc["label"]
pred = np.argmax(results)
return {
"acc": pred == gold
}
def aggregation(self):
"""
:returns: {str: [float] -> float}
......
......@@ -58,11 +58,10 @@ class Arithmetic(Task):
def construct_requests(self, doc, ctx):
ll, is_prediction = rf.loglikelihood(ctx, doc["completion"])
return ll, is_prediction
return is_prediction
def process_results(self, doc, results):
print(results)
results = results
is_prediction, = results
return {
"acc": is_prediction
}
......
......@@ -10,7 +10,7 @@ grammars.
Homepage: https://github.com/alexwarstadt/blimp
"""
from lm_eval.base import rf, PromptSourceTask
from lm_eval.base import rf, Task
from lm_eval.metrics import mean
......@@ -31,7 +31,7 @@ _CITATION = """
"""
class BlimpTask(PromptSourceTask):
class BlimpTask(Task):
VERSION = 0
DATASET_PATH = "blimp"
......@@ -50,6 +50,58 @@ class BlimpTask(PromptSourceTask):
# trained on this data.
return self.dataset["train"]
def fewshot_context(self, doc, num_fewshot, provide_description=None, rnd=None, description=None):
assert num_fewshot == 0
assert rnd is not None, "A `random.Random` generator argument must be provided to `rnd`"
assert not provide_description, (
"The `provide_description` arg will be removed in future versions. To prepend "
"a custom description to the context, supply the corresponding string via the "
"`description` arg."
)
if provide_description is not None:
# nudge people to not specify it at all
print("WARNING: provide_description is deprecated and will be removed in a future version in favor of description_dict")
return ""
def doc_to_text(self, doc):
# this method is invoked by tests only
return ""
def doc_to_target(self, doc):
# this method is invoked by tests only
return ""
def construct_requests(self, doc, ctx):
assert not ctx
# Calculate the loglikelihood for the good and the bad sentence.
# Note that loglikelihood translates the "" prefix to the "<|endoftext|>" token
return [
rf.loglikelihood("", doc["sentence_good"]),
rf.loglikelihood("", doc["sentence_bad"]),
]
def process_results(self, doc, results):
likelihood1, likelihood2 = results
# the model got this case right iff the good sentence scored higher than the bad sentence
acc = 1.0 if likelihood1 > likelihood2 else 0.0
return {
"acc": acc,
}
def higher_is_better(self):
return {
"acc": True,
}
def aggregation(self):
return {
"acc": mean,
}
class BlimpAdjunctIsland(BlimpTask):
DATASET_NAME = "adjunct_island"
......
......@@ -12,7 +12,7 @@ Homepage: https://stanfordnlp.github.io/coqa/
import inspect
import transformers.data.metrics.squad_metrics as squad_metrics
import lm_eval.datasets.coqa.coqa
from lm_eval.base import PromptSourceTask, Task, rf, mean
from lm_eval.base import Task, rf, mean
from itertools import zip_longest
......@@ -28,9 +28,9 @@ _CITATION = """
"""
class CoQA(PromptSourceTask):
class CoQA(Task):
VERSION = 1
DATASET_PATH = "coqa"
DATASET_PATH = inspect.getfile(lm_eval.datasets.coqa.coqa)
DATASET_NAME = None
def has_training_docs(self):
......@@ -51,21 +51,44 @@ class CoQA(PromptSourceTask):
def test_docs(self):
pass
# @classmethod
# def get_answers(cls, doc, turn_id):
# # Returns unique answers and valid alternatives (Some questions in CoQA have multiple valid answers).
# answers = []
# answer_forturn = doc["answers"]["input_text"][turn_id - 1]
# answers.append(answer_forturn)
# additional_answers = doc.get("additional_answers")
# if additional_answers:
# for key in additional_answers:
# additional_answer_for_turn = additional_answers[key]["input_text"][
# turn_id - 1
# ]
# if additional_answer_for_turn.lower() not in map(str.lower, answers):
# answers.append(additional_answer_for_turn)
# return answers
def doc_to_text(self, doc):
# Given a passage p, the conversation history {q1, a1, . . . qi−1, ai−1}
# and a question qi, the task is to predict the answer ai
doc_text = doc["story"] + '\n\n'
for (q, a) in zip_longest(doc["questions"]["input_text"], doc["answers"]["input_text"][:-1]): # omit target answer ai
question = f"Q: {q}\n\n"
answer = f"A: {a}\n\n" if a is not None else "A:"
doc_text += question + answer
return doc_text
@classmethod
def get_answers(cls, doc, turn_id):
# Returns unique answers and valid alternatives (Some questions in CoQA have multiple valid answers).
answers = []
answer_forturn = doc["answers"]["input_text"][turn_id - 1]
answers.append(answer_forturn)
additional_answers = doc.get("additional_answers")
if additional_answers:
for key in additional_answers:
additional_answer_for_turn = additional_answers[key]["input_text"][turn_id - 1]
if additional_answer_for_turn.lower() not in map(str.lower, answers):
answers.append(additional_answer_for_turn)
return answers
@classmethod
def get_answer_choice(self, raw_text):
# Function maps answers to CoQA answer categories
# ~ 1/5 of the CoQA answers are Yes/No
# ~ 2/3 of the CoQA answers are span-based
# (answers overlap with the passage ignoring punctuation and case mismatch)
if raw_text == "unknown":
return '0'
if squad_metrics.normalize_answer(raw_text) == "yes":
return '1'
if squad_metrics.normalize_answer(raw_text) == "no":
return '2'
return '3' # Not a yes/no question
@staticmethod
def compute_scores(gold_list, pred):
......@@ -75,40 +98,40 @@ class CoQA(PromptSourceTask):
em_sum = 0.0
if len(gold_list) > 1:
for i in range(len(gold_list)):
gold_answers = gold_list[0:i] + gold_list[i + 1 :]
gold_answers = gold_list[0:i] + gold_list[i + 1:]
# predictions compared against (n) golds and take maximum
em_sum += max(
squad_metrics.compute_exact(a, pred) for a in gold_answers
)
em_sum += max(squad_metrics.compute_exact(a, pred) for a in gold_answers)
f1_sum += max(squad_metrics.compute_f1(a, pred) for a in gold_answers)
else:
em_sum += max(squad_metrics.compute_exact(a, pred) for a in gold_list)
f1_sum += max(squad_metrics.compute_f1(a, pred) for a in gold_list)
return {
"em": em_sum / max(1, len(gold_list)),
"f1": f1_sum / max(1, len(gold_list)),
}
return {'em': em_sum / max(1, len(gold_list)), 'f1': f1_sum / max(1, len(gold_list))}
# def stopping_criteria(self):
# return "\n\n"
def doc_to_target(self, doc, turnid=None):
# Default to prediction of last turn.
if turnid is None:
turnid = len(doc["questions"]["input_text"])
raw_text = doc['answers']["input_text"][turnid - 1]
return " " + raw_text
# def construct_requests(self, doc, ctx):
# """Uses RequestFactory to construct Requests and returns an iterable of
# Requests which will be sent to the LM.
def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
# :param doc:
# The document as returned from training_docs, validation_docs, or test_docs.
# :param ctx: str
# The context string, generated by fewshot_context. This includes the natural
# language description, as well as the few shot examples, and the question
# part of the document for `doc`.
# """
# return cont_request
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
cont_request = rf.greedy_until(ctx, ['\nQ:'])
return cont_request
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc:
......@@ -116,19 +139,16 @@ class CoQA(PromptSourceTask):
:param results:
The results of the requests created in construct_requests.
"""
target = self.doc_to_target(doc).strip()
pred = results[0].strip().split("\n")[0]
scores = self.compute_scores([target], pred)
turn_id = len(doc["questions"]["input_text"])
gold_list = self.get_answers(doc, turn_id)
pred = results[0].strip().split('\n')[0]
out = {
"f1": scores["f1"],
"em": scores["em"],
}
scores = self.compute_scores(gold_list, pred)
if self.save_examples:
example = {"target": target, "pred": pred}
return out, example
return out
return {
"f1": scores['f1'],
"em": scores['em'],
}
def higher_is_better(self):
return {
......
"""
French CrowS-Pairs: Extending a challenge dataset for measuring social bias in masked language models to a language other than English
https://hal.inria.fr/hal-03629677/file/ACLFinal.pdf
Measuring social biases in masked language models in English and French.
https://gitlab.inria.fr/french-crows-pairs/acl-2022-paper-data-and-code/-/tree/main
"""
from lm_eval.base import PromptSourceTask
_CITATION = """\
@inproceedings{neveol2022french,
title={French CrowS-Pairs: Extending a challenge dataset for measuring social bias in masked language models to a language other than English},
author={N{\'e}v{\'e}ol, Aur{\'e}lie and Dupont, Yoann and Bezan{\c{c}}on, Julien and Fort, Kar{\"e}n},
booktitle={ACL 2022-60th Annual Meeting of the Association for Computational Linguistics},
year={2022}
"""
class CrowsPairsEnglish(PromptSourceTask):
VERSION = 0
DATASET_PATH = "oskarvanderwal/crows_pairs_multilingual"
DATASET_NAME = "english"
def has_training_docs(self):
return False
def has_validation_docs(self):
return False
def has_test_docs(self):
return True
def training_docs(self):
pass
def validation_docs(self):
pass
def test_docs(self):
if self.has_test_docs():
return self.dataset["test"]
class CrowsPairsFrench(PromptSourceTask):
VERSION = 0
DATASET_PATH = "oskarvanderwal/crows_pairs_multilingual"
DATASET_NAME = "french"
def has_training_docs(self):
return False
def has_validation_docs(self):
return False
def has_test_docs(self):
return True
def training_docs(self):
pass
def validation_docs(self):
pass
def test_docs(self):
if self.has_test_docs():
return self.dataset["test"]
......@@ -18,7 +18,7 @@ import re
import string
import lm_eval.datasets.drop.drop
from scipy.optimize import linear_sum_assignment
from lm_eval.base import PromptSourceTask, rf
from lm_eval.base import Task, rf
from lm_eval.metrics import mean
......@@ -37,9 +37,9 @@ _CITATION = """
_ARTICLES = re.compile(r"\b(a|an|the)\b", re.UNICODE)
class DROP(PromptSourceTask):
class DROP(Task):
VERSION = 1
DATASET_PATH = "drop" # inspect.getfile(lm_eval.datasets.drop.drop)
DATASET_PATH = inspect.getfile(lm_eval.datasets.drop.drop)
DATASET_NAME = None
def has_training_docs(self):
......@@ -52,13 +52,46 @@ class DROP(PromptSourceTask):
return False
def training_docs(self):
# if self._training_docs is None:
# self._training_docs = list()
# return self._training_docs
return self.dataset["train"]
if self._training_docs is None:
self._training_docs = list(map(self._process_doc, self.dataset["train"]))
return self._training_docs
def validation_docs(self):
return self.dataset["validation"]
return map(self._process_doc, self.dataset["validation"])
def _process_doc(self, doc):
return {
"id": doc["query_id"],
"passage": doc["passage"],
"question": doc["question"],
"answers": self.get_answers(doc),
}
@classmethod
def get_answers(cls, qa):
def _flatten_validated_answers(validated_answers):
""" Flattens a dict of lists of validated answers.
{"number": ['1', '8'], ...}
-> [{"number": ['1'], ...}, {"number": ['8'], ...}]
"""
vas = []
for i in range(len(validated_answers["number"])):
vas.append({
"number": validated_answers["number"][i],
"date": validated_answers["date"][i],
"spans": validated_answers["spans"][i],
})
return vas
answers = []
answers_set = set()
candidates = [qa["answer"]] + _flatten_validated_answers(qa["validated_answers"])
for candidate in candidates:
answer = cls.parse_answer(candidate)
if answer in answers_set:
continue
answers_set.add(answer)
answers.append(answer)
return answers
@classmethod
def parse_answer(cls, answer):
......@@ -67,31 +100,29 @@ class DROP(PromptSourceTask):
return (str(answer["number"]),)
if answer["spans"] != []:
return tuple(answer["spans"])
return (
" ".join(
[answer["date"]["day"], answer["date"]["month"], answer["date"]["year"]]
).strip(),
)
return (" ".join([answer["date"]["day"],
answer["date"]["month"],
answer["date"]["year"]]).strip(),)
# def doc_to_text(self, doc):
# return f"Passage: {doc['passage']}\nQuestion: {doc['question']}\nAnswer:"
def doc_to_text(self, doc):
return f"Passage: {doc['passage']}\nQuestion: {doc['question']}\nAnswer:"
# def doc_to_target(self, doc):
# return " " + ", ".join(doc["answers"][0])
def doc_to_target(self, doc):
return " " + ", ".join(doc["answers"][0])
# def construct_requests(self, doc, ctx):
# """Uses RequestFactory to construct Requests and returns an iterable of
# Requests which will be sent to the LM.
def construct_requests(self, doc, ctx):
"""Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
# :param doc:
# The document as returned from training_docs, validation_docs, or test_docs.
# :param ctx: str
# The context string, generated by fewshot_context. This includes the natural
# language description, as well as the few shot examples, and the question
# part of the document for `doc`.
# """
# conts = [rf.greedy_until(ctx, ["."])]
# return conts
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
conts = [rf.greedy_until(ctx, ["."])]
return conts
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
......@@ -103,21 +134,7 @@ class DROP(PromptSourceTask):
:param results:
The results of the requests created in construct_requests.
"""
pred = results[0].strip()
target = self.doc_to_target(doc).strip()
print("*" * 80)
print(f"DOC: {doc}")
print(f"PS: {self.prompt.apply(doc)}")
print(f"TEXT: {self.doc_to_text(doc)}")
print(f"TARGET: {target} END TARGET")
print(f"PRED: {pred} END PRED")
print("*" * 80)
preds = [pred]
golds = [target]
preds, golds = results, doc["answers"]
max_em = 0
max_f1 = 0
for gold_answer in golds:
......@@ -125,7 +142,10 @@ class DROP(PromptSourceTask):
if gold_answer[0].strip():
max_em = max(max_em, exact_match)
max_f1 = max(max_f1, f1_score)
return {"em": max_em, "f1": max_f1}
return {
"em": max_em,
"f1": max_f1
}
def get_metrics(self, predicted, gold):
"""
......@@ -138,9 +158,7 @@ class DROP(PromptSourceTask):
predicted_bags = self._answer_to_bags(predicted)
gold_bags = self._answer_to_bags(gold)
if set(predicted_bags[0]) == set(gold_bags[0]) and len(
predicted_bags[0]
) == len(gold_bags[0]):
if set(predicted_bags[0]) == set(gold_bags[0]) and len(predicted_bags[0]) == len(gold_bags[0]):
exact_match = 1.0
else:
exact_match = 0.0
......@@ -172,9 +190,7 @@ class DROP(PromptSourceTask):
for gold_index, gold_item in enumerate(gold):
for pred_index, pred_item in enumerate(predicted):
if self._match_numbers_if_present(gold_item, pred_item):
scores[gold_index, pred_index] = self._compute_f1(
pred_item, gold_item
)
scores[gold_index, pred_index] = self._compute_f1(pred_item, gold_item)
row_ind, col_ind = linear_sum_assignment(-scores)
max_scores = np.zeros([max(len(gold), len(predicted))])
......@@ -240,11 +256,7 @@ class DROP(PromptSourceTask):
def _normalize(self, answer):
tokens = [
self._white_space_fix(
self._remove_articles(
self._fix_number(self._remove_punc(token.lower()))
)
)
self._white_space_fix(self._remove_articles(self._fix_number(self._remove_punc(token.lower()))))
for token in self._tokenize(answer)
]
tokens = [token for token in tokens if token.strip()]
......@@ -257,7 +269,10 @@ class DROP(PromptSourceTask):
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
return {"em": mean, "f1": mean}
return {
"em": mean,
"f1": mean
}
def higher_is_better(self):
"""
......@@ -265,4 +280,7 @@ class DROP(PromptSourceTask):
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
return {"em": True, "f1": True}
return {
"em": True,
"f1": True
}
"""
Semantic Noise Matters for Neural Natural Language Generation
http://arxiv.org/abs/1911.03905
A cleaned version of the dataset from the E2E NLG Challenge.
The dataset contains MR with restaurant attributes and corresponding descriptions.
Homepage: https://github.com/tuetschek/e2e-cleaning
"""
from lm_eval.base import PromptSourceTask, rf
from lm_eval import metrics
_CITATION = """
@inproceedings{dusek-etal-2019-semantic,
title = "Semantic Noise Matters for Neural Natural Language Generation",
author = "Du{\v{s}}ek, Ond{\v{r}}ej and
Howcroft, David M. and
Rieser, Verena",
booktitle = "Proceedings of the 12th International Conference on Natural Language Generation",
year = "2019",
address = "Tokyo, Japan",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/W19-8652",
doi = "10.18653/v1/W19-8652",
pages = "421--426",
}
"""
# Work in progress
class E2E_NLG_Cleaned(PromptSourceTask):
VERSION = 0
DATASET_PATH = "e2e_nlg_cleaned"
DATASET_NAME = None
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def training_docs(self):
if self.has_training_docs():
# We cache training documents in `self._training_docs` for faster
# few-shot processing. If the data is too large to fit in memory,
# return the training data as a generator instead of a list.
if self._training_docs is None:
self._training_docs = list(self.dataset["train"])
return self._training_docs
def validation_docs(self):
if self.has_validation_docs():
return self.dataset["validation"]
def test_docs(self):
if self.has_test_docs():
return self.dataset["test"]
def max_generation_length(self):
return 64
def invalid_doc_for_prompt(self, doc) -> bool:
"""The QA prompts are not applicable to all the examples, we want to filter these out."""
return self.prompt.name.endswith("_qa") or self.prompt.name == "family_friendly_yes_no"
def doc_to_text(self, doc) -> str:
# if the response is not defined in PS, the text will be a single-element list containing an empty string
text = self.prompt.apply(doc)[0]
return text
def construct_requests(self, doc, ctx, args):
"""Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
_requests = []
# NOTE: In the future, target will be a list of strings.
request_args = {
"stopping_criteria": self.stopping_criteria(),
"max_generation_length": self.max_generation_length(),
"num_fewshot": args["num_fewshot"],
}
# Skip examples for which the templates are not applicable
if ctx != "":
cont_request = rf.greedy_until(ctx, request_args)
_requests.append(cont_request)
return _requests
"""
ASSET: ASSET (Alva-Manchego et al., 2020) is multi-reference dataset
for the evaluation of sentence simplification in English. The dataset
uses the same 2,359 sentences from TurkCorpus (Xu et al., 2016)
and each sentence is associated with 10 crowdsourced simplifications.
Unlike previous simplification datasets, which contain a single
transformation (e.g., lexical paraphrasing in TurkCorpus or sentence
splitting in HSplit), the simplifications in ASSET encompass a variety
of rewriting transformations.
https://aclanthology.org/2020.acl-main.424.pdf
TurkCorpus: TURKCorpus is a multi-reference dataset for the evaluation of
sentence simplification in English. The dataset consists of 2,359 sentences
from the Parallel Wikipedia Simplification (PWKP) corpus. Each sentence is
associated with 8 crowdsourced simplifications that focus on only lexical
paraphrasing (no sentence splitting or deletion).
https://cocoxu.github.io/publications/tacl2016-smt-simplification.pdf
"""
from lm_eval.base import PromptSourceTask
_CITATION = """
@article{DBLP:journals/corr/abs-2005-00481,
author = {Fernando Alva{-}Manchego and
Louis Martin and
Antoine Bordes and
Carolina Scarton and
Beno{\^{\i}}t Sagot and
Lucia Specia},
title = {{ASSET:} {A} Dataset for Tuning and Evaluation of Sentence Simplification
Models with Multiple Rewriting Transformations},
journal = {CoRR},
volume = {abs/2005.00481},
year = {2020},
url = {https://arxiv.org/abs/2005.00481},
eprinttype = {arXiv},
eprint = {2005.00481},
timestamp = {Thu, 14 Oct 2021 16:38:25 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2005-00481.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}"""
""""@article{Xu-EtAl:2016:TACL,
author = {Wei Xu and Courtney Napoles and Ellie Pavlick and Quanze Chen and Chris Callison-Burch},
title = {Optimizing Statistical Machine Translation for Text Simplification},
journal = {Transactions of the Association for Computational Linguistics},
volume = {4},
year = {2016},
url = {https://cocoxu.github.io/publications/tacl2016-smt-simplification.pdf},
pages = {401--415}
}"""
class AssetTurk(PromptSourceTask):
VERSION = 0
DATASET_PATH = "GEM/wiki_auto_asset_turk"
DATASET_NAME = None
SPLIT = None
def has_training_docs(self):
return False
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def training_docs(self):
if self.has_training_docs():
if self._training_docs is None:
self._training_docs = list(self.dataset["train"])
return self._training_docs
def validation_docs(self):
if self.has_validation_docs():
return self.dataset["validation"]
def test_docs(self):
return self.dataset[str(self.SPLIT)]
def max_generation_length(self):
return 200
class AssetTest(AssetTurk):
SPLIT = "test_asset"
class TurkTest(AssetTurk):
SPLIT = "test_turk"
class AssetTest1(AssetTurk):
SPLIT = "challenge_test_asset_backtranslation"
class AssetTest2(AssetTurk):
SPLIT = "challenge_test_asset_bfp02"
class AssetTest3(AssetTurk):
SPLIT = "challenge_test_asset_bfp05"
class AssetTest4(AssetTurk):
SPLIT = "challenge_test_asset_nopunc"
class TurkTest1(AssetTurk):
SPLIT = "challenge_test_turk_backtranslation"
class TurkTest2(AssetTurk):
SPLIT = "challenge_test_turk_bfp02"
class TurkTest3(AssetTurk):
SPLIT = "challenge_test_turk_bfp05"
class TurkTest4(AssetTurk):
SPLIT = "challenge_test_turk_nopunc"
ASSET_TURK_CLASSES = [
AssetTest,
TurkTest,
TurkTest1,
TurkTest2,
TurkTest3,
TurkTest4,
AssetTest1,
AssetTest2,
AssetTest3,
AssetTest4,
]
def construct_tasks():
tasks = {}
for asset_turk_class in ASSET_TURK_CLASSES:
tasks[f"GEM/wiki_auto_asset_turk_{asset_turk_class.SPLIT}"] = asset_turk_class
return tasks
"""
MLSUM: The Multilingual Summarization Corpus
https://aclanthology.org/2020.emnlp-main.647/
This is the MLSUM subset of the GEM benchmark. MLSUM is the first large-scale MultiLingual SUMmarization dataset.
Obtained from online newspapers, it contains 1.5M+ article/summary pairs in five different languages -- namely, French, German, Spanish, Russian, Turkish.
Together with English newspapers from the popular CNN/Daily mail dataset, the collected data form a large scale multilingual dataset which can enable new research directions for the text summarization community.
We report cross-lingual comparative analyses based on state-of-the-art systems.
These highlight existing biases which motivate the use of a multi-lingual dataset.
Homepage: https://gitlab.lip6.fr/scialom/mlsum_data/-/raw/master/MLSUM/
"""
from numpy import True_
from lm_eval.base import PromptSourceTask
_CITATION = """
@article{scialom2020mlsum,
title={MLSUM: The Multilingual Summarization Corpus},
author={Scialom, Thomas and Dray, Paul-Alexis and Lamprier, Sylvain and Piwowarski, Benjamin and Staiano, Jacopo},
journal={arXiv preprint arXiv:2004.14900},
year={2020}
}
"""
class GEMMLSUMEsBase(PromptSourceTask):
VERSION = 0
DATASET_PATH = "GEM/mlsum"
DATASET_NAME = "es"
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def training_docs(self):
if self.has_training_docs():
if self._training_docs is None:
self._training_docs = list(self.dataset["train"])
return self._training_docs
def validation_docs(self):
if self.has_validation_docs():
return self.dataset["validation"]
def test_docs(self):
if self.has_test_docs():
return self.dataset["test"]
class GEMMLSUMEs(GEMMLSUMEsBase):
'''this is for train/validation/test'''
SPLIT = ''
class GEMMLSUMEsChallgeTestCovid(GEMMLSUMEsBase):
'''this is for challenge_test_covid'''
SPLIT = 'challenge_test_covid'
def has_training_docs(self):
return False
def has_validation_docs(self):
return False
def test_docs(self):
if self.has_test_docs():
return self.dataset[self.SPLIT]
class GEMMLSUMDeBase(PromptSourceTask):
VERSION = 0
DATASET_PATH = "GEM/mlsum"
DATASET_NAME = "de"
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def training_docs(self):
if self.has_training_docs():
if self._training_docs is None:
self._training_docs = list(self.dataset["train"])
return self._training_docs
def validation_docs(self):
if self.has_validation_docs():
return self.dataset["validation"]
def test_docs(self):
if self.has_test_docs():
return self.dataset["test"]
class GEMMLSUMDe(GEMMLSUMDeBase):
'''this is for train/validation/test'''
SPLIT = ''
class GEMMLSUMDeChallgeTestCovid(GEMMLSUMDeBase):
'''this is for challenge_test_covid'''
SPLIT = 'challenge_test_covid'
def has_training_docs(self):
return False
def has_validation_docs(self):
return False
def test_docs(self):
if self.has_test_docs():
return self.dataset[self.SPLIT]
"""
The 2020 Bilingual, Bi-Directional WebNLG+ Shared Task:
Overview and Evaluation Results (WebNLG+ 2020)
https://aclanthology.org/2020.webnlg-1.7/
WebNLG+ offers two challenges: (i) mapping sets of RDF triples
to English or Russian text (generation) and (ii) converting
English or Russian text to sets of RDF triples (semantic parsing).
Compared to the eponymous WebNLG challenge, WebNLG+ provides an
extended dataset that enable the training, evaluation, and
comparison of microplanners and semantic parsers. In this paper,
we present the results of the generation and semantic parsing
task for both English and Russian and provide a brief
description of the participating systems.
"""
from lm_eval.base import PromptSourceTask
_CITATION = """
@inproceedings{castro-ferreira-etal-2020-2020,
title = "The 2020 Bilingual, Bi-Directional {W}eb{NLG}+ Shared Task: Overview and Evaluation Results ({W}eb{NLG}+ 2020)",
author = "Castro Ferreira, Thiago and
Gardent, Claire and
Ilinykh, Nikolai and
van der Lee, Chris and
Mille, Simon and
Moussallem, Diego and
Shimorina, Anastasia",
booktitle = "Proceedings of the 3rd International Workshop on Natural Language Generation from the Semantic Web (WebNLG+)",
month = "12",
year = "2020",
address = "Dublin, Ireland (Virtual)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.webnlg-1.7",
pages = "55--76",
abstract = "WebNLG+ offers two challenges: (i) mapping sets of RDF triples to English or Russian text (generation) and (ii) converting English or Russian text to sets of RDF triples (semantic parsing). Compared to the eponymous WebNLG challenge, WebNLG+ provides an extended dataset that enable the training, evaluation, and comparison of microplanners and semantic parsers. In this paper, we present the results of the generation and semantic parsing task for both English and Russian and provide a brief description of the participating systems.",
}
"""
class WebNLG(PromptSourceTask):
VERSION = 0
DATASET_PATH = "GEM/web_nlg"
DATASET_NAME = "en"
SPLIT = None
def has_training_docs(self):
return False
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def training_docs(self):
if self.has_training_docs():
if self._training_docs is None:
self._training_docs = list(self.dataset["train"])
return self._training_docs
def validation_docs(self):
if self.has_validation_docs():
return self.dataset["validation"]
def test_docs(self):
if self.has_test_docs():
if self.SPLIT is not None:
return self.dataset[str(self.SPLIT)]
else:
return self.dataset["test"]
def max_generation_length(self):
return 250
class WebNLGRu(WebNLG):
DATASET_NAME = "ru"
## En Challenge Sets
class WebNLGEn1(WebNLG):
SPLIT = "challenge_validation_sample"
class WebNLGEn2(WebNLG):
SPLIT = "challenge_test_scramble"
class WebNLGEn3(WebNLG):
SPLIT = "challenge_test_numbers"
## Ru Challenge sets
class WebNLGRu1(WebNLG):
DATASET_NAME = "ru"
SPLIT = "challenge_validation_sample"
class WebNLGRu2(WebNLG):
DATASET_NAME = "ru"
SPLIT = "challenge_test_scramble"
WEBNLG_CLASSES = [
WebNLG,
WebNLGRu,
WebNLGEn1,
WebNLGEn2,
WebNLGEn3,
WebNLGRu1,
WebNLGRu2,
]
def construct_tasks():
tasks = {}
for webnlg_class in WEBNLG_CLASSES:
if webnlg_class.SPLIT is None:
tasks[f"GEM/web_nlg_{webnlg_class.DATASET_NAME}"] = webnlg_class
else:
tasks[
f"GEM/web_nlg_{webnlg_class.DATASET_NAME}_{webnlg_class.SPLIT}"
] = webnlg_class
return tasks
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment