Commit ed53d51c authored by Rayyyyy's avatar Rayyyyy
Browse files

first add

parents
Pipeline #864 failed with stages
in 0 seconds
"""
Semantic Parsing on Freebase from Question-Answer Pairs
https://cs.stanford.edu/~pliang/papers/freebase-emnlp2013.pdf
WebQuestions is a benchmark for question answering. The dataset consists of 6,642
question/answer pairs. The questions are supposed to be answerable by Freebase, a
large knowledge graph. The questions are mostly centered around a single named entity.
The questions are popular ones asked on the web (at least in 2013).
Homepage: https://worksheets.codalab.org/worksheets/0xba659fe363cb46e7a505c5b6a774dc8a
"""
from lm_eval.base import rf, Task
from lm_eval.metrics import mean
_CITATION = """
@inproceedings{berant-etal-2013-semantic,
title = "Semantic Parsing on {F}reebase from Question-Answer Pairs",
author = "Berant, Jonathan and
Chou, Andrew and
Frostig, Roy and
Liang, Percy",
booktitle = "Proceedings of the 2013 Conference on Empirical Methods in Natural Language Processing",
month = oct,
year = "2013",
address = "Seattle, Washington, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/D13-1160",
pages = "1533--1544",
}
"""
class WebQs(Task):
VERSION = 0
DATASET_PATH = "web_questions"
DATASET_NAME = None
def has_training_docs(self):
return True
def has_validation_docs(self):
return False
def has_test_docs(self):
return True
def training_docs(self):
if self._training_docs is None:
self._training_docs = list(self.dataset["train"])
return self._training_docs
def test_docs(self):
return self.dataset["test"]
def doc_to_text(self, doc):
return "Question: " + doc["question"] + "\nAnswer:"
def should_decontaminate(self):
return True
def doc_to_decontamination_query(self, doc):
return doc["question"]
def doc_to_target(self, doc):
# this picks one answer to be the "correct" one, despite sometimes
# multiple correct answers being possible.
# TODO: make sure we're actually handling multi-answer correctly
return " " + doc["answers"][0]
def _remove_prefixes(self, aliases):
# Optimization: Remove any alias that has a strict prefix elsewhere in the list
# we can do this because if the prefix is acceptable by isgreedy, we can stop looking
aliases.sort()
ret = [aliases[0]]
for alias in aliases[1:]:
if not alias.startswith(ret[-1]):
ret.append(alias)
return ret
def construct_requests(self, doc, ctx):
ret = []
for alias in self._remove_prefixes(doc["answers"]):
_, is_prediction = rf.loglikelihood(ctx, " " + alias)
ret.append(is_prediction)
return ret
def process_results(self, doc, results):
return {"acc": float(any(results))}
def aggregation(self):
return {
"acc": mean,
}
def higher_is_better(self):
return {"acc": True}
"""
Pointer Sentinel Mixture Models
https://arxiv.org/pdf/1609.07843.pdf
The WikiText language modeling dataset is a collection of over 100 million tokens
extracted from the set of verified Good and Featured articles on Wikipedia.
NOTE: This `Task` is based on WikiText-2.
Homepage: https://www.salesforce.com/products/einstein/ai-research/the-wikitext-dependency-language-modeling-dataset/
"""
import re
from lm_eval.base import PerplexityTask
_CITATION = """
@misc{merity2016pointer,
title={Pointer Sentinel Mixture Models},
author={Stephen Merity and Caiming Xiong and James Bradbury and Richard Socher},
year={2016},
eprint={1609.07843},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
"""
def wikitext_detokenizer(string):
# contractions
string = string.replace("s '", "s'")
string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string)
# number separators
string = string.replace(" @-@ ", "-")
string = string.replace(" @,@ ", ",")
string = string.replace(" @.@ ", ".")
# punctuation
string = string.replace(" : ", ": ")
string = string.replace(" ; ", "; ")
string = string.replace(" . ", ". ")
string = string.replace(" ! ", "! ")
string = string.replace(" ? ", "? ")
string = string.replace(" , ", ", ")
# double brackets
string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string)
string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string)
string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string)
string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string)
string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string)
# miscellaneous
string = string.replace("= = = =", "====")
string = string.replace("= = =", "===")
string = string.replace("= =", "==")
string = string.replace(" " + chr(176) + " ", chr(176))
string = string.replace(" \n", "\n")
string = string.replace("\n ", "\n")
string = string.replace(" N ", " 1 ")
string = string.replace(" 's", "'s")
return string
class WikiText(PerplexityTask):
VERSION = 1
DATASET_PATH = "EleutherAI/wikitext_document_level"
DATASET_NAME = "wikitext-2-raw-v1"
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def training_docs(self):
return map(self._process_doc, self.dataset["train"])
def validation_docs(self):
return map(self._process_doc, self.dataset["validation"])
def test_docs(self):
return map(self._process_doc, self.dataset["test"])
def _process_doc(self, doc):
return doc["page"]
def doc_to_target(self, doc):
return wikitext_detokenizer(doc)
def should_decontaminate(self):
return True
def count_words(self, doc):
# count number of words in *original doc before detokenization*
return len(re.split(r"\s+", doc))
"""
WinoGrande: An Adversarial Winograd Schema Challenge at Scale
https://arxiv.org/pdf/1907.10641.pdf
WinoGrande is a collection of 44k problems, inspired by Winograd Schema Challenge
(Levesque, Davis, and Morgenstern 2011), but adjusted to improve the scale and
robustness against the dataset-specific bias. Formulated as a fill-in-a-blank
task with binary options, the goal is to choose the right option for a given
sentence which requires commonsense reasoning.
NOTE: This evaluation of Winogrande uses partial evaluation as described by
Trinh & Le in Simple Method for Commonsense Reasoning (2018).
See: https://arxiv.org/abs/1806.02847
Homepage: https://leaderboard.allenai.org/winogrande/submissions/public
"""
import numpy as np
from lm_eval.base import rf, Task
from lm_eval.metrics import mean
_CITATION = """
@article{sakaguchi2019winogrande,
title={WinoGrande: An Adversarial Winograd Schema Challenge at Scale},
author={Sakaguchi, Keisuke and Bras, Ronan Le and Bhagavatula, Chandra and Choi, Yejin},
journal={arXiv preprint arXiv:1907.10641},
year={2019}
}
"""
class Winogrande(Task):
VERSION = 0
DATASET_PATH = "winogrande"
DATASET_NAME = "winogrande_xl"
answer_to_num = {"1": 0, "2": 1}
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return False
def training_docs(self):
if self._training_docs is None:
self._training_docs = list(self.dataset["train"])
return self._training_docs
def validation_docs(self):
return self.dataset["validation"]
def doc_to_text(self, doc):
return self.partial_context(doc, doc["option" + doc["answer"]])
def should_decontaminate(self):
return True
def doc_to_decontamination_query(self, doc):
return doc["sentence"]
@classmethod
def partial_context(cls, doc, option):
# Substitute the pronoun in the sentence with the specified option
# and ignore everything after.
pronoun_loc = doc["sentence"].index("_")
return doc["sentence"][:pronoun_loc] + option
def doc_to_target(self, doc):
return self.partial_target(doc)
@classmethod
def partial_target(cls, doc):
# The target is everything after the document specified pronoun.
pronoun_loc = doc["sentence"].index("_") + 1
return " " + doc["sentence"][pronoun_loc:].strip()
def construct_requests(self, doc, ctx):
"""Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
target = self.partial_target(doc)
lls = []
for option in [doc["option1"], doc["option2"]]:
partial_ctx = self.partial_context(doc, option)
full_ctx = self.append_context(ctx, partial_ctx)
lls.append(rf.loglikelihood(full_ctx, target)[0])
return lls
@classmethod
def append_context(cls, ctx, partial_ctx):
ctx = ctx.split("\n\n") # Each fewshot context is on its own new line.
ctx.pop() # Remove the correct context put in by `doc_to_text`.
return "\n\n".join([*ctx, partial_ctx]) if ctx else partial_ctx
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
The results of the requests created in construct_requests.
"""
return {"acc": np.argmax(results) == self.answer_to_num[doc["answer"]]}
def aggregation(self):
"""
:returns: {str: [float] -> float}
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
return {"acc": mean}
def higher_is_better(self):
"""
:returns: {str: bool}
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
return {"acc": True}
"""
The Winograd Schema Challenge
http://commonsensereasoning.org/2011/papers/Levesque.pdf
A Winograd schema is a pair of sentences that differ in only one or two words
and that contain an ambiguity that is resolved in opposite ways in the two
sentences and requires the use of world knowledge and reasoning for its resolution.
The Winograd Schema Challenge 273 is a collection of 273 such Winograd schemas.
NOTE: This evaluation of Winograd Schema Challenge is based on `partial evaluation`
as described by Trinh & Le in Simple Method for Commonsense Reasoning (2018).
See: https://arxiv.org/abs/1806.0
Homepage: https://cs.nyu.edu/~davise/papers/WinogradSchemas/WS.html
"""
import numpy as np
from lm_eval.base import rf, Task
from lm_eval.metrics import mean
_CITATION = """
@inproceedings{ea01b9c0db064caca6986b925d75f2bb,
title = "The winograd schema challenge",
abstract = "In this paper, we present an alternative to the Turing Test that has some conceptual and practical advantages. A Wino-grad schema is a pair of sentences that differ only in one or two words and that contain a referential ambiguity that is resolved in opposite directions in the two sentences. We have compiled a collection of Winograd schemas, designed so that the correct answer is obvious to the human reader, but cannot easily be found using selectional restrictions or statistical techniques over text corpora. A contestant in the Winograd Schema Challenge is presented with a collection of one sentence from each pair, and required to achieve human-level accuracy in choosing the correct disambiguation.",
author = "Levesque, {Hector J.} and Ernest Davis and Leora Morgenstern",
year = "2012",
language = "English (US)",
isbn = "9781577355601",
series = "Proceedings of the International Conference on Knowledge Representation and Reasoning",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "552--561",
booktitle = "13th International Conference on the Principles of Knowledge Representation and Reasoning, KR 2012",
note = "13th International Conference on the Principles of Knowledge Representation and Reasoning, KR 2012 ; Conference date: 10-06-2012 Through 14-06-2012",
}
"""
class WinogradSchemaChallenge273(Task):
VERSION = 0
DATASET_PATH = "winograd_wsc"
DATASET_NAME = "wsc273"
upper_pronouns = [
"A",
"An",
"The",
"She",
"He",
"It",
"They",
"My",
"His",
"Her",
"Their",
]
def has_training_docs(self):
return False
def has_validation_docs(self):
return False
def has_test_docs(self):
return True
def test_docs(self):
return map(self._process_doc, self.dataset["test"])
def _process_doc(self, doc):
# The HF implementation of `wsc273` is not `partial evaluation` friendly.
doc["text"] = doc["text"].replace(" ", " ")
doc["options"][0] = self.__normalize_option(doc, doc["options"][0])
doc["options"][1] = self.__normalize_option(doc, doc["options"][1])
return doc
def __normalize_option(self, doc, option):
# Append `'s` to possessive determiner based options.
if doc["pronoun"].lower() in ["my", "his", "her", "our", "their"]:
option += "'s"
# Appropriately lowercase the pronoun in the option.
pronoun = option.split()[0]
start_of_sentence = doc["text"][doc["pronoun_loc"] - 2] == "."
if not start_of_sentence and pronoun in self.upper_pronouns:
return option.replace(pronoun, pronoun.lower())
return option
def fewshot_examples(self, k, rnd):
# NOTE: `super().fewshot_examples` samples from training docs which are
# not available for this test-set-only dataset.
if self._fewshot_docs is None:
self._fewshot_docs = list(self.test_docs())
return rnd.sample(list(self._fewshot_docs), k)
def doc_to_text(self, doc):
return self.partial_context(doc, doc["options"][doc["label"]])
def should_decontaminate(self):
return True
def doc_to_decontamination_query(self, doc):
return doc["text"]
@classmethod
def partial_context(cls, doc, option):
# Substitute the pronoun in the original text with the specified
# option and ignore everything after.
return doc["text"][: doc["pronoun_loc"]] + option
def doc_to_target(self, doc):
return self.partial_target(doc)
@classmethod
def partial_target(cls, doc):
# The target is everything after the document specified pronoun.
start_index = doc["pronoun_loc"] + len(doc["pronoun"])
return " " + doc["text"][start_index:].strip()
def construct_requests(self, doc, ctx):
"""Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
target = self.partial_target(doc)
lls = []
for option in doc["options"]:
partial_ctx = self.partial_context(doc, option)
full_ctx = self.append_context(ctx, partial_ctx)
lls.append(rf.loglikelihood(full_ctx, target)[0])
return lls
@classmethod
def append_context(cls, ctx, partial_ctx):
ctx = ctx.split("\n\n") # Each fewshot context is on its own new line.
ctx.pop() # Remove the correct context put in by `doc_to_text`.
return "\n\n".join([*ctx, partial_ctx]) if ctx else partial_ctx
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
The results of the requests created in construct_requests.
"""
return {"acc": np.argmax(results) == doc["label"]}
def aggregation(self):
"""
:returns: {str: [float] -> float}
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
return {"acc": mean}
def higher_is_better(self):
"""
:returns: {str: bool}
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
return {"acc": True}
"""
XCOPA: A Multilingual Dataset for Causal Commonsense Reasoning
https://ducdauge.github.io/files/xcopa.pdf
The Cross-lingual Choice of Plausible Alternatives dataset is a benchmark to evaluate the ability of machine learning models to transfer commonsense reasoning across languages.
The dataset is the translation and reannotation of the English COPA (Roemmele et al. 2011) and covers 11 languages from 11 families and several areas around the globe.
The dataset is challenging as it requires both the command of world knowledge and the ability to generalise to new languages.
All the details about the creation of XCOPA and the implementation of the baselines are available in the paper.
Homepage: https://github.com/cambridgeltl/xcopa
"""
from .superglue import Copa
_CITATION = """
@inproceedings{ponti2020xcopa,
title={{XCOPA: A} Multilingual Dataset for Causal Commonsense Reasoning},
author={Edoardo M. Ponti, Goran Glava\v{s}, Olga Majewska, Qianchu Liu, Ivan Vuli\'{c} and Anna Korhonen},
booktitle={Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)},
year={2020},
url={https://ducdauge.github.io/files/xcopa.pdf}
}
"""
class XCopa(Copa):
VERSION = 0
DATASET_PATH = "xcopa"
DATASET_NAME = None
CAUSE = "because"
EFFECT = "therefore"
def has_training_docs(self):
return False
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def validation_docs(self):
return self.dataset["validation"]
def test_docs(self):
return self.dataset["test"]
def doc_to_text(self, doc):
# Drop the period
connector = {
"cause": self.CAUSE,
"effect": self.EFFECT,
}[doc["question"]]
return doc["premise"].strip()[:-1] + f" {connector}"
class XCopaEt(XCopa):
DATASET_NAME = "et"
CAUSE = "sest"
EFFECT = "seetõttu"
class XCopaHt(XCopa):
DATASET_NAME = "ht"
CAUSE = "poukisa"
EFFECT = "donk sa"
class XCopaIt(XCopa):
DATASET_NAME = "it"
CAUSE = "perché"
EFFECT = "quindi"
class XCopaId(XCopa):
DATASET_NAME = "id"
CAUSE = "karena"
EFFECT = "maka"
class XCopaQu(XCopa):
DATASET_NAME = "qu"
CAUSE = "imataq"
EFFECT = "chaymi"
class XCopaSw(XCopa):
DATASET_NAME = "sw"
CAUSE = "kwa sababu"
EFFECT = "kwa hiyo"
class XCopaZh(XCopa):
DATASET_NAME = "zh"
CAUSE = "因为"
EFFECT = "所以"
class XCopaTa(XCopa):
DATASET_NAME = "ta"
CAUSE = "காரணமாக"
EFFECT = "எனவே"
class XCopaTh(XCopa):
DATASET_NAME = "th"
CAUSE = "เพราะ"
EFFECT = "ดังนั้น"
class XCopaTr(XCopa):
DATASET_NAME = "tr"
CAUSE = "çünkü"
EFFECT = "bu yüzden"
class XCopaVi(XCopa):
DATASET_NAME = "vi"
CAUSE = "bởi vì"
EFFECT = "vì vậy"
LANGS = ["et", "ht", "it", "id", "qu", "sw", "zh", "ta", "th", "tr", "vi"]
LANG_CLASSES = [
XCopaEt,
XCopaHt,
XCopaIt,
XCopaId,
XCopaQu,
XCopaSw,
XCopaZh,
XCopaTa,
XCopaTh,
XCopaTr,
XCopaVi,
]
def construct_tasks():
tasks = {}
for lang, lang_class in zip(LANGS, LANG_CLASSES):
tasks[f"xcopa_{lang}"] = lang_class
return tasks
"""
XNLI: Evaluating Cross-lingual Sentence Representations
https://arxiv.org/abs/1809.05053
Based on the implementation of @yongzx (see https://github.com/EleutherAI/lm-evaluation-harness/pull/258)
Prompt format (same as XGLM and mGPT):
sentence1 + ", right? " + mask = (Yes|Also|No) + ", " + sentence2
Predicition is the full sequence with the highest likelihood.
Language specific prompts are translated word-by-word with Google Translate
and may differ from the ones used by mGPT and XGLM (they do not provide their prompts).
Homepage: https://github.com/facebookresearch/XNLI
"""
import numpy as np
from lm_eval.base import rf, Task
from lm_eval.metrics import mean
from lm_eval import utils
_CITATIONS = """
@InProceedings{conneau2018xnli,
author = "Conneau, Alexis
and Rinott, Ruty
and Lample, Guillaume
and Williams, Adina
and Bowman, Samuel R.
and Schwenk, Holger
and Stoyanov, Veselin",
title = "XNLI: Evaluating Cross-lingual Sentence Representations",
booktitle = "Proceedings of the 2018 Conference on Empirical Methods
in Natural Language Processing",
year = "2018",
publisher = "Association for Computational Linguistics",
location = "Brussels, Belgium",
}
"""
class XNLIBase(Task):
VERSION = 0
DATASET_PATH = "xnli"
DATASET_NAME = None
QUESTION_WORD = None # 'right'
ENTAILMENT_LABEL = None # 'Yes'
NEUTRAL_LABEL = None # 'Also'
CONTRADICTION_LABEL = None # 'No'
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def training_docs(self):
return self.dataset["train"]
def validation_docs(self):
return self.dataset["validation"]
def test_docs(self):
return self.dataset["test"]
def doc_to_text(self, doc):
# Example:
# The girl that can help me is all the way across town, right? Yes, The girl I need help from lives a ways away.
# [MASK] is replaced with ENTAILMENT_LABEL, NEUTRAL_LABEL, or CONTRADICTION_LABEL
return (
doc["premise"]
+ ", "
+ self.QUESTION_WORD
+ "? [MASK], "
+ doc["hypothesis"]
)
def doc_to_target(self, doc):
# True = entailment
# False = contradiction
# Neither = neutral
return (
" "
+ [self.ENTAILMENT_LABEL, self.NEUTRAL_LABEL, self.CONTRADICTION_LABEL][
doc["label"]
]
)
def doc_to_fewshot_prompt(self, doc):
prompt = self.doc_to_text(doc)
return prompt.replace("[MASK]", self.doc_to_target(doc)[1:])
def construct_requests(self, doc, ctx):
"""Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
ll_true = rf.loglikelihood_rolling(ctx.replace("[MASK]", self.ENTAILMENT_LABEL))
ll_neither = rf.loglikelihood_rolling(ctx.replace("[MASK]", self.NEUTRAL_LABEL))
ll_false = rf.loglikelihood_rolling(
ctx.replace("[MASK]", self.CONTRADICTION_LABEL)
)
return ll_true, ll_neither, ll_false
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
The results of the requests created in construct_requests.
"""
gold = doc["label"]
pred = np.argmax(results)
return {"acc": pred == gold}
def aggregation(self):
"""
:returns: {str: [float] -> float}
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
return {"acc": mean}
def higher_is_better(self):
"""
:returns: {str: bool}
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
return {"acc": True}
@utils.positional_deprecated
def fewshot_context(
self, doc, num_fewshot, provide_description=None, rnd=None, description=None
):
"""Returns a fewshot context string that is made up of a prepended description
(if provided), the `num_fewshot` number of examples, and an appended prompt example.
:param doc: str
The document as returned from training_docs, validation_docs, or test_docs.
:param num_fewshot: int
The number of fewshot examples to provide in the returned context string.
:param provide_description: bool
Not implemented, and this option is deprecated and will be removed in a future version in favor of a different description providing method
:param rnd: random.Random
The pseudo-random number generator used to randomly sample examples.
WARNING: This is currently a required arg although it's optionalized with a default `None`.
:param description: str
The task's description that will be prepended to the fewshot examples.
:returns: str
The fewshot context.
"""
assert (
rnd is not None
), "A `random.Random` generator argument must be provided to `rnd`"
assert not provide_description, (
"The `provide_description` arg will be removed in future versions. To prepend "
"a custom description to the context, supply the corresponding string via the "
"`description` arg."
)
if provide_description is not None:
# nudge people to not specify it at all
print(
"WARNING: provide_description is deprecated and will be removed in a future version in favor of description_dict"
)
description = description + "\n\n" if description else ""
if num_fewshot == 0:
labeled_examples = ""
else:
# for sets with no training docs, draw from other set *but ensure no overlap with current doc*
if self.has_training_docs():
fewshotex = self.fewshot_examples(k=num_fewshot, rnd=rnd)
else:
if self._fewshot_docs is None:
self._fewshot_docs = list(
self.validation_docs()
if self.has_validation_docs()
else self.test_docs()
)
fewshotex = rnd.sample(self._fewshot_docs, num_fewshot + 1)
# get rid of the doc that's the one we're evaluating, if it's in the fewshot
fewshotex = [x for x in fewshotex if x != doc][:num_fewshot]
labeled_examples = (
"\n\n".join(
[
# self.doc_to_text(doc) + self.doc_to_target(doc)
self.doc_to_fewshot_prompt(doc)
for doc in fewshotex
]
)
+ "\n\n"
)
example = self.doc_to_text(doc)
return description + labeled_examples + example
class XNLI_en(XNLIBase): # English
DATASET_NAME = "en"
QUESTION_WORD = "right"
ENTAILMENT_LABEL = "Yes"
NEUTRAL_LABEL = "Also"
CONTRADICTION_LABEL = "No"
class XNLI_de(XNLIBase): # German
DATASET_NAME = "de"
QUESTION_WORD = "richtig"
ENTAILMENT_LABEL = "Ja"
NEUTRAL_LABEL = "Auch"
CONTRADICTION_LABEL = "Nein"
class XNLI_ar(XNLIBase): # Arabic
DATASET_NAME = "ar"
QUESTION_WORD = "صحيح"
ENTAILMENT_LABEL = "نعم"
NEUTRAL_LABEL = "لذا"
CONTRADICTION_LABEL = "رقم"
class XNLI_bg(XNLIBase): # Bulgarian
DATASET_NAME = "bg"
QUESTION_WORD = "правилно"
ENTAILMENT_LABEL = "да"
NEUTRAL_LABEL = "така"
CONTRADICTION_LABEL = "не"
class XNLI_el(XNLIBase): # Greek
DATASET_NAME = "el"
QUESTION_WORD = "σωστός"
ENTAILMENT_LABEL = "Ναί"
NEUTRAL_LABEL = "Έτσι"
CONTRADICTION_LABEL = "όχι"
class XNLI_es(XNLIBase): # Spanish
DATASET_NAME = "es"
QUESTION_WORD = "correcto"
ENTAILMENT_LABEL = "Sí"
NEUTRAL_LABEL = "Asi que"
CONTRADICTION_LABEL = "No"
class XNLI_fr(XNLIBase): # French
DATASET_NAME = "fr"
QUESTION_WORD = "correct"
ENTAILMENT_LABEL = "Oui"
NEUTRAL_LABEL = "Aussi"
CONTRADICTION_LABEL = "Non"
class XNLI_hi(XNLIBase): # Hindi
DATASET_NAME = "hi"
QUESTION_WORD = "सही"
ENTAILMENT_LABEL = "हाँ"
NEUTRAL_LABEL = "इसलिए"
CONTRADICTION_LABEL = "नहीं"
class XNLI_ru(XNLIBase): # Russian
DATASET_NAME = "ru"
QUESTION_WORD = "правильно"
ENTAILMENT_LABEL = "Да"
NEUTRAL_LABEL = "Так"
CONTRADICTION_LABEL = "Нет"
class XNLI_sw(XNLIBase): # Swahili
DATASET_NAME = "sw"
QUESTION_WORD = "sahihi"
ENTAILMENT_LABEL = "Ndiyo"
NEUTRAL_LABEL = "Hivyo"
CONTRADICTION_LABEL = "Hapana"
class XNLI_th(XNLIBase): # Thai
DATASET_NAME = "th"
QUESTION_WORD = "ถูกต้อง"
ENTAILMENT_LABEL = "ใช่"
NEUTRAL_LABEL = "ดังนั้น"
CONTRADICTION_LABEL = "ไม่"
class XNLI_tr(XNLIBase): # Turkish
DATASET_NAME = "tr"
QUESTION_WORD = "doğru"
ENTAILMENT_LABEL = "Evet"
NEUTRAL_LABEL = "Böylece"
CONTRADICTION_LABEL = "Hayır"
class XNLI_ur(XNLIBase): # Urdu
DATASET_NAME = "ur"
QUESTION_WORD = "صحیح"
ENTAILMENT_LABEL = "جی ہاں"
NEUTRAL_LABEL = "اس لئے"
CONTRADICTION_LABEL = "نہیں"
class XNLI_vi(XNLIBase): # Vietnamese
DATASET_NAME = "vi"
QUESTION_WORD = "đúng"
ENTAILMENT_LABEL = "Vâng"
NEUTRAL_LABEL = "Vì vậy"
CONTRADICTION_LABEL = "Không"
class XNLI_zh(XNLIBase): # Chinese
DATASET_NAME = "zh"
QUESTION_WORD = "正确"
ENTAILMENT_LABEL = "是的"
NEUTRAL_LABEL = "所以"
CONTRADICTION_LABEL = "不是的"
LANGS = [
"ar",
"bg",
"de",
"el",
"en",
"es",
"fr",
"hi",
"ru",
"sw",
"th",
"tr",
"ur",
"vi",
"zh",
]
LANG_CLASSES = [
XNLI_ar,
XNLI_bg,
XNLI_de,
XNLI_el,
XNLI_en,
XNLI_es,
XNLI_fr,
XNLI_hi,
XNLI_ru,
XNLI_sw,
XNLI_th,
XNLI_tr,
XNLI_ur,
XNLI_vi,
XNLI_zh,
]
def construct_tasks():
tasks = {}
for lang, lang_class in zip(LANGS, LANG_CLASSES):
tasks[f"xnli_{lang}"] = lang_class
return tasks
"""
Few-shot Learning with Multilingual Language Models
https://arxiv.org/abs/2112.10668
XStoryCloze consists of the professionally translated version of the [English StoryCloze dataset](https://cs.rochester.edu/nlp/rocstories/) (Spring 2016 version) to 10 non-English languages. This dataset is released by Meta AI.
Homepage: https://github.com/facebookresearch/fairseq/pull/4820
"""
from .storycloze import StoryCloze
_CITATION = """
@article{DBLP:journals/corr/abs-2112-10668,
author = {Xi Victoria Lin and
Todor Mihaylov and
Mikel Artetxe and
Tianlu Wang and
Shuohui Chen and
Daniel Simig and
Myle Ott and
Naman Goyal and
Shruti Bhosale and
Jingfei Du and
Ramakanth Pasunuru and
Sam Shleifer and
Punit Singh Koura and
Vishrav Chaudhary and
Brian O'Horo and
Jeff Wang and
Luke Zettlemoyer and
Zornitsa Kozareva and
Mona T. Diab and
Veselin Stoyanov and
Xian Li},
title = {Few-shot Learning with Multilingual Language Models},
journal = {CoRR},
volume = {abs/2112.10668},
year = {2021},
url = {https://arxiv.org/abs/2112.10668},
eprinttype = {arXiv},
eprint = {2112.10668},
timestamp = {Tue, 04 Jan 2022 15:59:27 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2112-10668.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
"""
_LANG = ["en", "ru", "zh", "es", "ar", "hi", "id", "te", "sw", "eu", "my"]
def create_all_tasks():
"""Creates a dictionary of tasks from a list of subjects
:return: {task_name: task}
"""
return {f"xstory_cloze_{lang}": create_task(lang) for lang in _LANG}
def create_task(lang):
class XStoryCloze(StoryCloze):
DATASET_PATH = "juletxara/xstory_cloze"
DATASET_NAME = lang
def __init__(self):
super().__init__(data_dir="")
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return False
def training_docs(self):
return self.dataset["train"]
def validation_docs(self):
return self.dataset["eval"]
def test_docs(self):
pass
return XStoryCloze
"""
It's All in the Heads: Using Attention Heads as a Baseline for Cross-Lingual Transfer in Commonsense Reasoning
https://arxiv.org/abs/2106.12066
Multilingual winograd schema challenge that includes English, French, Japanese, Portuguese, Russian and Chinese. Winograd schema challenges come from the XWinograd dataset introduced in Tikhonov et al. As it only contains 16 Chinese schemas, we add 488 Chinese schemas from clue/cluewsc2020.
Homepage: https://huggingface.co/datasets/Muennighoff/xwinograd
"""
from .winogrande import Winogrande
_CITATION = """
@misc{muennighoff2022crosslingual,
title={Crosslingual Generalization through Multitask Finetuning},
author={Niklas Muennighoff and Thomas Wang and Lintang Sutawika and Adam Roberts and Stella Biderman and Teven Le Scao and M Saiful Bari and Sheng Shen and Zheng-Xin Yong and Hailey Schoelkopf and Xiangru Tang and Dragomir Radev and Alham Fikri Aji and Khalid Almubarak and Samuel Albanie and Zaid Alyafeai and Albert Webson and Edward Raff and Colin Raffel},
year={2022},
eprint={2211.01786},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@misc{tikhonov2021heads,
title={It's All in the Heads: Using Attention Heads as a Baseline for Cross-Lingual Transfer in Commonsense Reasoning},
author={Alexey Tikhonov and Max Ryabinin},
year={2021},
eprint={2106.12066},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
"""
_LANG = ["en", "fr", "jp", "pt", "ru", "zh"]
def create_all_tasks():
"""Creates a dictionary of tasks from a list of subjects
:return: {task_name: task}
"""
return {f"xwinograd_{lang}": create_task(lang) for lang in _LANG}
def create_task(lang):
class XWinograd(Winogrande):
DATASET_PATH = "Muennighoff/xwinograd"
DATASET_NAME = lang
def __init__(self):
super().__init__()
def has_training_docs(self):
return False
def has_validation_docs(self):
return False
def has_test_docs(self):
return True
def training_docs(self):
pass
def validation_docs(self):
pass
def test_docs(self):
return self.dataset["test"]
return XWinograd
import os
import pathlib
import re
import collections
import functools
import inspect
import sys
import fnmatch
from typing import List, Union
import gc
import torch
from omegaconf import OmegaConf
class ExitCodeError(Exception):
pass
def sh(x):
if os.system(x):
raise ExitCodeError()
def escaped_split(text, sep_char, maxsplit=-1):
"""Split text into a list on occurrences of the given separation
character `sep_char`. The separation character may be escaped by a
backslash to avoid splitting at that location.
The separation character must be a string of size 1.
If `maxsplit` is given, at most `maxsplit` splits are done (thus,
the list will have at most `maxsplit + 1` elements). If `maxsplit`
is not specified or less than 0, then there is no limit on the
number of splits (all possible splits are made).
"""
assert (
len(sep_char) == 1
), "separation string must be a single character for escaped splitting"
if maxsplit == 0:
return text
maxsplit = max(0, maxsplit)
return re.split(r"(?<!\\)" + sep_char, text, maxsplit)
def simple_parse_args_string(args_string):
"""
Parses something like
args1=val1,arg2=val2
Into a dictionary
"""
args_string = args_string.strip()
if not args_string:
return {}
arg_list = args_string.split(",")
args_dict = OmegaConf.to_object(OmegaConf.from_dotlist(arg_list))
return args_dict
def join_iters(iters):
for iter in iters:
yield from iter
def chunks(iter, n=0, fn=None):
arr = []
for i, x in enumerate(iter):
arr.append(x)
if len(arr) == (fn(i) if fn else n):
yield arr
arr = []
if arr:
yield arr
def group(arr, fn):
res = collections.defaultdict(list)
for ob in arr:
res[fn(ob)].append(ob)
return list(res.values())
def _is_json_task(task_name):
return task_name == "json" or task_name.startswith("json=")
class MultiChoice:
def __init__(self, choices):
self.choices = choices
# Simple wildcard support (linux filename patterns)
def __contains__(self, values):
for value in values.split(","):
if len(fnmatch.filter(self.choices, value)) == 0 and not _is_json_task(
value
):
return False
return True
def __iter__(self):
for choice in self.choices:
yield choice
# Returns a list containing all values of the source_list that
# match at least one of the patterns
def pattern_match(patterns, source_list):
task_names = set()
for pattern in patterns:
if _is_json_task(pattern):
task_names.add(pattern)
for matching in fnmatch.filter(source_list, pattern):
task_names.add(matching)
return sorted(list(task_names))
def general_detokenize(string):
string = string.replace(" n't", "n't")
string = string.replace(" )", ")")
string = string.replace("( ", "(")
string = string.replace('" ', '"')
string = string.replace(' "', '"')
string = re.sub(r" (['.,])", r"\1", string)
return string
def get_rolling_token_windows(token_list, prefix_token, max_seq_len, context_len):
"""
- context_len allows for a rolling window context, allowing each prediction window to potentially
condition on some context
:param token_list: list
List of tokens to be PREDICTED
:param max_seq_len: int
max_seq_len of model (or max_seq_len we want to use)
:param context_len: int
Amount of desired token context for prediction. Needs to be at least 1.
:param prefix_token: token
Dummy token like <eos> so the first token has something to condition on
:return: generator
Generator of tuples
(input_tokens, pred_tokens)
Note: Score only the last len(pred_tokens) logits of the LM
"""
assert 1 <= context_len <= max_seq_len
if not token_list:
return
# +1 offset, going from input->preds
pred_len = max_seq_len - context_len + 1
predicted = 0
# Special handling for first window: predict all tokens
first_seq_len = min(max_seq_len, len(token_list))
yield ([prefix_token] + token_list[: first_seq_len - 1], token_list[:first_seq_len])
predicted += first_seq_len
while predicted < len(token_list):
window_pred_len = min(len(token_list) - predicted, pred_len)
window_end = predicted + window_pred_len
yield (
token_list[window_end - max_seq_len - 1 : window_end - 1],
token_list[window_end - window_pred_len : window_end],
)
predicted += window_pred_len
def make_disjoint_window(pair):
"""Takes output from get_rolling_token_windows and makes the context not overlap with the continuation"""
a, b = pair
return a[: len(a) - (len(b) - 1)], b
def select_continuation_from_batch_left_padding(
generations: Union[List[List[int]], torch.Tensor], max_context_size: int
):
"""Select the continuation from the batch, removing prompts of different lengths.
Args:
generations (Union[List[List[int]], torch.Tensor]):
A tensor or list-of-lists of shape [batch_size, sequence length].
max_context_size (int):
The size of the biggest context; generations will proceed from that
index.
Example:
PAD PAD Continue : The dog chased the cat [every day of the week]
Riddle me this : The dog chased the cat [yesterday] PAD PAD PAD PAD
Output:
[every day of the week]
[yesterday] PAD PAD PAD PAD
"""
return generations[:, max_context_size:]
class Reorderer:
def __init__(self, arr, fn):
self.size = len(arr)
arr = list(enumerate(arr))
arr = group(arr, lambda x: fn(x[1]))
arr = [([y[0] for y in x], x[0][1]) for x in arr]
arr.sort(key=lambda x: fn(x[1]))
self.arr = arr
def get_reordered(self):
return [x[1] for x in self.arr]
def get_original(self, newarr):
res = [None] * self.size
cov = [False] * self.size
for (inds, _), v in zip(self.arr, newarr):
for ind in inds:
res[ind] = v
cov[ind] = True
assert all(cov)
return res
def positional_deprecated(fn):
"""
A decorator to nudge users into passing only keyword args (`kwargs`) to the
wrapped function, `fn`.
"""
@functools.wraps(fn)
def _wrapper(*args, **kwargs):
if len(args) != 1 if inspect.ismethod(fn) else 0:
print(
f"WARNING: using {fn.__name__} with positional arguments is "
"deprecated and will be disallowed in a future version of "
"lm-evaluation-harness!"
)
return fn(*args, **kwargs)
return _wrapper
@positional_deprecated
def find_test_root(start_path: pathlib.Path) -> pathlib.Path:
"""
Search upward in the directory tree to a maximum of three layers
to find and return the package root (containing the 'tests' folder)
"""
cur_path = start_path.resolve()
max_layers = 3
for _ in range(max_layers):
if (cur_path / "tests" / "test_version_stable.py").exists():
return cur_path
else:
cur_path = cur_path.parent.resolve()
raise FileNotFoundError(
f"Unable to find package root within {max_layers} upwards" + f"of {start_path}"
)
@positional_deprecated
def run_task_tests(task_list: List[str]):
"""
Find the package root and run the tests for the given tasks
"""
import pytest
package_root = find_test_root(start_path=pathlib.Path(__file__))
task_string = " or ".join(task_list)
args = [
f"{package_root}/tests/test_version_stable.py",
f"--rootdir={package_root}",
"-k",
f"{task_string}",
]
sys.path.append(str(package_root))
pytest_return_val = pytest.main(args)
if pytest_return_val:
raise ValueError(
f"Not all tests for the specified tasks ({task_list}) ran successfully! Error code: {pytest_return_val}"
)
def clear_torch_cache():
gc.collect()
torch.cuda.empty_cache()
import argparse
import json
import logging
import os
from lm_eval import tasks, evaluator, utils
logging.getLogger("openai").setLevel(logging.WARNING)
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--model", required=True)
parser.add_argument("--model_args", default="")
parser.add_argument("--tasks", default=None, choices=utils.MultiChoice(tasks.ALL_TASKS))
parser.add_argument("--provide_description", action="store_true")
parser.add_argument("--num_fewshot", type=int, default=0)
parser.add_argument("--batch_size", type=str, default=None)
parser.add_argument("--max_batch_size", type=int, default=None,
help="Maximal batch size to try with --batch_size auto")
parser.add_argument("--device", type=str, default=None)
parser.add_argument("--output_path", default=None)
parser.add_argument("--limit", type=float, default=None,
help="Limit the number of examples per task. "
"If <1, limit is a percentage of the total number of examples.")
parser.add_argument("--data_sampling", type=float, default=None)
parser.add_argument("--no_cache", action="store_true")
parser.add_argument("--decontamination_ngrams_path", default=None)
parser.add_argument("--description_dict_path", default=None)
parser.add_argument("--check_integrity", action="store_true")
parser.add_argument("--write_out", action="store_true", default=False)
parser.add_argument("--output_base_path", type=str, default=None)
return parser.parse_args()
def main():
args = parse_args()
assert not args.provide_description # not implemented
if args.limit:
print(
"WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
)
if args.tasks is None:
task_names = tasks.ALL_TASKS
else:
task_names = utils.pattern_match(args.tasks.split(","), tasks.ALL_TASKS)
print(f"Selected Tasks: {task_names}")
description_dict = {}
if args.description_dict_path:
with open(args.description_dict_path, "r") as f:
description_dict = json.load(f)
results = evaluator.simple_evaluate(
model=args.model,
model_args=args.model_args,
tasks=task_names,
num_fewshot=args.num_fewshot,
batch_size=args.batch_size,
max_batch_size=args.max_batch_size,
device=args.device,
no_cache=args.no_cache,
limit=args.limit,
description_dict=description_dict,
decontamination_ngrams_path=args.decontamination_ngrams_path,
check_integrity=args.check_integrity,
write_out=args.write_out,
output_base_path=args.output_base_path,
)
dumped = json.dumps(results, indent=2)
print(dumped)
if args.output_path:
os.makedirs(os.path.dirname(args.output_path), exist_ok=True)
with open(args.output_path, "w") as f:
f.write(dumped)
batch_sizes = ",".join(map(str, results["config"]["batch_sizes"]))
print(
f"{args.model} ({args.model_args}), limit: {args.limit}, provide_description: {args.provide_description}, "
f"num_fewshot: {args.num_fewshot}, batch_size: {args.batch_size}{f' ({batch_sizes})' if batch_sizes else ''}"
)
print(evaluator.make_table(results))
if __name__ == "__main__":
main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment