Unverified Commit d1451679 authored by Julen Etxaniz's avatar Julen Etxaniz Committed by GitHub
Browse files

Add multilingual datasets (XCOPA, XStoryCloze, XWinograd, PAWS-X, XNLI, MGSM) (#426)

* add xcopa dataset

* add xstory_cloze dataset and run pre-commit

* fix xcopa validation and test sets

* add xwinograd dataset

* add pawsx task

* add xnli task

* update task table with recently added tasks

* remove unused metrics from paws-x

* add mgsm task and fix gsm8k

* fix gsm8k until

* update task table
parent 05550ef3
......@@ -247,7 +247,7 @@ class TruthfulQAGeneration(Task):
part of the document for `doc`.
"""
# TODO: Find a way to cap the number of generated tokens to `50` as in the official implementation.
completion = rf.greedy_until(ctx, {'until': ["."]})
completion = rf.greedy_until(ctx, {"until": ["."]})
return completion
def process_results(self, doc, results):
......
......@@ -59,7 +59,7 @@ class WordUnscrambleTask(Task):
return doc["completion"]
def construct_requests(self, doc, ctx):
completion = rf.greedy_until(ctx, {'until': ["\n"]})
completion = rf.greedy_until(ctx, {"until": ["\n"]})
return completion
def process_results(self, doc, results):
......
"""
XCOPA: A Multilingual Dataset for Causal Commonsense Reasoning
https://ducdauge.github.io/files/xcopa.pdf
The Cross-lingual Choice of Plausible Alternatives dataset is a benchmark to evaluate the ability of machine learning models to transfer commonsense reasoning across languages.
The dataset is the translation and reannotation of the English COPA (Roemmele et al. 2011) and covers 11 languages from 11 families and several areas around the globe.
The dataset is challenging as it requires both the command of world knowledge and the ability to generalise to new languages.
All the details about the creation of XCOPA and the implementation of the baselines are available in the paper.
Homepage: https://github.com/cambridgeltl/xcopa
"""
from .superglue import Copa
_CITATION = """
@inproceedings{ponti2020xcopa,
title={{XCOPA: A} Multilingual Dataset for Causal Commonsense Reasoning},
author={Edoardo M. Ponti, Goran Glava\v{s}, Olga Majewska, Qianchu Liu, Ivan Vuli\'{c} and Anna Korhonen},
booktitle={Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)},
year={2020},
url={https://ducdauge.github.io/files/xcopa.pdf}
}
"""
class XCopa(Copa):
VERSION = 0
DATASET_PATH = "xcopa"
DATASET_NAME = None
CAUSE = "because"
EFFECT = "therefore"
def has_training_docs(self):
return False
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def validation_docs(self):
return self.dataset["validation"]
def test_docs(self):
return self.dataset["test"]
def doc_to_text(self, doc):
# Drop the period
connector = {
"cause": self.CAUSE,
"effect": self.EFFECT,
}[doc["question"]]
return doc["premise"].strip()[:-1] + f" {connector}"
class XCopaEt(XCopa):
DATASET_NAME = "et"
CAUSE = "sest"
EFFECT = "seetõttu"
class XCopaHt(XCopa):
DATASET_NAME = "ht"
CAUSE = "poukisa"
EFFECT = "donk sa"
class XCopaIt(XCopa):
DATASET_NAME = "it"
CAUSE = "perché"
EFFECT = "quindi"
class XCopaId(XCopa):
DATASET_NAME = "id"
CAUSE = "karena"
EFFECT = "maka"
class XCopaQu(XCopa):
DATASET_NAME = "qu"
CAUSE = "imataq"
EFFECT = "chaymi"
class XCopaSw(XCopa):
DATASET_NAME = "sw"
CAUSE = "kwa sababu"
EFFECT = "kwa hiyo"
class XCopaZh(XCopa):
DATASET_NAME = "zh"
CAUSE = "因为"
EFFECT = "所以"
class XCopaTa(XCopa):
DATASET_NAME = "ta"
CAUSE = "காரணமாக"
EFFECT = "எனவே"
class XCopaTh(XCopa):
DATASET_NAME = "th"
CAUSE = "เพราะ"
EFFECT = "ดังนั้น"
class XCopaTr(XCopa):
DATASET_NAME = "tr"
CAUSE = "çünkü"
EFFECT = "bu yüzden"
class XCopaVi(XCopa):
DATASET_NAME = "vi"
CAUSE = "bởi vì"
EFFECT = "vì vậy"
LANGS = ["et", "ht", "it", "id", "qu", "sw", "zh", "ta", "th", "tr", "vi"]
LANG_CLASSES = [
XCopaEt,
XCopaHt,
XCopaIt,
XCopaId,
XCopaQu,
XCopaSw,
XCopaZh,
XCopaTa,
XCopaTh,
XCopaTr,
XCopaVi,
]
def construct_tasks():
tasks = {}
for lang, lang_class in zip(LANGS, LANG_CLASSES):
tasks[f"xcopa_{lang}"] = lang_class
return tasks
"""
XNLI: Evaluating Cross-lingual Sentence Representations
https://arxiv.org/abs/1809.05053
Based on the implementation of @yongzx (see https://github.com/EleutherAI/lm-evaluation-harness/pull/258)
Prompt format (same as XGLM and mGPT):
sentence1 + ", right? " + mask = (Yes|Also|No) + ", " + sentence2
Predicition is the full sequence with the highest likelihood.
Language specific prompts are translated word-by-word with Google Translate
and may differ from the ones used by mGPT and XGLM (they do not provide their prompts).
Homepage: https://github.com/facebookresearch/XNLI
"""
import numpy as np
from lm_eval.base import rf, Task
from lm_eval.metrics import mean
_CITATIONS = """
@InProceedings{conneau2018xnli,
author = "Conneau, Alexis
and Rinott, Ruty
and Lample, Guillaume
and Williams, Adina
and Bowman, Samuel R.
and Schwenk, Holger
and Stoyanov, Veselin",
title = "XNLI: Evaluating Cross-lingual Sentence Representations",
booktitle = "Proceedings of the 2018 Conference on Empirical Methods
in Natural Language Processing",
year = "2018",
publisher = "Association for Computational Linguistics",
location = "Brussels, Belgium",
}
"""
class XNLIBase(Task):
VERSION = 0
DATASET_PATH = "xnli"
DATASET_NAME = None
QUESTION_WORD = None # 'right'
ENTAILMENT_LABEL = None # 'Yes'
NEUTRAL_LABEL = None # 'Also'
CONTRADICTION_LABEL = None # 'No'
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def training_docs(self):
return self.dataset["train"]
def validation_docs(self):
return self.dataset["validation"]
def test_docs(self):
return self.dataset["test"]
def doc_to_text(self, doc):
# Example:
# The girl that can help me is all the way across town, right? Yes, The girl I need help from lives a ways away.
# [MASK] is replaced with ENTAILMENT_LABEL, NEUTRAL_LABEL, or CONTRADICTION_LABEL
return (
doc["premise"]
+ ", "
+ self.QUESTION_WORD
+ "? [MASK], "
+ doc["hypothesis"]
)
def doc_to_target(self, doc):
# True = entailment
# False = contradiction
# Neither = neutral
return (
" "
+ [self.ENTAILMENT_LABEL, self.NEUTRAL_LABEL, self.CONTRADICTION_LABEL][
doc["label"]
]
)
def construct_requests(self, doc, ctx):
"""Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
ll_true = rf.loglikelihood_rolling(ctx.replace("[MASK]", self.ENTAILMENT_LABEL))
ll_neither = rf.loglikelihood_rolling(ctx.replace("[MASK]", self.NEUTRAL_LABEL))
ll_false = rf.loglikelihood_rolling(
ctx.replace("[MASK]", self.CONTRADICTION_LABEL)
)
return ll_true, ll_neither, ll_false
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
The results of the requests created in construct_requests.
"""
gold = doc["label"]
pred = np.argmax(results)
return {"acc": pred == gold}
def aggregation(self):
"""
:returns: {str: [float] -> float}
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
return {"acc": mean}
def higher_is_better(self):
"""
:returns: {str: bool}
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
return {"acc": True}
class XNLI_en(XNLIBase): # English
DATASET_NAME = "en"
QUESTION_WORD = "right"
ENTAILMENT_LABEL = "Yes"
NEUTRAL_LABEL = "Also"
CONTRADICTION_LABEL = "No"
class XNLI_de(XNLIBase): # German
DATASET_NAME = "de"
QUESTION_WORD = "richtig"
ENTAILMENT_LABEL = "Ja"
NEUTRAL_LABEL = "Auch"
CONTRADICTION_LABEL = "Nein"
class XNLI_ar(XNLIBase): # Arabic
DATASET_NAME = "ar"
QUESTION_WORD = "صحيح"
ENTAILMENT_LABEL = "نعم"
NEUTRAL_LABEL = "لذا"
CONTRADICTION_LABEL = "رقم"
class XNLI_bg(XNLIBase): # Bulgarian
DATASET_NAME = "bg"
QUESTION_WORD = "правилно"
ENTAILMENT_LABEL = "да"
NEUTRAL_LABEL = "така"
CONTRADICTION_LABEL = "не"
class XNLI_el(XNLIBase): # Greek
DATASET_NAME = "el"
QUESTION_WORD = "σωστός"
ENTAILMENT_LABEL = "Ναί"
NEUTRAL_LABEL = "Έτσι"
CONTRADICTION_LABEL = "όχι"
class XNLI_es(XNLIBase): # Spanish
DATASET_NAME = "es"
QUESTION_WORD = "correcto"
ENTAILMENT_LABEL = "Sí"
NEUTRAL_LABEL = "Asi que"
CONTRADICTION_LABEL = "No"
class XNLI_fr(XNLIBase): # French
DATASET_NAME = "fr"
QUESTION_WORD = "correct"
ENTAILMENT_LABEL = "Oui"
NEUTRAL_LABEL = "Aussi"
CONTRADICTION_LABEL = "Non"
class XNLI_hi(XNLIBase): # Hindi
DATASET_NAME = "hi"
QUESTION_WORD = "सही"
ENTAILMENT_LABEL = "हाँ"
NEUTRAL_LABEL = "इसलिए"
CONTRADICTION_LABEL = "नहीं"
class XNLI_ru(XNLIBase): # Russian
DATASET_NAME = "ru"
QUESTION_WORD = "правильно"
ENTAILMENT_LABEL = "Да"
NEUTRAL_LABEL = "Так"
CONTRADICTION_LABEL = "Нет"
class XNLI_sw(XNLIBase): # Swahili
DATASET_NAME = "sw"
QUESTION_WORD = "sahihi"
ENTAILMENT_LABEL = "Ndiyo"
NEUTRAL_LABEL = "Hivyo"
CONTRADICTION_LABEL = "Hapana"
class XNLI_th(XNLIBase): # Thai
DATASET_NAME = "th"
QUESTION_WORD = "ถูกต้อง"
ENTAILMENT_LABEL = "ใช่"
NEUTRAL_LABEL = "ดังนั้น"
CONTRADICTION_LABEL = "ไม่"
class XNLI_tr(XNLIBase): # Turkish
DATASET_NAME = "tr"
QUESTION_WORD = "doğru"
ENTAILMENT_LABEL = "Evet"
NEUTRAL_LABEL = "Böylece"
CONTRADICTION_LABEL = "Hayır"
class XNLI_ur(XNLIBase): # Urdu
DATASET_NAME = "ur"
QUESTION_WORD = "صحیح"
ENTAILMENT_LABEL = "جی ہاں"
NEUTRAL_LABEL = "اس لئے"
CONTRADICTION_LABEL = "نہیں"
class XNLI_vi(XNLIBase): # Vietnamese
DATASET_NAME = "vi"
QUESTION_WORD = "đúng"
ENTAILMENT_LABEL = "Vâng"
NEUTRAL_LABEL = "Vì vậy"
CONTRADICTION_LABEL = "Không"
class XNLI_zh(XNLIBase): # Chinese
DATASET_NAME = "zh"
QUESTION_WORD = "正确"
ENTAILMENT_LABEL = "是的"
NEUTRAL_LABEL = "所以"
CONTRADICTION_LABEL = "不是的"
LANGS = [
"ar",
"bg",
"de",
"el",
"en",
"es",
"fr",
"hi",
"ru",
"sw",
"th",
"tr",
"ur",
"vi",
"zh",
]
LANG_CLASSES = [
XNLI_ar,
XNLI_bg,
XNLI_de,
XNLI_el,
XNLI_en,
XNLI_es,
XNLI_fr,
XNLI_hi,
XNLI_ru,
XNLI_sw,
XNLI_th,
XNLI_tr,
XNLI_ur,
XNLI_vi,
XNLI_zh,
]
def construct_tasks():
tasks = {}
for lang, lang_class in zip(LANGS, LANG_CLASSES):
tasks[f"xnli_{lang}"] = lang_class
return tasks
"""
Few-shot Learning with Multilingual Language Models
https://arxiv.org/abs/2112.10668
XStoryCloze consists of the professionally translated version of the [English StoryCloze dataset](https://cs.rochester.edu/nlp/rocstories/) (Spring 2016 version) to 10 non-English languages. This dataset is released by Meta AI.
Homepage: https://github.com/facebookresearch/fairseq/pull/4820
"""
from .storycloze import StoryCloze
_CITATION = """
@article{DBLP:journals/corr/abs-2112-10668,
author = {Xi Victoria Lin and
Todor Mihaylov and
Mikel Artetxe and
Tianlu Wang and
Shuohui Chen and
Daniel Simig and
Myle Ott and
Naman Goyal and
Shruti Bhosale and
Jingfei Du and
Ramakanth Pasunuru and
Sam Shleifer and
Punit Singh Koura and
Vishrav Chaudhary and
Brian O'Horo and
Jeff Wang and
Luke Zettlemoyer and
Zornitsa Kozareva and
Mona T. Diab and
Veselin Stoyanov and
Xian Li},
title = {Few-shot Learning with Multilingual Language Models},
journal = {CoRR},
volume = {abs/2112.10668},
year = {2021},
url = {https://arxiv.org/abs/2112.10668},
eprinttype = {arXiv},
eprint = {2112.10668},
timestamp = {Tue, 04 Jan 2022 15:59:27 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2112-10668.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
"""
_LANG = ["en", "ru", "zh", "es", "ar", "hi", "id", "te", "sw", "eu", "my"]
def create_all_tasks():
"""Creates a dictionary of tasks from a list of subjects
:return: {task_name: task}
"""
return {f"xstory_cloze_{lang}": create_task(lang) for lang in _LANG}
def create_task(lang):
class XStoryCloze(StoryCloze):
DATASET_PATH = "juletxara/xstory_cloze"
DATASET_NAME = lang
def __init__(self):
super().__init__(data_dir="")
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return False
def training_docs(self):
return self.dataset["train"]
def validation_docs(self):
return self.dataset["eval"]
def test_docs(self):
pass
return XStoryCloze
"""
It's All in the Heads: Using Attention Heads as a Baseline for Cross-Lingual Transfer in Commonsense Reasoning
https://arxiv.org/abs/2106.12066
Multilingual winograd schema challenge that includes English, French, Japanese, Portuguese, Russian and Chinese. Winograd schema challenges come from the XWinograd dataset introduced in Tikhonov et al. As it only contains 16 Chinese schemas, we add 488 Chinese schemas from clue/cluewsc2020.
Homepage: https://huggingface.co/datasets/Muennighoff/xwinograd
"""
from .winogrande import Winogrande
_CITATION = """
@misc{muennighoff2022crosslingual,
title={Crosslingual Generalization through Multitask Finetuning},
author={Niklas Muennighoff and Thomas Wang and Lintang Sutawika and Adam Roberts and Stella Biderman and Teven Le Scao and M Saiful Bari and Sheng Shen and Zheng-Xin Yong and Hailey Schoelkopf and Xiangru Tang and Dragomir Radev and Alham Fikri Aji and Khalid Almubarak and Samuel Albanie and Zaid Alyafeai and Albert Webson and Edward Raff and Colin Raffel},
year={2022},
eprint={2211.01786},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@misc{tikhonov2021heads,
title={It's All in the Heads: Using Attention Heads as a Baseline for Cross-Lingual Transfer in Commonsense Reasoning},
author={Alexey Tikhonov and Max Ryabinin},
year={2021},
eprint={2106.12066},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
"""
_LANG = ["en", "fr", "jp", "pt", "ru", "zh"]
def create_all_tasks():
"""Creates a dictionary of tasks from a list of subjects
:return: {task_name: task}
"""
return {f"xwinograd_{lang}": create_task(lang) for lang in _LANG}
def create_task(lang):
class XWinograd(Winogrande):
DATASET_PATH = "Muennighoff/xwinograd"
DATASET_NAME = lang
def __init__(self):
super().__init__()
def has_training_docs(self):
return False
def has_validation_docs(self):
return False
def has_test_docs(self):
return True
def training_docs(self):
pass
def validation_docs(self):
pass
def test_docs(self):
return self.dataset["test"]
return XWinograd
......@@ -223,4 +223,3 @@ def run_task_tests(task_list: List[str]):
raise ValueError(
f"Not all tests for the specified tasks ({task_list}) ran successfully! Error code: {pytest_return_val}"
)
......@@ -38,7 +38,7 @@ setuptools.setup(
"tqdm-multiprocess",
"transformers>=4.1",
"zstandard",
"accelerate>=0.17.1"
"accelerate>=0.17.1",
],
extras_require={
"dev": ["black", "flake8", "pre-commit", "pytest", "pytest-cov"],
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment