Add multilingual datasets (XCOPA, XStoryCloze, XWinograd, PAWS-X, XNLI, MGSM) (#426)

* add xcopa dataset * add xstory_cloze dataset and run pre-commit * fix xcopa validation and test sets * add xwinograd dataset * add pawsx task * add xnli task * update task table with recently added tasks * remove unused metrics from paws-x * add mgsm task and fix gsm8k * fix gsm8k until * update task table

Add multilingual datasets (XCOPA, XStoryCloze, XWinograd, PAWS-X, XNLI, MGSM) (#426)
* add xcopa dataset * add xstory_cloze dataset and run pre-commit * fix xcopa validation and test sets * add xwinograd dataset * add pawsx task * add xnli task * update task table with recently added tasks * remove unused metrics from paws-x * add mgsm task and fix gsm8k * fix gsm8k until * update task table
d1451679 · Julen Etxaniz · GitHub · 05550ef3 · d1451679 · d1451679
Unverified Commit d1451679 authored May 11, 2023 by Julen Etxaniz Committed by GitHub May 11, 2023
8 changed files
--- a/lm_eval/tasks/truthfulqa.py
+++ b/lm_eval/tasks/truthfulqa.py
@@ -247,7 +247,7 @@ class TruthfulQAGeneration(Task):
            part of the document for `doc`.
        """
        # TODO: Find a way to cap the number of generated tokens to `50` as in the official implementation.
-        completion = rf.greedy_until(ctx, {'until': ["."]})
+        completion = rf.greedy_until(ctx, {"until": ["."]})
        return completion

    def process_results(self, doc, results):

--- a/lm_eval/tasks/unscramble.py
+++ b/lm_eval/tasks/unscramble.py
@@ -59,7 +59,7 @@ class WordUnscrambleTask(Task):
        return doc["completion"]

    def construct_requests(self, doc, ctx):
-        completion = rf.greedy_until(ctx, {'until': ["\n"]})
+        completion = rf.greedy_until(ctx, {"until": ["\n"]})
        return completion

    def process_results(self, doc, results):

--- a/lm_eval/tasks/xcopa.py
+++ b/lm_eval/tasks/xcopa.py
+"""
+XCOPA: A Multilingual Dataset for Causal Commonsense Reasoning
+https://ducdauge.github.io/files/xcopa.pdf
+
+The Cross-lingual Choice of Plausible Alternatives dataset is a benchmark to evaluate the ability of machine learning models to transfer commonsense reasoning across languages.
+The dataset is the translation and reannotation of the English COPA (Roemmele et al. 2011) and covers 11 languages from 11 families and several areas around the globe.
+The dataset is challenging as it requires both the command of world knowledge and the ability to generalise to new languages.
+All the details about the creation of XCOPA and the implementation of the baselines are available in the paper.
+
+Homepage: https://github.com/cambridgeltl/xcopa
+"""
+from .superglue import Copa
+
+
+_CITATION = """
+@inproceedings{ponti2020xcopa,
+  title={{XCOPA: A} Multilingual Dataset for Causal Commonsense Reasoning},
+  author={Edoardo M. Ponti, Goran Glava\v{s}, Olga Majewska, Qianchu Liu, Ivan Vuli\'{c} and Anna Korhonen},
+  booktitle={Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)},
+  year={2020},
+  url={https://ducdauge.github.io/files/xcopa.pdf}
+}
+"""
+
+
+class XCopa(Copa):
+    VERSION = 0
+    DATASET_PATH = "xcopa"
+    DATASET_NAME = None
+    CAUSE = "because"
+    EFFECT = "therefore"
+
+    def has_training_docs(self):
+        return False
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return True
+
+    def validation_docs(self):
+        return self.dataset["validation"]
+
+    def test_docs(self):
+        return self.dataset["test"]
+
+    def doc_to_text(self, doc):
+        # Drop the period
+        connector = {
+            "cause": self.CAUSE,
+            "effect": self.EFFECT,
+        }[doc["question"]]
+        return doc["premise"].strip()[:-1] + f" {connector}"
+
+
+class XCopaEt(XCopa):
+    DATASET_NAME = "et"
+    CAUSE = "sest"
+    EFFECT = "seetõttu"
+
+
+class XCopaHt(XCopa):
+    DATASET_NAME = "ht"
+    CAUSE = "poukisa"
+    EFFECT = "donk sa"
+
+
+class XCopaIt(XCopa):
+    DATASET_NAME = "it"
+    CAUSE = "perché"
+    EFFECT = "quindi"
+
+
+class XCopaId(XCopa):
+    DATASET_NAME = "id"
+    CAUSE = "karena"
+    EFFECT = "maka"
+
+
+class XCopaQu(XCopa):
+    DATASET_NAME = "qu"
+    CAUSE = "imataq"
+    EFFECT = "chaymi"
+
+
+class XCopaSw(XCopa):
+    DATASET_NAME = "sw"
+    CAUSE = "kwa sababu"
+    EFFECT = "kwa hiyo"
+
+
+class XCopaZh(XCopa):
+    DATASET_NAME = "zh"
+    CAUSE = "因为"
+    EFFECT = "所以"
+
+
+class XCopaTa(XCopa):
+    DATASET_NAME = "ta"
+    CAUSE = "காரணமாக"
+    EFFECT = "எனவே"
+
+
+class XCopaTh(XCopa):
+    DATASET_NAME = "th"
+    CAUSE = "เพราะ"
+    EFFECT = "ดังนั้น"
+
+
+class XCopaTr(XCopa):
+    DATASET_NAME = "tr"
+    CAUSE = "çünkü"
+    EFFECT = "bu yüzden"
+
+
+class XCopaVi(XCopa):
+    DATASET_NAME = "vi"
+    CAUSE = "bởi vì"
+    EFFECT = "vì vậy"
+
+
+LANGS = ["et", "ht", "it", "id", "qu", "sw", "zh", "ta", "th", "tr", "vi"]
+
+LANG_CLASSES = [
+    XCopaEt,
+    XCopaHt,
+    XCopaIt,
+    XCopaId,
+    XCopaQu,
+    XCopaSw,
+    XCopaZh,
+    XCopaTa,
+    XCopaTh,
+    XCopaTr,
+    XCopaVi,
+]
+
+
+def construct_tasks():
+    tasks = {}
+    for lang, lang_class in zip(LANGS, LANG_CLASSES):
+        tasks[f"xcopa_{lang}"] = lang_class
+    return tasks
--- a/lm_eval/tasks/xnli.py
+++ b/lm_eval/tasks/xnli.py
+"""
+XNLI: Evaluating Cross-lingual Sentence Representations
+https://arxiv.org/abs/1809.05053
+
+Based on the implementation of @yongzx (see https://github.com/EleutherAI/lm-evaluation-harness/pull/258)
+
+Prompt format (same as XGLM and mGPT):
+
+sentence1 + ", right? " + mask = (Yes|Also|No) + ", " + sentence2
+
+Predicition is the full sequence with the highest likelihood.
+
+Language specific prompts are translated word-by-word with Google Translate
+and may differ from the ones used by mGPT and XGLM (they do not provide their prompts).
+
+Homepage: https://github.com/facebookresearch/XNLI
+"""
+import numpy as np
+from lm_eval.base import rf, Task
+from lm_eval.metrics import mean
+
+_CITATIONS = """
+@InProceedings{conneau2018xnli,
+  author = "Conneau, Alexis
+        and Rinott, Ruty
+        and Lample, Guillaume
+        and Williams, Adina
+        and Bowman, Samuel R.
+        and Schwenk, Holger
+        and Stoyanov, Veselin",
+  title = "XNLI: Evaluating Cross-lingual Sentence Representations",
+  booktitle = "Proceedings of the 2018 Conference on Empirical Methods
+               in Natural Language Processing",
+  year = "2018",
+  publisher = "Association for Computational Linguistics",
+  location = "Brussels, Belgium",
+}
+"""
+
+
+class XNLIBase(Task):
+    VERSION = 0
+    DATASET_PATH = "xnli"
+    DATASET_NAME = None
+
+    QUESTION_WORD = None  # 'right'
+    ENTAILMENT_LABEL = None  # 'Yes'
+    NEUTRAL_LABEL = None  # 'Also'
+    CONTRADICTION_LABEL = None  # 'No'
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return True
+
+    def training_docs(self):
+        return self.dataset["train"]
+
+    def validation_docs(self):
+        return self.dataset["validation"]
+
+    def test_docs(self):
+        return self.dataset["test"]
+
+    def doc_to_text(self, doc):
+        # Example:
+        # The girl that can help me is all the way across town, right? Yes, The girl I need help from lives a ways away.
+        # [MASK] is replaced with ENTAILMENT_LABEL, NEUTRAL_LABEL, or CONTRADICTION_LABEL
+        return (
+            doc["premise"]
+            + ", "
+            + self.QUESTION_WORD
+            + "? [MASK], "
+            + doc["hypothesis"]
+        )
+
+    def doc_to_target(self, doc):
+        # True = entailment
+        # False = contradiction
+        # Neither = neutral
+        return (
+            " "
+            + [self.ENTAILMENT_LABEL, self.NEUTRAL_LABEL, self.CONTRADICTION_LABEL][
+                doc["label"]
+            ]
+        )
+
+    def construct_requests(self, doc, ctx):
+        """Uses RequestFactory to construct Requests and returns an iterable of
+        Requests which will be sent to the LM.
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`.
+        """
+        ll_true = rf.loglikelihood_rolling(ctx.replace("[MASK]", self.ENTAILMENT_LABEL))
+        ll_neither = rf.loglikelihood_rolling(ctx.replace("[MASK]", self.NEUTRAL_LABEL))
+        ll_false = rf.loglikelihood_rolling(
+            ctx.replace("[MASK]", self.CONTRADICTION_LABEL)
+        )
+
+        return ll_true, ll_neither, ll_false
+
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a
+        dict where keys are the names of submetrics and values are the values of
+        the metric for that one document
+
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        gold = doc["label"]
+        pred = np.argmax(results)
+        return {"acc": pred == gold}
+
+    def aggregation(self):
+        """
+        :returns: {str: [float] -> float}
+            A dictionary where keys are the names of submetrics and values are
+            functions that aggregate a list of metrics
+        """
+        return {"acc": mean}
+
+    def higher_is_better(self):
+        """
+        :returns: {str: bool}
+            A dictionary where keys are the names of submetrics and values are
+            whether a higher value of the submetric is better
+        """
+        return {"acc": True}
+
+
+class XNLI_en(XNLIBase):  # English
+    DATASET_NAME = "en"
+
+    QUESTION_WORD = "right"
+    ENTAILMENT_LABEL = "Yes"
+    NEUTRAL_LABEL = "Also"
+    CONTRADICTION_LABEL = "No"
+
+
+class XNLI_de(XNLIBase):  # German
+    DATASET_NAME = "de"
+
+    QUESTION_WORD = "richtig"
+    ENTAILMENT_LABEL = "Ja"
+    NEUTRAL_LABEL = "Auch"
+    CONTRADICTION_LABEL = "Nein"
+
+
+class XNLI_ar(XNLIBase):  # Arabic
+    DATASET_NAME = "ar"
+
+    QUESTION_WORD = "صحيح"
+    ENTAILMENT_LABEL = "نعم"
+    NEUTRAL_LABEL = "لذا"
+    CONTRADICTION_LABEL = "رقم"
+
+
+class XNLI_bg(XNLIBase):  # Bulgarian
+    DATASET_NAME = "bg"
+
+    QUESTION_WORD = "правилно"
+    ENTAILMENT_LABEL = "да"
+    NEUTRAL_LABEL = "така"
+    CONTRADICTION_LABEL = "не"
+
+
+class XNLI_el(XNLIBase):  # Greek
+    DATASET_NAME = "el"
+
+    QUESTION_WORD = "σωστός"
+    ENTAILMENT_LABEL = "Ναί"
+    NEUTRAL_LABEL = "Έτσι"
+    CONTRADICTION_LABEL = "όχι"
+
+
+class XNLI_es(XNLIBase):  # Spanish
+    DATASET_NAME = "es"
+
+    QUESTION_WORD = "correcto"
+    ENTAILMENT_LABEL = "Sí"
+    NEUTRAL_LABEL = "Asi que"
+    CONTRADICTION_LABEL = "No"
+
+
+class XNLI_fr(XNLIBase):  # French
+    DATASET_NAME = "fr"
+
+    QUESTION_WORD = "correct"
+    ENTAILMENT_LABEL = "Oui"
+    NEUTRAL_LABEL = "Aussi"
+    CONTRADICTION_LABEL = "Non"
+
+
+class XNLI_hi(XNLIBase):  # Hindi
+    DATASET_NAME = "hi"
+
+    QUESTION_WORD = "सही"
+    ENTAILMENT_LABEL = "हाँ"
+    NEUTRAL_LABEL = "इसलिए"
+    CONTRADICTION_LABEL = "नहीं"
+
+
+class XNLI_ru(XNLIBase):  # Russian
+    DATASET_NAME = "ru"
+
+    QUESTION_WORD = "правильно"
+    ENTAILMENT_LABEL = "Да"
+    NEUTRAL_LABEL = "Так"
+    CONTRADICTION_LABEL = "Нет"
+
+
+class XNLI_sw(XNLIBase):  # Swahili
+    DATASET_NAME = "sw"
+
+    QUESTION_WORD = "sahihi"
+    ENTAILMENT_LABEL = "Ndiyo"
+    NEUTRAL_LABEL = "Hivyo"
+    CONTRADICTION_LABEL = "Hapana"
+
+
+class XNLI_th(XNLIBase):  # Thai
+    DATASET_NAME = "th"
+
+    QUESTION_WORD = "ถูกต้อง"
+    ENTAILMENT_LABEL = "ใช่"
+    NEUTRAL_LABEL = "ดังนั้น"
+    CONTRADICTION_LABEL = "ไม่"
+
+
+class XNLI_tr(XNLIBase):  # Turkish
+    DATASET_NAME = "tr"
+
+    QUESTION_WORD = "doğru"
+    ENTAILMENT_LABEL = "Evet"
+    NEUTRAL_LABEL = "Böylece"
+    CONTRADICTION_LABEL = "Hayır"
+
+
+class XNLI_ur(XNLIBase):  # Urdu
+    DATASET_NAME = "ur"
+
+    QUESTION_WORD = "صحیح"
+    ENTAILMENT_LABEL = "جی ہاں"
+    NEUTRAL_LABEL = "اس لئے"
+    CONTRADICTION_LABEL = "نہیں"
+
+
+class XNLI_vi(XNLIBase):  # Vietnamese
+    DATASET_NAME = "vi"
+
+    QUESTION_WORD = "đúng"
+    ENTAILMENT_LABEL = "Vâng"
+    NEUTRAL_LABEL = "Vì vậy"
+    CONTRADICTION_LABEL = "Không"
+
+
+class XNLI_zh(XNLIBase):  # Chinese
+    DATASET_NAME = "zh"
+
+    QUESTION_WORD = "正确"
+    ENTAILMENT_LABEL = "是的"
+    NEUTRAL_LABEL = "所以"
+    CONTRADICTION_LABEL = "不是的"
+
+
+LANGS = [
+    "ar",
+    "bg",
+    "de",
+    "el",
+    "en",
+    "es",
+    "fr",
+    "hi",
+    "ru",
+    "sw",
+    "th",
+    "tr",
+    "ur",
+    "vi",
+    "zh",
+]
+
+LANG_CLASSES = [
+    XNLI_ar,
+    XNLI_bg,
+    XNLI_de,
+    XNLI_el,
+    XNLI_en,
+    XNLI_es,
+    XNLI_fr,
+    XNLI_hi,
+    XNLI_ru,
+    XNLI_sw,
+    XNLI_th,
+    XNLI_tr,
+    XNLI_ur,
+    XNLI_vi,
+    XNLI_zh,
+]
+
+
+def construct_tasks():
+    tasks = {}
+    for lang, lang_class in zip(LANGS, LANG_CLASSES):
+        tasks[f"xnli_{lang}"] = lang_class
+    return tasks
--- a/lm_eval/tasks/xstorycloze.py
+++ b/lm_eval/tasks/xstorycloze.py
+"""
+Few-shot Learning with Multilingual Language Models
+https://arxiv.org/abs/2112.10668
+
+XStoryCloze consists of the professionally translated version of the [English StoryCloze dataset](https://cs.rochester.edu/nlp/rocstories/) (Spring 2016 version) to 10 non-English languages. This dataset is released by Meta AI.
+Homepage: https://github.com/facebookresearch/fairseq/pull/4820
+"""
+from .storycloze import StoryCloze
+
+
+_CITATION = """
+@article{DBLP:journals/corr/abs-2112-10668,
+  author    = {Xi Victoria Lin and
+               Todor Mihaylov and
+               Mikel Artetxe and
+               Tianlu Wang and
+               Shuohui Chen and
+               Daniel Simig and
+               Myle Ott and
+               Naman Goyal and
+               Shruti Bhosale and
+               Jingfei Du and
+               Ramakanth Pasunuru and
+               Sam Shleifer and
+               Punit Singh Koura and
+               Vishrav Chaudhary and
+               Brian O'Horo and
+               Jeff Wang and
+               Luke Zettlemoyer and
+               Zornitsa Kozareva and
+               Mona T. Diab and
+               Veselin Stoyanov and
+               Xian Li},
+  title     = {Few-shot Learning with Multilingual Language Models},
+  journal   = {CoRR},
+  volume    = {abs/2112.10668},
+  year      = {2021},
+  url       = {https://arxiv.org/abs/2112.10668},
+  eprinttype = {arXiv},
+  eprint    = {2112.10668},
+  timestamp = {Tue, 04 Jan 2022 15:59:27 +0100},
+  biburl    = {https://dblp.org/rec/journals/corr/abs-2112-10668.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+"""
+
+_LANG = ["en", "ru", "zh", "es", "ar", "hi", "id", "te", "sw", "eu", "my"]
+
+
+def create_all_tasks():
+    """Creates a dictionary of tasks from a list of subjects
+    :return: {task_name: task}
+    """
+    return {f"xstory_cloze_{lang}": create_task(lang) for lang in _LANG}
+
+
+def create_task(lang):
+    class XStoryCloze(StoryCloze):
+        DATASET_PATH = "juletxara/xstory_cloze"
+        DATASET_NAME = lang
+
+        def __init__(self):
+            super().__init__(data_dir="")
+
+        def has_training_docs(self):
+            return True
+
+        def has_validation_docs(self):
+            return True
+
+        def has_test_docs(self):
+            return False
+
+        def training_docs(self):
+            return self.dataset["train"]
+
+        def validation_docs(self):
+            return self.dataset["eval"]
+
+        def test_docs(self):
+            pass
+
+    return XStoryCloze
--- a/lm_eval/tasks/xwinograd.py
+++ b/lm_eval/tasks/xwinograd.py
+"""
+It's All in the Heads: Using Attention Heads as a Baseline for Cross-Lingual Transfer in Commonsense Reasoning
+https://arxiv.org/abs/2106.12066
+
+Multilingual winograd schema challenge that includes English, French, Japanese, Portuguese, Russian and Chinese. Winograd schema challenges come from the XWinograd dataset introduced in Tikhonov et al. As it only contains 16 Chinese schemas, we add 488 Chinese schemas from clue/cluewsc2020.
+
+Homepage: https://huggingface.co/datasets/Muennighoff/xwinograd
+"""
+from .winogrande import Winogrande
+
+
+_CITATION = """
+@misc{muennighoff2022crosslingual,
+      title={Crosslingual Generalization through Multitask Finetuning},
+      author={Niklas Muennighoff and Thomas Wang and Lintang Sutawika and Adam Roberts and Stella Biderman and Teven Le Scao and M Saiful Bari and Sheng Shen and Zheng-Xin Yong and Hailey Schoelkopf and Xiangru Tang and Dragomir Radev and Alham Fikri Aji and Khalid Almubarak and Samuel Albanie and Zaid Alyafeai and Albert Webson and Edward Raff and Colin Raffel},
+      year={2022},
+      eprint={2211.01786},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+@misc{tikhonov2021heads,
+    title={It's All in the Heads: Using Attention Heads as a Baseline for Cross-Lingual Transfer in Commonsense Reasoning},
+    author={Alexey Tikhonov and Max Ryabinin},
+    year={2021},
+    eprint={2106.12066},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}
+"""
+
+_LANG = ["en", "fr", "jp", "pt", "ru", "zh"]
+
+
+def create_all_tasks():
+    """Creates a dictionary of tasks from a list of subjects
+    :return: {task_name: task}
+    """
+    return {f"xwinograd_{lang}": create_task(lang) for lang in _LANG}
+
+
+def create_task(lang):
+    class XWinograd(Winogrande):
+        DATASET_PATH = "Muennighoff/xwinograd"
+        DATASET_NAME = lang
+
+        def __init__(self):
+            super().__init__()
+
+        def has_training_docs(self):
+            return False
+
+        def has_validation_docs(self):
+            return False
+
+        def has_test_docs(self):
+            return True
+
+        def training_docs(self):
+            pass
+
+        def validation_docs(self):
+            pass
+
+        def test_docs(self):
+            return self.dataset["test"]
+
+    return XWinograd
--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -223,4 +223,3 @@ def run_task_tests(task_list: List[str]):
        raise ValueError(
            f"Not all tests for the specified tasks ({task_list}) ran successfully! Error code: {pytest_return_val}"
        )
-
--- a/setup.py
+++ b/setup.py
@@ -38,7 +38,7 @@ setuptools.setup(
        "tqdm-multiprocess",
        "transformers>=4.1",
        "zstandard",
-        "accelerate>=0.17.1"
+        "accelerate>=0.17.1",
    ],
    extras_require={
        "dev": ["black", "flake8", "pre-commit", "pytest", "pytest-cov"],