Merge pull request #776 from EleutherAI/xnli

[Refactor] XNLI

Merge pull request #776 from EleutherAI/xnli
[Refactor] XNLI
f918c8fd · Lintang Sutawika · GitHub · 054ed37f · 1768f118 · f918c8fd
Unverified Commit f918c8fd authored Aug 14, 2023 by Lintang Sutawika Committed by GitHub Aug 14, 2023
19 changed files
--- a/lm_eval/tasks/README.md
+++ b/lm_eval/tasks/README.md
@@ -55,7 +55,7 @@ Boxes should be checked iff tasks are implemented in the refactor and tested for
 - [ ] XStoryCloze (Lintang)
 - [x] XWinograd
 - [ ] PAWS-X (Lintang)
- [ ] XNLI (Lintang)
+- [x] XNLI
 - [ ] MGSM (Lintang)
 - [ ] SCROLLS
 - [x] Babi

--- a/lm_eval/tasks/xnli/README.md
+++ b/lm_eval/tasks/xnli/README.md
+# XNLI
+
+### Paper
+
+Title: `XNLI: Evaluating Cross-lingual Sentence Representations`
+
+Abstract: https://arxiv.org/abs/1809.05053
+
+Based on the implementation of @yongzx (see https://github.com/EleutherAI/lm-evaluation-harness/pull/258)
+
+Prompt format (same as XGLM and mGPT):
+
+sentence1 + ", right? " + mask = (Yes|Also|No) + ", " + sentence2
+
+Predicition is the full sequence with the highest likelihood.
+
+Language specific prompts are translated word-by-word with Google Translate
+and may differ from the ones used by mGPT and XGLM (they do not provide their prompts).
+
+Homepage: https://github.com/facebookresearch/XNLI
+
+
+### Citation
+
+"""
+@InProceedings{conneau2018xnli,
+  author = "Conneau, Alexis
+        and Rinott, Ruty
+        and Lample, Guillaume
+        and Williams, Adina
+        and Bowman, Samuel R.
+        and Schwenk, Holger
+        and Stoyanov, Veselin",
+  title = "XNLI: Evaluating Cross-lingual Sentence Representations",
+  booktitle = "Proceedings of the 2018 Conference on Empirical Methods
+               in Natural Language Processing",
+  year = "2018",
+  publisher = "Association for Computational Linguistics",
+  location = "Brussels, Belgium",
+}
+"""
+
+### Groups and Tasks
+
+#### Groups
+
+* `xnli`
+
+#### Tasks
+
+* `xnli_ar`: Arabic
+* `xnli_bg`: Bulgarian
+* `xnli_de`: German
+* `xnli_el`: Greek
+* `xnli_en`: English
+* `xnli_es`: Spanish
+* `xnli_fr`: French
+* `xnli_hi`: Hindi
+* `xnli_ru`: Russian
+* `xnli_sw`: Swahili
+* `xnli_th`: Thai
+* `xnli_tr`: Turkish
+* `xnli_ur`: Urdu
+* `xnli_vi`: Vietnamese
+* `xnli_zh`: Chinese
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+* [ ] Is the task an existing benchmark in the literature?
+  * [ ] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/xnli/utils.py
+++ b/lm_eval/tasks/xnli/utils.py
+import argparse
+from typing import Dict, List
+
+import yaml
+
+
+# Different languages that are part of xnli.
+# These correspond to dataset names (Subsets) on HuggingFace.
+# A yaml file is generated by this script for each language.
+
+LANGUAGES = {
+    "ar": {  # Arabic
+        "QUESTION_WORD": "صحيح",
+        "ENTAILMENT_LABEL": "نعم",
+        "NEUTRAL_LABEL": "لذا",
+        "CONTRADICTION_LABEL": "رقم",
+    },
+    "bg": {  # Bulgarian
+        "QUESTION_WORD": "правилно",
+        "ENTAILMENT_LABEL": "да",
+        "NEUTRAL_LABEL": "така",
+        "CONTRADICTION_LABEL": "не",
+    },
+    "de": {  # German
+        "QUESTION_WORD": "richtig",
+        "ENTAILMENT_LABEL": "Ja",
+        "NEUTRAL_LABEL": "Auch",
+        "CONTRADICTION_LABEL": "Nein",
+    },
+    "el": {  # Greek
+        "QUESTION_WORD": "σωστός",
+        "ENTAILMENT_LABEL": "Ναί",
+        "NEUTRAL_LABEL": "Έτσι",
+        "CONTRADICTION_LABEL": "όχι",
+    },
+    "en": {  # English
+        "QUESTION_WORD": "right",
+        "ENTAILMENT_LABEL": "Yes",
+        "NEUTRAL_LABEL": "Also",
+        "CONTRADICTION_LABEL": "No",
+    },
+    "es": {  # Spanish
+        "QUESTION_WORD": "correcto",
+        "ENTAILMENT_LABEL": "Sí",
+        "NEUTRAL_LABEL": "Asi que",
+        "CONTRADICTION_LABEL": "No",
+    },
+    "fr": {  # French
+        "QUESTION_WORD": "correct",
+        "ENTAILMENT_LABEL": "Oui",
+        "NEUTRAL_LABEL": "Aussi",
+        "CONTRADICTION_LABEL": "Non",
+    },
+    "hi": {  # Hindi
+        "QUESTION_WORD": "सही",
+        "ENTAILMENT_LABEL": "हाँ",
+        "NEUTRAL_LABEL": "इसलिए",
+        "CONTRADICTION_LABEL": "नहीं",
+    },
+    "ru": {  # Russian
+        "QUESTION_WORD": "правильно",
+        "ENTAILMENT_LABEL": "Да",
+        "NEUTRAL_LABEL": "Так",
+        "CONTRADICTION_LABEL": "Нет",
+    },
+    "sw": {  # Swahili
+        "QUESTION_WORD": "sahihi",
+        "ENTAILMENT_LABEL": "Ndiyo",
+        "NEUTRAL_LABEL": "Hivyo",
+        "CONTRADICTION_LABEL": "Hapana",
+    },
+    "th": {  # Thai
+        "QUESTION_WORD": "ถูกต้อง",
+        "ENTAILMENT_LABEL": "ใช่",
+        "NEUTRAL_LABEL": "ดังนั้น",
+        "CONTRADICTION_LABEL": "ไม่",
+    },
+    "tr": {  # Turkish
+        "QUESTION_WORD": "doğru",
+        "ENTAILMENT_LABEL": "Evet",
+        "NEUTRAL_LABEL": "Böylece",
+        "CONTRADICTION_LABEL": "Hayır",
+    },
+    "ur": {  # Urdu
+        "QUESTION_WORD": "صحیح",
+        "ENTAILMENT_LABEL": "جی ہاں",
+        "NEUTRAL_LABEL": "اس لئے",
+        "CONTRADICTION_LABEL": "نہیں",
+    },
+    "vi": {  # Vietnamese
+        "QUESTION_WORD": "đúng",
+        "ENTAILMENT_LABEL": "Vâng",
+        "NEUTRAL_LABEL": "Vì vậy",
+        "CONTRADICTION_LABEL": "Không",
+    },
+    "zh": {  # Chinese
+        "QUESTION_WORD": "正确",
+        "ENTAILMENT_LABEL": "是的",
+        "NEUTRAL_LABEL": "所以",
+        "CONTRADICTION_LABEL": "不是的",
+    },
+}
+
+
+def gen_lang_yamls(output_dir: str, overwrite: bool) -> None:
+    """
+    Generate a yaml file for each language.
+
+    :param output_dir: The directory to output the files to.
+    :param overwrite: Whether to overwrite files if they already exist.
+    """
+    err = []
+    for lang in LANGUAGES.keys():
+        file_name = f"xnli_{lang}.yaml"
+        try:
+            QUESTION_WORD = LANGUAGES[lang]["QUESTION_WORD"]
+            ENTAILMENT_LABEL = LANGUAGES[lang]["ENTAILMENT_LABEL"]
+            NEUTRAL_LABEL = LANGUAGES[lang]["NEUTRAL_LABEL"]
+            CONTRADICTION_LABEL = LANGUAGES[lang]["CONTRADICTION_LABEL"]
+            with open(
+                f"{output_dir}/{file_name}", "w" if overwrite else "x", encoding="utf8"
+            ) as f:
+                f.write("# Generated by utils.py\n")
+                yaml.dump(
+                    {
+                        "include": "xnli_common_yaml",
+                        "dataset_name": lang,
+                        "task": f"xnli_{lang}",
+                        "doc_to_text": "",
+                        "doc_to_choice": f"{{{{["
+                        f"""premise+\", {QUESTION_WORD}? {ENTAILMENT_LABEL}, \"+hypothesis,"""
+                        f"""premise+\", {QUESTION_WORD}? {NEUTRAL_LABEL}, \"+hypothesis,"""
+                        f"""premise+\", {QUESTION_WORD}? {CONTRADICTION_LABEL}, \"+hypothesis"""
+                        f"]}}}}",
+                    },
+                    f,
+                    allow_unicode=True,
+                )
+        except FileExistsError:
+            err.append(file_name)
+
+    if len(err) > 0:
+        raise FileExistsError(
+            "Files were not created because they already exist (use --overwrite flag):"
+            f" {', '.join(err)}"
+        )
+
+
+def main() -> None:
+    """Parse CLI args and generate language-specific yaml files."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--overwrite",
+        default=False,
+        action="store_true",
+        help="Overwrite files if they already exist",
+    )
+    parser.add_argument(
+        "--output-dir", default=".", help="Directory to write yaml files to"
+    )
+    args = parser.parse_args()
+
+    gen_lang_yamls(output_dir=args.output_dir, overwrite=args.overwrite)
+
+
+if __name__ == "__main__":
+    main()
--- a/lm_eval/tasks/xnli/xnli_ar.yaml
+++ b/lm_eval/tasks/xnli/xnli_ar.yaml
+# Generated by utils.py
+dataset_name: ar
+doc_to_choice: '{{[premise+", صحيح? نعم, "+hypothesis,premise+", صحيح? لذا, "+hypothesis,premise+",
+  صحيح? رقم, "+hypothesis]}}'
+doc_to_text: ''
+include: xnli_common_yaml
+task: xnli_ar
--- a/lm_eval/tasks/xnli/xnli_bg.yaml
+++ b/lm_eval/tasks/xnli/xnli_bg.yaml
+# Generated by utils.py
+dataset_name: bg
+doc_to_choice: '{{[premise+", правилно? да, "+hypothesis,premise+", правилно? така,
+  "+hypothesis,premise+", правилно? не, "+hypothesis]}}'
+doc_to_text: ''
+include: xnli_common_yaml
+task: xnli_bg
--- a/lm_eval/tasks/xnli/xnli_common_yaml
+++ b/lm_eval/tasks/xnli/xnli_common_yaml
+# This file will be included in the generated language-specific task configs.
+# It doesn't have a yaml file extension as it is not meant to be imported directly
+# by the harness.
+group: xnli
+task: null
+dataset_path: xnli
+dataset_name: null
+output_type: multiple_choice
+training_split: train
+validation_split: validation
+doc_to_text: null
+doc_to_target: label
+doc_to_choice: null
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/xnli/xnli_de.yaml
+++ b/lm_eval/tasks/xnli/xnli_de.yaml
+# Generated by utils.py
+dataset_name: de
+doc_to_choice: '{{[premise+", richtig? Ja, "+hypothesis,premise+", richtig? Auch,
+  "+hypothesis,premise+", richtig? Nein, "+hypothesis]}}'
+doc_to_text: ''
+include: xnli_common_yaml
+task: xnli_de
--- a/lm_eval/tasks/xnli/xnli_el.yaml
+++ b/lm_eval/tasks/xnli/xnli_el.yaml
+# Generated by utils.py
+dataset_name: el
+doc_to_choice: '{{[premise+", σωστός? Ναί, "+hypothesis,premise+", σωστός? Έτσι, "+hypothesis,premise+",
+  σωστός? όχι, "+hypothesis]}}'
+doc_to_text: ''
+include: xnli_common_yaml
+task: xnli_el
--- a/lm_eval/tasks/xnli/xnli_en.yaml
+++ b/lm_eval/tasks/xnli/xnli_en.yaml
+# Generated by utils.py
+dataset_name: en
+doc_to_choice: '{{[premise+", right? Yes, "+hypothesis,premise+", right? Also, "+hypothesis,premise+",
+  right? No, "+hypothesis]}}'
+doc_to_text: ''
+include: xnli_common_yaml
+task: xnli_en
--- a/lm_eval/tasks/xnli/xnli_es.yaml
+++ b/lm_eval/tasks/xnli/xnli_es.yaml
+# Generated by utils.py
+dataset_name: es
+doc_to_choice: '{{[premise+", correcto? Sí, "+hypothesis,premise+", correcto? Asi
+  que, "+hypothesis,premise+", correcto? No, "+hypothesis]}}'
+doc_to_text: ''
+include: xnli_common_yaml
+task: xnli_es
--- a/lm_eval/tasks/xnli/xnli_fr.yaml
+++ b/lm_eval/tasks/xnli/xnli_fr.yaml
+# Generated by utils.py
+dataset_name: fr
+doc_to_choice: '{{[premise+", correct? Oui, "+hypothesis,premise+", correct? Aussi,
+  "+hypothesis,premise+", correct? Non, "+hypothesis]}}'
+doc_to_text: ''
+include: xnli_common_yaml
+task: xnli_fr
--- a/lm_eval/tasks/xnli/xnli_hi.yaml
+++ b/lm_eval/tasks/xnli/xnli_hi.yaml
+# Generated by utils.py
+dataset_name: hi
+doc_to_choice: '{{[premise+", सही? हाँ, "+hypothesis,premise+", सही? इसलिए, "+hypothesis,premise+",
+  सही? नहीं, "+hypothesis]}}'
+doc_to_text: ''
+include: xnli_common_yaml
+task: xnli_hi
--- a/lm_eval/tasks/xnli/xnli_ru.yaml
+++ b/lm_eval/tasks/xnli/xnli_ru.yaml
+# Generated by utils.py
+dataset_name: ru
+doc_to_choice: '{{[premise+", правильно? Да, "+hypothesis,premise+", правильно? Так,
+  "+hypothesis,premise+", правильно? Нет, "+hypothesis]}}'
+doc_to_text: ''
+include: xnli_common_yaml
+task: xnli_ru
--- a/lm_eval/tasks/xnli/xnli_sw.yaml
+++ b/lm_eval/tasks/xnli/xnli_sw.yaml
+# Generated by utils.py
+dataset_name: sw
+doc_to_choice: '{{[premise+", sahihi? Ndiyo, "+hypothesis,premise+", sahihi? Hivyo,
+  "+hypothesis,premise+", sahihi? Hapana, "+hypothesis]}}'
+doc_to_text: ''
+include: xnli_common_yaml
+task: xnli_sw
--- a/lm_eval/tasks/xnli/xnli_th.yaml
+++ b/lm_eval/tasks/xnli/xnli_th.yaml
+# Generated by utils.py
+dataset_name: th
+doc_to_choice: '{{[premise+", ถูกต้อง? ใช่, "+hypothesis,premise+", ถูกต้อง? ดังนั้น,
+  "+hypothesis,premise+", ถูกต้อง? ไม่, "+hypothesis]}}'
+doc_to_text: ''
+include: xnli_common_yaml
+task: xnli_th
--- a/lm_eval/tasks/xnli/xnli_tr.yaml
+++ b/lm_eval/tasks/xnli/xnli_tr.yaml
+# Generated by utils.py
+dataset_name: tr
+doc_to_choice: '{{[premise+", doğru? Evet, "+hypothesis,premise+", doğru? Böylece,
+  "+hypothesis,premise+", doğru? Hayır, "+hypothesis]}}'
+doc_to_text: ''
+include: xnli_common_yaml
+task: xnli_tr
--- a/lm_eval/tasks/xnli/xnli_ur.yaml
+++ b/lm_eval/tasks/xnli/xnli_ur.yaml
+# Generated by utils.py
+dataset_name: ur
+doc_to_choice: '{{[premise+", صحیح? جی ہاں, "+hypothesis,premise+", صحیح? اس لئے,
+  "+hypothesis,premise+", صحیح? نہیں, "+hypothesis]}}'
+doc_to_text: ''
+include: xnli_common_yaml
+task: xnli_ur
--- a/lm_eval/tasks/xnli/xnli_vi.yaml
+++ b/lm_eval/tasks/xnli/xnli_vi.yaml
+# Generated by utils.py
+dataset_name: vi
+doc_to_choice: '{{[premise+", đúng? Vâng, "+hypothesis,premise+", đúng? Vì vậy, "+hypothesis,premise+",
+  đúng? Không, "+hypothesis]}}'
+doc_to_text: ''
+include: xnli_common_yaml
+task: xnli_vi
--- a/lm_eval/tasks/xnli/xnli_zh.yaml
+++ b/lm_eval/tasks/xnli/xnli_zh.yaml
+# Generated by utils.py
+dataset_name: zh
+doc_to_choice: '{{[premise+", 正确? 是的, "+hypothesis,premise+", 正确? 所以, "+hypothesis,premise+",
+  正确? 不是的, "+hypothesis]}}'
+doc_to_text: ''
+include: xnli_common_yaml
+task: xnli_zh