[Refactor] Migrate xwinograd tasks to yaml

94563a36 · Eddy Yeo · 2820042d · 94563a36 · 94563a36 · 94563a36
Commit 94563a36 authored Jul 21, 2023 by Eddy Yeo
10 changed files
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -471,6 +471,9 @@ class Task(abc.ABC):
            return labeled_examples + example
        elif type(example) == list:
            return [labeled_examples + ex for ex in example]
+        elif type(example) == int:
+            choices = self.doc_to_choice(doc)
+            return labeled_examples + choices[example]

    def apply_filters(self):


--- a/lm_eval/tasks/xwinograd/README.md
+++ b/lm_eval/tasks/xwinograd/README.md
+# Task-name
+
+### Paper
+
+Title: `It's All in the Heads: Using Attention Heads as a Baseline for Cross-Lingual Transfer in Commonsense Reasoning`
+Abstract: `https://arxiv.org/abs/2106.12066`
+
+Multilingual winograd schema challenge that includes English, French, Japanese, Portuguese, Russian and Chinese. Winograd schema challenges come from the XWinograd dataset introduced in Tikhonov et al. As it only contains 16 Chinese schemas, we add 488 Chinese schemas from clue/cluewsc2020.
+
+Homepage: `https://huggingface.co/datasets/Muennighoff/xwinograd`
+
+
+### Citation
+
+```
+@misc{muennighoff2022crosslingual,
+      title={Crosslingual Generalization through Multitask Finetuning},
+      author={Niklas Muennighoff and Thomas Wang and Lintang Sutawika and Adam Roberts and Stella Biderman and Teven Le Scao and M Saiful Bari and Sheng Shen and Zheng-Xin Yong and Hailey Schoelkopf and Xiangru Tang and Dragomir Radev and Alham Fikri Aji and Khalid Almubarak and Samuel Albanie and Zaid Alyafeai and Albert Webson and Edward Raff and Colin Raffel},
+      year={2022},
+      eprint={2211.01786},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+@misc{tikhonov2021heads,
+    title={It's All in the Heads: Using Attention Heads as a Baseline for Cross-Lingual Transfer in Commonsense Reasoning},
+    author={Alexey Tikhonov and Max Ryabinin},
+    year={2021},
+    eprint={2106.12066},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}
+```
+
+### Subtasks
+
+List or describe tasks defined in this folder, and their names here:
+* `xwinograd_en`: Winograd schema challenges in English.
+* `xwinograd_fr`: Winograd schema challenges in French.
+* `xwinograd_jp`: Winograd schema challenges in Japanese.
+* `xwinograd_pt`: Winograd schema challenges in Portuguese.
+* `xwinograd_ru`: Winograd schema challenges in Russian.
+* `xwinograd_zh`: Winograd schema challenges in Chinese.
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+  * [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/xwinograd/utils.py
+++ b/lm_eval/tasks/xwinograd/utils.py
+import argparse
+from typing import Dict, List
+
+import yaml
+
+
+# Different languages that are part of xwinograd.
+# These correspond to dataset names (Subsets) on HuggingFace.
+# A yaml file is generated by this script for each language.
+LANGUAGES = ["en", "fr", "jp", "pt", "ru", "zh"]
+
+
+def doc_to_text(doc: Dict) -> int:
+    """
+    Return index of the correct choice.
+
+    Note: We are using the "multiple input" mode of the multiple-choice
+        output-type, which means we use different contexts with the same target
+        for the different choices, rather than the same context and different targets.
+    """
+    answer_to_num = {"1": 0, "2": 1}
+    return answer_to_num[doc["answer"]]
+
+
+def doc_to_target(doc: Dict) -> str:
+    """
+    Return the target completion.
+
+    Note that this does not depend on the correct choice as we are using
+    "multiple input" mode.
+    """
+    idx = doc["sentence"].index("_") + 1
+    return doc["sentence"][idx:].strip()
+
+
+def doc_to_choice(doc: Dict) -> List[str]:
+    """Return the choices that will be used as contexts in "multiple input" mode."""
+    idx = doc["sentence"].index("_")
+    options = [doc["option1"], doc["option2"]]
+    return [doc["sentence"][:idx] + opt for opt in options]
+
+
+def gen_lang_yamls(output_dir: str, overwrite: bool) -> None:
+    """
+    Generate a yaml file for each language.
+
+    :param output_dir: The directory to output the files to.
+    :param overwrite: Whether to overwrite files if they already exist.
+    """
+    err = []
+    for lang in LANGUAGES:
+        file_name = f"xwinograd_{lang}.yaml"
+        try:
+            with open(f"{output_dir}/{file_name}", "w" if overwrite else "x") as f:
+                f.write("# Generated by utils.py\n")
+                yaml.dump(
+                    {
+                        "include": "xwinograd_common_yaml",
+                        "dataset_name": lang,
+                        "task": f"xwinograd_{lang}",
+                    },
+                    f,
+                )
+        except FileExistsError:
+            err.append(file_name)
+
+    if len(err) > 0:
+        raise FileExistsError(
+            "Files were not created because they already exist (use --overwrite flag):"
+            f" {', '.join(err)}"
+        )
+
+
+def main() -> None:
+    """Parse CLI args and generate language-specific yaml files."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--overwrite",
+        default=False,
+        action="store_true",
+        help="Overwrite files if they already exist",
+    )
+    parser.add_argument(
+        "--output-dir", default=".", help="Directory to write yaml files to"
+    )
+    args = parser.parse_args()
+
+    gen_lang_yamls(output_dir=args.output_dir, overwrite=args.overwrite)
+
+
+if __name__ == "__main__":
+    main()
--- a/lm_eval/tasks/xwinograd/xwinograd_common_yaml
+++ b/lm_eval/tasks/xwinograd/xwinograd_common_yaml
+# This file will be included in the generated language-specific task configs.
+# It doesn't have a yaml file extension as it is not meant to be imported directly
+# by the harness.
+group:
+  - winograd
+  - commonsense
+  - multilingual
+dataset_path: Muennighoff/xwinograd
+dataset_name: null  # Overridden by language-specific config.
+output_type: multiple_choice
+training_split: null
+validation_split: null
+test_split: test
+doc_to_text: !function utils.doc_to_text
+doc_to_target: !function utils.doc_to_target
+doc_to_choice: !function utils.doc_to_choice
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/xwinograd/xwinograd_en.yaml
+++ b/lm_eval/tasks/xwinograd/xwinograd_en.yaml
+# Generated by utils.py
+dataset_name: en
+include: xwinograd_common_yaml
+task: xwinograd_en
--- a/lm_eval/tasks/xwinograd/xwinograd_fr.yaml
+++ b/lm_eval/tasks/xwinograd/xwinograd_fr.yaml
+# Generated by utils.py
+dataset_name: fr
+include: xwinograd_common_yaml
+task: xwinograd_fr
--- a/lm_eval/tasks/xwinograd/xwinograd_jp.yaml
+++ b/lm_eval/tasks/xwinograd/xwinograd_jp.yaml
+# Generated by utils.py
+dataset_name: jp
+include: xwinograd_common_yaml
+task: xwinograd_jp
--- a/lm_eval/tasks/xwinograd/xwinograd_pt.yaml
+++ b/lm_eval/tasks/xwinograd/xwinograd_pt.yaml
+# Generated by utils.py
+dataset_name: pt
+include: xwinograd_common_yaml
+task: xwinograd_pt
--- a/lm_eval/tasks/xwinograd/xwinograd_ru.yaml
+++ b/lm_eval/tasks/xwinograd/xwinograd_ru.yaml
+# Generated by utils.py
+dataset_name: ru
+include: xwinograd_common_yaml
+task: xwinograd_ru
--- a/lm_eval/tasks/xwinograd/xwinograd_zh.yaml
+++ b/lm_eval/tasks/xwinograd/xwinograd_zh.yaml
+# Generated by utils.py
+dataset_name: zh
+include: xwinograd_common_yaml
+task: xwinograd_zh