Merge pull request #779 from EleutherAI/pawsx

[Refactor] Paws-X

Merge pull request #779 from EleutherAI/pawsx
[Refactor] Paws-X
8eab2a58 · Lintang Sutawika · GitHub · 30aa9c33 · 55749b9b · 8eab2a58
Unverified Commit 8eab2a58 authored Aug 15, 2023 by Lintang Sutawika Committed by GitHub Aug 15, 2023
11 changed files
--- a/lm_eval/tasks/README.md
+++ b/lm_eval/tasks/README.md
@@ -54,7 +54,7 @@ Boxes should be checked iff tasks are implemented in the refactor and tested for
 - [ ] BIG-Bench (Hailey)
 - [x] XStoryCloze
 - [x] XWinograd
- [ ] PAWS-X (Lintang)
+- [x] PAWS-X
 - [x] XNLI
 - [ ] MGSM (Lintang)
 - [ ] SCROLLS

--- a/lm_eval/tasks/paws-x/README.md
+++ b/lm_eval/tasks/paws-x/README.md
+# PAWS-X
+
+### Paper
+
+Title: `PAWS-X: A Cross-lingual Adversarial Dataset for Paraphrase Identification`
+Abstract: https://arxiv.org/abs/1908.11828
+
+The dataset consists of 23,659 human translated PAWS evaluation pairs and
+296,406 machine translated training pairs in 6 typologically distinct languages.
+
+Examples are adapted from  PAWS-Wiki
+
+Prompt format (same as in mGPT):
+
+"<s>" + sentence1 + ", right? " + mask + ", " + sentence2 + "</s>",
+
+where mask is the string that matches the label:
+
+Yes, No.
+
+Example:
+
+<s> The Tabaci River is a tributary of the River Leurda in Romania, right? No, The Leurda River is a tributary of the River Tabaci in Romania.</s>
+
+Language specific prompts are translated word-by-word with Google Translate
+and may differ from the ones used by mGPT and XGLM (they do not provide their prompts).
+
+Homepage: https://github.com/google-research-datasets/paws/tree/master/pawsx
+
+
+### Citation
+
+```
+@inproceedings{yang-etal-2019-paws,
+    title = "{PAWS}-{X}: A Cross-lingual Adversarial Dataset for Paraphrase Identification",
+    author = "Yang, Yinfei  and
+      Zhang, Yuan  and
+      Tar, Chris  and
+      Baldridge, Jason",
+    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)",
+    month = nov,
+    year = "2019",
+    address = "Hong Kong, China",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/D19-1382",
+    doi = "10.18653/v1/D19-1382",
+    pages = "3687--3692",
+}
+```
+
+### Groups and Tasks
+
+#### Groups
+
+* `pawsx`
+
+#### Tasks
+
+* `paws_de`: German
+* `paws_en`: English
+* `paws_es`: Spanish
+* `paws_fr`: French
+* `paws_ja`: Japanese
+* `paws_ko`: Korean
+* `paws_zh`: Chinese
+
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+* [ ] Is the task an existing benchmark in the literature?
+  * [ ] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/paws-x/paws_de.yaml
+++ b/lm_eval/tasks/paws-x/paws_de.yaml
+# Generated by utils.py
+dataset_name: de
+doc_to_choice: '{{[sentence1+", richtig? Ja, "+sentence2, sentence1+", richtig? Nein,
+  "+sentence2]}}'
+doc_to_text: ''
+include: pawsx_template_yaml
+task: paws_de
--- a/lm_eval/tasks/paws-x/paws_en.yaml
+++ b/lm_eval/tasks/paws-x/paws_en.yaml
+# Generated by utils.py
+dataset_name: en
+doc_to_choice: '{{[sentence1+", right? Yes, "+sentence2, sentence1+", right? No, "+sentence2]}}'
+doc_to_text: ''
+include: pawsx_template_yaml
+task: paws_en
--- a/lm_eval/tasks/paws-x/paws_es.yaml
+++ b/lm_eval/tasks/paws-x/paws_es.yaml
+# Generated by utils.py
+dataset_name: es
+doc_to_choice: '{{[sentence1+", verdad? Sí, "+sentence2, sentence1+", verdad? No,
+  "+sentence2]}}'
+doc_to_text: ''
+include: pawsx_template_yaml
+task: paws_es
--- a/lm_eval/tasks/paws-x/paws_fr.yaml
+++ b/lm_eval/tasks/paws-x/paws_fr.yaml
+# Generated by utils.py
+dataset_name: fr
+doc_to_choice: '{{[sentence1+", n''est-ce pas? Oui, "+sentence2, sentence1+", n''est-ce
+  pas? No, "+sentence2]}}'
+doc_to_text: ''
+include: pawsx_template_yaml
+task: paws_fr
--- a/lm_eval/tasks/paws-x/paws_ja.yaml
+++ b/lm_eval/tasks/paws-x/paws_ja.yaml
+# Generated by utils.py
+dataset_name: ja
+doc_to_choice: '{{[sentence1+", ですね? はい, "+sentence2, sentence1+", ですね? いいえ, "+sentence2]}}'
+doc_to_text: ''
+include: pawsx_template_yaml
+task: paws_ja
--- a/lm_eval/tasks/paws-x/paws_ko.yaml
+++ b/lm_eval/tasks/paws-x/paws_ko.yaml
+# Generated by utils.py
+dataset_name: ko
+doc_to_choice: '{{[sentence1+", 맞죠? 예, "+sentence2, sentence1+", 맞죠? 아니요, "+sentence2]}}'
+doc_to_text: ''
+include: pawsx_template_yaml
+task: paws_ko
--- a/lm_eval/tasks/paws-x/paws_zh.yaml
+++ b/lm_eval/tasks/paws-x/paws_zh.yaml
+# Generated by utils.py
+dataset_name: zh
+doc_to_choice: '{{[sentence1+", 对吧? 是, "+sentence2, sentence1+", 对吧? 不是, "+sentence2]}}'
+doc_to_text: ''
+include: pawsx_template_yaml
+task: paws_zh
--- a/lm_eval/tasks/paws-x/pawsx_template_yaml
+++ b/lm_eval/tasks/paws-x/pawsx_template_yaml
+# This file will be included in the generated language-specific task configs.
+# It doesn't have a yaml file extension as it is not meant to be imported directly
+# by the harness.
+group: pawsx
+task: null
+dataset_path: paws-x
+dataset_name: null
+output_type: multiple_choice
+training_split: train
+validation_split: validation
+test_split: test
+doc_to_text: null
+doc_to_target: label
+doc_to_choice: null
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/paws-x/utils.py
+++ b/lm_eval/tasks/paws-x/utils.py
+import argparse
+from typing import Dict, List
+
+import yaml
+
+
+# Different languages that are part of xnli.
+# These correspond to dataset names (Subsets) on HuggingFace.
+# A yaml file is generated by this script for each language.
+
+LANGUAGES = {
+    "de": {  # German
+        "QUESTION_WORD": "richtig",
+        "YES": "Ja",
+        "NO": "Nein",
+    },
+    "en": {  # English
+        "QUESTION_WORD": "right",
+        "YES": "Yes",
+        "NO": "No",
+    },
+    "es": {  # Spanish
+        "QUESTION_WORD": "verdad",
+        "YES": "Sí",
+        "NO": "No",
+    },
+    "fr": {  # French
+        "QUESTION_WORD": "n'est-ce pas",
+        "YES": "Oui",
+        "NO": "No",
+    },
+    "ja": {  # Japanese
+        "QUESTION_WORD": "ですね",
+        "YES": "はい",
+        "NO": "いいえ",
+    },
+    "ko": {  # Korean
+        "QUESTION_WORD": "맞죠",
+        "YES": "예",
+        "NO": "아니요",
+    },
+    "zh": {  # Chinese
+        "QUESTION_WORD": "对吧",
+        "YES": "是",
+        "NO": "不是",
+    },
+}
+
+
+def gen_lang_yamls(output_dir: str, overwrite: bool) -> None:
+    """
+    Generate a yaml file for each language.
+
+    :param output_dir: The directory to output the files to.
+    :param overwrite: Whether to overwrite files if they already exist.
+    """
+    err = []
+    for lang in LANGUAGES.keys():
+        file_name = f"paws_{lang}.yaml"
+        try:
+            QUESTION_WORD = LANGUAGES[lang]["QUESTION_WORD"]
+            YES = LANGUAGES[lang]["YES"]
+            NO = LANGUAGES[lang]["NO"]
+            with open(
+                f"{output_dir}/{file_name}", "w" if overwrite else "x", encoding="utf8"
+            ) as f:
+                f.write("# Generated by utils.py\n")
+                yaml.dump(
+                    {
+                        "include": "pawsx_template_yaml",
+                        "dataset_name": lang,
+                        "task": f"paws_{lang}",
+                        "doc_to_text": "",
+                        "doc_to_choice": f"{{{{["
+                        f"""sentence1+\", {QUESTION_WORD}? {YES}, \"+sentence2,"""
+                        f""" sentence1+\", {QUESTION_WORD}? {NO}, \"+sentence2"""
+                        f"]}}}}",
+                    },
+                    f,
+                    allow_unicode=True,
+                )
+        except FileExistsError:
+            err.append(file_name)
+
+    if len(err) > 0:
+        raise FileExistsError(
+            "Files were not created because they already exist (use --overwrite flag):"
+            f" {', '.join(err)}"
+        )
+
+
+def main() -> None:
+    """Parse CLI args and generate language-specific yaml files."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--overwrite",
+        default=False,
+        action="store_true",
+        help="Overwrite files if they already exist",
+    )
+    parser.add_argument(
+        "--output-dir", default=".", help="Directory to write yaml files to"
+    )
+    args = parser.parse_args()
+
+    gen_lang_yamls(output_dir=args.output_dir, overwrite=args.overwrite)
+
+
+if __name__ == "__main__":
+    main()