add translation tasks

40f81c42 · haileyschoelkopf · 39bd5caf · 40f81c42 · 40f81c42 · 40f81c42
Commit 40f81c42 authored Aug 15, 2023 by haileyschoelkopf
8 changed files
--- a/lm_eval/tasks/translation/README.md
+++ b/lm_eval/tasks/translation/README.md
+# Translation Tasks
+### Paper
+### Citation
+```
+```
+### Groups and Tasks
+#### Groups
+* `gpt3_translation_tasks`
+* `wmt14`
+* `wmt16`
+* `wmt20`
+* `iwslt17`
+#### Tasks
+*
+### Checklist
+For adding novel benchmarks/datasets to the library:
+* [x] Is the task an existing benchmark in the literature?
+  * [ ] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+If other tasks on this dataset are already supported:
+* [x] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
+  * [ ] Checked for equivalence with v0.3.0 LM Evaluation Harness
--- a/lm_eval/tasks/translation/iwslt2017_ar-en.yaml
+++ b/lm_eval/tasks/translation/iwslt2017_ar-en.yaml
+# Generated by utils.py
+dataset_name: iwslt2017-ar-en
+dataset_path: iwslt2017
+doc_to_target: ' {{translation["en"]}}'
+doc_to_text: 'Arabic phrase: {{translation["ar"]}}
+  English phrase:'
+group:
+- greedy_until
+- translation
+- iwslt2017
+include: wmt_common_yaml
+task: iwslt2017-ar-en
--- a/lm_eval/tasks/translation/iwslt2017_en-ar.yaml
+++ b/lm_eval/tasks/translation/iwslt2017_en-ar.yaml
+# Generated by utils.py
+dataset_name: iwslt2017-en-ar
+dataset_path: iwslt2017
+doc_to_target: ' {{translation["ar"]}}'
+doc_to_text: 'English phrase: {{translation["en"]}}
+  Arabic phrase:'
+group:
+- greedy_until
+- translation
+- iwslt2017
+include: wmt_common_yaml
+task: iwslt2017-en-ar
--- a/lm_eval/tasks/translation/utils.py
+++ b/lm_eval/tasks/translation/utils.py
+import argparse
+from typing import Dict, List
+import yaml
+import sacrebleu
+try:
+    import pycountry
+except ModuleNotFoundError:
+    raise Exception(
+        "`pycountry` is required for generating translation task prompt templates. \
+please install pycountry via pip install lm-eval[multilingua] or pip install -e .[multilingual]",
+    )
+# Different translation benchmarks included in the library. Mostly WMT.
+# These correspond to dataset names (subsets) on HuggingFace for each dataset.
+# A yaml file is generated by this script for each language pair.
+gpt3_translation_benchmarks = {
+    "wmt14": ["fr-en"],  # ["en-fr", "fr-en"],  # French
+    "wmt16": [
+        "ro-en",
+        "de-en",
+    ],  # ["en-ro", "ro-en", "de-en", "en-de"],  # German, Romanian
+}
+# 28 total
+LANGUAGES = {
+    **gpt3_translation_benchmarks,
+    # "wmt20": sacrebleu.get_langpairs_for_testset("wmt20"),
+    "iwslt2017": ["en-ar", "ar-en"],  # Arabic
+}
+def code_to_language(code):
+    # key is alpha_2 or alpha_3 depending on the code length
+    language_tuple = pycountry.languages.get(**{f"alpha_{len(code)}": code})
+    return language_tuple.name
+def gen_lang_yamls(output_dir: str, overwrite: bool) -> None:
+    """
+    Generate a yaml file for each language.
+    :param output_dir: The directory to output the files to.
+    :param overwrite: Whether to overwrite files if they already exist.
+    """
+    err = []
+    for lang in LANGUAGES.keys():
+        for lang_pair in LANGUAGES[lang]:
+            file_name = f"{lang}_{lang_pair}.yaml"
+            try:
+                src_lang, _, tgt_lang = lang_pair.partition("-")
+                source, target = code_to_language(src_lang), code_to_language(tgt_lang)
+                groups = ["greedy_until", "translation", lang]
+                if lang in gpt3_translation_benchmarks.keys():
+                    groups += ["gpt3_translation_benchmarks"]
+                with open(
+                    f"{output_dir}/{file_name}",
+                    "w" if overwrite else "x",
+                    encoding="utf8",
+                ) as f:
+                    f.write("# Generated by utils.py\n")
+                    yaml.dump(
+                        {
+                            "include": "wmt_common_yaml",
+                            "group": groups,
+                            "dataset_path": lang,
+                            "dataset_name": lang_pair
+                            if not (lang == "iwslt2017")
+                            else "iwslt2017-" + lang_pair,
+                            "task": f"{lang}-{lang_pair}",
+                            "doc_to_text": f"{source} phrase: "
+                            + "{{translation["
+                            + f'"{src_lang}"'
+                            + "]}}\n"
+                            + f"{target} phrase:",
+                            "doc_to_target": " {{"
+                            + "translation["
+                            + f'"{tgt_lang}"]'
+                            + "}}",
+                        },
+                        f,
+                    )
+            except FileExistsError:
+                err.append(file_name)
+    if len(err) > 0:
+        raise FileExistsError(
+            "Files were not created because they already exist (use --overwrite flag):"
+            f" {', '.join(err)}"
+        )
+def main() -> None:
+    """Parse CLI args and generate language-specific yaml files."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--overwrite",
+        default=False,
+        action="store_true",
+        help="Overwrite files if they already exist",
+    )
+    parser.add_argument(
+        "--output-dir", default=".", help="Directory to write yaml files to"
+    )
+    args = parser.parse_args()
+    gen_lang_yamls(output_dir=args.output_dir, overwrite=args.overwrite)
+if __name__ == "__main__":
+    main()
--- a/lm_eval/tasks/translation/wmt14_fr-en.yaml
+++ b/lm_eval/tasks/translation/wmt14_fr-en.yaml
+# Generated by utils.py
+dataset_name: fr-en
+dataset_path: wmt14
+doc_to_target: ' {{translation["en"]}}'
+doc_to_text: 'French phrase: {{translation["fr"]}}
+  English phrase:'
+group:
+- greedy_until
+- translation
+- wmt14
+- gpt3_translation_benchmarks
+include: wmt_common_yaml
+task: wmt14-fr-en
--- a/lm_eval/tasks/translation/wmt16_de-en.yaml
+++ b/lm_eval/tasks/translation/wmt16_de-en.yaml
+# Generated by utils.py
+dataset_name: de-en
+dataset_path: wmt16
+doc_to_target: ' {{translation["en"]}}'
+doc_to_text: 'German phrase: {{translation["de"]}}
+  English phrase:'
+group:
+- greedy_until
+- translation
+- wmt16
+- gpt3_translation_benchmarks
+include: wmt_common_yaml
+task: wmt16-de-en
--- a/lm_eval/tasks/translation/wmt16_ro-en.yaml
+++ b/lm_eval/tasks/translation/wmt16_ro-en.yaml
+# Generated by utils.py
+dataset_name: ro-en
+dataset_path: wmt16
+doc_to_target: ' {{translation["en"]}}'
+doc_to_text: 'Romanian phrase: {{translation["ro"]}}
+  English phrase:'
+group:
+- greedy_until
+- translation
+- wmt16
+- gpt3_translation_benchmarks
+include: wmt_common_yaml
+task: wmt16-ro-en
--- a/lm_eval/tasks/translation/wmt_common_yaml
+++ b/lm_eval/tasks/translation/wmt_common_yaml
+output_type: greedy_until
+training_split: train
+validation_split: validation
+fewshot_split: validation
+test_split: test
+metric_list:
+  - metric: bleu
+  - metric: ter
+  - metric: chrf
+generation_kwargs:
+  until:
+    - "\n"
+  do_sample: false
+  temperature: 0.0
+repeats: 1