add mgsm

416a3ca1 · lintangsutawika · 8eab2a58 · 416a3ca1 · 416a3ca1 · 416a3ca1
Commit 416a3ca1 authored Aug 15, 2023 by lintangsutawika
Showing with 219 additions and 0 deletions

lm_eval/tasks/mgsm/README.md lm_eval/tasks/mgsm/README.md +74 -0

lm_eval/tasks/mgsm/common_template_yaml lm_eval/tasks/mgsm/common_template_yaml +22 -0

lm_eval/tasks/mgsm/utils.py lm_eval/tasks/mgsm/utils.py +123 -0

No files found.
--- a/lm_eval/tasks/mgsm/README.md
+++ b/lm_eval/tasks/mgsm/README.md
+# MGSM
+### Paper
+Title: `Language Models are Multilingual Chain-of-Thought Reasoners`
+Abstract: https://arxiv.org/abs/2210.03057
+Multilingual Grade School Math Benchmark (MGSM) is a benchmark of grade-school math problems, proposed in the paper [Language models are multilingual chain-of-thought reasoners](http://arxiv.org/abs/2210.03057).
+The same 250 problems from [GSM8K](https://arxiv.org/abs/2110.14168) are each translated via human annotators in 10 languages. The 10 languages are:
+- Spanish
+- French
+- German
+- Russian
+- Chinese
+- Japanese
+- Thai
+- Swahili
+- Bengali
+- Telugu
+GSM8K (Grade School Math 8K) is a dataset of 8.5K high quality linguistically diverse grade school math word problems. The dataset was created to support the task of question answering on basic mathematical problems that require multi-step reasoning.
+You can find the input and targets for each of the ten languages (and English) as `.tsv` files.
+We also include few-shot exemplars that are also manually translated from each language in `exemplars.py`.
+Homepage: https://github.com/google-research/url-nlp/tree/main/mgsm
+### Citation
+```
+@misc{cobbe2021training,
+    title={Training Verifiers to Solve Math Word Problems},
+    author={Karl Cobbe and Vineet Kosaraju and Mohammad Bavarian and Jacob Hilton and Reiichiro Nakano and Christopher Hesse and John Schulman},
+    year={2021},
+    eprint={2110.14168},
+    archivePrefix={arXiv},
+    primaryClass={cs.LG}
+}
+@misc{shi2022language,
+    title={Language Models are Multilingual Chain-of-Thought Reasoners},
+    author={Freda Shi and Mirac Suzgun and Markus Freitag and Xuezhi Wang and Suraj Srivats and Soroush Vosoughi and Hyung Won Chung and Yi Tay and Sebastian Ruder and Denny Zhou and Dipanjan Das and Jason Wei},
+    year={2022},
+    eprint={2210.03057},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}
+```
+### Groups and Tasks
+#### Groups
+* `mgsm`
+#### Tasks
+* `task_name`: `1-sentence description of what this particular task does`
+* `task_name2`: ...
+### Checklist
+For adding novel benchmarks/datasets to the library:
+* [ ] Is the task an existing benchmark in the literature?
+  * [ ] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
\ No newline at end of file
--- a/lm_eval/tasks/mgsm/common_template_yaml
+++ b/lm_eval/tasks/mgsm/common_template_yaml
+# This file will be included in the generated language-specific task configs.
+# It doesn't have a yaml file extension as it is not meant to be imported directly
+# by the harness.
+group: mgsm
+dataset_path: juletxara/mgsm
+dataset_name: null  # Overridden by language-specific config.
+output_type: greedy_until
+training_split: train
+test_split: test
+target_delimiter: ""
+generation_kwargs:
+  until:
+    - "\n\n"
+    - "\n"
+  do_sample: false
+  temperature: 0.0
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
--- a/lm_eval/tasks/mgsm/utils.py
+++ b/lm_eval/tasks/mgsm/utils.py
+import argparse
+from typing import Dict, List
+import yaml
+LANGUAGES = {
+    "bn": {  # Bengali
+        "QUESTION": "\u09aa\u09cd\u09b0\u09b6\u09cd\u09a8:",
+        "ANSWER": "\u09a7\u09be\u09aa\u09c7 \u09a7\u09be\u09aa\u09c7 \u0989\u09a4\u09cd\u09a4\u09b0:",
+    },
+    "de": {  # German
+        "QUESTION": "Frage:",
+        "ANSWER": "Schritt-f\u00fcr-Schritt-Antwort:",
+    },
+    "en": {  # English
+        "QUESTION": "Question:",
+        "ANSWER": "Step-by-Step Answer:",
+    },
+    "es": {  # Spanish
+        "QUESTION": "Pregunta:",
+        "ANSWER": "Respuesta paso a paso:",
+    },
+    "fr": {  # French
+        "QUESTION": "Question :",
+        "ANSWER": "R\u00e9ponse \u00e9tape par \u00e9tape :",
+    },
+    "ru": {  # Russian
+        "QUESTION": "\u0417\u0430\u0434\u0430\u0447\u0430:",
+        "ANSWER": "\u041f\u043e\u0448\u0430\u0433\u043e\u0432\u043e\u0435\u0440\u0435\u0448\u0435\u043d\u0438\u0435:",
+    },
+    "sw": {  # Swahili
+        "QUESTION": "Swali:",
+        "ANSWER": "Jibu la Hatua kwa Hatua:",
+    },
+    "te": {  # Telugu
+        "QUESTION": "\u0c2a\u0c4d\u0c30\u0c36\u0c4d\u0c28:",
+        "ANSWER": "\u0c26\u0c36\u0c32\u0c35\u0c3e\u0c30\u0c40\u0c17\u0c3e \u0c38\u0c2e\u0c3e\u0c27\u0c3e\u0c28\u0c02:",
+    },
+    "th": {  # Thai
+        "QUESTION": "\u0e42\u0e08\u0e17\u0e22\u0e4c:",
+        "ANSWER": "\u0e04\u0e33\u0e15\u0e2d\u0e1a\u0e17\u0e35\u0e25\u0e30\u0e02\u0e31\u0e49\u0e19\u0e15\u0e2d\u0e19:",
+    },
+    "ja": {  # Japanese
+        "QUESTION": "\u554f\u984c:",
+        "ANSWER": "\u30b9\u30c6\u30c3\u30d7\u3054\u3068\u306e\u7b54\u3048:",
+    },
+    "zh": {  # Chinese
+        "QUESTION": "\u95ee\u9898:",
+        "ANSWER": "\u9010\u6b65\u89e3\u7b54:",
+    },
+}
+def doc_to_text(doc, QUESTION, ANSWER):
+    if doc["answer"] is not None:
+        return doc["question"] + "\n" + ANSWER
+    else:
+        return QUESTION + " " + doc["question"] + "\n" + ANSWER
+def doc_to_target(doc, QUESTION, ANSWER):
+    if doc["answer"] is not None:
+        return " " + doc["answer"][len(ANSWER) + 1 :]
+    else:
+        return " " + str(doc["answer_number"])
+def gen_lang_yamls(output_dir: str, overwrite: bool) -> None:
+    """
+    Generate a yaml file for each language.
+    :param output_dir: The directory to output the files to.
+    :param overwrite: Whether to overwrite files if they already exist.
+    """
+    err = []
+    for lang in LANGUAGES.keys():
+        file_name = f"mgsm_{lang}.yaml"
+        try:
+            QUESTION = LANGUAGES[lang]["QUESTION"]
+            ANSWER = LANGUAGES[lang]["ANSWER"]
+            with open(
+                f"{output_dir}/{file_name}", "w" if overwrite else "x", encoding="utf8"
+            ) as f:
+                f.write("# Generated by utils.py\n")
+                yaml.dump(
+                    {
+                        "include": "common_template_yaml",
+                        "dataset_name": lang,
+                        "task": f"mgsm_{lang}",
+                        "doc_to_text": doc_to_text(doc, QUESTION, ANSWER),
+                        "doc_to_target": doc_to_target(doc, QUESTION, ANSWER),
+                    },
+                    f,
+                    allow_unicode=True,
+                )
+        except FileExistsError:
+            err.append(file_name)
+    if len(err) > 0:
+        raise FileExistsError(
+            "Files were not created because they already exist (use --overwrite flag):"
+            f" {', '.join(err)}"
+        )
+def main() -> None:
+    """Parse CLI args and generate language-specific yaml files."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--overwrite",
+        default=False,
+        action="store_true",
+        help="Overwrite files if they already exist",
+    )
+    parser.add_argument(
+        "--output-dir", default=".", help="Directory to write yaml files to"
+    )
+    args = parser.parse_args()
+    gen_lang_yamls(output_dir=args.output_dir, overwrite=args.overwrite)
+if __name__ == "__main__":
+    main()