Merge branch 'main' into humaneval

# Conflicts: # lm_eval/api/task.py

Merge branch 'main' into humaneval
# Conflicts: # lm_eval/api/task.py
173b2bc3 · Baber · 74344829 · bb098f13 · 173b2bc3 · 173b2bc3
Commit 173b2bc3 authored Jan 10, 2025 by Baber
20 changed files
--- a/lm_eval/tasks/afrimgsm/utils.py
+++ b/lm_eval/tasks/afrimgsm/utils.py
+import argparse
+import yaml
+languages = [
+    "eng",
+    "amh",
+    "ibo",
+    "fra",
+    "sna",
+    "lin",
+    "wol",
+    "ewe",
+    "lug",
+    "xho",
+    "kin",
+    "twi",
+    "zul",
+    "orm",
+    "yor",
+    "hau",
+    "sot",
+    "swa",
+]
+languages_REGEX = {
+    "eng": "The answer is (\\-?[0-9\\.\\,]+)",
+    "amh": "መልሱ (\\-?[0-9\\.\\,]+)",
+    "ibo": "Azịza ya bụ (\\-?[0-9\\.\\,]+)",
+    "fra": "La réponse est(\\-?[0-9\\.\\,]+)",
+    "sna": "Mhinduro kumubvunzo ndi (\\-?[0-9\\.\\,]+)",
+    "lin": "Eyano ezali (\\-?[0-9\\.\\,]+)",
+    "wol": "Tontu li (\\-?[0-9\\.\\,]+)",
+    "ewe": "ŋuɖoɖoae nye (\\-?[0-9\\.\\,]+)",
+    "lug": "Ansa eri (\\-?[0-9\\.\\,]+)",
+    "xho": "Impendulo ngu (\\-?[0-9\\.\\,]+)",
+    "kin": "Igisubizo ni (\\-?[0-9\\.\\,]+)",
+    "twi": "Ne nnyiano yɛ (\\-?[0-9\\.\\,]+)",
+    "zul": "Impendulo ithi (\\-?[0-9\\.\\,]+)",
+    "orm": "Deebiin isaa (\\-?[0-9\\.\\,]+)",
+    "yor": "Ìdáhùn náà ni (\\-?[0-9\\.\\,]+)",
+    "hau": "Amsar ita ce (\\-?[0-9\\.\\,]+)",
+    "sot": "Karabo ke (\\-?[0-9\\.\\,]+)",
+    "swa": "Jibu ni (\\-?[0-9\\.\\,]+)",
+}
+LANGUAGES = {}
+for lang in languages:
+    if lang == "amh":
+        LANGUAGES[lang] = {  # English
+            "QUESTION": "ጥያቄ:",
+            "ANSWER": "በቅደም ተከተል መልስ:",
+            "DIRECT": "Answer:",
+            "REGEX": languages_REGEX[lang],
+        }
+    elif lang == "yor":
+        LANGUAGES[lang] = {  # English
+            "QUESTION": "Ìbéèrè:",
+            "ANSWER": "Ìdáhùn lẹ́sẹsẹ:",
+            "DIRECT": "Answer:",
+            "REGEX": languages_REGEX[lang],
+        }
+    else:
+        LANGUAGES[lang] = {  # English
+            "QUESTION": "Question:",
+            "ANSWER": "Step-by-Step Answer:",
+            "DIRECT": "Answer:",
+            "REGEX": languages_REGEX[lang],
+        }
+def add_regex_pattern(regex_pattern):
+    if regex_pattern is None:
+        return {}
+    return {
+        "filter_list": [
+            {
+                "name": "strict-match",
+                "filter": [
+                    {
+                        "function": "regex",
+                        "regex_pattern": f"""{regex_pattern}""",
+                    },
+                    {
+                        "function": "take_first",
+                    },
+                ],
+            },
+            {
+                "name": "flexible-extract",
+                "filter": [
+                    {
+                        "function": "regex",
+                        "regex_pattern": """(-?[$0-9.,]{2,})|(-?[0-9]+)""",
+                        "group_select": -1,
+                    },
+                    {
+                        "function": "take_first",
+                    },
+                ],
+            },
+        ],
+    }
+def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None:
+    """
+    Generate a yaml file for each language.
+    :param output_dir: The directory to output the files to.
+    :param overwrite: Whether to overwrite files if they already exist.
+    """
+    err = []
+    for lang in LANGUAGES.keys():
+        try:
+            yaml_template = "cot_yaml"
+            filter_list = {}
+            DELIMITER = None
+            if mode == "direct":
+                ANSWER = LANGUAGES["eng"]["DIRECT"]
+                QUESTION = LANGUAGES["eng"]["QUESTION"]
+                REGEX = None
+                task_name = f"afrimgsm_direct_{lang}"
+                yaml_template = "direct_yaml"
+            if mode == "direct-native":
+                ANSWER = LANGUAGES[lang]["DIRECT"]
+                QUESTION = LANGUAGES[lang]["QUESTION"]
+                REGEX = None
+                task_name = f"afrimgsm_direct_native_{lang}"
+                yaml_template = "direct_native_yaml"
+            elif mode == "native-cot":
+                ANSWER = LANGUAGES[lang]["ANSWER"]
+                REGEX = LANGUAGES[lang]["REGEX"]
+                QUESTION = LANGUAGES[lang]["QUESTION"]
+                task_name = f"afrimgsm_native_cot_{lang}"
+                filter_list = add_regex_pattern(REGEX)
+                DELIMITER = "" if lang in ["zh", "ja"] else None
+            elif mode == "en-cot":
+                ANSWER = LANGUAGES["eng"]["ANSWER"]
+                REGEX = LANGUAGES["eng"]["REGEX"]
+                QUESTION = LANGUAGES["eng"]["QUESTION"]
+                task_name = f"afrimgsm_en_cot_{lang}"
+            elif mode == "translate-direct":
+                ANSWER = LANGUAGES["eng"]["DIRECT"]
+                QUESTION = LANGUAGES["eng"]["QUESTION"]
+                REGEX = None
+                task_name = f"afrimgsm_translate_direct_{lang}"
+                yaml_template = "translate_direct_yaml"
+            file_name = f"{task_name}.yaml"
+            ANSWER_TO_SKIP = len(LANGUAGES[lang]["ANSWER"]) + 1
+            with open(
+                f"{output_dir}/{file_name}", "w" if overwrite else "x", encoding="utf8"
+            ) as f:
+                f.write("# Generated by utils.py\n")
+                yaml.dump(
+                    {
+                        "include": yaml_template,
+                        "dataset_name": lang,
+                        "task": f"{task_name}",
+                        "doc_to_text": f"""{{% if answer is not none %}}"""
+                        f"""{{{{question+"\\n{ANSWER}"}}}}"""
+                        f"""{{% else %}}"""
+                        f"""{{{{"{QUESTION} "+question+"\\n{ANSWER}"}}}}"""
+                        f"""{{% endif %}}""",
+                        "doc_to_target": f"""{{% if answer is not none %}}"""
+                        f"""{{{{answer[{ANSWER_TO_SKIP}:]}}}}"""
+                        f"""{{% else %}}"""
+                        f"""{{{{answer_number|string}}}}"""
+                        f"""{{% endif %}}""",
+                        **filter_list,
+                        "generation_kwargs": {
+                            "until": [QUESTION, "</s>", "<|im_end|>"],
+                            "do_sample": False,
+                        },
+                        **({"target_delimiter": DELIMITER} if DELIMITER else {}),
+                    },
+                    f,
+                    allow_unicode=True,
+                    width=float("inf"),
+                )
+        except FileExistsError:
+            err.append(file_name)
+    if len(err) > 0:
+        raise FileExistsError(
+            "Files were not created because they already exist (use --overwrite flag):"
+            f" {', '.join(err)}"
+        )
+def main() -> None:
+    """Parse CLI args and generate language-specific yaml files."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--overwrite",
+        default=False,
+        action="store_true",
+        help="Overwrite files if they already exist",
+    )
+    parser.add_argument(
+        "--output-dir", default=".", help="Directory to write yaml files to"
+    )
+    parser.add_argument(
+        "--mode",
+        default="native-cot",
+        choices=["direct", "direct-native", "native-cot", "en-cot", "translate-direct"],
+        help="Mode of chain-of-thought",
+    )
+    args = parser.parse_args()
+    gen_lang_yamls(output_dir=args.output_dir, overwrite=args.overwrite, mode=args.mode)
+if __name__ == "__main__":
+    main()
--- a/lm_eval/tasks/afrimmlu/README.md
+++ b/lm_eval/tasks/afrimmlu/README.md
+# MathQA
+### Paper
+IrokoBench: A New Benchmark for African Languages in the Age of Large Language Models
+https://arxiv.org/pdf/2406.03368
+IrokoBench is a human-translated benchmark dataset for 16 typologically diverse
+low-resource African languages covering three tasks: natural language inference (AfriXNLI),
+mathematical reasoning (AfriMGSM), and multi-choice knowledge-based QA (AfriMMLU).
+### Citation
+```
+@misc{adelani2024irokobenchnewbenchmarkafrican,
+      title={IrokoBench: A New Benchmark for African Languages in the Age of Large Language Models},
+      author={David Ifeoluwa Adelani and Jessica Ojo and Israel Abebe Azime and Jian Yun Zhuang and Jesujoba O. Alabi and Xuanli He and Millicent Ochieng and Sara Hooker and Andiswa Bukula and En-Shiun Annie Lee and Chiamaka Chukwuneke and Happy Buzaaba and Blessing Sibanda and Godson Kalipe and Jonathan Mukiibi and Salomon Kabongo and Foutse Yuehgoh and Mmasibidi Setaka and Lolwethu Ndolela and Nkiruka Odu and Rooweither Mabuya and Shamsuddeen Hassan Muhammad and Salomey Osei and Sokhar Samb and Tadesse Kebede Guge and Pontus Stenetorp},
+      year={2024},
+      eprint={2406.03368},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2406.03368},
+}
+```
+### Groups and Tasks
+#### Groups
+* `afrimmlu`: All afrimmlu tasks
+* `afrimmlu_direct`: afrimmlu_direct evaluates models performance on the curated dataset
+* `afrimmlu_translate`: afrimmlu_translate evaluates models in translate-test setting
+#### Tasks
+* `afrimmlu_direct_{language_code}`: each task evaluates for one language
+* `afrimmlu_translate_{language_code}`: each task evaluates for one language
+### Checklist
+For adding novel benchmarks/datasets to the library:
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+If other tasks on this dataset are already supported:
+* [x] Is the "Main" variant of this task clearly denoted?
+* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [x] Have you noted which, if any, published evaluation setups are matched by this variant?
+  * [x] Checked for equivalence with v0.3.0 LM Evaluation Harness
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_common_yaml
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_common_yaml
+group:
+  - afrimmlu
+  - afrimmlu_direct
+task: null
+dataset_path: masakhane/afrimmlu
+dataset_name: null
+output_type: multiple_choice
+validation_split: validation
+test_split: test
+fewshot_split: validation
+doc_to_text: !function utils.doc_to_text
+doc_to_target: "{{['A', 'B', 'C', 'D'].index(answer)}}"
+doc_to_choice: !function utils.doc_to_choice
+should_decontaminate: true
+doc_to_decontamination_query: "Question: {{question}}\nAnswer:"
+metric_list:
+  - metric: f1
+    aggregation: !function utils.weighted_f1_score
+    # aggregation: mean
+    average: weighted
+    hf_evaluate: true
+    higher_is_better: True
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_amh.yaml
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_amh.yaml
+dataset_name: amh
+include: afrimmlu_common_yaml
+task: afrimmlu_direct_amh
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_eng.yaml
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_eng.yaml
+dataset_name: eng
+include: afrimmlu_common_yaml
+task: afrimmlu_direct_eng
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_ewe.yaml
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_ewe.yaml
+dataset_name: ewe
+include: afrimmlu_common_yaml
+task: afrimmlu_direct_ewe
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_fra.yaml
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_fra.yaml
+dataset_name: fra
+include: afrimmlu_common_yaml
+task: afrimmlu_direct_fra
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_hau.yaml
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_hau.yaml
+dataset_name: hau
+include: afrimmlu_common_yaml
+task: afrimmlu_direct_hau
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_ibo.yaml
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_ibo.yaml
+dataset_name: ibo
+include: afrimmlu_common_yaml
+task: afrimmlu_direct_ibo
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_kin.yaml
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_kin.yaml
+dataset_name: kin
+include: afrimmlu_common_yaml
+task: afrimmlu_direct_kin
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_lin.yaml
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_lin.yaml
+dataset_name: lin
+include: afrimmlu_common_yaml
+task: afrimmlu_direct_lin
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_lug.yaml
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_lug.yaml
+dataset_name: lug
+include: afrimmlu_common_yaml
+task: afrimmlu_direct_lug
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_orm.yaml
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_orm.yaml
+dataset_name: orm
+include: afrimmlu_common_yaml
+task: afrimmlu_direct_orm
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_sna.yaml
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_sna.yaml
+dataset_name: sna
+include: afrimmlu_common_yaml
+task: afrimmlu_direct_sna
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_sot.yaml
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_sot.yaml
+dataset_name: sot
+include: afrimmlu_common_yaml
+task: afrimmlu_direct_sot
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_swa.yaml
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_swa.yaml
+dataset_name: swa
+include: afrimmlu_common_yaml
+task: afrimmlu_direct_swa
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_twi.yaml
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_twi.yaml
+dataset_name: twi
+include: afrimmlu_common_yaml
+task: afrimmlu_direct_twi
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_wol.yaml
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_wol.yaml
+dataset_name: wol
+include: afrimmlu_common_yaml
+task: afrimmlu_direct_wol
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_xho.yaml
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_xho.yaml
+dataset_name: xho
+include: afrimmlu_common_yaml
+task: afrimmlu_direct_xho
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_yor.yaml
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_yor.yaml
+dataset_name: yor
+include: afrimmlu_common_yaml
+task: afrimmlu_direct_yor