formatting (#2104)

56a4e794 · Lintang Sutawika · GitHub · 9884ad6e · 56a4e794 · 56a4e794
Unverified Commit 56a4e794 authored Jul 15, 2024 by Lintang Sutawika Committed by GitHub Jul 15, 2024
20 changed files
--- a/lm_eval/api/metrics.py
+++ b/lm_eval/api/metrics.py
@@ -565,4 +565,4 @@ def aggregate_subtask_metrics(metrics, sizes, weight_by_size=True):
    assert len(metrics) == len(sizes)
    return sum([metric * size for metric, size in zip(metrics, sizes)]) / sum(sizes)
\ No newline at end of file
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -1665,4 +1665,4 @@ class PerplexityTask(Task):
    @classmethod
    def count_words(cls, doc) -> int:
        """Downstream tasks with custom word boundaries should override this!"""
        return len(re.split(r"\s+", doc))
\ No newline at end of file
--- a/lm_eval/filters/extraction.py
+++ b/lm_eval/filters/extraction.py
@@ -181,4 +181,4 @@ class MultiChoiceRegexFilter(RegexFilter):
                filtered.append(match)
            filtered_resps.append(filtered)
        return filtered_resps
\ No newline at end of file
--- a/lm_eval/tasks/afrimgsm/README.md
+++ b/lm_eval/tasks/afrimgsm/README.md
@@ -5,8 +5,8 @@
 IrokoBench: A New Benchmark for African Languages in the Age of Large Language Models
 https://arxiv.org/pdf/2406.03368
-IrokoBench is a human-translated benchmark dataset for 16 typologically diverse 
+IrokoBench is a human-translated benchmark dataset for 16 typologically diverse
-low-resource African languages covering three tasks: natural language inference (AfriXNLI), 
+low-resource African languages covering three tasks: natural language inference (AfriXNLI),
 mathematical reasoning (AfriMGSM), and multi-choice knowledge-based QA (AfriMMLU).
@@ -14,13 +14,13 @@ mathematical reasoning (AfriMGSM), and multi-choice knowledge-based QA (AfriMMLU
 ```
 @misc{adelani2024irokobenchnewbenchmarkafrican,
-      title={IrokoBench: A New Benchmark for African Languages in the Age of Large Language Models}, 
+      title={IrokoBench: A New Benchmark for African Languages in the Age of Large Language Models},
      author={David Ifeoluwa Adelani and Jessica Ojo and Israel Abebe Azime and Jian Yun Zhuang and Jesujoba O. Alabi and Xuanli He and Millicent Ochieng and Sara Hooker and Andiswa Bukula and En-Shiun Annie Lee and Chiamaka Chukwuneke and Happy Buzaaba and Blessing Sibanda and Godson Kalipe and Jonathan Mukiibi and Salomon Kabongo and Foutse Yuehgoh and Mmasibidi Setaka and Lolwethu Ndolela and Nkiruka Odu and Rooweither Mabuya and Shamsuddeen Hassan Muhammad and Salomey Osei and Sokhar Samb and Tadesse Kebede Guge and Pontus Stenetorp},
      year={2024},
      eprint={2406.03368},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
-      url={https://arxiv.org/abs/2406.03368}, 
+      url={https://arxiv.org/abs/2406.03368},
 }
 ```

--- a/lm_eval/tasks/afrimgsm/run.sh
+++ b/lm_eval/tasks/afrimgsm/run.sh
@@ -3,4 +3,4 @@ lm_eval --model hf   \
        --device cuda:0     \
        --batch_size 1  \
        --verbosity DEBUG \
        --limit 5
\ No newline at end of file
--- a/lm_eval/tasks/afrimgsm/utils.py
+++ b/lm_eval/tasks/afrimgsm/utils.py
@@ -2,51 +2,74 @@ import argparse
 import yaml
-languages = ['eng', 'amh', 'ibo', 'fra', 'sna', 'lin', 'wol', 'ewe', 'lug', 'xho', 'kin', 'twi', 'zul', 'orm', 'yor',
-             'hau', 'sot', 'swa']
+languages = [
+    "eng",
-languages_REGEX = {"eng": "The answer is (\\-?[0-9\\.\\,]+)",
+    "amh",
-                   "amh": "መልሱ (\\-?[0-9\\.\\,]+)",
+    "ibo",
-                   "ibo": "Azịza ya bụ (\\-?[0-9\\.\\,]+)",
+    "fra",
-                   'fra': "La réponse est(\\-?[0-9\\.\\,]+)",
+    "sna",
-                   'sna': "Mhinduro kumubvunzo ndi (\\-?[0-9\\.\\,]+)",
+    "lin",
-                   'lin': "Eyano ezali (\\-?[0-9\\.\\,]+)",
+    "wol",
-                   'wol': "Tontu li (\\-?[0-9\\.\\,]+)",
+    "ewe",
-                   'ewe': "ŋuɖoɖoae nye (\\-?[0-9\\.\\,]+)",
+    "lug",
-                   'lug': "Ansa eri (\\-?[0-9\\.\\,]+)",
+    "xho",
-                   'xho': "Impendulo ngu (\\-?[0-9\\.\\,]+)",
+    "kin",
-                   'kin': "Igisubizo ni (\\-?[0-9\\.\\,]+)",
+    "twi",
-                   'twi': "Ne nnyiano yɛ (\\-?[0-9\\.\\,]+)",
+    "zul",
-                   'zul': "Impendulo ithi (\\-?[0-9\\.\\,]+)",
+    "orm",
-                   'orm': "Deebiin isaa (\\-?[0-9\\.\\,]+)",
+    "yor",
-                   'yor': "Ìdáhùn náà ni (\\-?[0-9\\.\\,]+)",
+    "hau",
-                   'hau': "Amsar ita ce (\\-?[0-9\\.\\,]+)",
+    "sot",
-                   'sot': "Karabo ke (\\-?[0-9\\.\\,]+)",
+    "swa",
-                   'swa': "Jibu ni (\\-?[0-9\\.\\,]+)",
+]
-                   }
+languages_REGEX = {
+    "eng": "The answer is (\\-?[0-9\\.\\,]+)",
+    "amh": "መልሱ (\\-?[0-9\\.\\,]+)",
+    "ibo": "Azịza ya bụ (\\-?[0-9\\.\\,]+)",
+    "fra": "La réponse est(\\-?[0-9\\.\\,]+)",
+    "sna": "Mhinduro kumubvunzo ndi (\\-?[0-9\\.\\,]+)",
+    "lin": "Eyano ezali (\\-?[0-9\\.\\,]+)",
+    "wol": "Tontu li (\\-?[0-9\\.\\,]+)",
+    "ewe": "ŋuɖoɖoae nye (\\-?[0-9\\.\\,]+)",
+    "lug": "Ansa eri (\\-?[0-9\\.\\,]+)",
+    "xho": "Impendulo ngu (\\-?[0-9\\.\\,]+)",
+    "kin": "Igisubizo ni (\\-?[0-9\\.\\,]+)",
+    "twi": "Ne nnyiano yɛ (\\-?[0-9\\.\\,]+)",
+    "zul": "Impendulo ithi (\\-?[0-9\\.\\,]+)",
+    "orm": "Deebiin isaa (\\-?[0-9\\.\\,]+)",
+    "yor": "Ìdáhùn náà ni (\\-?[0-9\\.\\,]+)",
+    "hau": "Amsar ita ce (\\-?[0-9\\.\\,]+)",
+    "sot": "Karabo ke (\\-?[0-9\\.\\,]+)",
+    "swa": "Jibu ni (\\-?[0-9\\.\\,]+)",
+}
 LANGUAGES = {}
 for lang in languages:
-    if lang == 'amh':
+    if lang == "amh":
        LANGUAGES[lang] = {  # English
            "QUESTION": "ጥያቄ:",
            "ANSWER": "በቅደም ተከተል መልስ:",
            "DIRECT": "Answer:",
-            "REGEX": languages_REGEX[lang]}
+            "REGEX": languages_REGEX[lang],
-    elif lang == 'yor':
+        }
+    elif lang == "yor":
        LANGUAGES[lang] = {  # English
            "QUESTION": "Ìbéèrè:",
            "ANSWER": "Ìdáhùn lẹ́sẹsẹ:",
            "DIRECT": "Answer:",
-            "REGEX": languages_REGEX[lang]}
+            "REGEX": languages_REGEX[lang],
+        }
    else:
        LANGUAGES[lang] = {  # English
            "QUESTION": "Question:",
            "ANSWER": "Step-by-Step Answer:",
            "DIRECT": "Answer:",
-            "REGEX": languages_REGEX[lang]}
+            "REGEX": languages_REGEX[lang],
+        }
 def add_regex_pattern(regex_pattern):
@@ -93,13 +116,12 @@ def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None:
    err = []
    for lang in LANGUAGES.keys():
        try:
            yaml_template = "cot_yaml"
            filter_list = {}
            DELIMITER = None
            if mode == "direct":
-                ANSWER = LANGUAGES['eng']["DIRECT"]
+                ANSWER = LANGUAGES["eng"]["DIRECT"]
-                QUESTION = LANGUAGES['eng']["QUESTION"]
+                QUESTION = LANGUAGES["eng"]["QUESTION"]
                REGEX = None
                task_name = f"afrimgsm_direct_{lang}"
                yaml_template = "direct_yaml"
@@ -122,8 +144,8 @@ def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None:
                QUESTION = LANGUAGES["eng"]["QUESTION"]
                task_name = f"afrimgsm_en_cot_{lang}"
            elif mode == "translate-direct":
-                ANSWER = LANGUAGES['eng']["DIRECT"]
+                ANSWER = LANGUAGES["eng"]["DIRECT"]
-                QUESTION = LANGUAGES['eng']["QUESTION"]
+                QUESTION = LANGUAGES["eng"]["QUESTION"]
                REGEX = None
                task_name = f"afrimgsm_translate_direct_{lang}"
                yaml_template = "translate_direct_yaml"
@@ -131,7 +153,7 @@ def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None:
            file_name = f"{task_name}.yaml"
            ANSWER_TO_SKIP = len(LANGUAGES[lang]["ANSWER"]) + 1
            with open(
-                    f"{output_dir}/{file_name}", "w" if overwrite else "x", encoding="utf8"
+                f"{output_dir}/{file_name}", "w" if overwrite else "x", encoding="utf8"
            ) as f:
                f.write("# Generated by utils.py\n")
                yaml.dump(
@@ -140,15 +162,15 @@ def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None:
                        "dataset_name": lang,
                        "task": f"{task_name}",
                        "doc_to_text": f"""{{% if answer is not none %}}"""
-                                       f"""{{{{question+"\\n{ANSWER}"}}}}"""
+                        f"""{{{{question+"\\n{ANSWER}"}}}}"""
-                                       f"""{{% else %}}"""
+                        f"""{{% else %}}"""
-                                       f"""{{{{"{QUESTION} "+question+"\\n{ANSWER}"}}}}"""
+                        f"""{{{{"{QUESTION} "+question+"\\n{ANSWER}"}}}}"""
-                                       f"""{{% endif %}}""",
+                        f"""{{% endif %}}""",
                        "doc_to_target": f"""{{% if answer is not none %}}"""
-                                         f"""{{{{answer[{ANSWER_TO_SKIP}:]}}}}"""
+                        f"""{{{{answer[{ANSWER_TO_SKIP}:]}}}}"""
-                                         f"""{{% else %}}"""
+                        f"""{{% else %}}"""
-                                         f"""{{{{answer_number|string}}}}"""
+                        f"""{{{{answer_number|string}}}}"""
-                                         f"""{{% endif %}}""",
+                        f"""{{% endif %}}""",
                        **filter_list,
                        "generation_kwargs": {
                            "until": [QUESTION, "</s>", "<|im_end|>"],
@@ -194,4 +216,4 @@ def main() -> None:
 if __name__ == "__main__":
    main()
\ No newline at end of file
--- a/lm_eval/tasks/afrimmlu/README.md
+++ b/lm_eval/tasks/afrimmlu/README.md
@@ -5,8 +5,8 @@
 IrokoBench: A New Benchmark for African Languages in the Age of Large Language Models
 https://arxiv.org/pdf/2406.03368
-IrokoBench is a human-translated benchmark dataset for 16 typologically diverse 
+IrokoBench is a human-translated benchmark dataset for 16 typologically diverse
-low-resource African languages covering three tasks: natural language inference (AfriXNLI), 
+low-resource African languages covering three tasks: natural language inference (AfriXNLI),
 mathematical reasoning (AfriMGSM), and multi-choice knowledge-based QA (AfriMMLU).
@@ -14,13 +14,13 @@ mathematical reasoning (AfriMGSM), and multi-choice knowledge-based QA (AfriMMLU
 ```
 @misc{adelani2024irokobenchnewbenchmarkafrican,
-      title={IrokoBench: A New Benchmark for African Languages in the Age of Large Language Models}, 
+      title={IrokoBench: A New Benchmark for African Languages in the Age of Large Language Models},
      author={David Ifeoluwa Adelani and Jessica Ojo and Israel Abebe Azime and Jian Yun Zhuang and Jesujoba O. Alabi and Xuanli He and Millicent Ochieng and Sara Hooker and Andiswa Bukula and En-Shiun Annie Lee and Chiamaka Chukwuneke and Happy Buzaaba and Blessing Sibanda and Godson Kalipe and Jonathan Mukiibi and Salomon Kabongo and Foutse Yuehgoh and Mmasibidi Setaka and Lolwethu Ndolela and Nkiruka Odu and Rooweither Mabuya and Shamsuddeen Hassan Muhammad and Salomey Osei and Sokhar Samb and Tadesse Kebede Guge and Pontus Stenetorp},
      year={2024},
      eprint={2406.03368},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
-      url={https://arxiv.org/abs/2406.03368}, 
+      url={https://arxiv.org/abs/2406.03368},
 }
 ```

--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_common_yaml
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_common_yaml
@@ -9,18 +9,18 @@ output_type: multiple_choice
 validation_split: validation
 test_split: test
 fewshot_split: validation
-doc_to_text: !function utils.doc_to_text 
+doc_to_text: !function utils.doc_to_text
 doc_to_target: "{{['A', 'B', 'C', 'D'].index(answer)}}"
 doc_to_choice: !function utils.doc_to_choice
 should_decontaminate: true
 doc_to_decontamination_query: "Question: {{question}}\nAnswer:"
 metric_list:
-  - metric: f1 
+  - metric: f1
-    aggregation: !function utils.weighted_f1_score 
+    aggregation: !function utils.weighted_f1_score
    # aggregation: mean
-    average: weighted 
+    average: weighted
-    hf_evaluate: true 
+    hf_evaluate: true
-    higher_is_better: True 
+    higher_is_better: True
    ignore_case: true
    ignore_punctuation: true
    regexes_to_ignore:

--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_eng.yaml
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_eng.yaml
 dataset_name: eng
 include: afrimmlu_common_yaml
 task: afrimmlu_direct_eng
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_ewe.yaml
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_ewe.yaml
 dataset_name: ewe
 include: afrimmlu_common_yaml
 task: afrimmlu_direct_ewe
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_fra.yaml
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_fra.yaml
 dataset_name: fra
 include: afrimmlu_common_yaml
 task: afrimmlu_direct_fra
\ No newline at end of file
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_hau.yaml
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_hau.yaml
 dataset_name: hau
 include: afrimmlu_common_yaml
 task: afrimmlu_direct_hau
\ No newline at end of file
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_ibo.yaml
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_ibo.yaml
 dataset_name: ibo
 include: afrimmlu_common_yaml
 task: afrimmlu_direct_ibo
\ No newline at end of file
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_kin.yaml
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_kin.yaml
 dataset_name: kin
 include: afrimmlu_common_yaml
 task: afrimmlu_direct_kin
\ No newline at end of file
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_lin.yaml
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_lin.yaml
 dataset_name: lin
 include: afrimmlu_common_yaml
 task: afrimmlu_direct_lin
\ No newline at end of file
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_lug.yaml
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_lug.yaml
 dataset_name: lug
 include: afrimmlu_common_yaml
 task: afrimmlu_direct_lug
\ No newline at end of file
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_orm.yaml
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_orm.yaml
 dataset_name: orm
 include: afrimmlu_common_yaml
 task: afrimmlu_direct_orm
\ No newline at end of file
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_sna.yaml
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_sna.yaml
 dataset_name: sna
 include: afrimmlu_common_yaml
 task: afrimmlu_direct_sna
\ No newline at end of file
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_sot.yaml
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_sot.yaml
 dataset_name: sot
 include: afrimmlu_common_yaml
 task: afrimmlu_direct_sot
\ No newline at end of file
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_swa.yaml
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_swa.yaml
 dataset_name: swa
 include: afrimmlu_common_yaml
 task: afrimmlu_direct_swa
\ No newline at end of file