Merge branch 'main' into metrics

# Conflicts: # tests/test_tasks.py

Merge branch 'main' into metrics
# Conflicts: # tests/test_tasks.py
bd028848 · Baber · 6e48110e · 56def33d · bd028848 · bd028848
Commit bd028848 authored Jul 18, 2025 by Baber
20 changed files
--- a/lm_eval/tasks/bbh/cot_fewshot/word_sorting.yaml
+++ b/lm_eval/tasks/bbh/cot_fewshot/word_sorting.yaml
@@ -12,9 +12,7 @@ fewshot_config:
  sampler: first_n
  samples:
  - input: 'Sort the following words alphabetically: List: oven costume counterpart'
-    target: 'Let''s think step by step.
-
-      The first letter: "oven": "o" (15). "costume": "c" (3). "counterpart": "c" (3).
+    target: 'The first letter: "oven": "o" (15). "costume": "c" (3). "counterpart": "c" (3).
      We now have: (3) ["costume" ? "counterpart"] < (15) "oven". Now let''s sort
      this subpart ["costume" ? "counterpart"] by looking at their second letters.

@@ -27,9 +25,7 @@ fewshot_config:
      < "oven". So the answer is costume counterpart oven.'
  - input: 'Sort the following words alphabetically: List: hypochlorite ponderosa
      phone credulity'
-    target: 'Let''s think step by step.
-
-      The first letter: "hypochlorite": "h" (8). "ponderosa": "p" (16). "phone": "p"
+    target: 'The first letter: "hypochlorite": "h" (8). "ponderosa": "p" (16). "phone": "p"
      (16). "credulity": "c" (3). We now have: (3) "credulity" < (8) "hypochlorite"
      < (16) ["ponderosa" ? "phone"]. Now let''s sort this subpart ["ponderosa" ?
      "phone"] by looking at their second letters.
@@ -39,9 +35,7 @@ fewshot_config:
      <"ponderosa"]. So the answer is credulity hypochlorite phone ponderosa.'
  - input: 'Sort the following words alphabetically: List: newt arson parthia seismography
      mugho aspect census'
-    target: 'Let''s think step by step.
-
-      The first letter: "newt": "n" (14). "arson": "a" (1). "parthia": "p" (16). "seismography":
+    target: 'The first letter: "newt": "n" (14). "arson": "a" (1). "parthia": "p" (16). "seismography":
      "s" (19). "mugho": "m" (13). "aspect": "a" (1). "census": "c" (3). We now have:
      (1) ["arson" ? "aspect"] < (3) "census" < (13) "mugho" < (14) "newt" < (16)
      "parthia" < (19) "seismography". Now let''s sort this subpart ["arson" ? "aspect"]

--- a/lm_eval/tasks/egyhellaswag/README.md
+++ b/lm_eval/tasks/egyhellaswag/README.md
+# EgyHellaSwag
+
+### Paper
+
+Title: NileChat: Towards Linguistically Diverse and Culturally Aware LLMs for Local Communities
+
+Abstract: [https://arxiv.org/abs/2505.18383](https://arxiv.org/abs/2505.18383)
+
+**EgyHellaSwag** is a benchmark for evaluating commonsense reasoning in Egyptian Arabic. It is a translated version of the HellaSwag benchmark, consisting of multiple-choice sentence completion tasks. The dataset was translated using the `google/gemma-3-27b-it` model.
+
+
+Homepage: [https://huggingface.co/datasets/UBC-NLP/EgyHellaSwag](https://huggingface.co/datasets/UBC-NLP/EgyHellaSwag)
+
+
+### Citation
+
+```
+@article{mekki2025nilechatlinguisticallydiverseculturally,
+  title={NileChat: Towards Linguistically Diverse and Culturally Aware LLMs for Local Communities},
+  author={Abdellah El Mekki and Houdaifa Atou and Omer Nacar and Shady Shehata and Muhammad Abdul-Mageed},
+  year={2025},
+  eprint={2505.18383},
+  archivePrefix={arXiv},
+  primaryClass={cs.CL},
+  url={https://arxiv.org/abs/2505.18383},
+}
+```
+
+### Groups and Tasks
+
+#### Groups
+
+* Not part of a group.
+
+#### Tags
+
+* `egyhellaswag`
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/egyhellaswag/egyhellaswag.yaml
+++ b/lm_eval/tasks/egyhellaswag/egyhellaswag.yaml
+tag:
+  - multiple_choice
+task: egyhellaswag
+dataset_path: UBC-NLP/EgyHellaSwag
+dataset_name: null
+output_type: multiple_choice
+training_split: train
+validation_split: validation
+test_split: null
+process_docs: !function utils.process_docs
+doc_to_text: "{{query}}"
+doc_to_target: "{{label}}"
+doc_to_choice: "choices"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 0.0
+dataset_kwargs:
+  trust_remote_code: true
--- a/lm_eval/tasks/egyhellaswag/utils.py
+++ b/lm_eval/tasks/egyhellaswag/utils.py
+import datasets
+
+
+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+    def _process_doc(doc):
+        ctx = doc["ctx"]
+        out_doc = {
+            "query": doc["activity_label"] + ": " + ctx,
+            "choices": doc["endings"],
+            "gold": int(doc["label"]),
+        }
+        return out_doc
+
+    return dataset.map(_process_doc)
--- a/lm_eval/tasks/egymmlu/README.md
+++ b/lm_eval/tasks/egymmlu/README.md
+# EgyMMLU
+
+### Paper
+
+Title: NileChat: Towards Linguistically Diverse and Culturally Aware LLMs for Local Communities
+
+Abstract: [https://arxiv.org/abs/2505.18383](https://arxiv.org/abs/2505.18383)
+
+EgyMMLU is a benchmark designed to evaluate the performance of large language models in Egyptian Arabic. It contains 22,027 multiple-choice questions covering 44 subjects, translated from parts of the Massive Multitask Language Understanding (MMLU) and ArabicMMLU benchmarks. The dataset was translated using `google/gemma-3-27b-it`.
+
+Homepage: [https://huggingface.co/datasets/UBC-NLP/EgyMMLU](https://huggingface.co/datasets/UBC-NLP/EgyMMLU)
+
+
+### Citation
+
+```
+@article{mekki2025nilechatlinguisticallydiverseculturally,
+  title={NileChat: Towards Linguistically Diverse and Culturally Aware LLMs for Local Communities},
+  author={Abdellah El Mekki and Houdaifa Atou and Omer Nacar and Shady Shehata and Muhammad Abdul-Mageed},
+  year={2025},
+  eprint={2505.18383},
+  archivePrefix={arXiv},
+  primaryClass={cs.CL},
+  url={https://arxiv.org/abs/2505.18383},
+}
+```
+
+### Groups and Tasks
+
+#### Groups
+
+* `egymmlu`: evaluates all EgyMMLU tasks.
+
+#### Tags
+Source-based tags:
+
+* `egymmlu_mmlu`: evaluates EgyMMLU tasks that were translated from MMLU.
+* `egymmlu_ar_mmlu`: evaluates EgyMMLU tasks that were translated from ArabicMMLU.
+
+Category-based tags:
+
+* `egymmlu_stem`: evaluates EgyMMLU STEM tasks.
+* `egymmlu_social_sciences`: evaluates EgyMMLU social sciences tasks.
+* `egymmlu_humanities`: evaluates EgyMMLU humanities tasks.
+* `egymmlu_language`: evaluates EgyMMLU language tasks.
+* `egymmlu_other`: evaluates other EgyMMLU tasks.
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/egymmlu/_default_egymmlu_template_yaml
+++ b/lm_eval/tasks/egymmlu/_default_egymmlu_template_yaml
+dataset_path: UBC-NLP/EgyMMLU
+test_split: test
+fewshot_split: dev
+fewshot_config:
+  sampler: first_n
+output_type: multiple_choice
+doc_to_text: !function utils.doc_to_text
+doc_to_choice: !function utils.doc_to_choice
+doc_to_target: answer
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 0.0
--- a/lm_eval/tasks/egymmlu/_egymmlu.yaml
+++ b/lm_eval/tasks/egymmlu/_egymmlu.yaml
+group: egymmlu
+group_alias: EgyMMLU
+task:
+- egymmlu_mmlu
+- egymmlu_ar_mmlu
+aggregate_metric_list:
+  - metric: acc
+    weight_by_size: True
+metadata:
+  version: 0
--- a/lm_eval/tasks/egymmlu/_egymmlu_ar_mmlu.yaml
+++ b/lm_eval/tasks/egymmlu/_egymmlu_ar_mmlu.yaml
+group: egymmlu_ar_mmlu
+group_alias: ArabicMMLU
+task:
+  - egymmlu_ar_mmlu_tasks
+aggregate_metric_list:
+  - metric: acc
+    weight_by_size: True
+metadata:
+  version: 0
--- a/lm_eval/tasks/egymmlu/_egymmlu_mmlu.yaml
+++ b/lm_eval/tasks/egymmlu/_egymmlu_mmlu.yaml
+group: egymmlu_mmlu
+group_alias: MMLU
+task:
+  - egymmlu_mmlu_tasks
+aggregate_metric_list:
+  - metric: acc
+    weight_by_size: True
+metadata:
+  version: 0
--- a/lm_eval/tasks/egymmlu/_generate_configs.py
+++ b/lm_eval/tasks/egymmlu/_generate_configs.py
+"""
+Take in a YAML, and output all "other" splits with this YAML
+"""
+
+import argparse
+import logging
+import os
+
+import yaml
+from tqdm import tqdm
+
+
+eval_logger = logging.getLogger("lm-eval")
+
+
+MMLU_SUBJECTS = {
+    "global_facts": "other",
+    "high_school_european_history": "humanities",
+    "high_school_geography": "social_sciences",
+    "high_school_government_and_politics": "social_sciences",
+    "high_school_psychology": "social_sciences",
+    "high_school_statistics": "stem",
+    "high_school_world_history": "humanities",
+    "human_aging": "other",
+    "international_law": "humanities",
+    "jurisprudence": "humanities",
+    "logical_fallacies": "humanities",
+    "management": "other",
+    "marketing": "other",
+    "moral_disputes": "humanities",
+    "moral_scenarios": "humanities",
+    "nutrition": "other",
+    "philosophy": "humanities",
+    "professional_law": "humanities",
+    "professional_psychology": "social_sciences",
+    "public_relations": "social_sciences",
+    "security_studies": "social_sciences",
+    "sociology": "social_sciences",
+    "world_religions": "humanities",
+}
+
+
+ARABIC_MMLU_SUBJECTS = {
+    "islamic_studies": "humanities",
+    "driving_test": "other",
+    "natural_science": "stem",
+    "history": "humanities",
+    "general_knowledge": "other",
+    "law": "humanities",
+    "physics": "stem",
+    "social_science": "social_sciences",
+    "management_ar": "other",
+    "arabic_language": "language",
+    "political_science": "social_sciences",
+    "philosophy_ar": "humanities",
+    "accounting": "social_sciences",
+    "computer_science": "stem",
+    "geography": "social_sciences",
+    "math": "stem",
+    "biology": "stem",
+    "economics": "social_sciences",
+    "arabic_language_(general)": "language",
+    "arabic_language_(grammar)": "language",
+    "civics": "social_sciences",
+}
+
+
+DATASETS = {
+    "mmlu": MMLU_SUBJECTS,
+    "ar_mmlu": ARABIC_MMLU_SUBJECTS,
+}
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--base_yaml_path", default="_default_egymmlu_template_yaml")
+    parser.add_argument("--save_prefix_path", default="egymmlu")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+
+    base_yaml_name = os.path.split(args.base_yaml_path)[-1]
+
+    ALL_CATEGORIES = []
+    for dataset, SUBJECTS in DATASETS.items():
+        for subject, category in tqdm(SUBJECTS.items()):
+            if category not in ALL_CATEGORIES:
+                ALL_CATEGORIES.append(category)
+
+            yaml_dict = {
+                "include": base_yaml_name,
+                "tag": [
+                    f"egymmlu_{category}_tasks",
+                    "egymmlu_" + dataset + "_tasks",
+                ],
+                "task": f"egymmlu_{subject}",
+                "task_alias": subject.replace("_", " "),
+                "dataset_name": subject,
+            }
+
+            file_save_path = args.save_prefix_path + f"_{subject}.yaml"
+            eval_logger.info(f"Saving yaml for subset {subject} to {file_save_path}")
+            with open(file_save_path, "w", encoding="utf-8") as yaml_file:
+                yaml.dump(
+                    yaml_dict,
+                    yaml_file,
+                    allow_unicode=True,
+                    default_style='"',
+                )
+
+    egymmlu_subcategories = [f"egymmlu_{category}" for category in ALL_CATEGORIES]
+
+    file_save_path = args.save_prefix_path + ".yaml"
+
+    eval_logger.info(f"Saving benchmark config to {file_save_path}")
--- a/lm_eval/tasks/egymmlu/egymmlu_accounting.yaml
+++ b/lm_eval/tasks/egymmlu/egymmlu_accounting.yaml
+"dataset_name": "accounting"
+"include": "_default_egymmlu_template_yaml"
+"tag":
+- "egymmlu_social_sciences_tasks"
+- "egymmlu_ar_mmlu_tasks"
+"task": "egymmlu_accounting"
+"task_alias": "accounting"
--- a/lm_eval/tasks/egymmlu/egymmlu_arabic_language.yaml
+++ b/lm_eval/tasks/egymmlu/egymmlu_arabic_language.yaml
+"dataset_name": "arabic_language"
+"include": "_default_egymmlu_template_yaml"
+"tag":
+- "egymmlu_language_tasks"
+- "egymmlu_ar_mmlu_tasks"
+"task": "egymmlu_arabic_language"
+"task_alias": "arabic language"
--- a/lm_eval/tasks/egymmlu/egymmlu_arabic_language_(general).yaml
+++ b/lm_eval/tasks/egymmlu/egymmlu_arabic_language_(general).yaml
+"dataset_name": "arabic_language_(general)"
+"include": "_default_egymmlu_template_yaml"
+"tag":
+- "egymmlu_language_tasks"
+- "egymmlu_ar_mmlu_tasks"
+"task": "egymmlu_arabic_language_(general)"
+"task_alias": "arabic language (general)"
--- a/lm_eval/tasks/egymmlu/egymmlu_arabic_language_(grammar).yaml
+++ b/lm_eval/tasks/egymmlu/egymmlu_arabic_language_(grammar).yaml
+"dataset_name": "arabic_language_(grammar)"
+"include": "_default_egymmlu_template_yaml"
+"tag":
+- "egymmlu_language_tasks"
+- "egymmlu_ar_mmlu_tasks"
+"task": "egymmlu_arabic_language_(grammar)"
+"task_alias": "arabic language (grammar)"
--- a/lm_eval/tasks/egymmlu/egymmlu_biology.yaml
+++ b/lm_eval/tasks/egymmlu/egymmlu_biology.yaml
+"dataset_name": "biology"
+"include": "_default_egymmlu_template_yaml"
+"tag":
+- "egymmlu_stem_tasks"
+- "egymmlu_ar_mmlu_tasks"
+"task": "egymmlu_biology"
+"task_alias": "biology"
--- a/lm_eval/tasks/egymmlu/egymmlu_civics.yaml
+++ b/lm_eval/tasks/egymmlu/egymmlu_civics.yaml
+"dataset_name": "civics"
+"include": "_default_egymmlu_template_yaml"
+"tag":
+- "egymmlu_social_sciences_tasks"
+- "egymmlu_ar_mmlu_tasks"
+"task": "egymmlu_civics"
+"task_alias": "civics"
--- a/lm_eval/tasks/egymmlu/egymmlu_computer_science.yaml
+++ b/lm_eval/tasks/egymmlu/egymmlu_computer_science.yaml
+"dataset_name": "computer_science"
+"include": "_default_egymmlu_template_yaml"
+"tag":
+- "egymmlu_stem_tasks"
+- "egymmlu_ar_mmlu_tasks"
+"task": "egymmlu_computer_science"
+"task_alias": "computer science"
--- a/lm_eval/tasks/egymmlu/egymmlu_driving_test.yaml
+++ b/lm_eval/tasks/egymmlu/egymmlu_driving_test.yaml
+"dataset_name": "driving_test"
+"include": "_default_egymmlu_template_yaml"
+"tag":
+- "egymmlu_other_tasks"
+- "egymmlu_ar_mmlu_tasks"
+"task": "egymmlu_driving_test"
+"task_alias": "driving test"
--- a/lm_eval/tasks/egymmlu/egymmlu_economics.yaml
+++ b/lm_eval/tasks/egymmlu/egymmlu_economics.yaml
+"dataset_name": "economics"
+"include": "_default_egymmlu_template_yaml"
+"tag":
+- "egymmlu_social_sciences_tasks"
+- "egymmlu_ar_mmlu_tasks"
+"task": "egymmlu_economics"
+"task_alias": "economics"
--- a/lm_eval/tasks/egymmlu/egymmlu_general_knowledge.yaml
+++ b/lm_eval/tasks/egymmlu/egymmlu_general_knowledge.yaml
+"dataset_name": "general_knowledge"
+"include": "_default_egymmlu_template_yaml"
+"tag":
+- "egymmlu_other_tasks"
+- "egymmlu_ar_mmlu_tasks"
+"task": "egymmlu_general_knowledge"
+"task_alias": "general knowledge"