Commit bd028848 authored by Baber's avatar Baber
Browse files

Merge branch 'main' into metrics

# Conflicts:
#	tests/test_tasks.py
parents 6e48110e 56def33d
......@@ -12,9 +12,7 @@ fewshot_config:
sampler: first_n
samples:
- input: 'Sort the following words alphabetically: List: oven costume counterpart'
target: 'Let''s think step by step.
The first letter: "oven": "o" (15). "costume": "c" (3). "counterpart": "c" (3).
target: 'The first letter: "oven": "o" (15). "costume": "c" (3). "counterpart": "c" (3).
We now have: (3) ["costume" ? "counterpart"] < (15) "oven". Now let''s sort
this subpart ["costume" ? "counterpart"] by looking at their second letters.
......@@ -27,9 +25,7 @@ fewshot_config:
< "oven". So the answer is costume counterpart oven.'
- input: 'Sort the following words alphabetically: List: hypochlorite ponderosa
phone credulity'
target: 'Let''s think step by step.
The first letter: "hypochlorite": "h" (8). "ponderosa": "p" (16). "phone": "p"
target: 'The first letter: "hypochlorite": "h" (8). "ponderosa": "p" (16). "phone": "p"
(16). "credulity": "c" (3). We now have: (3) "credulity" < (8) "hypochlorite"
< (16) ["ponderosa" ? "phone"]. Now let''s sort this subpart ["ponderosa" ?
"phone"] by looking at their second letters.
......@@ -39,9 +35,7 @@ fewshot_config:
<"ponderosa"]. So the answer is credulity hypochlorite phone ponderosa.'
- input: 'Sort the following words alphabetically: List: newt arson parthia seismography
mugho aspect census'
target: 'Let''s think step by step.
The first letter: "newt": "n" (14). "arson": "a" (1). "parthia": "p" (16). "seismography":
target: 'The first letter: "newt": "n" (14). "arson": "a" (1). "parthia": "p" (16). "seismography":
"s" (19). "mugho": "m" (13). "aspect": "a" (1). "census": "c" (3). We now have:
(1) ["arson" ? "aspect"] < (3) "census" < (13) "mugho" < (14) "newt" < (16)
"parthia" < (19) "seismography". Now let''s sort this subpart ["arson" ? "aspect"]
......
# EgyHellaSwag
### Paper
Title: NileChat: Towards Linguistically Diverse and Culturally Aware LLMs for Local Communities
Abstract: [https://arxiv.org/abs/2505.18383](https://arxiv.org/abs/2505.18383)
**EgyHellaSwag** is a benchmark for evaluating commonsense reasoning in Egyptian Arabic. It is a translated version of the HellaSwag benchmark, consisting of multiple-choice sentence completion tasks. The dataset was translated using the `google/gemma-3-27b-it` model.
Homepage: [https://huggingface.co/datasets/UBC-NLP/EgyHellaSwag](https://huggingface.co/datasets/UBC-NLP/EgyHellaSwag)
### Citation
```
@article{mekki2025nilechatlinguisticallydiverseculturally,
title={NileChat: Towards Linguistically Diverse and Culturally Aware LLMs for Local Communities},
author={Abdellah El Mekki and Houdaifa Atou and Omer Nacar and Shady Shehata and Muhammad Abdul-Mageed},
year={2025},
eprint={2505.18383},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2505.18383},
}
```
### Groups and Tasks
#### Groups
* Not part of a group.
#### Tags
* `egyhellaswag`
### Checklist
For adding novel benchmarks/datasets to the library:
* [x] Is the task an existing benchmark in the literature?
* [x] Have you referenced the original paper that introduced the task?
* [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
tag:
- multiple_choice
task: egyhellaswag
dataset_path: UBC-NLP/EgyHellaSwag
dataset_name: null
output_type: multiple_choice
training_split: train
validation_split: validation
test_split: null
process_docs: !function utils.process_docs
doc_to_text: "{{query}}"
doc_to_target: "{{label}}"
doc_to_choice: "choices"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 0.0
dataset_kwargs:
trust_remote_code: true
import datasets
def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
def _process_doc(doc):
ctx = doc["ctx"]
out_doc = {
"query": doc["activity_label"] + ": " + ctx,
"choices": doc["endings"],
"gold": int(doc["label"]),
}
return out_doc
return dataset.map(_process_doc)
# EgyMMLU
### Paper
Title: NileChat: Towards Linguistically Diverse and Culturally Aware LLMs for Local Communities
Abstract: [https://arxiv.org/abs/2505.18383](https://arxiv.org/abs/2505.18383)
EgyMMLU is a benchmark designed to evaluate the performance of large language models in Egyptian Arabic. It contains 22,027 multiple-choice questions covering 44 subjects, translated from parts of the Massive Multitask Language Understanding (MMLU) and ArabicMMLU benchmarks. The dataset was translated using `google/gemma-3-27b-it`.
Homepage: [https://huggingface.co/datasets/UBC-NLP/EgyMMLU](https://huggingface.co/datasets/UBC-NLP/EgyMMLU)
### Citation
```
@article{mekki2025nilechatlinguisticallydiverseculturally,
title={NileChat: Towards Linguistically Diverse and Culturally Aware LLMs for Local Communities},
author={Abdellah El Mekki and Houdaifa Atou and Omer Nacar and Shady Shehata and Muhammad Abdul-Mageed},
year={2025},
eprint={2505.18383},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2505.18383},
}
```
### Groups and Tasks
#### Groups
* `egymmlu`: evaluates all EgyMMLU tasks.
#### Tags
Source-based tags:
* `egymmlu_mmlu`: evaluates EgyMMLU tasks that were translated from MMLU.
* `egymmlu_ar_mmlu`: evaluates EgyMMLU tasks that were translated from ArabicMMLU.
Category-based tags:
* `egymmlu_stem`: evaluates EgyMMLU STEM tasks.
* `egymmlu_social_sciences`: evaluates EgyMMLU social sciences tasks.
* `egymmlu_humanities`: evaluates EgyMMLU humanities tasks.
* `egymmlu_language`: evaluates EgyMMLU language tasks.
* `egymmlu_other`: evaluates other EgyMMLU tasks.
### Checklist
For adding novel benchmarks/datasets to the library:
* [x] Is the task an existing benchmark in the literature?
* [x] Have you referenced the original paper that introduced the task?
* [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
dataset_path: UBC-NLP/EgyMMLU
test_split: test
fewshot_split: dev
fewshot_config:
sampler: first_n
output_type: multiple_choice
doc_to_text: !function utils.doc_to_text
doc_to_choice: !function utils.doc_to_choice
doc_to_target: answer
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
metadata:
version: 0.0
group: egymmlu
group_alias: EgyMMLU
task:
- egymmlu_mmlu
- egymmlu_ar_mmlu
aggregate_metric_list:
- metric: acc
weight_by_size: True
metadata:
version: 0
group: egymmlu_ar_mmlu
group_alias: ArabicMMLU
task:
- egymmlu_ar_mmlu_tasks
aggregate_metric_list:
- metric: acc
weight_by_size: True
metadata:
version: 0
group: egymmlu_mmlu
group_alias: MMLU
task:
- egymmlu_mmlu_tasks
aggregate_metric_list:
- metric: acc
weight_by_size: True
metadata:
version: 0
"""
Take in a YAML, and output all "other" splits with this YAML
"""
import argparse
import logging
import os
import yaml
from tqdm import tqdm
eval_logger = logging.getLogger("lm-eval")
MMLU_SUBJECTS = {
"global_facts": "other",
"high_school_european_history": "humanities",
"high_school_geography": "social_sciences",
"high_school_government_and_politics": "social_sciences",
"high_school_psychology": "social_sciences",
"high_school_statistics": "stem",
"high_school_world_history": "humanities",
"human_aging": "other",
"international_law": "humanities",
"jurisprudence": "humanities",
"logical_fallacies": "humanities",
"management": "other",
"marketing": "other",
"moral_disputes": "humanities",
"moral_scenarios": "humanities",
"nutrition": "other",
"philosophy": "humanities",
"professional_law": "humanities",
"professional_psychology": "social_sciences",
"public_relations": "social_sciences",
"security_studies": "social_sciences",
"sociology": "social_sciences",
"world_religions": "humanities",
}
ARABIC_MMLU_SUBJECTS = {
"islamic_studies": "humanities",
"driving_test": "other",
"natural_science": "stem",
"history": "humanities",
"general_knowledge": "other",
"law": "humanities",
"physics": "stem",
"social_science": "social_sciences",
"management_ar": "other",
"arabic_language": "language",
"political_science": "social_sciences",
"philosophy_ar": "humanities",
"accounting": "social_sciences",
"computer_science": "stem",
"geography": "social_sciences",
"math": "stem",
"biology": "stem",
"economics": "social_sciences",
"arabic_language_(general)": "language",
"arabic_language_(grammar)": "language",
"civics": "social_sciences",
}
DATASETS = {
"mmlu": MMLU_SUBJECTS,
"ar_mmlu": ARABIC_MMLU_SUBJECTS,
}
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--base_yaml_path", default="_default_egymmlu_template_yaml")
parser.add_argument("--save_prefix_path", default="egymmlu")
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
base_yaml_name = os.path.split(args.base_yaml_path)[-1]
ALL_CATEGORIES = []
for dataset, SUBJECTS in DATASETS.items():
for subject, category in tqdm(SUBJECTS.items()):
if category not in ALL_CATEGORIES:
ALL_CATEGORIES.append(category)
yaml_dict = {
"include": base_yaml_name,
"tag": [
f"egymmlu_{category}_tasks",
"egymmlu_" + dataset + "_tasks",
],
"task": f"egymmlu_{subject}",
"task_alias": subject.replace("_", " "),
"dataset_name": subject,
}
file_save_path = args.save_prefix_path + f"_{subject}.yaml"
eval_logger.info(f"Saving yaml for subset {subject} to {file_save_path}")
with open(file_save_path, "w", encoding="utf-8") as yaml_file:
yaml.dump(
yaml_dict,
yaml_file,
allow_unicode=True,
default_style='"',
)
egymmlu_subcategories = [f"egymmlu_{category}" for category in ALL_CATEGORIES]
file_save_path = args.save_prefix_path + ".yaml"
eval_logger.info(f"Saving benchmark config to {file_save_path}")
"dataset_name": "accounting"
"include": "_default_egymmlu_template_yaml"
"tag":
- "egymmlu_social_sciences_tasks"
- "egymmlu_ar_mmlu_tasks"
"task": "egymmlu_accounting"
"task_alias": "accounting"
"dataset_name": "arabic_language"
"include": "_default_egymmlu_template_yaml"
"tag":
- "egymmlu_language_tasks"
- "egymmlu_ar_mmlu_tasks"
"task": "egymmlu_arabic_language"
"task_alias": "arabic language"
"dataset_name": "arabic_language_(general)"
"include": "_default_egymmlu_template_yaml"
"tag":
- "egymmlu_language_tasks"
- "egymmlu_ar_mmlu_tasks"
"task": "egymmlu_arabic_language_(general)"
"task_alias": "arabic language (general)"
"dataset_name": "arabic_language_(grammar)"
"include": "_default_egymmlu_template_yaml"
"tag":
- "egymmlu_language_tasks"
- "egymmlu_ar_mmlu_tasks"
"task": "egymmlu_arabic_language_(grammar)"
"task_alias": "arabic language (grammar)"
"dataset_name": "biology"
"include": "_default_egymmlu_template_yaml"
"tag":
- "egymmlu_stem_tasks"
- "egymmlu_ar_mmlu_tasks"
"task": "egymmlu_biology"
"task_alias": "biology"
"dataset_name": "civics"
"include": "_default_egymmlu_template_yaml"
"tag":
- "egymmlu_social_sciences_tasks"
- "egymmlu_ar_mmlu_tasks"
"task": "egymmlu_civics"
"task_alias": "civics"
"dataset_name": "computer_science"
"include": "_default_egymmlu_template_yaml"
"tag":
- "egymmlu_stem_tasks"
- "egymmlu_ar_mmlu_tasks"
"task": "egymmlu_computer_science"
"task_alias": "computer science"
"dataset_name": "driving_test"
"include": "_default_egymmlu_template_yaml"
"tag":
- "egymmlu_other_tasks"
- "egymmlu_ar_mmlu_tasks"
"task": "egymmlu_driving_test"
"task_alias": "driving test"
"dataset_name": "economics"
"include": "_default_egymmlu_template_yaml"
"tag":
- "egymmlu_social_sciences_tasks"
- "egymmlu_ar_mmlu_tasks"
"task": "egymmlu_economics"
"task_alias": "economics"
"dataset_name": "general_knowledge"
"include": "_default_egymmlu_template_yaml"
"tag":
- "egymmlu_other_tasks"
- "egymmlu_ar_mmlu_tasks"
"task": "egymmlu_general_knowledge"
"task_alias": "general knowledge"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment