Commit c1e63555 authored by Yu Shi Jie's avatar Yu Shi Jie
Browse files

Merge branch 'upstream' into 'mmlu-pro'

add tokenizer logs info (#1731)

See merge request shijie.yu/lm-evaluation-harness!4
parents e361687c 42dc2448
include: aqua-rat.yaml include: aqua-rat.yaml
group:
- agieval
- agieval_nous
- agieval_en
task: agieval_lsat_ar task: agieval_lsat_ar
dataset_path: hails/agieval-lsat-ar dataset_path: hails/agieval-lsat-ar
include: aqua-rat.yaml include: aqua-rat.yaml
group:
- agieval
- agieval_nous
- agieval_en
task: agieval_lsat_lr task: agieval_lsat_lr
dataset_path: hails/agieval-lsat-lr dataset_path: hails/agieval-lsat-lr
include: aqua-rat.yaml include: aqua-rat.yaml
group:
- agieval
- agieval_nous
- agieval_en
task: agieval_lsat_rc task: agieval_lsat_rc
dataset_path: hails/agieval-lsat-rc dataset_path: hails/agieval-lsat-rc
group:
- agieval
- agieval_en
task: agieval_math task: agieval_math
dataset_path: hails/agieval-math dataset_path: hails/agieval-math
dataset_name: null dataset_name: null
......
include: aqua-rat.yaml include: aqua-rat.yaml
group:
- agieval
- agieval_nous
- agieval_en
task: agieval_sat_en_without_passage task: agieval_sat_en_without_passage
dataset_path: hails/agieval-sat-en-without-passage dataset_path: hails/agieval-sat-en-without-passage
include: aqua-rat.yaml include: aqua-rat.yaml
group:
- agieval
- agieval_nous
- agieval_en
task: agieval_sat_en task: agieval_sat_en
dataset_path: hails/agieval-sat-en dataset_path: hails/agieval-sat-en
include: aqua-rat.yaml include: aqua-rat.yaml
group:
- agieval
- agieval_nous
- agieval_en
task: agieval_sat_math task: agieval_sat_math
dataset_path: hails/agieval-sat-math dataset_path: hails/agieval-sat-math
group: tag:
- anli - anli
task: anli_r1 task: anli_r1
dataset_path: anli dataset_path: anli
......
...@@ -18,7 +18,7 @@ Homepage: https://github.com/mbzuai-nlp/ArabicMMLU ...@@ -18,7 +18,7 @@ Homepage: https://github.com/mbzuai-nlp/ArabicMMLU
``` ```
@misc{koto2024arabicmmlu, @misc{koto2024arabicmmlu,
title={ArabicMMLU: Assessing Massive Multitask Language Understanding in Arabic}, title={ArabicMMLU: Assessing Massive Multitask Language Understanding in Arabic},
author={Fajri Koto and Haonan Li and Sara Shatnawi and Jad Doughman and Abdelrahman Boda Sadallah and Aisha Alraeesi and Khalid Almubarak and Zaid Alyafeai and Neha Sengupta and Shady Shehata and Nizar Habash and Preslav Nakov and Timothy Baldwin}, author={Fajri Koto and Haonan Li and Sara Shatnawi and Jad Doughman and Abdelrahman Boda Sadallah and Aisha Alraeesi and Khalid Almubarak and Zaid Alyafeai and Neha Sengupta and Shady Shehata and Nizar Habash and Preslav Nakov and Timothy Baldwin},
year={2024}, year={2024},
eprint={2402.12840}, eprint={2402.12840},
...@@ -37,4 +37,4 @@ Homepage: https://github.com/mbzuai-nlp/ArabicMMLU ...@@ -37,4 +37,4 @@ Homepage: https://github.com/mbzuai-nlp/ArabicMMLU
* `arabicmmlu_stem_social_science`: evaluates social science ArabicMMLU tasks. * `arabicmmlu_stem_social_science`: evaluates social science ArabicMMLU tasks.
* `arabicmmlu_stem_humanities`: evaluates humanities ArabicMMLU tasks. * `arabicmmlu_stem_humanities`: evaluates humanities ArabicMMLU tasks.
* `arabicmmlu_stem_language`: evaluates Arabic language ArabicMMLU tasks. * `arabicmmlu_stem_language`: evaluates Arabic language ArabicMMLU tasks.
* `arabicmmlu_stem_other`: evaluates other ArabicMMLU tasks. * `arabicmmlu_stem_other`: evaluates other ArabicMMLU tasks.
\ No newline at end of file
...@@ -5,3 +5,8 @@ task: ...@@ -5,3 +5,8 @@ task:
- arabicmmlu_humanities - arabicmmlu_humanities
- arabicmmlu_stem - arabicmmlu_stem
- arabicmmlu_language - arabicmmlu_language
aggregate_metric_list:
- metric: acc
weight_by_size: True
metadata:
version: 0
group: arabicmmlu_humanities
group_alias: Humanities
task:
- arabicmmlu_humanities_tasks
aggregate_metric_list:
- metric: acc
weight_by_size: True
metadata:
version: 0
group: arabicmmlu_language
group_alias: Language
task:
- arabicmmlu_language_tasks
aggregate_metric_list:
- metric: acc
weight_by_size: True
metadata:
version: 0
group: arabicmmlu_other
group_alias: Other
task:
- arabicmmlu_other_tasks
aggregate_metric_list:
- metric: acc
weight_by_size: True
metadata:
version: 0
group: arabicmmlu_social_science
group_alias: Social Science
task:
- arabicmmlu_social_science_tasks
aggregate_metric_list:
- metric: acc
weight_by_size: True
metadata:
version: 0
group: arabicmmlu_stem
group_alias: STEM
task:
- arabicmmlu_stem_tasks
aggregate_metric_list:
- metric: acc
weight_by_size: True
metadata:
version: 0
...@@ -11,3 +11,5 @@ metric_list: ...@@ -11,3 +11,5 @@ metric_list:
- metric: acc - metric: acc
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
metadata:
version: 0.0
""" """
Take in a YAML, and output all "other" splits with this YAML Take in a YAML, and output all "other" splits with this YAML
""" """
import argparse import argparse
import logging import logging
import os import os
...@@ -58,7 +59,7 @@ SUBJECTS = { ...@@ -58,7 +59,7 @@ SUBJECTS = {
def parse_args(): def parse_args():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--base_yaml_path", default="_default_template_yaml") parser.add_argument("--base_yaml_path", default="_default_arabicmmlu_template_yaml")
parser.add_argument("--save_prefix_path", default="arabicmmlu") parser.add_argument("--save_prefix_path", default="arabicmmlu")
return parser.parse_args() return parser.parse_args()
...@@ -76,20 +77,21 @@ if __name__ == "__main__": ...@@ -76,20 +77,21 @@ if __name__ == "__main__":
if category not in ALL_CATEGORIES: if category not in ALL_CATEGORIES:
ALL_CATEGORIES.append(category) ALL_CATEGORIES.append(category)
# description = f"The following are multiple choice questions (with answers) about {' '.join(subject.split('_'))}.\n\n" # description = f"The following are multiple choice questions (with answers) about {' '.join(subject.split('_'))}.\n\n"
yaml_dict = { yaml_dict = {
"include": base_yaml_name, "include": base_yaml_name,
"group": f"arabicmmlu_{category}", "tag": f"arabicmmlu_{category}",
"group_alias": category.replace("_", " "),
"task": f"arabicmmlu_{subject.lower().replace(' ', '_')}", "task": f"arabicmmlu_{subject.lower().replace(' ', '_')}",
"task_alias": subject, "task_alias": subject,
"dataset_name": subject, "dataset_name": subject,
# "description": description, # "description": description,
} }
file_save_path = args.save_prefix_path + f"_{subject.lower().replace(' ', '_').replace('(', '').replace(')', '')}.yaml" file_save_path = (
args.save_prefix_path
+ f"_{subject.lower().replace(' ', '_').replace('(', '').replace(')', '')}.yaml"
)
eval_logger.info(f"Saving yaml for subset {subject} to {file_save_path}") eval_logger.info(f"Saving yaml for subset {subject} to {file_save_path}")
with open(file_save_path, "w", encoding="utf-8") as yaml_file: with open(file_save_path, "w", encoding="utf-8") as yaml_file:
yaml.dump( yaml.dump(
......
"dataset_name": "Arabic Language (General)" "dataset_name": "Arabic Language (General)"
"group": "arabicmmlu_language" "tag": "arabicmmlu_language_tasks"
"group_alias": "language" "include": "_default_arabicmmlu_template_yaml"
"include": "_default_template_yaml"
"task": "arabicmmlu_arabic_language_(general)" "task": "arabicmmlu_arabic_language_(general)"
"task_alias": "Arabic Language (General)" "task_alias": "Arabic Language (General)"
"dataset_name": "Arabic Language (Grammar)" "dataset_name": "Arabic Language (Grammar)"
"group": "arabicmmlu_language" "tag": "arabicmmlu_language_tasks"
"group_alias": "language" "include": "_default_arabicmmlu_template_yaml"
"include": "_default_template_yaml"
"task": "arabicmmlu_arabic_language_(grammar)" "task": "arabicmmlu_arabic_language_(grammar)"
"task_alias": "Arabic Language (Grammar)" "task_alias": "Arabic Language (Grammar)"
"dataset_name": "Driving Test" "dataset_name": "Driving Test"
"group": "arabicmmlu_other" "tag": "arabicmmlu_other_tasks"
"group_alias": "other" "include": "_default_arabicmmlu_template_yaml"
"include": "_default_template_yaml"
"task": "arabicmmlu_driving_test" "task": "arabicmmlu_driving_test"
"task_alias": "Driving Test" "task_alias": "Driving Test"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment