Commit 88486e57 authored by lintangsutawika's avatar lintangsutawika
Browse files

Merge branch 'group-agg-rework' of...

Merge branch 'group-agg-rework' of https://github.com/EleutherAI/lm-evaluation-harness into multiprompt
parents 5971f2ca ba73d131
group: arabicmmlu_other
group_alias: other
task:
- arabicmmlu_other_tasks
aggregate_metric_list:
- metric: acc
weight_by_size: True
metadata:
version: 0
group: arabicmmlu_social_science
group_alias: social_science
task:
- arabicmmlu_social_science_tasks
aggregate_metric_list:
- metric: acc
weight_by_size: True
metadata:
version: 0
group: arabicmmlu_stem
group_alias: stem
task:
- arabicmmlu_stem_tasks
aggregate_metric_list:
- metric: acc
weight_by_size: True
metadata:
version: 0
dataset_path: yazeed7/ArabicMMLU
test_split: test
fewshot_split: dev
fewshot_config:
sampler: first_n
output_type: multiple_choice
doc_to_text: !function utils.doc_to_text
doc_to_choice: !function utils.doc_to_choice
doc_to_target: "Answer Key"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
metadata:
version: 0.0
"""
Take in a YAML, and output all "other" splits with this YAML
"""
import argparse
import logging
import os
import yaml
from tqdm import tqdm
eval_logger = logging.getLogger("lm-eval")
SUBJECTS = {
"Driving Test": "other",
"High Geography": "social_science",
"High History": "humanities",
"Islamic Studies": "humanities",
"Univ Accounting": "social_science",
"Primary General Knowledge": "other",
"Univ Political Science": "social_science",
"Primary Math": "stem",
"Middle General Knowledge": "other",
"High Biology": "stem",
"Primary Natural Science": "stem",
"High Economics": "social_science",
"Middle Natural Science": "stem",
"Middle Geography": "social_science",
"Primary Social Science": "social_science",
"Middle Computer Science": "stem",
"Middle Islamic Studies": "humanities",
"Primary Computer Science": "stem",
"High Physics": "stem",
"Middle Social Science": "social_science",
"Middle Civics": "social_science",
"High Computer Science": "stem",
"General Knowledge": "other",
"High Civics": "social_science",
"Prof Law": "humanities",
"High Islamic Studies": "humanities",
"Primary Arabic Language": "language",
"High Arabic Language": "language",
"Arabic Language (Grammar)": "language",
"Primary History": "humanities",
"Middle History": "humanities",
"Univ Economics": "social_science",
"Arabic Language (General)": "language",
"Univ Computer Science": "stem",
"Primary Islamic Studies": "humanities",
"Primary Geography": "social_science",
"High Philosophy": "humanities",
"Middle Arabic Language": "language",
"Middle Economics": "social_science",
"Univ Management": "other",
}
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--base_yaml_path", default="_default_template_yaml")
parser.add_argument("--save_prefix_path", default="arabicmmlu")
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
# get filename of base_yaml so we can `"include": ` it in our "other" YAMLs.
base_yaml_name = os.path.split(args.base_yaml_path)[-1]
with open(args.base_yaml_path, encoding="utf-8") as f:
base_yaml = yaml.full_load(f)
ALL_CATEGORIES = []
for subject, category in tqdm(SUBJECTS.items()):
if category not in ALL_CATEGORIES:
ALL_CATEGORIES.append(category)
# description = f"The following are multiple choice questions (with answers) about {' '.join(subject.split('_'))}.\n\n"
yaml_dict = {
"include": base_yaml_name,
"tag": f"arabicmmlu_{category}",
"task": f"arabicmmlu_{subject.lower().replace(' ', '_')}",
"task_alias": subject,
"dataset_name": subject,
# "description": description,
}
file_save_path = (
args.save_prefix_path
+ f"_{subject.lower().replace(' ', '_').replace('(', '').replace(')', '')}.yaml"
)
eval_logger.info(f"Saving yaml for subset {subject} to {file_save_path}")
with open(file_save_path, "w", encoding="utf-8") as yaml_file:
yaml.dump(
yaml_dict,
yaml_file,
allow_unicode=True,
default_style='"',
)
arabicmmlu_subcategories = [f"arabicmmlu_{category}" for category in ALL_CATEGORIES]
file_save_path = args.save_prefix_path + ".yaml"
eval_logger.info(f"Saving benchmark config to {file_save_path}")
with open(file_save_path, "w", encoding="utf-8") as yaml_file:
yaml.dump(
{
"group": "arabicmmlu",
"task": arabicmmlu_subcategories,
},
yaml_file,
indent=4,
default_flow_style=False,
)
"dataset_name": "Arabic Language (General)"
"tag": "arabicmmlu_language_tasks"
"include": "_default_template_yaml"
"task": "arabicmmlu_arabic_language_(general)"
"task_alias": "Arabic Language (General)"
"dataset_name": "Arabic Language (Grammar)"
"tag": "arabicmmlu_language_tasks"
"include": "_default_template_yaml"
"task": "arabicmmlu_arabic_language_(grammar)"
"task_alias": "Arabic Language (Grammar)"
"dataset_name": "Driving Test"
"tag": "arabicmmlu_other_tasks"
"include": "_default_template_yaml"
"task": "arabicmmlu_driving_test"
"task_alias": "Driving Test"
"dataset_name": "General Knowledge"
"tag": "arabicmmlu_other_tasks"
"include": "_default_template_yaml"
"task": "arabicmmlu_general_knowledge"
"task_alias": "General Knowledge"
"dataset_name": "High Arabic Language"
"tag": "arabicmmlu_language_tasks"
"include": "_default_template_yaml"
"task": "arabicmmlu_high_arabic_language"
"task_alias": "High Arabic Language"
"dataset_name": "High Biology"
"tag": "arabicmmlu_stem_tasks"
"include": "_default_template_yaml"
"task": "arabicmmlu_high_biology"
"task_alias": "High Biology"
"dataset_name": "High Civics"
"tag": "arabicmmlu_social_science_tasks"
"include": "_default_template_yaml"
"task": "arabicmmlu_high_civics"
"task_alias": "High Civics"
"dataset_name": "High Computer Science"
"tag": "arabicmmlu_stem_tasks"
"include": "_default_template_yaml"
"task": "arabicmmlu_high_computer_science"
"task_alias": "High Computer Science"
"dataset_name": "High Economics"
"tag": "arabicmmlu_social_science_tasks"
"include": "_default_template_yaml"
"task": "arabicmmlu_high_economics"
"task_alias": "High Economics"
"dataset_name": "High Geography"
"tag": "arabicmmlu_social_science_tasks"
"include": "_default_template_yaml"
"task": "arabicmmlu_high_geography"
"task_alias": "High Geography"
"dataset_name": "High History"
"tag": "arabicmmlu_humanities_tasks"
"include": "_default_template_yaml"
"task": "arabicmmlu_high_history"
"task_alias": "High History"
"dataset_name": "High Islamic Studies"
"tag": "arabicmmlu_humanities_tasks"
"include": "_default_template_yaml"
"task": "arabicmmlu_high_islamic_studies"
"task_alias": "High Islamic Studies"
"dataset_name": "High Philosophy"
"tag": "arabicmmlu_humanities_tasks"
"include": "_default_template_yaml"
"task": "arabicmmlu_high_philosophy"
"task_alias": "High Philosophy"
"dataset_name": "High Physics"
"tag": "arabicmmlu_stem_tasks"
"include": "_default_template_yaml"
"task": "arabicmmlu_high_physics"
"task_alias": "High Physics"
"dataset_name": "Islamic Studies"
"tag": "arabicmmlu_humanities_tasks"
"include": "_default_template_yaml"
"task": "arabicmmlu_islamic_studies"
"task_alias": "Islamic Studies"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment