Commit 88486e57 authored by lintangsutawika's avatar lintangsutawika
Browse files

Merge branch 'group-agg-rework' of...

Merge branch 'group-agg-rework' of https://github.com/EleutherAI/lm-evaluation-harness into multiprompt
parents 5971f2ca ba73d131
task: bertaqa_en_mt_latxa-7b-v1
include: _bertaqa_template
dataset_name: en_mt_latxa-7b-v1
doc_to_text: "Question: {{question}}\nA: {{candidates[0]}}\nB: {{candidates[1]}}\nC: {{candidates[2]}}\nAnswer:"
task: bertaqa_en_mt_llama-2-13b
include: _bertaqa_template
dataset_name: en_mt_llama-2-13b
doc_to_text: "Question: {{question}}\nA: {{candidates[0]}}\nB: {{candidates[1]}}\nC: {{candidates[2]}}\nAnswer:"
task: bertaqa_en_mt_llama-2-70b
include: _bertaqa_template
dataset_name: en_mt_llama-2-70b
doc_to_text: "Question: {{question}}\nA: {{candidates[0]}}\nB: {{candidates[1]}}\nC: {{candidates[2]}}\nAnswer:"
task: bertaqa_en_mt_llama-2-7b
include: _bertaqa_template
dataset_name: en_mt_llama-2-7b
doc_to_text: "Question: {{question}}\nA: {{candidates[0]}}\nB: {{candidates[1]}}\nC: {{candidates[2]}}\nAnswer:"
task: bertaqa_en_mt_madlad
include: _bertaqa_template
dataset_name: en_mt_madlad
doc_to_text: "Question: {{question}}\nA: {{candidates[0]}}\nB: {{candidates[1]}}\nC: {{candidates[2]}}\nAnswer:"
task: bertaqa_en_mt_nllb
include: _bertaqa_template
dataset_name: en_mt_nllb
doc_to_text: "Question: {{question}}\nA: {{candidates[0]}}\nB: {{candidates[1]}}\nC: {{candidates[2]}}\nAnswer:"
task: bertaqa_eu
include: _bertaqa_template
dataset_name: eu
doc_to_text: "Galdera: {{question}}\nA: {{candidates[0]}}\nB: {{candidates[1]}}\nC: {{candidates[2]}}\nErantzuna:"
...@@ -8,6 +8,7 @@ Requires the installation of ...@@ -8,6 +8,7 @@ Requires the installation of
`pip install "bigbench @ https://storage.googleapis.com/public_research_data/bigbench/bigbench-0.0.1.tar.gz"` `pip install "bigbench @ https://storage.googleapis.com/public_research_data/bigbench/bigbench-0.0.1.tar.gz"`
and is included so that the bigbench dependency can be avoided. and is included so that the bigbench dependency can be avoided.
""" """
import bigbench.api.util as bb_utils import bigbench.api.util as bb_utils
import datasets import datasets
from tqdm import tqdm from tqdm import tqdm
......
group: blimp
task:
- "blimp_adjunct_island"
- "blimp_anaphor_gender_agreement"
- "blimp_anaphor_number_agreement"
- "blimp_animate_subject_passive"
- "blimp_animate_subject_trans"
- "blimp_causative"
- "blimp_complex_NP_island"
- "blimp_coordinate_structure_constraint_complex_left_branch"
- "blimp_coordinate_structure_constraint_object_extraction"
- "blimp_determiner_noun_agreement_1"
- "blimp_determiner_noun_agreement_2"
- "blimp_determiner_noun_agreement_irregular_1"
- "blimp_determiner_noun_agreement_irregular_2"
- "blimp_determiner_noun_agreement_with_adj_2"
- "blimp_determiner_noun_agreement_with_adj_irregular_1"
- "blimp_determiner_noun_agreement_with_adj_irregular_2"
- "blimp_determiner_noun_agreement_with_adjective_1"
- "blimp_distractor_agreement_relational_noun"
- "blimp_distractor_agreement_relative_clause"
- "blimp_drop_argument"
- "blimp_ellipsis_n_bar_1"
- "blimp_ellipsis_n_bar_2"
- "blimp_existential_there_object_raising"
- "blimp_existential_there_quantifiers_1"
- "blimp_existential_there_quantifiers_2"
- "blimp_existential_there_subject_raising"
- "blimp_expletive_it_object_raising"
- "blimp_inchoative"
- "blimp_intransitive"
- "blimp_irregular_past_participle_adjectives"
- "blimp_irregular_past_participle_verbs"
- "blimp_irregular_plural_subject_verb_agreement_1"
- "blimp_irregular_plural_subject_verb_agreement_2"
- "blimp_left_branch_island_echo_question"
- "blimp_left_branch_island_simple_question"
- "blimp_matrix_question_npi_licensor_present"
- "blimp_npi_present_1"
- "blimp_npi_present_2"
- "blimp_only_npi_licensor_present"
- "blimp_only_npi_scope"
- "blimp_passive_1"
- "blimp_passive_2"
- "blimp_principle_A_c_command"
- "blimp_principle_A_case_1"
- "blimp_principle_A_case_2"
- "blimp_principle_A_domain_1"
- "blimp_principle_A_domain_2"
- "blimp_principle_A_domain_3"
- "blimp_principle_A_reconstruction"
- "blimp_regular_plural_subject_verb_agreement_1"
- "blimp_regular_plural_subject_verb_agreement_2"
- "blimp_sentential_negation_npi_licensor_present"
- "blimp_sentential_negation_npi_scope"
- "blimp_sentential_subject_island"
- "blimp_superlative_quantifiers_1"
- "blimp_superlative_quantifiers_2"
- "blimp_tough_vs_raising_1"
- "blimp_tough_vs_raising_2"
- "blimp_transitive"
- "blimp_wh_island"
- "blimp_wh_questions_object_gap"
- "blimp_wh_questions_subject_gap"
- "blimp_wh_questions_subject_gap_long_distance"
- "blimp_wh_vs_that_no_gap"
- "blimp_wh_vs_that_no_gap_long_distance"
- "blimp_wh_vs_that_with_gap"
- "blimp_wh_vs_that_with_gap_long_distance"
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: False
metadata:
version: 2.0
group: blimp
dataset_path: blimp dataset_path: blimp
output_type: multiple_choice output_type: multiple_choice
validation_split: train validation_split: train
......
aggregate_metric_list:
- aggregation: mean
metric: acc
weight_by_size: true
- aggregation: mean
metric: acc_norm
weight_by_size: true
group: ceval-valid
metadata:
version: 1.0
task:
- ceval-valid_computer_network
- ceval-valid_operating_system
- ceval-valid_computer_architecture
- ceval-valid_college_programming
- ceval-valid_college_physics
- ceval-valid_college_chemistry
- ceval-valid_advanced_mathematics
- ceval-valid_probability_and_statistics
- ceval-valid_discrete_mathematics
- ceval-valid_electrical_engineer
- ceval-valid_metrology_engineer
- ceval-valid_high_school_mathematics
- ceval-valid_high_school_physics
- ceval-valid_high_school_chemistry
- ceval-valid_high_school_biology
- ceval-valid_middle_school_mathematics
- ceval-valid_middle_school_biology
- ceval-valid_middle_school_physics
- ceval-valid_middle_school_chemistry
- ceval-valid_veterinary_medicine
- ceval-valid_college_economics
- ceval-valid_business_administration
- ceval-valid_marxism
- ceval-valid_mao_zedong_thought
- ceval-valid_education_science
- ceval-valid_teacher_qualification
- ceval-valid_high_school_politics
- ceval-valid_high_school_geography
- ceval-valid_middle_school_politics
- ceval-valid_middle_school_geography
- ceval-valid_modern_chinese_history
- ceval-valid_ideological_and_moral_cultivation
- ceval-valid_logic
- ceval-valid_law
- ceval-valid_chinese_language_and_literature
- ceval-valid_art_studies
- ceval-valid_professional_tour_guide
- ceval-valid_legal_professional
- ceval-valid_high_school_chinese
- ceval-valid_high_school_history
- ceval-valid_middle_school_history
- ceval-valid_civil_servant
- ceval-valid_sports_science
- ceval-valid_plant_protection
- ceval-valid_basic_medicine
- ceval-valid_clinical_medicine
- ceval-valid_urban_and_rural_planner
- ceval-valid_accountant
- ceval-valid_fire_engineer
- ceval-valid_environmental_impact_assessment_engineer
- ceval-valid_tax_accountant
- ceval-valid_physician
group: ceval-valid
dataset_path: ceval/ceval-exam dataset_path: ceval/ceval-exam
validation_split: val validation_split: val
fewshot_split: dev fewshot_split: dev
......
""" """
Take in a YAML, and output all other splits with this YAML Take in a YAML, and output all other splits with this YAML
""" """
import argparse import argparse
import os import os
import yaml import yaml
from tqdm import tqdm from tqdm import tqdm
from lm_eval.logger import eval_logger from lm_eval.utils import eval_logger
SUBJECTS = { SUBJECTS = {
...@@ -116,3 +117,26 @@ if __name__ == "__main__": ...@@ -116,3 +117,26 @@ if __name__ == "__main__":
allow_unicode=True, allow_unicode=True,
default_style='"', default_style='"',
) )
# write group config out
group_yaml_dict = {
"group": "ceval-valid",
"task": [f"ceval-valid_{task_name}" for task_name in SUBJECTS.keys()],
"aggregate_metric_list": [
{"metric": "acc", "aggregation": "mean", "weight_by_size": True},
{"metric": "acc_norm", "aggregation": "mean", "weight_by_size": True},
],
"metadata": {"version": 1.0},
}
file_save_path = "_" + args.save_prefix_path + ".yaml"
with open(file_save_path, "w", encoding="utf-8") as group_yaml_file:
yaml.dump(
group_yaml_dict,
group_yaml_file,
width=float("inf"),
allow_unicode=True,
default_style='"',
)
group: cmmlu
task:
- cmmlu_agronomy
- cmmlu_anatomy
- cmmlu_ancient_chinese
- cmmlu_arts
- cmmlu_astronomy
- cmmlu_business_ethics
- cmmlu_chinese_civil_service_exam
- cmmlu_chinese_driving_rule
- cmmlu_chinese_food_culture
- cmmlu_chinese_foreign_policy
- cmmlu_chinese_history
- cmmlu_chinese_literature
- cmmlu_chinese_teacher_qualification
- cmmlu_clinical_knowledge
- cmmlu_college_actuarial_science
- cmmlu_college_education
- cmmlu_college_engineering_hydrology
- cmmlu_college_law
- cmmlu_college_mathematics
- cmmlu_college_medical_statistics
- cmmlu_college_medicine
- cmmlu_computer_science
- cmmlu_computer_security
- cmmlu_conceptual_physics
- cmmlu_construction_project_management
- cmmlu_economics
- cmmlu_education
- cmmlu_electrical_engineering
- cmmlu_elementary_chinese
- cmmlu_elementary_commonsense
- cmmlu_elementary_information_and_technology
- cmmlu_elementary_mathematics
- cmmlu_ethnology
- cmmlu_food_science
- cmmlu_genetics
- cmmlu_global_facts
- cmmlu_high_school_biology
- cmmlu_high_school_chemistry
- cmmlu_high_school_geography
- cmmlu_high_school_mathematics
- cmmlu_high_school_physics
- cmmlu_high_school_politics
- cmmlu_human_sexuality
- cmmlu_international_law
- cmmlu_journalism
- cmmlu_jurisprudence
- cmmlu_legal_and_moral_basis
- cmmlu_logical
- cmmlu_machine_learning
- cmmlu_management
- cmmlu_marketing
- cmmlu_marxist_theory
- cmmlu_modern_chinese
- cmmlu_nutrition
- cmmlu_philosophy
- cmmlu_professional_accounting
- cmmlu_professional_law
- cmmlu_professional_medicine
- cmmlu_professional_psychology
- cmmlu_public_relations
- cmmlu_security_study
- cmmlu_sociology
- cmmlu_sports_science
- cmmlu_traditional_chinese_medicine
- cmmlu_virology
- cmmlu_world_history
- cmmlu_world_religions
aggregate_metric_list:
- aggregation: mean
metric: acc
weight_by_size: true
- aggregation: mean
metric: acc_norm
weight_by_size: true
metadata:
version: 0.0
group: cmmlu
dataset_path: haonan-li/cmmlu dataset_path: haonan-li/cmmlu
test_split: test test_split: test
fewshot_split: dev fewshot_split: dev
......
""" """
Take in a YAML, and output all other splits with this YAML Take in a YAML, and output all other splits with this YAML
""" """
import argparse import argparse
import os import os
...@@ -131,3 +132,33 @@ if __name__ == "__main__": ...@@ -131,3 +132,33 @@ if __name__ == "__main__":
allow_unicode=True, allow_unicode=True,
default_style='"', default_style='"',
) )
# write group config out
group_yaml_dict = {
"group": "cmmlu",
"task": [
(
f"cmmlu_{args.task_prefix}_{subject_eng}"
if args.task_prefix != ""
else f"cmmlu_{subject_eng}"
)
for subject_eng in SUBJECTS.keys()
],
"aggregate_metric_list": [
{"metric": "acc", "aggregation": "mean", "weight_by_size": True},
{"metric": "acc_norm", "aggregation": "mean", "weight_by_size": True},
],
"metadata": {"version": 0.0},
}
file_save_path = "_" + args.save_prefix_path + ".yaml"
with open(file_save_path, "w", encoding="utf-8") as group_yaml_file:
yaml.dump(
group_yaml_dict,
group_yaml_file,
width=float("inf"),
allow_unicode=True,
default_style='"',
)
"dataset_name": "agronomy"
"description": "以下是关于农学的单项选择题,请直接给出正确答案的选项。\n\n"
"include": "_default_template_yaml"
"task": "cmmlu_agronomy"
"dataset_name": "anatomy"
"description": "以下是关于解剖学的单项选择题,请直接给出正确答案的选项。\n\n"
"include": "_default_template_yaml"
"task": "cmmlu_anatomy"
"dataset_name": "ancient_chinese"
"description": "以下是关于古汉语的单项选择题,请直接给出正确答案的选项。\n\n"
"include": "_default_template_yaml"
"task": "cmmlu_ancient_chinese"
"dataset_name": "arts"
"description": "以下是关于艺术学的单项选择题,请直接给出正确答案的选项。\n\n"
"include": "_default_template_yaml"
"task": "cmmlu_arts"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment