Commit 09dd7f6c authored by haileyschoelkopf's avatar haileyschoelkopf
Browse files

add more explicit group configs

parent 8fdcbc13
......@@ -4,7 +4,7 @@ task:
# ANLI R1
- group: anli_r1_flan
group_alias: ANLI R1
aggregate_metric:
aggregate_metric_list:
- metric: acc
weight_by_size: True
task:
......@@ -56,7 +56,7 @@ task:
# ANLI R2
- group: anli_r2_flan
group_alias: ANLI R2
aggregate_metric:
aggregate_metric_list:
- metric: acc
weight_by_size: True
task:
......@@ -108,7 +108,7 @@ task:
# ANLI R3
- group: anli_r3_flan
group_alias: ANLI R3
aggregate_metric:
aggregate_metric_list:
- metric: acc
weight_by_size: True
task:
......@@ -160,7 +160,7 @@ task:
# Arc Easy
- group: arc_easy_flan
group_alias: Arc Easy
aggregate_metric:
aggregate_metric_list:
- metric: acc
weight_by_size: True
task:
......@@ -202,7 +202,7 @@ task:
# Arc Challenge
- group: arc_challenge_flan
group_alias: Arc Challenge
aggregate_metric:
aggregate_metric_list:
- metric: acc
weight_by_size: True
task:
......@@ -244,7 +244,7 @@ task:
# BoolQ
- group: boolq_flan
group_alias: BoolQ
aggregate_metric:
aggregate_metric_list:
- metric: acc
weight_by_size: True
task:
......@@ -301,7 +301,7 @@ task:
# RTE
- group: rte_flan
group_alias: RTE
aggregate_metric:
aggregate_metric_list:
- metric: acc
weight_by_size: True
task:
......
......@@ -15,3 +15,7 @@ task:
task_alias: "professional_medicine (mmlu)"
- task: mmlu_college_biology
task_alias: "college_biology (mmlu)"
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: True
......@@ -9,55 +9,55 @@ group: ceval-valid
metadata:
version: 1.0
task:
- ceval-valid_computer_network
- ceval-valid_operating_system
- ceval-valid_computer_architecture
- ceval-valid_college_programming
- ceval-valid_college_physics
- ceval-valid_college_chemistry
- ceval-valid_advanced_mathematics
- ceval-valid_probability_and_statistics
- ceval-valid_discrete_mathematics
- ceval-valid_electrical_engineer
- ceval-valid_metrology_engineer
- ceval-valid_high_school_mathematics
- ceval-valid_high_school_physics
- ceval-valid_high_school_chemistry
- ceval-valid_high_school_biology
- ceval-valid_middle_school_mathematics
- ceval-valid_middle_school_biology
- ceval-valid_middle_school_physics
- ceval-valid_middle_school_chemistry
- ceval-valid_veterinary_medicine
- ceval-valid_college_economics
- ceval-valid_business_administration
- ceval-valid_marxism
- ceval-valid_mao_zedong_thought
- ceval-valid_education_science
- ceval-valid_teacher_qualification
- ceval-valid_high_school_politics
- ceval-valid_high_school_geography
- ceval-valid_middle_school_politics
- ceval-valid_middle_school_geography
- ceval-valid_modern_chinese_history
- ceval-valid_ideological_and_moral_cultivation
- ceval-valid_logic
- ceval-valid_law
- ceval-valid_chinese_language_and_literature
- ceval-valid_art_studies
- ceval-valid_professional_tour_guide
- ceval-valid_legal_professional
- ceval-valid_high_school_chinese
- ceval-valid_high_school_history
- ceval-valid_middle_school_history
- ceval-valid_civil_servant
- ceval-valid_sports_science
- ceval-valid_plant_protection
- ceval-valid_basic_medicine
- ceval-valid_clinical_medicine
- ceval-valid_urban_and_rural_planner
- ceval-valid_accountant
- ceval-valid_fire_engineer
- ceval-valid_environmental_impact_assessment_engineer
- ceval-valid_tax_accountant
- ceval-valid_physician
- ceval-valid_computer_network
- ceval-valid_operating_system
- ceval-valid_computer_architecture
- ceval-valid_college_programming
- ceval-valid_college_physics
- ceval-valid_college_chemistry
- ceval-valid_advanced_mathematics
- ceval-valid_probability_and_statistics
- ceval-valid_discrete_mathematics
- ceval-valid_electrical_engineer
- ceval-valid_metrology_engineer
- ceval-valid_high_school_mathematics
- ceval-valid_high_school_physics
- ceval-valid_high_school_chemistry
- ceval-valid_high_school_biology
- ceval-valid_middle_school_mathematics
- ceval-valid_middle_school_biology
- ceval-valid_middle_school_physics
- ceval-valid_middle_school_chemistry
- ceval-valid_veterinary_medicine
- ceval-valid_college_economics
- ceval-valid_business_administration
- ceval-valid_marxism
- ceval-valid_mao_zedong_thought
- ceval-valid_education_science
- ceval-valid_teacher_qualification
- ceval-valid_high_school_politics
- ceval-valid_high_school_geography
- ceval-valid_middle_school_politics
- ceval-valid_middle_school_geography
- ceval-valid_modern_chinese_history
- ceval-valid_ideological_and_moral_cultivation
- ceval-valid_logic
- ceval-valid_law
- ceval-valid_chinese_language_and_literature
- ceval-valid_art_studies
- ceval-valid_professional_tour_guide
- ceval-valid_legal_professional
- ceval-valid_high_school_chinese
- ceval-valid_high_school_history
- ceval-valid_middle_school_history
- ceval-valid_civil_servant
- ceval-valid_sports_science
- ceval-valid_plant_protection
- ceval-valid_basic_medicine
- ceval-valid_clinical_medicine
- ceval-valid_urban_and_rural_planner
- ceval-valid_accountant
- ceval-valid_fire_engineer
- ceval-valid_environmental_impact_assessment_engineer
- ceval-valid_tax_accountant
- ceval-valid_physician
group: ceval-valid
dataset_path: ceval/ceval-exam
validation_split: val
fewshot_split: dev
......
task: gsm8k
dataset_path: mcaleste/sat_multiple_choice_math_may_23
dataset_name: main
output_type: generate_until
fewshot_split: none
test_split: train
doc_to_text: "Question: {{question}}\nAnswer:"
doc_to_target: "({{answer}})"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
process_results: !function _utils.process_results
generation_kwargs:
until:
- "I hope it is correct."
do_sample: false
temperature: 0.0
repeats: 1
num_fewshot: 5
# filter_list:
# - name: "get-answer"
# filter:
# - function: "regex"
# regex_pattern: "### (\\-?[0-9\\.\\,]+)"
# - function: "take_first"
group:
tag:
- math_word_problems
task: mathqa
dataset_path: math_qa
......
group:
- pile
task: pile_arxiv
dataset_path: EleutherAI/pile
dataset_name: pile_arxiv
......
group:
tag:
- polemo2
task: polemo2_in
dataset_path: allegro/klej-polemo2-in
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment