Commit 09dd7f6c authored by haileyschoelkopf's avatar haileyschoelkopf
Browse files

add more explicit group configs

parent 8fdcbc13
...@@ -4,7 +4,7 @@ task: ...@@ -4,7 +4,7 @@ task:
# ANLI R1 # ANLI R1
- group: anli_r1_flan - group: anli_r1_flan
group_alias: ANLI R1 group_alias: ANLI R1
aggregate_metric: aggregate_metric_list:
- metric: acc - metric: acc
weight_by_size: True weight_by_size: True
task: task:
...@@ -56,7 +56,7 @@ task: ...@@ -56,7 +56,7 @@ task:
# ANLI R2 # ANLI R2
- group: anli_r2_flan - group: anli_r2_flan
group_alias: ANLI R2 group_alias: ANLI R2
aggregate_metric: aggregate_metric_list:
- metric: acc - metric: acc
weight_by_size: True weight_by_size: True
task: task:
...@@ -108,7 +108,7 @@ task: ...@@ -108,7 +108,7 @@ task:
# ANLI R3 # ANLI R3
- group: anli_r3_flan - group: anli_r3_flan
group_alias: ANLI R3 group_alias: ANLI R3
aggregate_metric: aggregate_metric_list:
- metric: acc - metric: acc
weight_by_size: True weight_by_size: True
task: task:
...@@ -160,7 +160,7 @@ task: ...@@ -160,7 +160,7 @@ task:
# Arc Easy # Arc Easy
- group: arc_easy_flan - group: arc_easy_flan
group_alias: Arc Easy group_alias: Arc Easy
aggregate_metric: aggregate_metric_list:
- metric: acc - metric: acc
weight_by_size: True weight_by_size: True
task: task:
...@@ -202,7 +202,7 @@ task: ...@@ -202,7 +202,7 @@ task:
# Arc Challenge # Arc Challenge
- group: arc_challenge_flan - group: arc_challenge_flan
group_alias: Arc Challenge group_alias: Arc Challenge
aggregate_metric: aggregate_metric_list:
- metric: acc - metric: acc
weight_by_size: True weight_by_size: True
task: task:
...@@ -244,7 +244,7 @@ task: ...@@ -244,7 +244,7 @@ task:
# BoolQ # BoolQ
- group: boolq_flan - group: boolq_flan
group_alias: BoolQ group_alias: BoolQ
aggregate_metric: aggregate_metric_list:
- metric: acc - metric: acc
weight_by_size: True weight_by_size: True
task: task:
...@@ -301,7 +301,7 @@ task: ...@@ -301,7 +301,7 @@ task:
# RTE # RTE
- group: rte_flan - group: rte_flan
group_alias: RTE group_alias: RTE
aggregate_metric: aggregate_metric_list:
- metric: acc - metric: acc
weight_by_size: True weight_by_size: True
task: task:
......
...@@ -15,3 +15,7 @@ task: ...@@ -15,3 +15,7 @@ task:
task_alias: "professional_medicine (mmlu)" task_alias: "professional_medicine (mmlu)"
- task: mmlu_college_biology - task: mmlu_college_biology
task_alias: "college_biology (mmlu)" task_alias: "college_biology (mmlu)"
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: True
...@@ -9,55 +9,55 @@ group: ceval-valid ...@@ -9,55 +9,55 @@ group: ceval-valid
metadata: metadata:
version: 1.0 version: 1.0
task: task:
- ceval-valid_computer_network - ceval-valid_computer_network
- ceval-valid_operating_system - ceval-valid_operating_system
- ceval-valid_computer_architecture - ceval-valid_computer_architecture
- ceval-valid_college_programming - ceval-valid_college_programming
- ceval-valid_college_physics - ceval-valid_college_physics
- ceval-valid_college_chemistry - ceval-valid_college_chemistry
- ceval-valid_advanced_mathematics - ceval-valid_advanced_mathematics
- ceval-valid_probability_and_statistics - ceval-valid_probability_and_statistics
- ceval-valid_discrete_mathematics - ceval-valid_discrete_mathematics
- ceval-valid_electrical_engineer - ceval-valid_electrical_engineer
- ceval-valid_metrology_engineer - ceval-valid_metrology_engineer
- ceval-valid_high_school_mathematics - ceval-valid_high_school_mathematics
- ceval-valid_high_school_physics - ceval-valid_high_school_physics
- ceval-valid_high_school_chemistry - ceval-valid_high_school_chemistry
- ceval-valid_high_school_biology - ceval-valid_high_school_biology
- ceval-valid_middle_school_mathematics - ceval-valid_middle_school_mathematics
- ceval-valid_middle_school_biology - ceval-valid_middle_school_biology
- ceval-valid_middle_school_physics - ceval-valid_middle_school_physics
- ceval-valid_middle_school_chemistry - ceval-valid_middle_school_chemistry
- ceval-valid_veterinary_medicine - ceval-valid_veterinary_medicine
- ceval-valid_college_economics - ceval-valid_college_economics
- ceval-valid_business_administration - ceval-valid_business_administration
- ceval-valid_marxism - ceval-valid_marxism
- ceval-valid_mao_zedong_thought - ceval-valid_mao_zedong_thought
- ceval-valid_education_science - ceval-valid_education_science
- ceval-valid_teacher_qualification - ceval-valid_teacher_qualification
- ceval-valid_high_school_politics - ceval-valid_high_school_politics
- ceval-valid_high_school_geography - ceval-valid_high_school_geography
- ceval-valid_middle_school_politics - ceval-valid_middle_school_politics
- ceval-valid_middle_school_geography - ceval-valid_middle_school_geography
- ceval-valid_modern_chinese_history - ceval-valid_modern_chinese_history
- ceval-valid_ideological_and_moral_cultivation - ceval-valid_ideological_and_moral_cultivation
- ceval-valid_logic - ceval-valid_logic
- ceval-valid_law - ceval-valid_law
- ceval-valid_chinese_language_and_literature - ceval-valid_chinese_language_and_literature
- ceval-valid_art_studies - ceval-valid_art_studies
- ceval-valid_professional_tour_guide - ceval-valid_professional_tour_guide
- ceval-valid_legal_professional - ceval-valid_legal_professional
- ceval-valid_high_school_chinese - ceval-valid_high_school_chinese
- ceval-valid_high_school_history - ceval-valid_high_school_history
- ceval-valid_middle_school_history - ceval-valid_middle_school_history
- ceval-valid_civil_servant - ceval-valid_civil_servant
- ceval-valid_sports_science - ceval-valid_sports_science
- ceval-valid_plant_protection - ceval-valid_plant_protection
- ceval-valid_basic_medicine - ceval-valid_basic_medicine
- ceval-valid_clinical_medicine - ceval-valid_clinical_medicine
- ceval-valid_urban_and_rural_planner - ceval-valid_urban_and_rural_planner
- ceval-valid_accountant - ceval-valid_accountant
- ceval-valid_fire_engineer - ceval-valid_fire_engineer
- ceval-valid_environmental_impact_assessment_engineer - ceval-valid_environmental_impact_assessment_engineer
- ceval-valid_tax_accountant - ceval-valid_tax_accountant
- ceval-valid_physician - ceval-valid_physician
group: ceval-valid
dataset_path: ceval/ceval-exam dataset_path: ceval/ceval-exam
validation_split: val validation_split: val
fewshot_split: dev fewshot_split: dev
......
task: gsm8k
dataset_path: mcaleste/sat_multiple_choice_math_may_23
dataset_name: main
output_type: generate_until
fewshot_split: none
test_split: train
doc_to_text: "Question: {{question}}\nAnswer:"
doc_to_target: "({{answer}})"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
process_results: !function _utils.process_results
generation_kwargs:
until:
- "I hope it is correct."
do_sample: false
temperature: 0.0
repeats: 1
num_fewshot: 5
# filter_list:
# - name: "get-answer"
# filter:
# - function: "regex"
# regex_pattern: "### (\\-?[0-9\\.\\,]+)"
# - function: "take_first"
group: tag:
- math_word_problems - math_word_problems
task: mathqa task: mathqa
dataset_path: math_qa dataset_path: math_qa
......
group:
- pile
task: pile_arxiv task: pile_arxiv
dataset_path: EleutherAI/pile dataset_path: EleutherAI/pile
dataset_name: pile_arxiv dataset_name: pile_arxiv
......
group: tag:
- polemo2 - polemo2
task: polemo2_in task: polemo2_in
dataset_path: allegro/klej-polemo2-in dataset_path: allegro/klej-polemo2-in
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment