fix alllll the merge conflicts

7d09b24c · haileyschoelkopf · 96dfe976 · 6348b947 · 7d09b24c · 7d09b24c
Commit 7d09b24c authored Jul 03, 2024 by haileyschoelkopf
20 changed files
--- a/lm_eval/tasks/benchmarks/flan/flan_held_in.yaml
+++ b/lm_eval/tasks/benchmarks/flan/flan_held_in.yaml
--- a/lm_eval/tasks/benchmarks/minerva_math.yaml
+++ b/lm_eval/tasks/benchmarks/minerva_math.yaml
@@ -7,3 +7,9 @@ task:
  - minerva_math_num_theory
  - minerva_math_prealgebra
  - minerva_math_precalc
+aggregate_metric_list:
+  - metric: exact_match
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/benchmarks/multimedqa/multimedqa.yaml
+++ b/lm_eval/tasks/benchmarks/multimedqa/multimedqa.yaml
@@ -15,3 +15,7 @@ task:
    task_alias: "professional_medicine (mmlu)"
  - task: mmlu_college_biology
    task_alias: "college_biology (mmlu)"
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: True
--- a/lm_eval/tasks/blimp/_blimp.yaml
+++ b/lm_eval/tasks/blimp/_blimp.yaml
+group: blimp
+task:
+  - "blimp_adjunct_island"
+  - "blimp_anaphor_gender_agreement"
+  - "blimp_anaphor_number_agreement"
+  - "blimp_animate_subject_passive"
+  - "blimp_animate_subject_trans"
+  - "blimp_causative"
+  - "blimp_complex_NP_island"
+  - "blimp_coordinate_structure_constraint_complex_left_branch"
+  - "blimp_coordinate_structure_constraint_object_extraction"
+  - "blimp_determiner_noun_agreement_1"
+  - "blimp_determiner_noun_agreement_2"
+  - "blimp_determiner_noun_agreement_irregular_1"
+  - "blimp_determiner_noun_agreement_irregular_2"
+  - "blimp_determiner_noun_agreement_with_adj_2"
+  - "blimp_determiner_noun_agreement_with_adj_irregular_1"
+  - "blimp_determiner_noun_agreement_with_adj_irregular_2"
+  - "blimp_determiner_noun_agreement_with_adjective_1"
+  - "blimp_distractor_agreement_relational_noun"
+  - "blimp_distractor_agreement_relative_clause"
+  - "blimp_drop_argument"
+  - "blimp_ellipsis_n_bar_1"
+  - "blimp_ellipsis_n_bar_2"
+  - "blimp_existential_there_object_raising"
+  - "blimp_existential_there_quantifiers_1"
+  - "blimp_existential_there_quantifiers_2"
+  - "blimp_existential_there_subject_raising"
+  - "blimp_expletive_it_object_raising"
+  - "blimp_inchoative"
+  - "blimp_intransitive"
+  - "blimp_irregular_past_participle_adjectives"
+  - "blimp_irregular_past_participle_verbs"
+  - "blimp_irregular_plural_subject_verb_agreement_1"
+  - "blimp_irregular_plural_subject_verb_agreement_2"
+  - "blimp_left_branch_island_echo_question"
+  - "blimp_left_branch_island_simple_question"
+  - "blimp_matrix_question_npi_licensor_present"
+  - "blimp_npi_present_1"
+  - "blimp_npi_present_2"
+  - "blimp_only_npi_licensor_present"
+  - "blimp_only_npi_scope"
+  - "blimp_passive_1"
+  - "blimp_passive_2"
+  - "blimp_principle_A_c_command"
+  - "blimp_principle_A_case_1"
+  - "blimp_principle_A_case_2"
+  - "blimp_principle_A_domain_1"
+  - "blimp_principle_A_domain_2"
+  - "blimp_principle_A_domain_3"
+  - "blimp_principle_A_reconstruction"
+  - "blimp_regular_plural_subject_verb_agreement_1"
+  - "blimp_regular_plural_subject_verb_agreement_2"
+  - "blimp_sentential_negation_npi_licensor_present"
+  - "blimp_sentential_negation_npi_scope"
+  - "blimp_sentential_subject_island"
+  - "blimp_superlative_quantifiers_1"
+  - "blimp_superlative_quantifiers_2"
+  - "blimp_tough_vs_raising_1"
+  - "blimp_tough_vs_raising_2"
+  - "blimp_transitive"
+  - "blimp_wh_island"
+  - "blimp_wh_questions_object_gap"
+  - "blimp_wh_questions_subject_gap"
+  - "blimp_wh_questions_subject_gap_long_distance"
+  - "blimp_wh_vs_that_no_gap"
+  - "blimp_wh_vs_that_no_gap_long_distance"
+  - "blimp_wh_vs_that_with_gap"
+  - "blimp_wh_vs_that_with_gap_long_distance"
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: False
+metadata:
+  version: 2.0
--- a/lm_eval/tasks/blimp/_template_yaml
+++ b/lm_eval/tasks/blimp/_template_yaml
-group: blimp
 dataset_path: blimp
 output_type: multiple_choice
 validation_split: train

--- a/lm_eval/tasks/ceval/_ceval-valid.yaml
+++ b/lm_eval/tasks/ceval/_ceval-valid.yaml
+aggregate_metric_list:
+- aggregation: mean
+  metric: acc
+  weight_by_size: true
+- aggregation: mean
+  metric: acc_norm
+  weight_by_size: true
+group: ceval-valid
+metadata:
+  version: 1.0
+task:
+  - ceval-valid_computer_network
+  - ceval-valid_operating_system
+  - ceval-valid_computer_architecture
+  - ceval-valid_college_programming
+  - ceval-valid_college_physics
+  - ceval-valid_college_chemistry
+  - ceval-valid_advanced_mathematics
+  - ceval-valid_probability_and_statistics
+  - ceval-valid_discrete_mathematics
+  - ceval-valid_electrical_engineer
+  - ceval-valid_metrology_engineer
+  - ceval-valid_high_school_mathematics
+  - ceval-valid_high_school_physics
+  - ceval-valid_high_school_chemistry
+  - ceval-valid_high_school_biology
+  - ceval-valid_middle_school_mathematics
+  - ceval-valid_middle_school_biology
+  - ceval-valid_middle_school_physics
+  - ceval-valid_middle_school_chemistry
+  - ceval-valid_veterinary_medicine
+  - ceval-valid_college_economics
+  - ceval-valid_business_administration
+  - ceval-valid_marxism
+  - ceval-valid_mao_zedong_thought
+  - ceval-valid_education_science
+  - ceval-valid_teacher_qualification
+  - ceval-valid_high_school_politics
+  - ceval-valid_high_school_geography
+  - ceval-valid_middle_school_politics
+  - ceval-valid_middle_school_geography
+  - ceval-valid_modern_chinese_history
+  - ceval-valid_ideological_and_moral_cultivation
+  - ceval-valid_logic
+  - ceval-valid_law
+  - ceval-valid_chinese_language_and_literature
+  - ceval-valid_art_studies
+  - ceval-valid_professional_tour_guide
+  - ceval-valid_legal_professional
+  - ceval-valid_high_school_chinese
+  - ceval-valid_high_school_history
+  - ceval-valid_middle_school_history
+  - ceval-valid_civil_servant
+  - ceval-valid_sports_science
+  - ceval-valid_plant_protection
+  - ceval-valid_basic_medicine
+  - ceval-valid_clinical_medicine
+  - ceval-valid_urban_and_rural_planner
+  - ceval-valid_accountant
+  - ceval-valid_fire_engineer
+  - ceval-valid_environmental_impact_assessment_engineer
+  - ceval-valid_tax_accountant
+  - ceval-valid_physician
--- a/lm_eval/tasks/ceval/_default_ceval_yaml
+++ b/lm_eval/tasks/ceval/_default_ceval_yaml
-group: ceval-valid
 dataset_path: ceval/ceval-exam
 validation_split: val
 fewshot_split: dev

--- a/lm_eval/tasks/ceval/_generate_configs.py
+++ b/lm_eval/tasks/ceval/_generate_configs.py
@@ -8,7 +8,7 @@ import os
 import yaml
 from tqdm import tqdm

-from lm_eval.logger import eval_logger
+from lm_eval.utils import eval_logger


 SUBJECTS = {
@@ -117,3 +117,26 @@ if __name__ == "__main__":
                allow_unicode=True,
                default_style='"',
            )
+
+    # write group config out
+
+    group_yaml_dict = {
+        "group": "ceval-valid",
+        "task": [f"ceval-valid_{task_name}" for task_name in SUBJECTS.keys()],
+        "aggregate_metric_list": [
+            {"metric": "acc", "aggregation": "mean", "weight_by_size": True},
+            {"metric": "acc_norm", "aggregation": "mean", "weight_by_size": True},
+        ],
+        "metadata": {"version": 1.0},
+    }
+
+    file_save_path = "_" + args.save_prefix_path + ".yaml"
+
+    with open(file_save_path, "w", encoding="utf-8") as group_yaml_file:
+        yaml.dump(
+            group_yaml_dict,
+            group_yaml_file,
+            width=float("inf"),
+            allow_unicode=True,
+            default_style='"',
+        )
--- a/lm_eval/tasks/cmmlu/_cmmlu.yaml
+++ b/lm_eval/tasks/cmmlu/_cmmlu.yaml
+aggregate_metric_list:
+- aggregation: mean
+  metric: acc
+  weight_by_size: true
+- aggregation: mean
+  metric: acc_norm
+  weight_by_size: true
+group: cmmlu
+metadata:
+  version: 0.0
+task:
+- cmmlu_agronomy
+- cmmlu_anatomy
+- cmmlu_ancient_chinese
+- cmmlu_arts
+- cmmlu_astronomy
+- cmmlu_business_ethics
+- cmmlu_chinese_civil_service_exam
+- cmmlu_chinese_driving_rule
+- cmmlu_chinese_food_culture
+- cmmlu_chinese_foreign_policy
+- cmmlu_chinese_history
+- cmmlu_chinese_literature
+- cmmlu_chinese_teacher_qualification
+- cmmlu_clinical_knowledge
+- cmmlu_college_actuarial_science
+- cmmlu_college_education
+- cmmlu_college_engineering_hydrology
+- cmmlu_college_law
+- cmmlu_college_mathematics
+- cmmlu_college_medical_statistics
+- cmmlu_college_medicine
+- cmmlu_computer_science
+- cmmlu_computer_security
+- cmmlu_conceptual_physics
+- cmmlu_construction_project_management
+- cmmlu_economics
+- cmmlu_education
+- cmmlu_electrical_engineering
+- cmmlu_elementary_chinese
+- cmmlu_elementary_commonsense
+- cmmlu_elementary_information_and_technology
+- cmmlu_elementary_mathematics
+- cmmlu_ethnology
+- cmmlu_food_science
+- cmmlu_genetics
+- cmmlu_global_facts
+- cmmlu_high_school_biology
+- cmmlu_high_school_chemistry
+- cmmlu_high_school_geography
+- cmmlu_high_school_mathematics
+- cmmlu_high_school_physics
+- cmmlu_high_school_politics
+- cmmlu_human_sexuality
+- cmmlu_international_law
+- cmmlu_journalism
+- cmmlu_jurisprudence
+- cmmlu_legal_and_moral_basis
+- cmmlu_logical
+- cmmlu_machine_learning
+- cmmlu_management
+- cmmlu_marketing
+- cmmlu_marxist_theory
+- cmmlu_modern_chinese
+- cmmlu_nutrition
+- cmmlu_philosophy
+- cmmlu_professional_accounting
+- cmmlu_professional_law
+- cmmlu_professional_medicine
+- cmmlu_professional_psychology
+- cmmlu_public_relations
+- cmmlu_security_study
+- cmmlu_sociology
+- cmmlu_sports_science
+- cmmlu_traditional_chinese_medicine
+- cmmlu_virology
+- cmmlu_world_history
+- cmmlu_world_religions
--- a/lm_eval/tasks/cmmlu/_generate_configs.py
+++ b/lm_eval/tasks/cmmlu/_generate_configs.py
@@ -132,3 +132,33 @@ if __name__ == "__main__":
                allow_unicode=True,
                default_style='"',
            )
+
+    # write group config out
+
+    group_yaml_dict = {
+        "group": "cmmlu",
+        "task": [
+            (
+                f"cmmlu_{args.task_prefix}_{subject_eng}"
+                if args.task_prefix != ""
+                else f"cmmlu_{subject_eng}"
+            )
+            for subject_eng in SUBJECTS.keys()
+        ],
+        "aggregate_metric_list": [
+            {"metric": "acc", "aggregation": "mean", "weight_by_size": True},
+            {"metric": "acc_norm", "aggregation": "mean", "weight_by_size": True},
+        ],
+        "metadata": {"version": 0.0},
+    }
+
+    file_save_path = "_" + args.save_prefix_path + ".yaml"
+
+    with open(file_save_path, "w", encoding="utf-8") as group_yaml_file:
+        yaml.dump(
+            group_yaml_dict,
+            group_yaml_file,
+            width=float("inf"),
+            allow_unicode=True,
+            default_style='"',
+        )
--- a/lm_eval/tasks/cmmlu/cmmlu_agronomy.yaml
+++ b/lm_eval/tasks/cmmlu/cmmlu_agronomy.yaml
+"dataset_name": "agronomy"
+"description": "以下是关于农学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_agronomy"
--- a/lm_eval/tasks/cmmlu/cmmlu_anatomy.yaml
+++ b/lm_eval/tasks/cmmlu/cmmlu_anatomy.yaml
+"dataset_name": "anatomy"
+"description": "以下是关于解剖学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_anatomy"
--- a/lm_eval/tasks/cmmlu/cmmlu_ancient_chinese.yaml
+++ b/lm_eval/tasks/cmmlu/cmmlu_ancient_chinese.yaml
+"dataset_name": "ancient_chinese"
+"description": "以下是关于古汉语的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_ancient_chinese"
--- a/lm_eval/tasks/cmmlu/cmmlu_arts.yaml
+++ b/lm_eval/tasks/cmmlu/cmmlu_arts.yaml
+"dataset_name": "arts"
+"description": "以下是关于艺术学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_arts"
--- a/lm_eval/tasks/cmmlu/cmmlu_astronomy.yaml
+++ b/lm_eval/tasks/cmmlu/cmmlu_astronomy.yaml
+"dataset_name": "astronomy"
+"description": "以下是关于天文学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_astronomy"
--- a/lm_eval/tasks/cmmlu/cmmlu_business_ethics.yaml
+++ b/lm_eval/tasks/cmmlu/cmmlu_business_ethics.yaml
+"dataset_name": "business_ethics"
+"description": "以下是关于商业伦理的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_business_ethics"
--- a/lm_eval/tasks/cmmlu/cmmlu_chinese_civil_service_exam.yaml
+++ b/lm_eval/tasks/cmmlu/cmmlu_chinese_civil_service_exam.yaml
+"dataset_name": "chinese_civil_service_exam"
+"description": "以下是关于中国公务员考试的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_chinese_civil_service_exam"
--- a/lm_eval/tasks/cmmlu/cmmlu_chinese_driving_rule.yaml
+++ b/lm_eval/tasks/cmmlu/cmmlu_chinese_driving_rule.yaml
+"dataset_name": "chinese_driving_rule"
+"description": "以下是关于中国驾驶规则的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_chinese_driving_rule"
--- a/lm_eval/tasks/cmmlu/cmmlu_chinese_food_culture.yaml
+++ b/lm_eval/tasks/cmmlu/cmmlu_chinese_food_culture.yaml
+"dataset_name": "chinese_food_culture"
+"description": "以下是关于中国饮食文化的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_chinese_food_culture"
--- a/lm_eval/tasks/cmmlu/cmmlu_chinese_foreign_policy.yaml
+++ b/lm_eval/tasks/cmmlu/cmmlu_chinese_foreign_policy.yaml
+"dataset_name": "chinese_foreign_policy"
+"description": "以下是关于中国外交政策的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_chinese_foreign_policy"