Unverified Commit aa2dd2b5 authored by Fengzhe Zhou's avatar Fengzhe Zhou Committed by GitHub
Browse files

[Format] Add config lints (#892)

parent 3dbba119
...@@ -115,36 +115,36 @@ agent_summary_groups = [ ...@@ -115,36 +115,36 @@ agent_summary_groups = [
other_summary_groups = [ other_summary_groups = [
{ {
"name": "average_cn", 'name': 'average_cn',
"subsets": [ 'subsets': [
["language_zh_perf_4_and_non_mcq", "naive_average"], ['language_zh_perf_4_and_non_mcq', 'naive_average'],
["knowledge_cn", "perf_4"], ['knowledge_cn', 'perf_4'],
["reasonbench_cn_circular", "perf_circular"], ['reasonbench_cn_circular', 'perf_circular'],
["math_perf_4_and_fill_in_blank_cn", "naive_average"], ['math_perf_4_and_fill_in_blank_cn', 'naive_average'],
["code_cn", "naive_average"], ['code_cn', 'naive_average'],
["agent_cn", "naive_average"], ['agent_cn', 'naive_average'],
], ],
}, },
{ {
"name": "average_en", 'name': 'average_en',
"subsets": [ 'subsets': [
["language_en_perf_4_and_non_mcq", "naive_average"], ['language_en_perf_4_and_non_mcq', 'naive_average'],
["compassbench_v1_knowledge-mixed-cloze_en", "score"], ['compassbench_v1_knowledge-mixed-cloze_en', 'score'],
["reasonbench_en_circular", "perf_circular"], ['reasonbench_en_circular', 'perf_circular'],
["math_perf_4_and_fill_in_blank_en", "naive_average"], ['math_perf_4_and_fill_in_blank_en', 'naive_average'],
["code_en", "naive_average"], ['code_en', 'naive_average'],
["agent_en", "naive_average"], ['agent_en', 'naive_average'],
], ],
}, },
{ {
"name": "average", 'name': 'average',
"subsets": [ 'subsets': [
["language_perf_4_and_non_mcq", "naive_average"], ['language_perf_4_and_non_mcq', 'naive_average'],
["knowledge_perf_4_and_cloze", "naive_average"], ['knowledge_perf_4_and_cloze', 'naive_average'],
["reasonbench", "perf_circular"], ['reasonbench', 'perf_circular'],
["math_perf_4_and_fill_in_blank", "naive_average"], ['math_perf_4_and_fill_in_blank', 'naive_average'],
["code", "naive_average"], ['code', 'naive_average'],
["agent", "naive_average"], ['agent', 'naive_average'],
], ],
}, },
] ]
...@@ -223,5 +223,5 @@ summarizer = dict( ...@@ -223,5 +223,5 @@ summarizer = dict(
['plugin_eval-mus-p10_one_review_zh', 'naive_average'], ['plugin_eval-mus-p10_one_review_zh', 'naive_average'],
['plugin_eval-mus-p10_one_review', 'naive_average'], ['plugin_eval-mus-p10_one_review', 'naive_average'],
], ],
summary_groups=sum([v for k, v in locals().items() if k.endswith("_summary_groups")], []), summary_groups=sum([v for k, v in locals().items() if k.endswith('_summary_groups')], []),
) )
...@@ -60,63 +60,63 @@ ceval_category_weights = { ...@@ -60,63 +60,63 @@ ceval_category_weights = {
} }
mmlu_category_weights = { mmlu_category_weights = {
"business_ethics": {"accuracy - clean": 44, "accuracy - input contaminated": 16, "accuracy - input-and-label contaminated": 38, "accuracy - not labeled": 1}, 'business_ethics': {'accuracy - clean': 44, 'accuracy - input contaminated': 16, 'accuracy - input-and-label contaminated': 38, 'accuracy - not labeled': 1},
"security_studies": {"accuracy - clean": 188, "accuracy - input contaminated": 9, "accuracy - input-and-label contaminated": 47, "accuracy - not labeled": 0}, 'security_studies': {'accuracy - clean': 188, 'accuracy - input contaminated': 9, 'accuracy - input-and-label contaminated': 47, 'accuracy - not labeled': 0},
"high_school_us_history": {"accuracy - clean": 42, "accuracy - input contaminated": 0, "accuracy - input-and-label contaminated": 0, "accuracy - not labeled": 161}, 'high_school_us_history': {'accuracy - clean': 42, 'accuracy - input contaminated': 0, 'accuracy - input-and-label contaminated': 0, 'accuracy - not labeled': 161},
"moral_disputes": {"accuracy - clean": 105, "accuracy - input contaminated": 13, "accuracy - input-and-label contaminated": 168, "accuracy - not labeled": 59}, 'moral_disputes': {'accuracy - clean': 105, 'accuracy - input contaminated': 13, 'accuracy - input-and-label contaminated': 168, 'accuracy - not labeled': 59},
"philosophy": {"accuracy - clean": 81, "accuracy - input contaminated": 11, "accuracy - input-and-label contaminated": 187, "accuracy - not labeled": 31}, 'philosophy': {'accuracy - clean': 81, 'accuracy - input contaminated': 11, 'accuracy - input-and-label contaminated': 187, 'accuracy - not labeled': 31},
"public_relations": {"accuracy - clean": 75, "accuracy - input contaminated": 8, "accuracy - input-and-label contaminated": 26, "accuracy - not labeled": 0}, 'public_relations': {'accuracy - clean': 75, 'accuracy - input contaminated': 8, 'accuracy - input-and-label contaminated': 26, 'accuracy - not labeled': 0},
"high_school_microeconomics": {"accuracy - clean": 82, "accuracy - input contaminated": 9, "accuracy - input-and-label contaminated": 146, "accuracy - not labeled": 0}, 'high_school_microeconomics': {'accuracy - clean': 82, 'accuracy - input contaminated': 9, 'accuracy - input-and-label contaminated': 146, 'accuracy - not labeled': 0},
"human_sexuality": {"accuracy - clean": 108, "accuracy - input contaminated": 3, "accuracy - input-and-label contaminated": 15, "accuracy - not labeled": 4}, 'human_sexuality': {'accuracy - clean': 108, 'accuracy - input contaminated': 3, 'accuracy - input-and-label contaminated': 15, 'accuracy - not labeled': 4},
"professional_accounting": {"accuracy - clean": 88, "accuracy - input contaminated": 40, "accuracy - input-and-label contaminated": 152, "accuracy - not labeled": 1}, 'professional_accounting': {'accuracy - clean': 88, 'accuracy - input contaminated': 40, 'accuracy - input-and-label contaminated': 152, 'accuracy - not labeled': 1},
"high_school_government_and_politics": {"accuracy - clean": 104, "accuracy - input contaminated": 6, "accuracy - input-and-label contaminated": 82, "accuracy - not labeled": 0}, 'high_school_government_and_politics': {'accuracy - clean': 104, 'accuracy - input contaminated': 6, 'accuracy - input-and-label contaminated': 82, 'accuracy - not labeled': 0},
"sociology": {"accuracy - clean": 105, "accuracy - input contaminated": 4, "accuracy - input-and-label contaminated": 91, "accuracy - not labeled": 0}, 'sociology': {'accuracy - clean': 105, 'accuracy - input contaminated': 4, 'accuracy - input-and-label contaminated': 91, 'accuracy - not labeled': 0},
"conceptual_physics": {"accuracy - clean": 79, "accuracy - input contaminated": 8, "accuracy - input-and-label contaminated": 147, "accuracy - not labeled": 0}, 'conceptual_physics': {'accuracy - clean': 79, 'accuracy - input contaminated': 8, 'accuracy - input-and-label contaminated': 147, 'accuracy - not labeled': 0},
"human_aging": {"accuracy - clean": 208, "accuracy - input contaminated": 1, "accuracy - input-and-label contaminated": 13, "accuracy - not labeled": 0}, 'human_aging': {'accuracy - clean': 208, 'accuracy - input contaminated': 1, 'accuracy - input-and-label contaminated': 13, 'accuracy - not labeled': 0},
"high_school_psychology": {"accuracy - clean": 108, "accuracy - input contaminated": 26, "accuracy - input-and-label contaminated": 162, "accuracy - not labeled": 248}, 'high_school_psychology': {'accuracy - clean': 108, 'accuracy - input contaminated': 26, 'accuracy - input-and-label contaminated': 162, 'accuracy - not labeled': 248},
"jurisprudence": {"accuracy - clean": 59, "accuracy - input contaminated": 5, "accuracy - input-and-label contaminated": 43, "accuracy - not labeled": 0}, 'jurisprudence': {'accuracy - clean': 59, 'accuracy - input contaminated': 5, 'accuracy - input-and-label contaminated': 43, 'accuracy - not labeled': 0},
"moral_scenarios": {"accuracy - clean": 320, "accuracy - input contaminated": 0, "accuracy - input-and-label contaminated": 0, "accuracy - not labeled": 574}, 'moral_scenarios': {'accuracy - clean': 320, 'accuracy - input contaminated': 0, 'accuracy - input-and-label contaminated': 0, 'accuracy - not labeled': 574},
"college_medicine": {"accuracy - clean": 107, "accuracy - input contaminated": 16, "accuracy - input-and-label contaminated": 44, "accuracy - not labeled": 5}, 'college_medicine': {'accuracy - clean': 107, 'accuracy - input contaminated': 16, 'accuracy - input-and-label contaminated': 44, 'accuracy - not labeled': 5},
"high_school_world_history": {"accuracy - clean": 61, "accuracy - input contaminated": 2, "accuracy - input-and-label contaminated": 0, "accuracy - not labeled": 173}, 'high_school_world_history': {'accuracy - clean': 61, 'accuracy - input contaminated': 2, 'accuracy - input-and-label contaminated': 0, 'accuracy - not labeled': 173},
"virology": {"accuracy - clean": 104, "accuracy - input contaminated": 3, "accuracy - input-and-label contaminated": 58, "accuracy - not labeled": 0}, 'virology': {'accuracy - clean': 104, 'accuracy - input contaminated': 3, 'accuracy - input-and-label contaminated': 58, 'accuracy - not labeled': 0},
"high_school_statistics": {"accuracy - clean": 96, "accuracy - input contaminated": 43, "accuracy - input-and-label contaminated": 76, "accuracy - not labeled": 0}, 'high_school_statistics': {'accuracy - clean': 96, 'accuracy - input contaminated': 43, 'accuracy - input-and-label contaminated': 76, 'accuracy - not labeled': 0},
"nutrition": {"accuracy - clean": 172, "accuracy - input contaminated": 11, "accuracy - input-and-label contaminated": 98, "accuracy - not labeled": 24}, 'nutrition': {'accuracy - clean': 172, 'accuracy - input contaminated': 11, 'accuracy - input-and-label contaminated': 98, 'accuracy - not labeled': 24},
"abstract_algebra": {"accuracy - clean": 84, "accuracy - input contaminated": 8, "accuracy - input-and-label contaminated": 7, "accuracy - not labeled": 0}, 'abstract_algebra': {'accuracy - clean': 84, 'accuracy - input contaminated': 8, 'accuracy - input-and-label contaminated': 7, 'accuracy - not labeled': 0},
"high_school_geography": {"accuracy - clean": 91, "accuracy - input contaminated": 1, "accuracy - input-and-label contaminated": 105, "accuracy - not labeled": 0}, 'high_school_geography': {'accuracy - clean': 91, 'accuracy - input contaminated': 1, 'accuracy - input-and-label contaminated': 105, 'accuracy - not labeled': 0},
"econometrics": {"accuracy - clean": 62, "accuracy - input contaminated": 13, "accuracy - input-and-label contaminated": 38, "accuracy - not labeled": 0}, 'econometrics': {'accuracy - clean': 62, 'accuracy - input contaminated': 13, 'accuracy - input-and-label contaminated': 38, 'accuracy - not labeled': 0},
"marketing": {"accuracy - clean": 115, "accuracy - input contaminated": 15, "accuracy - input-and-label contaminated": 101, "accuracy - not labeled": 2}, 'marketing': {'accuracy - clean': 115, 'accuracy - input contaminated': 15, 'accuracy - input-and-label contaminated': 101, 'accuracy - not labeled': 2},
"high_school_chemistry": {"accuracy - clean": 108, "accuracy - input contaminated": 25, "accuracy - input-and-label contaminated": 69, "accuracy - not labeled": 0}, 'high_school_chemistry': {'accuracy - clean': 108, 'accuracy - input contaminated': 25, 'accuracy - input-and-label contaminated': 69, 'accuracy - not labeled': 0},
"prehistory": {"accuracy - clean": 154, "accuracy - input contaminated": 5, "accuracy - input-and-label contaminated": 107, "accuracy - not labeled": 57}, 'prehistory': {'accuracy - clean': 154, 'accuracy - input contaminated': 5, 'accuracy - input-and-label contaminated': 107, 'accuracy - not labeled': 57},
"college_physics": {"accuracy - clean": 25, "accuracy - input contaminated": 20, "accuracy - input-and-label contaminated": 57, "accuracy - not labeled": 0}, 'college_physics': {'accuracy - clean': 25, 'accuracy - input contaminated': 20, 'accuracy - input-and-label contaminated': 57, 'accuracy - not labeled': 0},
"management": {"accuracy - clean": 35, "accuracy - input contaminated": 5, "accuracy - input-and-label contaminated": 62, "accuracy - not labeled": 0}, 'management': {'accuracy - clean': 35, 'accuracy - input contaminated': 5, 'accuracy - input-and-label contaminated': 62, 'accuracy - not labeled': 0},
"college_biology": {"accuracy - clean": 91, "accuracy - input contaminated": 12, "accuracy - input-and-label contaminated": 40, "accuracy - not labeled": 0}, 'college_biology': {'accuracy - clean': 91, 'accuracy - input contaminated': 12, 'accuracy - input-and-label contaminated': 40, 'accuracy - not labeled': 0},
"high_school_biology": {"accuracy - clean": 128, "accuracy - input contaminated": 17, "accuracy - input-and-label contaminated": 135, "accuracy - not labeled": 29}, 'high_school_biology': {'accuracy - clean': 128, 'accuracy - input contaminated': 17, 'accuracy - input-and-label contaminated': 135, 'accuracy - not labeled': 29},
"high_school_physics": {"accuracy - clean": 42, "accuracy - input contaminated": 28, "accuracy - input-and-label contaminated": 80, "accuracy - not labeled": 0}, 'high_school_physics': {'accuracy - clean': 42, 'accuracy - input contaminated': 28, 'accuracy - input-and-label contaminated': 80, 'accuracy - not labeled': 0},
"logical_fallacies": {"accuracy - clean": 133, "accuracy - input contaminated": 5, "accuracy - input-and-label contaminated": 24, "accuracy - not labeled": 0}, 'logical_fallacies': {'accuracy - clean': 133, 'accuracy - input contaminated': 5, 'accuracy - input-and-label contaminated': 24, 'accuracy - not labeled': 0},
"medical_genetics": {"accuracy - clean": 49, "accuracy - input contaminated": 6, "accuracy - input-and-label contaminated": 43, "accuracy - not labeled": 1}, 'medical_genetics': {'accuracy - clean': 49, 'accuracy - input contaminated': 6, 'accuracy - input-and-label contaminated': 43, 'accuracy - not labeled': 1},
"machine_learning": {"accuracy - clean": 71, "accuracy - input contaminated": 8, "accuracy - input-and-label contaminated": 32, "accuracy - not labeled": 0}, 'machine_learning': {'accuracy - clean': 71, 'accuracy - input contaminated': 8, 'accuracy - input-and-label contaminated': 32, 'accuracy - not labeled': 0},
"professional_law": {"accuracy - clean": 401, "accuracy - input contaminated": 8, "accuracy - input-and-label contaminated": 5, "accuracy - not labeled": 1119}, 'professional_law': {'accuracy - clean': 401, 'accuracy - input contaminated': 8, 'accuracy - input-and-label contaminated': 5, 'accuracy - not labeled': 1119},
"professional_psychology": {"accuracy - clean": 265, "accuracy - input contaminated": 9, "accuracy - input-and-label contaminated": 27, "accuracy - not labeled": 310}, 'professional_psychology': {'accuracy - clean': 265, 'accuracy - input contaminated': 9, 'accuracy - input-and-label contaminated': 27, 'accuracy - not labeled': 310},
"global_facts": {"accuracy - clean": 89, "accuracy - input contaminated": 5, "accuracy - input-and-label contaminated": 5, "accuracy - not labeled": 0}, 'global_facts': {'accuracy - clean': 89, 'accuracy - input contaminated': 5, 'accuracy - input-and-label contaminated': 5, 'accuracy - not labeled': 0},
"us_foreign_policy": {"accuracy - clean": 71, "accuracy - input contaminated": 3, "accuracy - input-and-label contaminated": 25, "accuracy - not labeled": 0}, 'us_foreign_policy': {'accuracy - clean': 71, 'accuracy - input contaminated': 3, 'accuracy - input-and-label contaminated': 25, 'accuracy - not labeled': 0},
"international_law": {"accuracy - clean": 73, "accuracy - input contaminated": 1, "accuracy - input-and-label contaminated": 46, "accuracy - not labeled": 0}, 'international_law': {'accuracy - clean': 73, 'accuracy - input contaminated': 1, 'accuracy - input-and-label contaminated': 46, 'accuracy - not labeled': 0},
"clinical_knowledge": {"accuracy - clean": 172, "accuracy - input contaminated": 6, "accuracy - input-and-label contaminated": 86, "accuracy - not labeled": 0}, 'clinical_knowledge': {'accuracy - clean': 172, 'accuracy - input contaminated': 6, 'accuracy - input-and-label contaminated': 86, 'accuracy - not labeled': 0},
"high_school_mathematics": {"accuracy - clean": 178, "accuracy - input contaminated": 59, "accuracy - input-and-label contaminated": 32, "accuracy - not labeled": 0}, 'high_school_mathematics': {'accuracy - clean': 178, 'accuracy - input contaminated': 59, 'accuracy - input-and-label contaminated': 32, 'accuracy - not labeled': 0},
"high_school_computer_science": {"accuracy - clean": 62, "accuracy - input contaminated": 7, "accuracy - input-and-label contaminated": 28, "accuracy - not labeled": 2}, 'high_school_computer_science': {'accuracy - clean': 62, 'accuracy - input contaminated': 7, 'accuracy - input-and-label contaminated': 28, 'accuracy - not labeled': 2},
"college_computer_science": {"accuracy - clean": 68, "accuracy - input contaminated": 15, "accuracy - input-and-label contaminated": 15, "accuracy - not labeled": 1}, 'college_computer_science': {'accuracy - clean': 68, 'accuracy - input contaminated': 15, 'accuracy - input-and-label contaminated': 15, 'accuracy - not labeled': 1},
"electrical_engineering": {"accuracy - clean": 75, "accuracy - input contaminated": 8, "accuracy - input-and-label contaminated": 61, "accuracy - not labeled": 0}, 'electrical_engineering': {'accuracy - clean': 75, 'accuracy - input contaminated': 8, 'accuracy - input-and-label contaminated': 61, 'accuracy - not labeled': 0},
"college_mathematics": {"accuracy - clean": 61, "accuracy - input contaminated": 13, "accuracy - input-and-label contaminated": 26, "accuracy - not labeled": 0}, 'college_mathematics': {'accuracy - clean': 61, 'accuracy - input contaminated': 13, 'accuracy - input-and-label contaminated': 26, 'accuracy - not labeled': 0},
"computer_security": {"accuracy - clean": 55, "accuracy - input contaminated": 8, "accuracy - input-and-label contaminated": 36, "accuracy - not labeled": 0}, 'computer_security': {'accuracy - clean': 55, 'accuracy - input contaminated': 8, 'accuracy - input-and-label contaminated': 36, 'accuracy - not labeled': 0},
"high_school_macroeconomics": {"accuracy - clean": 102, "accuracy - input contaminated": 14, "accuracy - input-and-label contaminated": 173, "accuracy - not labeled": 100}, 'high_school_macroeconomics': {'accuracy - clean': 102, 'accuracy - input contaminated': 14, 'accuracy - input-and-label contaminated': 173, 'accuracy - not labeled': 100},
"astronomy": {"accuracy - clean": 112, "accuracy - input contaminated": 4, "accuracy - input-and-label contaminated": 35, "accuracy - not labeled": 0}, 'astronomy': {'accuracy - clean': 112, 'accuracy - input contaminated': 4, 'accuracy - input-and-label contaminated': 35, 'accuracy - not labeled': 0},
"college_chemistry": {"accuracy - clean": 46, "accuracy - input contaminated": 19, "accuracy - input-and-label contaminated": 34, "accuracy - not labeled": 0}, 'college_chemistry': {'accuracy - clean': 46, 'accuracy - input contaminated': 19, 'accuracy - input-and-label contaminated': 34, 'accuracy - not labeled': 0},
"high_school_european_history": {"accuracy - clean": 41, "accuracy - input contaminated": 0, "accuracy - input-and-label contaminated": 0, "accuracy - not labeled": 123}, 'high_school_european_history': {'accuracy - clean': 41, 'accuracy - input contaminated': 0, 'accuracy - input-and-label contaminated': 0, 'accuracy - not labeled': 123},
"miscellaneous": {"accuracy - clean": 256, "accuracy - input contaminated": 9, "accuracy - input-and-label contaminated": 40, "accuracy - not labeled": 477}, 'miscellaneous': {'accuracy - clean': 256, 'accuracy - input contaminated': 9, 'accuracy - input-and-label contaminated': 40, 'accuracy - not labeled': 477},
"formal_logic": {"accuracy - clean": 92, "accuracy - input contaminated": 12, "accuracy - input-and-label contaminated": 21, "accuracy - not labeled": 0}, 'formal_logic': {'accuracy - clean': 92, 'accuracy - input contaminated': 12, 'accuracy - input-and-label contaminated': 21, 'accuracy - not labeled': 0},
"elementary_mathematics": {"accuracy - clean": 155, "accuracy - input contaminated": 31, "accuracy - input-and-label contaminated": 103, "accuracy - not labeled": 88}, 'elementary_mathematics': {'accuracy - clean': 155, 'accuracy - input contaminated': 31, 'accuracy - input-and-label contaminated': 103, 'accuracy - not labeled': 88},
"world_religions": {"accuracy - clean": 130, "accuracy - input contaminated": 4, "accuracy - input-and-label contaminated": 36, "accuracy - not labeled": 0}, 'world_religions': {'accuracy - clean': 130, 'accuracy - input contaminated': 4, 'accuracy - input-and-label contaminated': 36, 'accuracy - not labeled': 0},
"professional_medicine": {"accuracy - clean": 191, "accuracy - input contaminated": 43, "accuracy - input-and-label contaminated": 1, "accuracy - not labeled": 36}, 'professional_medicine': {'accuracy - clean': 191, 'accuracy - input contaminated': 43, 'accuracy - input-and-label contaminated': 1, 'accuracy - not labeled': 36},
"anatomy": {"accuracy - clean": 52, "accuracy - input contaminated": 6, "accuracy - input-and-label contaminated": 76, "accuracy - not labeled": 0}, 'anatomy': {'accuracy - clean': 52, 'accuracy - input contaminated': 6, 'accuracy - input-and-label contaminated': 76, 'accuracy - not labeled': 0},
} }
...@@ -166,7 +166,7 @@ for metric_name in ['accuracy - clean', 'accuracy - input contaminated', 'accura ...@@ -166,7 +166,7 @@ for metric_name in ['accuracy - clean', 'accuracy - input contaminated', 'accura
'weights': weights, 'weights': weights,
} }
) )
for dataset_abbr, subsets in mmlu_name_and_subsets: for dataset_abbr, subsets in mmlu_name_and_subsets:
weights = {f'lukaemon_mmlu_{i}': mmlu_category_weights[i][metric_name] for i in subsets} weights = {f'lukaemon_mmlu_{i}': mmlu_category_weights[i][metric_name] for i in subsets}
subsets = [[f'lukaemon_mmlu_{i}', metric_name] for i in subsets] subsets = [[f'lukaemon_mmlu_{i}', metric_name] for i in subsets]
...@@ -178,7 +178,7 @@ for metric_name in ['accuracy - clean', 'accuracy - input contaminated', 'accura ...@@ -178,7 +178,7 @@ for metric_name in ['accuracy - clean', 'accuracy - input contaminated', 'accura
'weights': weights, 'weights': weights,
} }
) )
summary_groups.append( summary_groups.append(
{ {
'name': 'hellaswag', 'name': 'hellaswag',
......
...@@ -14,5 +14,5 @@ with read_base(): ...@@ -14,5 +14,5 @@ with read_base():
from .groups.mgsm import mgsm_summary_groups from .groups.mgsm import mgsm_summary_groups
summarizer = dict( summarizer = dict(
summary_groups=sum([v for k, v in locals().items() if k.endswith("_summary_groups")], []), summary_groups=sum([v for k, v in locals().items() if k.endswith('_summary_groups')], []),
) )
sub_categories = { sub_categories = {
'math': ['abstract_algebra', 'college_mathematics', 'elementary_mathematics', 'high_school_mathematics', 'high_school_statistics'], 'math': ['abstract_algebra', 'college_mathematics', 'elementary_mathematics', 'high_school_mathematics', 'high_school_statistics'],
'health': ['anatomy', 'clinical_knowledge', 'college_medicine', 'human_aging', 'medical_genetics', 'nutrition', 'professional_medicine', 'virology'], 'health': ['anatomy', 'clinical_knowledge', 'college_medicine', 'human_aging', 'medical_genetics', 'nutrition', 'professional_medicine', 'virology'],
'physics': ['astronomy', 'college_physics', 'conceptual_physics', 'high_school_physics'], 'physics': ['astronomy', 'college_physics', 'conceptual_physics', 'high_school_physics'],
'business': ['business_ethics', 'management', 'marketing'], 'business': ['business_ethics', 'management', 'marketing'],
'biology': ['college_biology', 'high_school_biology'], 'biology': ['college_biology', 'high_school_biology'],
'chemistry': ['college_chemistry', 'high_school_chemistry'], 'chemistry': ['college_chemistry', 'high_school_chemistry'],
'computer science': ['college_computer_science', 'computer_security', 'high_school_computer_science', 'machine_learning'], 'computer science': ['college_computer_science', 'computer_security', 'high_school_computer_science', 'machine_learning'],
'economics': ['econometrics', 'high_school_macroeconomics', 'high_school_microeconomics'], 'economics': ['econometrics', 'high_school_macroeconomics', 'high_school_microeconomics'],
'engineering': ['electrical_engineering'], 'engineering': ['electrical_engineering'],
'philosophy': ['formal_logic', 'logical_fallacies', 'moral_disputes', 'moral_scenarios', 'philosophy', 'world_religions'], 'philosophy': ['formal_logic', 'logical_fallacies', 'moral_disputes', 'moral_scenarios', 'philosophy', 'world_religions'],
'other': ['global_facts', 'miscellaneous', 'professional_accounting'], 'other': ['global_facts', 'miscellaneous', 'professional_accounting'],
'history': ['high_school_european_history', 'high_school_us_history', 'high_school_world_history', 'prehistory'], 'history': ['high_school_european_history', 'high_school_us_history', 'high_school_world_history', 'prehistory'],
'geography': ['high_school_geography'], 'geography': ['high_school_geography'],
'politics': ['high_school_government_and_politics', 'public_relations', 'security_studies', 'us_foreign_policy'], 'politics': ['high_school_government_and_politics', 'public_relations', 'security_studies', 'us_foreign_policy'],
'psychology': ['high_school_psychology', 'professional_psychology'], 'psychology': ['high_school_psychology', 'professional_psychology'],
'culture': ['human_sexuality', 'sociology'], 'culture': ['human_sexuality', 'sociology'],
'law': ['international_law', 'jurisprudence', 'professional_law'] 'law': ['international_law', 'jurisprudence', 'professional_law']
} }
categories = { categories = {
"STEM": ["physics", "chemistry", "biology", "computer science", "math", "engineering"], 'STEM': ['physics', 'chemistry', 'biology', 'computer science', 'math', 'engineering'],
"humanities": ["history", "philosophy", "law"], 'humanities': ['history', 'philosophy', 'law'],
"social_sciences": ["politics", "culture", "economics", "geography", "psychology"], 'social_sciences': ['politics', 'culture', 'economics', 'geography', 'psychology'],
"other": ["other", "business", "health"], 'other': ['other', 'business', 'health'],
} }
category2subject = {} category2subject = {}
......
...@@ -392,4 +392,4 @@ cibench_summary_groups.extend([ ...@@ -392,4 +392,4 @@ cibench_summary_groups.extend([
'subsets': [i[:2] for i in cibench_math], 'subsets': [i[:2] for i in cibench_math],
'weights': {f'{k[0]}@{k[1]}': k[-1] for k in cibench_math}, 'weights': {f'{k[0]}@{k[1]}': k[-1] for k in cibench_math},
}, },
]) ])
\ No newline at end of file
subcategories = { subcategories = {
"agronomy": ['other'], 'agronomy': ['other'],
"anatomy": ['biology'], 'anatomy': ['biology'],
"ancient_chinese": ['linguistics','china specific'], 'ancient_chinese': ['linguistics','china specific'],
"arts": ['arts'], 'arts': ['arts'],
"astronomy": ['physics'], 'astronomy': ['physics'],
"business_ethics": ['business'], 'business_ethics': ['business'],
"chinese_civil_service_exam": ['politics','china specific'], 'chinese_civil_service_exam': ['politics','china specific'],
"chinese_driving_rule": ['other','china specific'], 'chinese_driving_rule': ['other','china specific'],
"chinese_food_culture": ['culture','china specific'], 'chinese_food_culture': ['culture','china specific'],
"chinese_foreign_policy": ['politics','china specific'], 'chinese_foreign_policy': ['politics','china specific'],
"chinese_history":['history','china specific'], 'chinese_history':['history','china specific'],
"chinese_literature": ['literature','china specific'], 'chinese_literature': ['literature','china specific'],
"chinese_teacher_qualification": ['education','china specific'], 'chinese_teacher_qualification': ['education','china specific'],
"college_actuarial_science":['math'], 'college_actuarial_science':['math'],
"college_education":['education'], 'college_education':['education'],
"college_engineering_hydrology": ['engineering'], 'college_engineering_hydrology': ['engineering'],
"college_law": ['law'], 'college_law': ['law'],
"college_mathematics": ['math'], 'college_mathematics': ['math'],
"college_medical_statistics":['statistics'], 'college_medical_statistics':['statistics'],
"clinical_knowledge": ['other'], 'clinical_knowledge': ['other'],
"college_medicine": ['other'], 'college_medicine': ['other'],
"computer_science": ['computer science'], 'computer_science': ['computer science'],
"computer_security": ['other'], 'computer_security': ['other'],
"conceptual_physics": ['physics'], 'conceptual_physics': ['physics'],
"construction_project_management": ['other','china specific'], 'construction_project_management': ['other','china specific'],
"economics": ['economics'], 'economics': ['economics'],
"education": ['education'], 'education': ['education'],
"elementary_chinese":['linguistics','china specific'], 'elementary_chinese':['linguistics','china specific'],
"elementary_commonsense":['other','china specific'], 'elementary_commonsense':['other','china specific'],
"elementary_information_and_technology": ['other'], 'elementary_information_and_technology': ['other'],
"electrical_engineering": ['engineering'], 'electrical_engineering': ['engineering'],
"elementary_mathematics": ['math'], 'elementary_mathematics': ['math'],
"ethnology": ['culture','china specific'], 'ethnology': ['culture','china specific'],
"food_science": ['other'], 'food_science': ['other'],
"genetics": ['biology'], 'genetics': ['biology'],
"global_facts": ['global'], 'global_facts': ['global'],
"high_school_biology": ['biology'], 'high_school_biology': ['biology'],
"high_school_chemistry": ['chemistry'], 'high_school_chemistry': ['chemistry'],
"high_school_geography": ['geography'], 'high_school_geography': ['geography'],
"high_school_mathematics": ['math'], 'high_school_mathematics': ['math'],
"high_school_physics": ['physics'], 'high_school_physics': ['physics'],
"high_school_politics": ['politics','china specific'], 'high_school_politics': ['politics','china specific'],
"human_sexuality": ['other'], 'human_sexuality': ['other'],
"international_law": ['law'], 'international_law': ['law'],
"journalism": ['sociology'], 'journalism': ['sociology'],
"jurisprudence": ['law'], 'jurisprudence': ['law'],
"legal_and_moral_basis": ['other'], 'legal_and_moral_basis': ['other'],
"logical": ['philosophy'], 'logical': ['philosophy'],
"machine_learning": ['computer science'], 'machine_learning': ['computer science'],
"management": ['business'], 'management': ['business'],
"marketing": ['business'], 'marketing': ['business'],
"marxist_theory": ['philosophy'], 'marxist_theory': ['philosophy'],
"modern_chinese": ['linguistics','china specific'], 'modern_chinese': ['linguistics','china specific'],
"nutrition": ['other'], 'nutrition': ['other'],
"philosophy": ['philosophy'], 'philosophy': ['philosophy'],
"professional_accounting": ['business'], 'professional_accounting': ['business'],
"professional_law": ['law'], 'professional_law': ['law'],
"professional_medicine": ['other'], 'professional_medicine': ['other'],
"professional_psychology": ['psychology'], 'professional_psychology': ['psychology'],
"public_relations": ['politics'], 'public_relations': ['politics'],
"security_study": ['politics'], 'security_study': ['politics'],
"sociology": ['culture'], 'sociology': ['culture'],
"sports_science": ['other'], 'sports_science': ['other'],
"traditional_chinese_medicine": ['other','china specific'], 'traditional_chinese_medicine': ['other','china specific'],
"virology": ['biology'], 'virology': ['biology'],
"world_history":['history'], 'world_history':['history'],
"world_religions": ['global'], 'world_religions': ['global'],
} }
categories = { categories = {
"STEM": ["physics", "chemistry", "biology", "computer science", "math", "engineering", "statistics"], 'STEM': ['physics', 'chemistry', 'biology', 'computer science', 'math', 'engineering', 'statistics'],
"Humanities": ["history", "philosophy", "law", "arts", "literature", "global"], 'Humanities': ['history', 'philosophy', 'law', 'arts', 'literature', 'global'],
"Social Science": ['linguistics',"business", "politics", "culture", "economics", "geography", "psychology", "education", "sociology"], 'Social Science': ['linguistics','business', 'politics', 'culture', 'economics', 'geography', 'psychology', 'education', 'sociology'],
"Other":["other"], 'Other':['other'],
"China specific": ["china specific"], 'China specific': ['china specific'],
} }
category2subject = {} category2subject = {}
......
names = [ names = [
["1-1", "article_recitation"], ['1-1', 'article_recitation'],
["1-2", "knowledge_question_answering"], ['1-2', 'knowledge_question_answering'],
["2-1", "document_proofreading"], ['2-1', 'document_proofreading'],
["2-2", "dispute_focus_identification"], ['2-2', 'dispute_focus_identification'],
["2-3", "marital_disputes_identification"], ['2-3', 'marital_disputes_identification'],
["2-4", "issue_topic_identification"], ['2-4', 'issue_topic_identification'],
["2-5", "reading_comprehension"], ['2-5', 'reading_comprehension'],
["2-6", "named_entity_recognition"], ['2-6', 'named_entity_recognition'],
["2-7", "opinion_summarization"], ['2-7', 'opinion_summarization'],
["2-8", "argument_mining"], ['2-8', 'argument_mining'],
["2-9", "event_detection"], ['2-9', 'event_detection'],
["2-10", "trigger_word_extraction"], ['2-10', 'trigger_word_extraction'],
["3-1", "fact_based_article_prediction"], ['3-1', 'fact_based_article_prediction'],
["3-2", "scene_based_article_prediction"], ['3-2', 'scene_based_article_prediction'],
["3-3", "charge_prediction"], ['3-3', 'charge_prediction'],
["3-4", "prison_term_prediction_wo_article"], ['3-4', 'prison_term_prediction_wo_article'],
["3-5", "prison_term_prediction_w_article"], ['3-5', 'prison_term_prediction_w_article'],
["3-6", "case_analysis"], ['3-6', 'case_analysis'],
["3-7", "criminal_damages_calculation"], ['3-7', 'criminal_damages_calculation'],
["3-8", "consultation"], ['3-8', 'consultation'],
] ]
lawbench_summary_groups = [] lawbench_summary_groups = []
......
leval_summary_groups = [ leval_summary_groups = [
{"name": "leval", "subsets": ["LEval_coursera", "LEval_gsm100", "LEval_quality", "LEval_tpo", "LEval_topic_retrieval", "LEval_financialqa", "LEval_gov_report_summ", "LEval_legal_contract_qa", "LEval_meeting_summ", "LEval_multidocqa", "LEval_narrativeqa", "LEval_nq", "LEval_news_summ", "LEval_paper_assistant", "LEval_patent_summ", "LEval_review_summ", "LEval_scientificqa", "LEval_tvshow_summ"]}, {'name': 'leval', 'subsets': ['LEval_coursera', 'LEval_gsm100', 'LEval_quality', 'LEval_tpo', 'LEval_topic_retrieval', 'LEval_financialqa', 'LEval_gov_report_summ', 'LEval_legal_contract_qa', 'LEval_meeting_summ', 'LEval_multidocqa', 'LEval_narrativeqa', 'LEval_nq', 'LEval_news_summ', 'LEval_paper_assistant', 'LEval_patent_summ', 'LEval_review_summ', 'LEval_scientificqa', 'LEval_tvshow_summ']},
] ]
len_levels = ["16k", "32k", "64k", "128k", "256k"] len_levels = ['16k', '32k', '64k', '128k', '256k']
subsets_lveval_loogle_SD_mixup = [ subsets_lveval_loogle_SD_mixup = [
"LVEval_loogle_SD_mixup" + "_" + len_level for len_level in len_levels 'LVEval_loogle_SD_mixup' + '_' + len_level for len_level in len_levels
] ]
subsets_lveval_cmrc_mixup = [ subsets_lveval_cmrc_mixup = [
"LVEval_cmrc_mixup" + "_" + len_level for len_level in len_levels 'LVEval_cmrc_mixup' + '_' + len_level for len_level in len_levels
] ]
subsets_lveval_multifieldqa_en_mixup = [ subsets_lveval_multifieldqa_en_mixup = [
"LVEval_multifieldqa_en_mixup" + "_" + len_level 'LVEval_multifieldqa_en_mixup' + '_' + len_level
for len_level in len_levels for len_level in len_levels
] ]
subsets_lveval_multifieldqa_zh_mixup = [ subsets_lveval_multifieldqa_zh_mixup = [
"LVEval_multifieldqa_zh_mixup" + "_" + len_level 'LVEval_multifieldqa_zh_mixup' + '_' + len_level
for len_level in len_levels for len_level in len_levels
] ]
subsets_lveval_dureader_mixup = [ subsets_lveval_dureader_mixup = [
"LVEval_dureader_mixup" + "_" + len_level for len_level in len_levels 'LVEval_dureader_mixup' + '_' + len_level for len_level in len_levels
] ]
subsets_lveval_loogle_CR_mixup = [ subsets_lveval_loogle_CR_mixup = [
"LVEval_loogle_CR_mixup" + "_" + len_level for len_level in len_levels 'LVEval_loogle_CR_mixup' + '_' + len_level for len_level in len_levels
] ]
subsets_lveval_loogle_MIR_mixup = [ subsets_lveval_loogle_MIR_mixup = [
"LVEval_loogle_MIR_mixup" + "_" + len_level for len_level in len_levels 'LVEval_loogle_MIR_mixup' + '_' + len_level for len_level in len_levels
] ]
subsets_lveval_hotpotwikiqa_mixup = [ subsets_lveval_hotpotwikiqa_mixup = [
"LVEval_hotpotwikiqa_mixup" + "_" + len_level for len_level in len_levels 'LVEval_hotpotwikiqa_mixup' + '_' + len_level for len_level in len_levels
] ]
subsets_lveval_lic_mixup = [ subsets_lveval_lic_mixup = [
"LVEval_lic_mixup" + "_" + len_level for len_level in len_levels 'LVEval_lic_mixup' + '_' + len_level for len_level in len_levels
] ]
subsets_lveval_factrecall_en = [ subsets_lveval_factrecall_en = [
"LVEval_factrecall_en" + "_" + len_level for len_level in len_levels 'LVEval_factrecall_en' + '_' + len_level for len_level in len_levels
] ]
subsets_lveval_factrecall_zh = [ subsets_lveval_factrecall_zh = [
"LVEval_factrecall_zh" + "_" + len_level for len_level in len_levels 'LVEval_factrecall_zh' + '_' + len_level for len_level in len_levels
] ]
subsets_lveval_single_hop_qa = ( subsets_lveval_single_hop_qa = (
...@@ -64,47 +64,47 @@ subsets_lveval_qa = ( ...@@ -64,47 +64,47 @@ subsets_lveval_qa = (
lveval_summary_groups = [ lveval_summary_groups = [
{ {
"name": "LVEval_loogle_SD_mixup", 'name': 'LVEval_loogle_SD_mixup',
"subsets": subsets_lveval_loogle_SD_mixup, 'subsets': subsets_lveval_loogle_SD_mixup,
}, },
{"name": "LVEval_cmrc_mixup", "subsets": subsets_lveval_cmrc_mixup}, {'name': 'LVEval_cmrc_mixup', 'subsets': subsets_lveval_cmrc_mixup},
{ {
"name": "LVEval_multifieldqa_en_mixup", 'name': 'LVEval_multifieldqa_en_mixup',
"subsets": subsets_lveval_multifieldqa_en_mixup, 'subsets': subsets_lveval_multifieldqa_en_mixup,
}, },
{ {
"name": "LVEval_multifieldqa_zh_mixup", 'name': 'LVEval_multifieldqa_zh_mixup',
"subsets": subsets_lveval_multifieldqa_zh_mixup, 'subsets': subsets_lveval_multifieldqa_zh_mixup,
}, },
{ {
"name": "LVEval_dureader_mixup", 'name': 'LVEval_dureader_mixup',
"subsets": subsets_lveval_dureader_mixup, 'subsets': subsets_lveval_dureader_mixup,
}, },
{ {
"name": "LVEval_loogle_CR_mixup", 'name': 'LVEval_loogle_CR_mixup',
"subsets": subsets_lveval_loogle_CR_mixup, 'subsets': subsets_lveval_loogle_CR_mixup,
}, },
{ {
"name": "LVEval_loogle_MIR_mixup", 'name': 'LVEval_loogle_MIR_mixup',
"subsets": subsets_lveval_loogle_MIR_mixup, 'subsets': subsets_lveval_loogle_MIR_mixup,
}, },
{ {
"name": "LVEval_hotpotwikiqa_mixup", 'name': 'LVEval_hotpotwikiqa_mixup',
"subsets": subsets_lveval_hotpotwikiqa_mixup, 'subsets': subsets_lveval_hotpotwikiqa_mixup,
}, },
{"name": "LVEval_lic_mixup", "subsets": subsets_lveval_lic_mixup}, {'name': 'LVEval_lic_mixup', 'subsets': subsets_lveval_lic_mixup},
{"name": "LVEval_factrecall_en", "subsets": subsets_lveval_factrecall_en}, {'name': 'LVEval_factrecall_en', 'subsets': subsets_lveval_factrecall_en},
{"name": "LVEval_factrecall_zh", "subsets": subsets_lveval_factrecall_zh}, {'name': 'LVEval_factrecall_zh', 'subsets': subsets_lveval_factrecall_zh},
{"name": "LVEval_single_hop_qa", "subsets": subsets_lveval_single_hop_qa}, {'name': 'LVEval_single_hop_qa', 'subsets': subsets_lveval_single_hop_qa},
{ {
"name": "LVEval_single_hop_cqa", 'name': 'LVEval_single_hop_cqa',
"subsets": subsets_lveval_single_hop_cqa, 'subsets': subsets_lveval_single_hop_cqa,
}, },
{"name": "LVEval_multi_hop_qa", "subsets": subsets_lveval_multi_hop_qa}, {'name': 'LVEval_multi_hop_qa', 'subsets': subsets_lveval_multi_hop_qa},
{"name": "LVEval_multi_hop_cqa", "subsets": subsets_lveval_multi_hop_cqa}, {'name': 'LVEval_multi_hop_cqa', 'subsets': subsets_lveval_multi_hop_cqa},
{ {
"name": "LVEval_factrecall_cqa", 'name': 'LVEval_factrecall_cqa',
"subsets": subsets_lveval_factrecall_cqa, 'subsets': subsets_lveval_factrecall_cqa,
}, },
{"name": "LVEval_qa", "subsets": subsets_lveval_qa}, {'name': 'LVEval_qa', 'subsets': subsets_lveval_qa},
] ]
ALL_LANGUAGES = ["bn", "de", "en", "es", "fr", "ja", "ru", "sw", "te", "th", "zh"] ALL_LANGUAGES = ['bn', 'de', 'en', 'es', 'fr', 'ja', 'ru', 'sw', 'te', 'th', 'zh']
LATIN_LANGUAGES = ["de", "en", "es", "fr", "sw"] LATIN_LANGUAGES = ['de', 'en', 'es', 'fr', 'sw']
NON_LATIN_LANGUAGES = ["bn", "ja", "ru", "te", "th", "zh"] NON_LATIN_LANGUAGES = ['bn', 'ja', 'ru', 'te', 'th', 'zh']
mgsm_summary_groups = [ mgsm_summary_groups = [
{'name': 'mgsm_latin', 'subsets': [f'mgsm_{lang}' for lang in LATIN_LANGUAGES]}, {'name': 'mgsm_latin', 'subsets': [f'mgsm_{lang}' for lang in LATIN_LANGUAGES]},
......
scibench_summary_groups = [] scibench_summary_groups = []
scibench_tasks = ["atkins", "calculus", "chemmc", "class", "diff", "fund", "matter", "quan", "stat", "thermo"] scibench_tasks = ['atkins', 'calculus', 'chemmc', 'class', 'diff', 'fund', 'matter', 'quan', 'stat', 'thermo']
for suffix in ["", "_zs-cot", "_fs", "_fs-cot"]: for suffix in ['', '_zs-cot', '_fs', '_fs-cot']:
subsets = [f"scibench-{subset}{suffix}" for subset in scibench_tasks] subsets = [f'scibench-{subset}{suffix}' for subset in scibench_tasks]
scibench_summary_groups.append({'name': f'scibench{suffix}', 'subsets': subsets}) scibench_summary_groups.append({'name': f'scibench{suffix}', 'subsets': subsets})
...@@ -71,4 +71,3 @@ for group in _base_summary_groups: ...@@ -71,4 +71,3 @@ for group in _base_summary_groups:
group['name'] = group['name'] + '_zh' group['name'] = group['name'] + '_zh'
group['subsets'] = [[subset[0] + '_zh', subset[1]] for subset in group['subsets']] group['subsets'] = [[subset[0] + '_zh', subset[1]] for subset in group['subsets']]
teval_summary_groups.append(group) teval_summary_groups.append(group)
xiezhi_summary_groups = [] xiezhi_summary_groups = []
_xiezhi = ["xiezhi-spec_eng", "xiezhi-spec_chn", "xiezhi-inter_eng", "xiezhi-inter_chn"] _xiezhi = ['xiezhi-spec_eng', 'xiezhi-spec_chn', 'xiezhi-inter_eng', 'xiezhi-inter_chn']
xiezhi_summary_groups.append({'name': 'xiezhi', 'subsets': _xiezhi}) xiezhi_summary_groups.append({'name': 'xiezhi', 'subsets': _xiezhi})
...@@ -2,7 +2,7 @@ from mmengine.config import read_base ...@@ -2,7 +2,7 @@ from mmengine.config import read_base
with read_base(): with read_base():
from .groups.infinitebench import infinitebench_summary_groups from .groups.infinitebench import infinitebench_summary_groups
summarizer = dict( summarizer = dict(
summary_groups=sum([v for k, v in locals().items() if k.endswith("_summary_groups")], []), summary_groups=sum([v for k, v in locals().items() if k.endswith('_summary_groups')], []),
) )
...@@ -16,5 +16,5 @@ summarizer = dict( ...@@ -16,5 +16,5 @@ summarizer = dict(
['sanitized_mbpp', 'score'], ['sanitized_mbpp', 'score'],
], ],
summary_groups=sum( summary_groups=sum(
[v for k, v in locals().items() if k.endswith("_summary_groups")], []), [v for k, v in locals().items() if k.endswith('_summary_groups')], []),
) )
...@@ -50,7 +50,7 @@ summarizer = dict( ...@@ -50,7 +50,7 @@ summarizer = dict(
'lawbench-3-7-criminal_damages_calculation-1-shot', 'lawbench-3-7-criminal_damages_calculation-1-shot',
'lawbench-3-8-consultation-1-shot', 'lawbench-3-8-consultation-1-shot',
], ],
summary_groups=sum([v for k, v in locals().items() if k.endswith("_summary_groups")], []), summary_groups=sum([v for k, v in locals().items() if k.endswith('_summary_groups')], []),
prompt_db=dict( prompt_db=dict(
database_path='configs/datasets/log.json', database_path='configs/datasets/log.json',
config_dir='configs/datasets', config_dir='configs/datasets',
......
...@@ -13,11 +13,11 @@ with read_base(): ...@@ -13,11 +13,11 @@ with read_base():
other_summary_groups = [] other_summary_groups = []
other_summary_groups.append({'name': 'Exam', 'subsets': ["ceval",'agieval','mmlu','cmmlu',"GaokaoBench",'ARC-c','ARC-e']}) other_summary_groups.append({'name': 'Exam', 'subsets': ['ceval','agieval','mmlu','cmmlu','GaokaoBench','ARC-c','ARC-e']})
other_summary_groups.append({'name': 'Language', 'subsets': ['WiC','chid-dev','afqmc-dev','WSC','tydiqa-goldp','flores_100']}) other_summary_groups.append({'name': 'Language', 'subsets': ['WiC','chid-dev','afqmc-dev','WSC','tydiqa-goldp','flores_100']})
other_summary_groups.append({'name': 'Knowledge', 'subsets': ['BoolQ','commonsense_qa','triviaqa','nq']}) other_summary_groups.append({'name': 'Knowledge', 'subsets': ['BoolQ','commonsense_qa','triviaqa','nq']})
other_summary_groups.append({'name': 'Understanding', 'subsets': ['C3','race-middle','race-high','openbookqa_fact','csl_dev','lcsts','Xsum','eprstmt-dev','lambada']}) other_summary_groups.append({'name': 'Understanding', 'subsets': ['C3','race-middle','race-high','openbookqa_fact','csl_dev','lcsts','Xsum','eprstmt-dev','lambada']})
other_summary_groups.append({'name': 'Reasoning', 'subsets': ['cmnli','ocnli','AX_b','AX_g','RTE','COPA','ReCoRD','hellaswag','piqa','siqa','math','gsm8k','drop','openai_humaneval','mbpp',"bbh"]}) other_summary_groups.append({'name': 'Reasoning', 'subsets': ['cmnli','ocnli','AX_b','AX_g','RTE','COPA','ReCoRD','hellaswag','piqa','siqa','math','gsm8k','drop','openai_humaneval','mbpp','bbh']})
other_summary_groups.append({'name': 'Overall', 'subsets': ['Exam', 'Language', 'Knowledge', 'Understanding', 'Reasoning']}) other_summary_groups.append({'name': 'Overall', 'subsets': ['Exam', 'Language', 'Knowledge', 'Understanding', 'Reasoning']})
summarizer = dict( summarizer = dict(
...@@ -30,11 +30,11 @@ summarizer = dict( ...@@ -30,11 +30,11 @@ summarizer = dict(
'Reasoning', 'Reasoning',
'--------- 考试 Exam ---------', # category '--------- 考试 Exam ---------', # category
# 'Mixed', # subcategory # 'Mixed', # subcategory
"ceval", 'ceval',
'agieval', 'agieval',
'mmlu', 'mmlu',
'cmmlu', 'cmmlu',
"GaokaoBench", 'GaokaoBench',
'ARC-c', 'ARC-c',
'ARC-e', 'ARC-e',
'--------- 语言 Language ---------', # category '--------- 语言 Language ---------', # category
...@@ -92,8 +92,8 @@ summarizer = dict( ...@@ -92,8 +92,8 @@ summarizer = dict(
'openai_humaneval', 'openai_humaneval',
'mbpp', 'mbpp',
# '综合推理', # subcategory # '综合推理', # subcategory
"bbh", 'bbh',
], ],
summary_groups=sum( summary_groups=sum(
[v for k, v in locals().items() if k.endswith("_summary_groups")], []), [v for k, v in locals().items() if k.endswith('_summary_groups')], []),
) )
...@@ -5,110 +5,110 @@ with read_base(): ...@@ -5,110 +5,110 @@ with read_base():
summarizer = dict( summarizer = dict(
dataset_abbrs=[ dataset_abbrs=[
"----------------------------------------", '----------------------------------------',
"--------- LVEval All ---------", # category '--------- LVEval All ---------', # category
"----------------------------------------", '----------------------------------------',
"LVEval_qa", 'LVEval_qa',
"----------------------------------------", '----------------------------------------',
"--------- LVEval Tasks All ---------", # category '--------- LVEval Tasks All ---------', # category
"----------------------------------------", '----------------------------------------',
"LVEval_single_hop_qa", 'LVEval_single_hop_qa',
"LVEval_single_hop_cqa", 'LVEval_single_hop_cqa',
"LVEval_multi_hop_qa", 'LVEval_multi_hop_qa',
"LVEval_multi_hop_cqa", 'LVEval_multi_hop_cqa',
"LVEval_factrecall_cqa", 'LVEval_factrecall_cqa',
"----------------------------------------", '----------------------------------------',
"--------- LVEval Datasets All ---------", # category '--------- LVEval Datasets All ---------', # category
"----------------------------------------", '----------------------------------------',
"LVEval_loogle_SD_mixup", 'LVEval_loogle_SD_mixup',
"LVEval_cmrc_mixup", 'LVEval_cmrc_mixup',
"LVEval_multifieldqa_en_mixup", 'LVEval_multifieldqa_en_mixup',
"LVEval_multifieldqa_zh_mixup", 'LVEval_multifieldqa_zh_mixup',
"LVEval_dureader_mixup", 'LVEval_dureader_mixup',
"LVEval_loogle_CR_mixup", 'LVEval_loogle_CR_mixup',
"LVEval_loogle_MIR_mixup", 'LVEval_loogle_MIR_mixup',
"LVEval_hotpotwikiqa_mixup", 'LVEval_hotpotwikiqa_mixup',
"LVEval_lic_mixup", 'LVEval_lic_mixup',
"LVEval_factrecall_en", 'LVEval_factrecall_en',
"LVEval_factrecall_zh", 'LVEval_factrecall_zh',
"----------------------------------------", '----------------------------------------',
"--------- LVEval Single_Hop QA ---------", # category '--------- LVEval Single_Hop QA ---------', # category
"----------------------------------------", '----------------------------------------',
"LVEval_loogle_SD_mixup_16k", 'LVEval_loogle_SD_mixup_16k',
"LVEval_loogle_SD_mixup_32k", 'LVEval_loogle_SD_mixup_32k',
"LVEval_loogle_SD_mixup_64k", 'LVEval_loogle_SD_mixup_64k',
"LVEval_loogle_SD_mixup_128k", 'LVEval_loogle_SD_mixup_128k',
"LVEval_loogle_SD_mixup_256k", 'LVEval_loogle_SD_mixup_256k',
"----------------------------------------", '----------------------------------------',
"LVEval_cmrc_mixup_16k", 'LVEval_cmrc_mixup_16k',
"LVEval_cmrc_mixup_32k", 'LVEval_cmrc_mixup_32k',
"LVEval_cmrc_mixup_64k", 'LVEval_cmrc_mixup_64k',
"LVEval_cmrc_mixup_128k", 'LVEval_cmrc_mixup_128k',
"LVEval_cmrc_mixup_256k", 'LVEval_cmrc_mixup_256k',
"----------------------------------------", '----------------------------------------',
"--------- LVEval Single_Hop CQA ---------", # category '--------- LVEval Single_Hop CQA ---------', # category
"----------------------------------------", '----------------------------------------',
"LVEval_multifieldqa_en_mixup_16k", 'LVEval_multifieldqa_en_mixup_16k',
"LVEval_multifieldqa_en_mixup_32k", 'LVEval_multifieldqa_en_mixup_32k',
"LVEval_multifieldqa_en_mixup_64k", 'LVEval_multifieldqa_en_mixup_64k',
"LVEval_multifieldqa_en_mixup_128k", 'LVEval_multifieldqa_en_mixup_128k',
"LVEval_multifieldqa_en_mixup_256k", 'LVEval_multifieldqa_en_mixup_256k',
"----------------------------------------", '----------------------------------------',
"LVEval_multifieldqa_zh_mixup_16k", 'LVEval_multifieldqa_zh_mixup_16k',
"LVEval_multifieldqa_zh_mixup_32k", 'LVEval_multifieldqa_zh_mixup_32k',
"LVEval_multifieldqa_zh_mixup_64k", 'LVEval_multifieldqa_zh_mixup_64k',
"LVEval_multifieldqa_zh_mixup_128k", 'LVEval_multifieldqa_zh_mixup_128k',
"LVEval_multifieldqa_zh_mixup_256k", 'LVEval_multifieldqa_zh_mixup_256k',
"----------------------------------------", '----------------------------------------',
"--------- LVEval Multi_Hop QA ---------", # category '--------- LVEval Multi_Hop QA ---------', # category
"----------------------------------------", '----------------------------------------',
"LVEval_dureader_mixup_16k", 'LVEval_dureader_mixup_16k',
"LVEval_dureader_mixup_32k", 'LVEval_dureader_mixup_32k',
"LVEval_dureader_mixup_64k", 'LVEval_dureader_mixup_64k',
"LVEval_dureader_mixup_128k", 'LVEval_dureader_mixup_128k',
"LVEval_dureader_mixup_256k", 'LVEval_dureader_mixup_256k',
"----------------------------------------", '----------------------------------------',
"LVEval_loogle_CR_mixup_16k", 'LVEval_loogle_CR_mixup_16k',
"LVEval_loogle_CR_mixup_32k", 'LVEval_loogle_CR_mixup_32k',
"LVEval_loogle_CR_mixup_64k", 'LVEval_loogle_CR_mixup_64k',
"LVEval_loogle_CR_mixup_128k", 'LVEval_loogle_CR_mixup_128k',
"LVEval_loogle_CR_mixup_256k", 'LVEval_loogle_CR_mixup_256k',
"----------------------------------------", '----------------------------------------',
"LVEval_loogle_MIR_mixup_16k", 'LVEval_loogle_MIR_mixup_16k',
"LVEval_loogle_MIR_mixup_32k", 'LVEval_loogle_MIR_mixup_32k',
"LVEval_loogle_MIR_mixup_64k", 'LVEval_loogle_MIR_mixup_64k',
"LVEval_loogle_MIR_mixup_128k", 'LVEval_loogle_MIR_mixup_128k',
"LVEval_loogle_MIR_mixup_256k", 'LVEval_loogle_MIR_mixup_256k',
"----------------------------------------", '----------------------------------------',
"--------- LVEval Multi_Hop CQA ---------", # category '--------- LVEval Multi_Hop CQA ---------', # category
"----------------------------------------", '----------------------------------------',
"LVEval_hotpotwikiqa_mixup_16k", 'LVEval_hotpotwikiqa_mixup_16k',
"LVEval_hotpotwikiqa_mixup_32k", 'LVEval_hotpotwikiqa_mixup_32k',
"LVEval_hotpotwikiqa_mixup_64k", 'LVEval_hotpotwikiqa_mixup_64k',
"LVEval_hotpotwikiqa_mixup_128k", 'LVEval_hotpotwikiqa_mixup_128k',
"LVEval_hotpotwikiqa_mixup_256k", 'LVEval_hotpotwikiqa_mixup_256k',
"----------------------------------------", '----------------------------------------',
"LVEval_lic_mixup_16k", 'LVEval_lic_mixup_16k',
"LVEval_lic_mixup_32k", 'LVEval_lic_mixup_32k',
"LVEval_lic_mixup_64k", 'LVEval_lic_mixup_64k',
"LVEval_lic_mixup_128k", 'LVEval_lic_mixup_128k',
"LVEval_lic_mixup_256k", 'LVEval_lic_mixup_256k',
"----------------------------------------", '----------------------------------------',
"--------- LVEval Factrecall CQA ---------", # category '--------- LVEval Factrecall CQA ---------', # category
"----------------------------------------", '----------------------------------------',
"LVEval_factrecall_en_16k", 'LVEval_factrecall_en_16k',
"LVEval_factrecall_en_32k", 'LVEval_factrecall_en_32k',
"LVEval_factrecall_en_64k", 'LVEval_factrecall_en_64k',
"LVEval_factrecall_en_128k", 'LVEval_factrecall_en_128k',
"LVEval_factrecall_en_256k", 'LVEval_factrecall_en_256k',
"----------------------------------------", '----------------------------------------',
"LVEval_factrecall_zh_16k", 'LVEval_factrecall_zh_16k',
"LVEval_factrecall_zh_32k", 'LVEval_factrecall_zh_32k',
"LVEval_factrecall_zh_64k", 'LVEval_factrecall_zh_64k',
"LVEval_factrecall_zh_128k", 'LVEval_factrecall_zh_128k',
"LVEval_factrecall_zh_256k", 'LVEval_factrecall_zh_256k',
], ],
summary_groups=sum( summary_groups=sum(
[v for k, v in locals().items() if k.endswith("_summary_groups")], [] [v for k, v in locals().items() if k.endswith('_summary_groups')], []
), ),
) )
...@@ -21,5 +21,5 @@ summarizer = dict( ...@@ -21,5 +21,5 @@ summarizer = dict(
'mathbench-circular-and-cloze-agent', 'mathbench-circular-and-cloze-agent',
], ],
summary_groups=sum( summary_groups=sum(
[v for k, v in locals().items() if k.endswith("_summary_groups")], []) [v for k, v in locals().items() if k.endswith('_summary_groups')], [])
) )
...@@ -15,5 +15,5 @@ summarizer = dict( ...@@ -15,5 +15,5 @@ summarizer = dict(
'mathbench-circular-and-cloze', 'mathbench-circular-and-cloze',
], ],
summary_groups=sum( summary_groups=sum(
[v for k, v in locals().items() if k.endswith("_summary_groups")], []) [v for k, v in locals().items() if k.endswith('_summary_groups')], [])
) )
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment