"node.h" did not exist on "f959a475b7d7c5cd7aaac05fe4b944bbf5cd22b1"
Unverified Commit aa2dd2b5 authored by Fengzhe Zhou's avatar Fengzhe Zhou Committed by GitHub
Browse files

[Format] Add config lints (#892)

parent 3dbba119
......@@ -115,36 +115,36 @@ agent_summary_groups = [
other_summary_groups = [
{
"name": "average_cn",
"subsets": [
["language_zh_perf_4_and_non_mcq", "naive_average"],
["knowledge_cn", "perf_4"],
["reasonbench_cn_circular", "perf_circular"],
["math_perf_4_and_fill_in_blank_cn", "naive_average"],
["code_cn", "naive_average"],
["agent_cn", "naive_average"],
'name': 'average_cn',
'subsets': [
['language_zh_perf_4_and_non_mcq', 'naive_average'],
['knowledge_cn', 'perf_4'],
['reasonbench_cn_circular', 'perf_circular'],
['math_perf_4_and_fill_in_blank_cn', 'naive_average'],
['code_cn', 'naive_average'],
['agent_cn', 'naive_average'],
],
},
{
"name": "average_en",
"subsets": [
["language_en_perf_4_and_non_mcq", "naive_average"],
["compassbench_v1_knowledge-mixed-cloze_en", "score"],
["reasonbench_en_circular", "perf_circular"],
["math_perf_4_and_fill_in_blank_en", "naive_average"],
["code_en", "naive_average"],
["agent_en", "naive_average"],
'name': 'average_en',
'subsets': [
['language_en_perf_4_and_non_mcq', 'naive_average'],
['compassbench_v1_knowledge-mixed-cloze_en', 'score'],
['reasonbench_en_circular', 'perf_circular'],
['math_perf_4_and_fill_in_blank_en', 'naive_average'],
['code_en', 'naive_average'],
['agent_en', 'naive_average'],
],
},
{
"name": "average",
"subsets": [
["language_perf_4_and_non_mcq", "naive_average"],
["knowledge_perf_4_and_cloze", "naive_average"],
["reasonbench", "perf_circular"],
["math_perf_4_and_fill_in_blank", "naive_average"],
["code", "naive_average"],
["agent", "naive_average"],
'name': 'average',
'subsets': [
['language_perf_4_and_non_mcq', 'naive_average'],
['knowledge_perf_4_and_cloze', 'naive_average'],
['reasonbench', 'perf_circular'],
['math_perf_4_and_fill_in_blank', 'naive_average'],
['code', 'naive_average'],
['agent', 'naive_average'],
],
},
]
......@@ -223,5 +223,5 @@ summarizer = dict(
['plugin_eval-mus-p10_one_review_zh', 'naive_average'],
['plugin_eval-mus-p10_one_review', 'naive_average'],
],
summary_groups=sum([v for k, v in locals().items() if k.endswith("_summary_groups")], []),
summary_groups=sum([v for k, v in locals().items() if k.endswith('_summary_groups')], []),
)
......@@ -60,63 +60,63 @@ ceval_category_weights = {
}
mmlu_category_weights = {
"business_ethics": {"accuracy - clean": 44, "accuracy - input contaminated": 16, "accuracy - input-and-label contaminated": 38, "accuracy - not labeled": 1},
"security_studies": {"accuracy - clean": 188, "accuracy - input contaminated": 9, "accuracy - input-and-label contaminated": 47, "accuracy - not labeled": 0},
"high_school_us_history": {"accuracy - clean": 42, "accuracy - input contaminated": 0, "accuracy - input-and-label contaminated": 0, "accuracy - not labeled": 161},
"moral_disputes": {"accuracy - clean": 105, "accuracy - input contaminated": 13, "accuracy - input-and-label contaminated": 168, "accuracy - not labeled": 59},
"philosophy": {"accuracy - clean": 81, "accuracy - input contaminated": 11, "accuracy - input-and-label contaminated": 187, "accuracy - not labeled": 31},
"public_relations": {"accuracy - clean": 75, "accuracy - input contaminated": 8, "accuracy - input-and-label contaminated": 26, "accuracy - not labeled": 0},
"high_school_microeconomics": {"accuracy - clean": 82, "accuracy - input contaminated": 9, "accuracy - input-and-label contaminated": 146, "accuracy - not labeled": 0},
"human_sexuality": {"accuracy - clean": 108, "accuracy - input contaminated": 3, "accuracy - input-and-label contaminated": 15, "accuracy - not labeled": 4},
"professional_accounting": {"accuracy - clean": 88, "accuracy - input contaminated": 40, "accuracy - input-and-label contaminated": 152, "accuracy - not labeled": 1},
"high_school_government_and_politics": {"accuracy - clean": 104, "accuracy - input contaminated": 6, "accuracy - input-and-label contaminated": 82, "accuracy - not labeled": 0},
"sociology": {"accuracy - clean": 105, "accuracy - input contaminated": 4, "accuracy - input-and-label contaminated": 91, "accuracy - not labeled": 0},
"conceptual_physics": {"accuracy - clean": 79, "accuracy - input contaminated": 8, "accuracy - input-and-label contaminated": 147, "accuracy - not labeled": 0},
"human_aging": {"accuracy - clean": 208, "accuracy - input contaminated": 1, "accuracy - input-and-label contaminated": 13, "accuracy - not labeled": 0},
"high_school_psychology": {"accuracy - clean": 108, "accuracy - input contaminated": 26, "accuracy - input-and-label contaminated": 162, "accuracy - not labeled": 248},
"jurisprudence": {"accuracy - clean": 59, "accuracy - input contaminated": 5, "accuracy - input-and-label contaminated": 43, "accuracy - not labeled": 0},
"moral_scenarios": {"accuracy - clean": 320, "accuracy - input contaminated": 0, "accuracy - input-and-label contaminated": 0, "accuracy - not labeled": 574},
"college_medicine": {"accuracy - clean": 107, "accuracy - input contaminated": 16, "accuracy - input-and-label contaminated": 44, "accuracy - not labeled": 5},
"high_school_world_history": {"accuracy - clean": 61, "accuracy - input contaminated": 2, "accuracy - input-and-label contaminated": 0, "accuracy - not labeled": 173},
"virology": {"accuracy - clean": 104, "accuracy - input contaminated": 3, "accuracy - input-and-label contaminated": 58, "accuracy - not labeled": 0},
"high_school_statistics": {"accuracy - clean": 96, "accuracy - input contaminated": 43, "accuracy - input-and-label contaminated": 76, "accuracy - not labeled": 0},
"nutrition": {"accuracy - clean": 172, "accuracy - input contaminated": 11, "accuracy - input-and-label contaminated": 98, "accuracy - not labeled": 24},
"abstract_algebra": {"accuracy - clean": 84, "accuracy - input contaminated": 8, "accuracy - input-and-label contaminated": 7, "accuracy - not labeled": 0},
"high_school_geography": {"accuracy - clean": 91, "accuracy - input contaminated": 1, "accuracy - input-and-label contaminated": 105, "accuracy - not labeled": 0},
"econometrics": {"accuracy - clean": 62, "accuracy - input contaminated": 13, "accuracy - input-and-label contaminated": 38, "accuracy - not labeled": 0},
"marketing": {"accuracy - clean": 115, "accuracy - input contaminated": 15, "accuracy - input-and-label contaminated": 101, "accuracy - not labeled": 2},
"high_school_chemistry": {"accuracy - clean": 108, "accuracy - input contaminated": 25, "accuracy - input-and-label contaminated": 69, "accuracy - not labeled": 0},
"prehistory": {"accuracy - clean": 154, "accuracy - input contaminated": 5, "accuracy - input-and-label contaminated": 107, "accuracy - not labeled": 57},
"college_physics": {"accuracy - clean": 25, "accuracy - input contaminated": 20, "accuracy - input-and-label contaminated": 57, "accuracy - not labeled": 0},
"management": {"accuracy - clean": 35, "accuracy - input contaminated": 5, "accuracy - input-and-label contaminated": 62, "accuracy - not labeled": 0},
"college_biology": {"accuracy - clean": 91, "accuracy - input contaminated": 12, "accuracy - input-and-label contaminated": 40, "accuracy - not labeled": 0},
"high_school_biology": {"accuracy - clean": 128, "accuracy - input contaminated": 17, "accuracy - input-and-label contaminated": 135, "accuracy - not labeled": 29},
"high_school_physics": {"accuracy - clean": 42, "accuracy - input contaminated": 28, "accuracy - input-and-label contaminated": 80, "accuracy - not labeled": 0},
"logical_fallacies": {"accuracy - clean": 133, "accuracy - input contaminated": 5, "accuracy - input-and-label contaminated": 24, "accuracy - not labeled": 0},
"medical_genetics": {"accuracy - clean": 49, "accuracy - input contaminated": 6, "accuracy - input-and-label contaminated": 43, "accuracy - not labeled": 1},
"machine_learning": {"accuracy - clean": 71, "accuracy - input contaminated": 8, "accuracy - input-and-label contaminated": 32, "accuracy - not labeled": 0},
"professional_law": {"accuracy - clean": 401, "accuracy - input contaminated": 8, "accuracy - input-and-label contaminated": 5, "accuracy - not labeled": 1119},
"professional_psychology": {"accuracy - clean": 265, "accuracy - input contaminated": 9, "accuracy - input-and-label contaminated": 27, "accuracy - not labeled": 310},
"global_facts": {"accuracy - clean": 89, "accuracy - input contaminated": 5, "accuracy - input-and-label contaminated": 5, "accuracy - not labeled": 0},
"us_foreign_policy": {"accuracy - clean": 71, "accuracy - input contaminated": 3, "accuracy - input-and-label contaminated": 25, "accuracy - not labeled": 0},
"international_law": {"accuracy - clean": 73, "accuracy - input contaminated": 1, "accuracy - input-and-label contaminated": 46, "accuracy - not labeled": 0},
"clinical_knowledge": {"accuracy - clean": 172, "accuracy - input contaminated": 6, "accuracy - input-and-label contaminated": 86, "accuracy - not labeled": 0},
"high_school_mathematics": {"accuracy - clean": 178, "accuracy - input contaminated": 59, "accuracy - input-and-label contaminated": 32, "accuracy - not labeled": 0},
"high_school_computer_science": {"accuracy - clean": 62, "accuracy - input contaminated": 7, "accuracy - input-and-label contaminated": 28, "accuracy - not labeled": 2},
"college_computer_science": {"accuracy - clean": 68, "accuracy - input contaminated": 15, "accuracy - input-and-label contaminated": 15, "accuracy - not labeled": 1},
"electrical_engineering": {"accuracy - clean": 75, "accuracy - input contaminated": 8, "accuracy - input-and-label contaminated": 61, "accuracy - not labeled": 0},
"college_mathematics": {"accuracy - clean": 61, "accuracy - input contaminated": 13, "accuracy - input-and-label contaminated": 26, "accuracy - not labeled": 0},
"computer_security": {"accuracy - clean": 55, "accuracy - input contaminated": 8, "accuracy - input-and-label contaminated": 36, "accuracy - not labeled": 0},
"high_school_macroeconomics": {"accuracy - clean": 102, "accuracy - input contaminated": 14, "accuracy - input-and-label contaminated": 173, "accuracy - not labeled": 100},
"astronomy": {"accuracy - clean": 112, "accuracy - input contaminated": 4, "accuracy - input-and-label contaminated": 35, "accuracy - not labeled": 0},
"college_chemistry": {"accuracy - clean": 46, "accuracy - input contaminated": 19, "accuracy - input-and-label contaminated": 34, "accuracy - not labeled": 0},
"high_school_european_history": {"accuracy - clean": 41, "accuracy - input contaminated": 0, "accuracy - input-and-label contaminated": 0, "accuracy - not labeled": 123},
"miscellaneous": {"accuracy - clean": 256, "accuracy - input contaminated": 9, "accuracy - input-and-label contaminated": 40, "accuracy - not labeled": 477},
"formal_logic": {"accuracy - clean": 92, "accuracy - input contaminated": 12, "accuracy - input-and-label contaminated": 21, "accuracy - not labeled": 0},
"elementary_mathematics": {"accuracy - clean": 155, "accuracy - input contaminated": 31, "accuracy - input-and-label contaminated": 103, "accuracy - not labeled": 88},
"world_religions": {"accuracy - clean": 130, "accuracy - input contaminated": 4, "accuracy - input-and-label contaminated": 36, "accuracy - not labeled": 0},
"professional_medicine": {"accuracy - clean": 191, "accuracy - input contaminated": 43, "accuracy - input-and-label contaminated": 1, "accuracy - not labeled": 36},
"anatomy": {"accuracy - clean": 52, "accuracy - input contaminated": 6, "accuracy - input-and-label contaminated": 76, "accuracy - not labeled": 0},
'business_ethics': {'accuracy - clean': 44, 'accuracy - input contaminated': 16, 'accuracy - input-and-label contaminated': 38, 'accuracy - not labeled': 1},
'security_studies': {'accuracy - clean': 188, 'accuracy - input contaminated': 9, 'accuracy - input-and-label contaminated': 47, 'accuracy - not labeled': 0},
'high_school_us_history': {'accuracy - clean': 42, 'accuracy - input contaminated': 0, 'accuracy - input-and-label contaminated': 0, 'accuracy - not labeled': 161},
'moral_disputes': {'accuracy - clean': 105, 'accuracy - input contaminated': 13, 'accuracy - input-and-label contaminated': 168, 'accuracy - not labeled': 59},
'philosophy': {'accuracy - clean': 81, 'accuracy - input contaminated': 11, 'accuracy - input-and-label contaminated': 187, 'accuracy - not labeled': 31},
'public_relations': {'accuracy - clean': 75, 'accuracy - input contaminated': 8, 'accuracy - input-and-label contaminated': 26, 'accuracy - not labeled': 0},
'high_school_microeconomics': {'accuracy - clean': 82, 'accuracy - input contaminated': 9, 'accuracy - input-and-label contaminated': 146, 'accuracy - not labeled': 0},
'human_sexuality': {'accuracy - clean': 108, 'accuracy - input contaminated': 3, 'accuracy - input-and-label contaminated': 15, 'accuracy - not labeled': 4},
'professional_accounting': {'accuracy - clean': 88, 'accuracy - input contaminated': 40, 'accuracy - input-and-label contaminated': 152, 'accuracy - not labeled': 1},
'high_school_government_and_politics': {'accuracy - clean': 104, 'accuracy - input contaminated': 6, 'accuracy - input-and-label contaminated': 82, 'accuracy - not labeled': 0},
'sociology': {'accuracy - clean': 105, 'accuracy - input contaminated': 4, 'accuracy - input-and-label contaminated': 91, 'accuracy - not labeled': 0},
'conceptual_physics': {'accuracy - clean': 79, 'accuracy - input contaminated': 8, 'accuracy - input-and-label contaminated': 147, 'accuracy - not labeled': 0},
'human_aging': {'accuracy - clean': 208, 'accuracy - input contaminated': 1, 'accuracy - input-and-label contaminated': 13, 'accuracy - not labeled': 0},
'high_school_psychology': {'accuracy - clean': 108, 'accuracy - input contaminated': 26, 'accuracy - input-and-label contaminated': 162, 'accuracy - not labeled': 248},
'jurisprudence': {'accuracy - clean': 59, 'accuracy - input contaminated': 5, 'accuracy - input-and-label contaminated': 43, 'accuracy - not labeled': 0},
'moral_scenarios': {'accuracy - clean': 320, 'accuracy - input contaminated': 0, 'accuracy - input-and-label contaminated': 0, 'accuracy - not labeled': 574},
'college_medicine': {'accuracy - clean': 107, 'accuracy - input contaminated': 16, 'accuracy - input-and-label contaminated': 44, 'accuracy - not labeled': 5},
'high_school_world_history': {'accuracy - clean': 61, 'accuracy - input contaminated': 2, 'accuracy - input-and-label contaminated': 0, 'accuracy - not labeled': 173},
'virology': {'accuracy - clean': 104, 'accuracy - input contaminated': 3, 'accuracy - input-and-label contaminated': 58, 'accuracy - not labeled': 0},
'high_school_statistics': {'accuracy - clean': 96, 'accuracy - input contaminated': 43, 'accuracy - input-and-label contaminated': 76, 'accuracy - not labeled': 0},
'nutrition': {'accuracy - clean': 172, 'accuracy - input contaminated': 11, 'accuracy - input-and-label contaminated': 98, 'accuracy - not labeled': 24},
'abstract_algebra': {'accuracy - clean': 84, 'accuracy - input contaminated': 8, 'accuracy - input-and-label contaminated': 7, 'accuracy - not labeled': 0},
'high_school_geography': {'accuracy - clean': 91, 'accuracy - input contaminated': 1, 'accuracy - input-and-label contaminated': 105, 'accuracy - not labeled': 0},
'econometrics': {'accuracy - clean': 62, 'accuracy - input contaminated': 13, 'accuracy - input-and-label contaminated': 38, 'accuracy - not labeled': 0},
'marketing': {'accuracy - clean': 115, 'accuracy - input contaminated': 15, 'accuracy - input-and-label contaminated': 101, 'accuracy - not labeled': 2},
'high_school_chemistry': {'accuracy - clean': 108, 'accuracy - input contaminated': 25, 'accuracy - input-and-label contaminated': 69, 'accuracy - not labeled': 0},
'prehistory': {'accuracy - clean': 154, 'accuracy - input contaminated': 5, 'accuracy - input-and-label contaminated': 107, 'accuracy - not labeled': 57},
'college_physics': {'accuracy - clean': 25, 'accuracy - input contaminated': 20, 'accuracy - input-and-label contaminated': 57, 'accuracy - not labeled': 0},
'management': {'accuracy - clean': 35, 'accuracy - input contaminated': 5, 'accuracy - input-and-label contaminated': 62, 'accuracy - not labeled': 0},
'college_biology': {'accuracy - clean': 91, 'accuracy - input contaminated': 12, 'accuracy - input-and-label contaminated': 40, 'accuracy - not labeled': 0},
'high_school_biology': {'accuracy - clean': 128, 'accuracy - input contaminated': 17, 'accuracy - input-and-label contaminated': 135, 'accuracy - not labeled': 29},
'high_school_physics': {'accuracy - clean': 42, 'accuracy - input contaminated': 28, 'accuracy - input-and-label contaminated': 80, 'accuracy - not labeled': 0},
'logical_fallacies': {'accuracy - clean': 133, 'accuracy - input contaminated': 5, 'accuracy - input-and-label contaminated': 24, 'accuracy - not labeled': 0},
'medical_genetics': {'accuracy - clean': 49, 'accuracy - input contaminated': 6, 'accuracy - input-and-label contaminated': 43, 'accuracy - not labeled': 1},
'machine_learning': {'accuracy - clean': 71, 'accuracy - input contaminated': 8, 'accuracy - input-and-label contaminated': 32, 'accuracy - not labeled': 0},
'professional_law': {'accuracy - clean': 401, 'accuracy - input contaminated': 8, 'accuracy - input-and-label contaminated': 5, 'accuracy - not labeled': 1119},
'professional_psychology': {'accuracy - clean': 265, 'accuracy - input contaminated': 9, 'accuracy - input-and-label contaminated': 27, 'accuracy - not labeled': 310},
'global_facts': {'accuracy - clean': 89, 'accuracy - input contaminated': 5, 'accuracy - input-and-label contaminated': 5, 'accuracy - not labeled': 0},
'us_foreign_policy': {'accuracy - clean': 71, 'accuracy - input contaminated': 3, 'accuracy - input-and-label contaminated': 25, 'accuracy - not labeled': 0},
'international_law': {'accuracy - clean': 73, 'accuracy - input contaminated': 1, 'accuracy - input-and-label contaminated': 46, 'accuracy - not labeled': 0},
'clinical_knowledge': {'accuracy - clean': 172, 'accuracy - input contaminated': 6, 'accuracy - input-and-label contaminated': 86, 'accuracy - not labeled': 0},
'high_school_mathematics': {'accuracy - clean': 178, 'accuracy - input contaminated': 59, 'accuracy - input-and-label contaminated': 32, 'accuracy - not labeled': 0},
'high_school_computer_science': {'accuracy - clean': 62, 'accuracy - input contaminated': 7, 'accuracy - input-and-label contaminated': 28, 'accuracy - not labeled': 2},
'college_computer_science': {'accuracy - clean': 68, 'accuracy - input contaminated': 15, 'accuracy - input-and-label contaminated': 15, 'accuracy - not labeled': 1},
'electrical_engineering': {'accuracy - clean': 75, 'accuracy - input contaminated': 8, 'accuracy - input-and-label contaminated': 61, 'accuracy - not labeled': 0},
'college_mathematics': {'accuracy - clean': 61, 'accuracy - input contaminated': 13, 'accuracy - input-and-label contaminated': 26, 'accuracy - not labeled': 0},
'computer_security': {'accuracy - clean': 55, 'accuracy - input contaminated': 8, 'accuracy - input-and-label contaminated': 36, 'accuracy - not labeled': 0},
'high_school_macroeconomics': {'accuracy - clean': 102, 'accuracy - input contaminated': 14, 'accuracy - input-and-label contaminated': 173, 'accuracy - not labeled': 100},
'astronomy': {'accuracy - clean': 112, 'accuracy - input contaminated': 4, 'accuracy - input-and-label contaminated': 35, 'accuracy - not labeled': 0},
'college_chemistry': {'accuracy - clean': 46, 'accuracy - input contaminated': 19, 'accuracy - input-and-label contaminated': 34, 'accuracy - not labeled': 0},
'high_school_european_history': {'accuracy - clean': 41, 'accuracy - input contaminated': 0, 'accuracy - input-and-label contaminated': 0, 'accuracy - not labeled': 123},
'miscellaneous': {'accuracy - clean': 256, 'accuracy - input contaminated': 9, 'accuracy - input-and-label contaminated': 40, 'accuracy - not labeled': 477},
'formal_logic': {'accuracy - clean': 92, 'accuracy - input contaminated': 12, 'accuracy - input-and-label contaminated': 21, 'accuracy - not labeled': 0},
'elementary_mathematics': {'accuracy - clean': 155, 'accuracy - input contaminated': 31, 'accuracy - input-and-label contaminated': 103, 'accuracy - not labeled': 88},
'world_religions': {'accuracy - clean': 130, 'accuracy - input contaminated': 4, 'accuracy - input-and-label contaminated': 36, 'accuracy - not labeled': 0},
'professional_medicine': {'accuracy - clean': 191, 'accuracy - input contaminated': 43, 'accuracy - input-and-label contaminated': 1, 'accuracy - not labeled': 36},
'anatomy': {'accuracy - clean': 52, 'accuracy - input contaminated': 6, 'accuracy - input-and-label contaminated': 76, 'accuracy - not labeled': 0},
}
......@@ -166,7 +166,7 @@ for metric_name in ['accuracy - clean', 'accuracy - input contaminated', 'accura
'weights': weights,
}
)
for dataset_abbr, subsets in mmlu_name_and_subsets:
weights = {f'lukaemon_mmlu_{i}': mmlu_category_weights[i][metric_name] for i in subsets}
subsets = [[f'lukaemon_mmlu_{i}', metric_name] for i in subsets]
......@@ -178,7 +178,7 @@ for metric_name in ['accuracy - clean', 'accuracy - input contaminated', 'accura
'weights': weights,
}
)
summary_groups.append(
{
'name': 'hellaswag',
......
......@@ -14,5 +14,5 @@ with read_base():
from .groups.mgsm import mgsm_summary_groups
summarizer = dict(
summary_groups=sum([v for k, v in locals().items() if k.endswith("_summary_groups")], []),
summary_groups=sum([v for k, v in locals().items() if k.endswith('_summary_groups')], []),
)
sub_categories = {
'math': ['abstract_algebra', 'college_mathematics', 'elementary_mathematics', 'high_school_mathematics', 'high_school_statistics'],
'health': ['anatomy', 'clinical_knowledge', 'college_medicine', 'human_aging', 'medical_genetics', 'nutrition', 'professional_medicine', 'virology'],
'physics': ['astronomy', 'college_physics', 'conceptual_physics', 'high_school_physics'],
'business': ['business_ethics', 'management', 'marketing'],
'biology': ['college_biology', 'high_school_biology'],
'chemistry': ['college_chemistry', 'high_school_chemistry'],
'computer science': ['college_computer_science', 'computer_security', 'high_school_computer_science', 'machine_learning'],
'economics': ['econometrics', 'high_school_macroeconomics', 'high_school_microeconomics'],
'engineering': ['electrical_engineering'],
'philosophy': ['formal_logic', 'logical_fallacies', 'moral_disputes', 'moral_scenarios', 'philosophy', 'world_religions'],
'other': ['global_facts', 'miscellaneous', 'professional_accounting'],
'history': ['high_school_european_history', 'high_school_us_history', 'high_school_world_history', 'prehistory'],
'geography': ['high_school_geography'],
'politics': ['high_school_government_and_politics', 'public_relations', 'security_studies', 'us_foreign_policy'],
'psychology': ['high_school_psychology', 'professional_psychology'],
'culture': ['human_sexuality', 'sociology'],
'math': ['abstract_algebra', 'college_mathematics', 'elementary_mathematics', 'high_school_mathematics', 'high_school_statistics'],
'health': ['anatomy', 'clinical_knowledge', 'college_medicine', 'human_aging', 'medical_genetics', 'nutrition', 'professional_medicine', 'virology'],
'physics': ['astronomy', 'college_physics', 'conceptual_physics', 'high_school_physics'],
'business': ['business_ethics', 'management', 'marketing'],
'biology': ['college_biology', 'high_school_biology'],
'chemistry': ['college_chemistry', 'high_school_chemistry'],
'computer science': ['college_computer_science', 'computer_security', 'high_school_computer_science', 'machine_learning'],
'economics': ['econometrics', 'high_school_macroeconomics', 'high_school_microeconomics'],
'engineering': ['electrical_engineering'],
'philosophy': ['formal_logic', 'logical_fallacies', 'moral_disputes', 'moral_scenarios', 'philosophy', 'world_religions'],
'other': ['global_facts', 'miscellaneous', 'professional_accounting'],
'history': ['high_school_european_history', 'high_school_us_history', 'high_school_world_history', 'prehistory'],
'geography': ['high_school_geography'],
'politics': ['high_school_government_and_politics', 'public_relations', 'security_studies', 'us_foreign_policy'],
'psychology': ['high_school_psychology', 'professional_psychology'],
'culture': ['human_sexuality', 'sociology'],
'law': ['international_law', 'jurisprudence', 'professional_law']
}
categories = {
"STEM": ["physics", "chemistry", "biology", "computer science", "math", "engineering"],
"humanities": ["history", "philosophy", "law"],
"social_sciences": ["politics", "culture", "economics", "geography", "psychology"],
"other": ["other", "business", "health"],
'STEM': ['physics', 'chemistry', 'biology', 'computer science', 'math', 'engineering'],
'humanities': ['history', 'philosophy', 'law'],
'social_sciences': ['politics', 'culture', 'economics', 'geography', 'psychology'],
'other': ['other', 'business', 'health'],
}
category2subject = {}
......
......@@ -392,4 +392,4 @@ cibench_summary_groups.extend([
'subsets': [i[:2] for i in cibench_math],
'weights': {f'{k[0]}@{k[1]}': k[-1] for k in cibench_math},
},
])
\ No newline at end of file
])
subcategories = {
"agronomy": ['other'],
"anatomy": ['biology'],
"ancient_chinese": ['linguistics','china specific'],
"arts": ['arts'],
"astronomy": ['physics'],
"business_ethics": ['business'],
"chinese_civil_service_exam": ['politics','china specific'],
"chinese_driving_rule": ['other','china specific'],
"chinese_food_culture": ['culture','china specific'],
"chinese_foreign_policy": ['politics','china specific'],
"chinese_history":['history','china specific'],
"chinese_literature": ['literature','china specific'],
"chinese_teacher_qualification": ['education','china specific'],
"college_actuarial_science":['math'],
"college_education":['education'],
"college_engineering_hydrology": ['engineering'],
"college_law": ['law'],
"college_mathematics": ['math'],
"college_medical_statistics":['statistics'],
"clinical_knowledge": ['other'],
"college_medicine": ['other'],
"computer_science": ['computer science'],
"computer_security": ['other'],
"conceptual_physics": ['physics'],
"construction_project_management": ['other','china specific'],
"economics": ['economics'],
"education": ['education'],
"elementary_chinese":['linguistics','china specific'],
"elementary_commonsense":['other','china specific'],
"elementary_information_and_technology": ['other'],
"electrical_engineering": ['engineering'],
"elementary_mathematics": ['math'],
"ethnology": ['culture','china specific'],
"food_science": ['other'],
"genetics": ['biology'],
"global_facts": ['global'],
"high_school_biology": ['biology'],
"high_school_chemistry": ['chemistry'],
"high_school_geography": ['geography'],
"high_school_mathematics": ['math'],
"high_school_physics": ['physics'],
"high_school_politics": ['politics','china specific'],
"human_sexuality": ['other'],
"international_law": ['law'],
"journalism": ['sociology'],
"jurisprudence": ['law'],
"legal_and_moral_basis": ['other'],
"logical": ['philosophy'],
"machine_learning": ['computer science'],
"management": ['business'],
"marketing": ['business'],
"marxist_theory": ['philosophy'],
"modern_chinese": ['linguistics','china specific'],
"nutrition": ['other'],
"philosophy": ['philosophy'],
"professional_accounting": ['business'],
"professional_law": ['law'],
"professional_medicine": ['other'],
"professional_psychology": ['psychology'],
"public_relations": ['politics'],
"security_study": ['politics'],
"sociology": ['culture'],
"sports_science": ['other'],
"traditional_chinese_medicine": ['other','china specific'],
"virology": ['biology'],
"world_history":['history'],
"world_religions": ['global'],
'agronomy': ['other'],
'anatomy': ['biology'],
'ancient_chinese': ['linguistics','china specific'],
'arts': ['arts'],
'astronomy': ['physics'],
'business_ethics': ['business'],
'chinese_civil_service_exam': ['politics','china specific'],
'chinese_driving_rule': ['other','china specific'],
'chinese_food_culture': ['culture','china specific'],
'chinese_foreign_policy': ['politics','china specific'],
'chinese_history':['history','china specific'],
'chinese_literature': ['literature','china specific'],
'chinese_teacher_qualification': ['education','china specific'],
'college_actuarial_science':['math'],
'college_education':['education'],
'college_engineering_hydrology': ['engineering'],
'college_law': ['law'],
'college_mathematics': ['math'],
'college_medical_statistics':['statistics'],
'clinical_knowledge': ['other'],
'college_medicine': ['other'],
'computer_science': ['computer science'],
'computer_security': ['other'],
'conceptual_physics': ['physics'],
'construction_project_management': ['other','china specific'],
'economics': ['economics'],
'education': ['education'],
'elementary_chinese':['linguistics','china specific'],
'elementary_commonsense':['other','china specific'],
'elementary_information_and_technology': ['other'],
'electrical_engineering': ['engineering'],
'elementary_mathematics': ['math'],
'ethnology': ['culture','china specific'],
'food_science': ['other'],
'genetics': ['biology'],
'global_facts': ['global'],
'high_school_biology': ['biology'],
'high_school_chemistry': ['chemistry'],
'high_school_geography': ['geography'],
'high_school_mathematics': ['math'],
'high_school_physics': ['physics'],
'high_school_politics': ['politics','china specific'],
'human_sexuality': ['other'],
'international_law': ['law'],
'journalism': ['sociology'],
'jurisprudence': ['law'],
'legal_and_moral_basis': ['other'],
'logical': ['philosophy'],
'machine_learning': ['computer science'],
'management': ['business'],
'marketing': ['business'],
'marxist_theory': ['philosophy'],
'modern_chinese': ['linguistics','china specific'],
'nutrition': ['other'],
'philosophy': ['philosophy'],
'professional_accounting': ['business'],
'professional_law': ['law'],
'professional_medicine': ['other'],
'professional_psychology': ['psychology'],
'public_relations': ['politics'],
'security_study': ['politics'],
'sociology': ['culture'],
'sports_science': ['other'],
'traditional_chinese_medicine': ['other','china specific'],
'virology': ['biology'],
'world_history':['history'],
'world_religions': ['global'],
}
categories = {
"STEM": ["physics", "chemistry", "biology", "computer science", "math", "engineering", "statistics"],
"Humanities": ["history", "philosophy", "law", "arts", "literature", "global"],
"Social Science": ['linguistics',"business", "politics", "culture", "economics", "geography", "psychology", "education", "sociology"],
"Other":["other"],
"China specific": ["china specific"],
'STEM': ['physics', 'chemistry', 'biology', 'computer science', 'math', 'engineering', 'statistics'],
'Humanities': ['history', 'philosophy', 'law', 'arts', 'literature', 'global'],
'Social Science': ['linguistics','business', 'politics', 'culture', 'economics', 'geography', 'psychology', 'education', 'sociology'],
'Other':['other'],
'China specific': ['china specific'],
}
category2subject = {}
......
names = [
["1-1", "article_recitation"],
["1-2", "knowledge_question_answering"],
["2-1", "document_proofreading"],
["2-2", "dispute_focus_identification"],
["2-3", "marital_disputes_identification"],
["2-4", "issue_topic_identification"],
["2-5", "reading_comprehension"],
["2-6", "named_entity_recognition"],
["2-7", "opinion_summarization"],
["2-8", "argument_mining"],
["2-9", "event_detection"],
["2-10", "trigger_word_extraction"],
["3-1", "fact_based_article_prediction"],
["3-2", "scene_based_article_prediction"],
["3-3", "charge_prediction"],
["3-4", "prison_term_prediction_wo_article"],
["3-5", "prison_term_prediction_w_article"],
["3-6", "case_analysis"],
["3-7", "criminal_damages_calculation"],
["3-8", "consultation"],
['1-1', 'article_recitation'],
['1-2', 'knowledge_question_answering'],
['2-1', 'document_proofreading'],
['2-2', 'dispute_focus_identification'],
['2-3', 'marital_disputes_identification'],
['2-4', 'issue_topic_identification'],
['2-5', 'reading_comprehension'],
['2-6', 'named_entity_recognition'],
['2-7', 'opinion_summarization'],
['2-8', 'argument_mining'],
['2-9', 'event_detection'],
['2-10', 'trigger_word_extraction'],
['3-1', 'fact_based_article_prediction'],
['3-2', 'scene_based_article_prediction'],
['3-3', 'charge_prediction'],
['3-4', 'prison_term_prediction_wo_article'],
['3-5', 'prison_term_prediction_w_article'],
['3-6', 'case_analysis'],
['3-7', 'criminal_damages_calculation'],
['3-8', 'consultation'],
]
lawbench_summary_groups = []
......
leval_summary_groups = [
{"name": "leval", "subsets": ["LEval_coursera", "LEval_gsm100", "LEval_quality", "LEval_tpo", "LEval_topic_retrieval", "LEval_financialqa", "LEval_gov_report_summ", "LEval_legal_contract_qa", "LEval_meeting_summ", "LEval_multidocqa", "LEval_narrativeqa", "LEval_nq", "LEval_news_summ", "LEval_paper_assistant", "LEval_patent_summ", "LEval_review_summ", "LEval_scientificqa", "LEval_tvshow_summ"]},
{'name': 'leval', 'subsets': ['LEval_coursera', 'LEval_gsm100', 'LEval_quality', 'LEval_tpo', 'LEval_topic_retrieval', 'LEval_financialqa', 'LEval_gov_report_summ', 'LEval_legal_contract_qa', 'LEval_meeting_summ', 'LEval_multidocqa', 'LEval_narrativeqa', 'LEval_nq', 'LEval_news_summ', 'LEval_paper_assistant', 'LEval_patent_summ', 'LEval_review_summ', 'LEval_scientificqa', 'LEval_tvshow_summ']},
]
len_levels = ["16k", "32k", "64k", "128k", "256k"]
len_levels = ['16k', '32k', '64k', '128k', '256k']
subsets_lveval_loogle_SD_mixup = [
"LVEval_loogle_SD_mixup" + "_" + len_level for len_level in len_levels
'LVEval_loogle_SD_mixup' + '_' + len_level for len_level in len_levels
]
subsets_lveval_cmrc_mixup = [
"LVEval_cmrc_mixup" + "_" + len_level for len_level in len_levels
'LVEval_cmrc_mixup' + '_' + len_level for len_level in len_levels
]
subsets_lveval_multifieldqa_en_mixup = [
"LVEval_multifieldqa_en_mixup" + "_" + len_level
'LVEval_multifieldqa_en_mixup' + '_' + len_level
for len_level in len_levels
]
subsets_lveval_multifieldqa_zh_mixup = [
"LVEval_multifieldqa_zh_mixup" + "_" + len_level
'LVEval_multifieldqa_zh_mixup' + '_' + len_level
for len_level in len_levels
]
subsets_lveval_dureader_mixup = [
"LVEval_dureader_mixup" + "_" + len_level for len_level in len_levels
'LVEval_dureader_mixup' + '_' + len_level for len_level in len_levels
]
subsets_lveval_loogle_CR_mixup = [
"LVEval_loogle_CR_mixup" + "_" + len_level for len_level in len_levels
'LVEval_loogle_CR_mixup' + '_' + len_level for len_level in len_levels
]
subsets_lveval_loogle_MIR_mixup = [
"LVEval_loogle_MIR_mixup" + "_" + len_level for len_level in len_levels
'LVEval_loogle_MIR_mixup' + '_' + len_level for len_level in len_levels
]
subsets_lveval_hotpotwikiqa_mixup = [
"LVEval_hotpotwikiqa_mixup" + "_" + len_level for len_level in len_levels
'LVEval_hotpotwikiqa_mixup' + '_' + len_level for len_level in len_levels
]
subsets_lveval_lic_mixup = [
"LVEval_lic_mixup" + "_" + len_level for len_level in len_levels
'LVEval_lic_mixup' + '_' + len_level for len_level in len_levels
]
subsets_lveval_factrecall_en = [
"LVEval_factrecall_en" + "_" + len_level for len_level in len_levels
'LVEval_factrecall_en' + '_' + len_level for len_level in len_levels
]
subsets_lveval_factrecall_zh = [
"LVEval_factrecall_zh" + "_" + len_level for len_level in len_levels
'LVEval_factrecall_zh' + '_' + len_level for len_level in len_levels
]
subsets_lveval_single_hop_qa = (
......@@ -64,47 +64,47 @@ subsets_lveval_qa = (
lveval_summary_groups = [
{
"name": "LVEval_loogle_SD_mixup",
"subsets": subsets_lveval_loogle_SD_mixup,
'name': 'LVEval_loogle_SD_mixup',
'subsets': subsets_lveval_loogle_SD_mixup,
},
{"name": "LVEval_cmrc_mixup", "subsets": subsets_lveval_cmrc_mixup},
{'name': 'LVEval_cmrc_mixup', 'subsets': subsets_lveval_cmrc_mixup},
{
"name": "LVEval_multifieldqa_en_mixup",
"subsets": subsets_lveval_multifieldqa_en_mixup,
'name': 'LVEval_multifieldqa_en_mixup',
'subsets': subsets_lveval_multifieldqa_en_mixup,
},
{
"name": "LVEval_multifieldqa_zh_mixup",
"subsets": subsets_lveval_multifieldqa_zh_mixup,
'name': 'LVEval_multifieldqa_zh_mixup',
'subsets': subsets_lveval_multifieldqa_zh_mixup,
},
{
"name": "LVEval_dureader_mixup",
"subsets": subsets_lveval_dureader_mixup,
'name': 'LVEval_dureader_mixup',
'subsets': subsets_lveval_dureader_mixup,
},
{
"name": "LVEval_loogle_CR_mixup",
"subsets": subsets_lveval_loogle_CR_mixup,
'name': 'LVEval_loogle_CR_mixup',
'subsets': subsets_lveval_loogle_CR_mixup,
},
{
"name": "LVEval_loogle_MIR_mixup",
"subsets": subsets_lveval_loogle_MIR_mixup,
'name': 'LVEval_loogle_MIR_mixup',
'subsets': subsets_lveval_loogle_MIR_mixup,
},
{
"name": "LVEval_hotpotwikiqa_mixup",
"subsets": subsets_lveval_hotpotwikiqa_mixup,
'name': 'LVEval_hotpotwikiqa_mixup',
'subsets': subsets_lveval_hotpotwikiqa_mixup,
},
{"name": "LVEval_lic_mixup", "subsets": subsets_lveval_lic_mixup},
{"name": "LVEval_factrecall_en", "subsets": subsets_lveval_factrecall_en},
{"name": "LVEval_factrecall_zh", "subsets": subsets_lveval_factrecall_zh},
{"name": "LVEval_single_hop_qa", "subsets": subsets_lveval_single_hop_qa},
{'name': 'LVEval_lic_mixup', 'subsets': subsets_lveval_lic_mixup},
{'name': 'LVEval_factrecall_en', 'subsets': subsets_lveval_factrecall_en},
{'name': 'LVEval_factrecall_zh', 'subsets': subsets_lveval_factrecall_zh},
{'name': 'LVEval_single_hop_qa', 'subsets': subsets_lveval_single_hop_qa},
{
"name": "LVEval_single_hop_cqa",
"subsets": subsets_lveval_single_hop_cqa,
'name': 'LVEval_single_hop_cqa',
'subsets': subsets_lveval_single_hop_cqa,
},
{"name": "LVEval_multi_hop_qa", "subsets": subsets_lveval_multi_hop_qa},
{"name": "LVEval_multi_hop_cqa", "subsets": subsets_lveval_multi_hop_cqa},
{'name': 'LVEval_multi_hop_qa', 'subsets': subsets_lveval_multi_hop_qa},
{'name': 'LVEval_multi_hop_cqa', 'subsets': subsets_lveval_multi_hop_cqa},
{
"name": "LVEval_factrecall_cqa",
"subsets": subsets_lveval_factrecall_cqa,
'name': 'LVEval_factrecall_cqa',
'subsets': subsets_lveval_factrecall_cqa,
},
{"name": "LVEval_qa", "subsets": subsets_lveval_qa},
{'name': 'LVEval_qa', 'subsets': subsets_lveval_qa},
]
ALL_LANGUAGES = ["bn", "de", "en", "es", "fr", "ja", "ru", "sw", "te", "th", "zh"]
LATIN_LANGUAGES = ["de", "en", "es", "fr", "sw"]
NON_LATIN_LANGUAGES = ["bn", "ja", "ru", "te", "th", "zh"]
ALL_LANGUAGES = ['bn', 'de', 'en', 'es', 'fr', 'ja', 'ru', 'sw', 'te', 'th', 'zh']
LATIN_LANGUAGES = ['de', 'en', 'es', 'fr', 'sw']
NON_LATIN_LANGUAGES = ['bn', 'ja', 'ru', 'te', 'th', 'zh']
mgsm_summary_groups = [
{'name': 'mgsm_latin', 'subsets': [f'mgsm_{lang}' for lang in LATIN_LANGUAGES]},
......
scibench_summary_groups = []
scibench_tasks = ["atkins", "calculus", "chemmc", "class", "diff", "fund", "matter", "quan", "stat", "thermo"]
for suffix in ["", "_zs-cot", "_fs", "_fs-cot"]:
subsets = [f"scibench-{subset}{suffix}" for subset in scibench_tasks]
scibench_tasks = ['atkins', 'calculus', 'chemmc', 'class', 'diff', 'fund', 'matter', 'quan', 'stat', 'thermo']
for suffix in ['', '_zs-cot', '_fs', '_fs-cot']:
subsets = [f'scibench-{subset}{suffix}' for subset in scibench_tasks]
scibench_summary_groups.append({'name': f'scibench{suffix}', 'subsets': subsets})
......@@ -71,4 +71,3 @@ for group in _base_summary_groups:
group['name'] = group['name'] + '_zh'
group['subsets'] = [[subset[0] + '_zh', subset[1]] for subset in group['subsets']]
teval_summary_groups.append(group)
xiezhi_summary_groups = []
_xiezhi = ["xiezhi-spec_eng", "xiezhi-spec_chn", "xiezhi-inter_eng", "xiezhi-inter_chn"]
_xiezhi = ['xiezhi-spec_eng', 'xiezhi-spec_chn', 'xiezhi-inter_eng', 'xiezhi-inter_chn']
xiezhi_summary_groups.append({'name': 'xiezhi', 'subsets': _xiezhi})
......@@ -2,7 +2,7 @@ from mmengine.config import read_base
with read_base():
from .groups.infinitebench import infinitebench_summary_groups
summarizer = dict(
summary_groups=sum([v for k, v in locals().items() if k.endswith("_summary_groups")], []),
summary_groups=sum([v for k, v in locals().items() if k.endswith('_summary_groups')], []),
)
......@@ -16,5 +16,5 @@ summarizer = dict(
['sanitized_mbpp', 'score'],
],
summary_groups=sum(
[v for k, v in locals().items() if k.endswith("_summary_groups")], []),
[v for k, v in locals().items() if k.endswith('_summary_groups')], []),
)
......@@ -50,7 +50,7 @@ summarizer = dict(
'lawbench-3-7-criminal_damages_calculation-1-shot',
'lawbench-3-8-consultation-1-shot',
],
summary_groups=sum([v for k, v in locals().items() if k.endswith("_summary_groups")], []),
summary_groups=sum([v for k, v in locals().items() if k.endswith('_summary_groups')], []),
prompt_db=dict(
database_path='configs/datasets/log.json',
config_dir='configs/datasets',
......
......@@ -13,11 +13,11 @@ with read_base():
other_summary_groups = []
other_summary_groups.append({'name': 'Exam', 'subsets': ["ceval",'agieval','mmlu','cmmlu',"GaokaoBench",'ARC-c','ARC-e']})
other_summary_groups.append({'name': 'Exam', 'subsets': ['ceval','agieval','mmlu','cmmlu','GaokaoBench','ARC-c','ARC-e']})
other_summary_groups.append({'name': 'Language', 'subsets': ['WiC','chid-dev','afqmc-dev','WSC','tydiqa-goldp','flores_100']})
other_summary_groups.append({'name': 'Knowledge', 'subsets': ['BoolQ','commonsense_qa','triviaqa','nq']})
other_summary_groups.append({'name': 'Understanding', 'subsets': ['C3','race-middle','race-high','openbookqa_fact','csl_dev','lcsts','Xsum','eprstmt-dev','lambada']})
other_summary_groups.append({'name': 'Reasoning', 'subsets': ['cmnli','ocnli','AX_b','AX_g','RTE','COPA','ReCoRD','hellaswag','piqa','siqa','math','gsm8k','drop','openai_humaneval','mbpp',"bbh"]})
other_summary_groups.append({'name': 'Reasoning', 'subsets': ['cmnli','ocnli','AX_b','AX_g','RTE','COPA','ReCoRD','hellaswag','piqa','siqa','math','gsm8k','drop','openai_humaneval','mbpp','bbh']})
other_summary_groups.append({'name': 'Overall', 'subsets': ['Exam', 'Language', 'Knowledge', 'Understanding', 'Reasoning']})
summarizer = dict(
......@@ -30,11 +30,11 @@ summarizer = dict(
'Reasoning',
'--------- 考试 Exam ---------', # category
# 'Mixed', # subcategory
"ceval",
'ceval',
'agieval',
'mmlu',
'cmmlu',
"GaokaoBench",
'GaokaoBench',
'ARC-c',
'ARC-e',
'--------- 语言 Language ---------', # category
......@@ -92,8 +92,8 @@ summarizer = dict(
'openai_humaneval',
'mbpp',
# '综合推理', # subcategory
"bbh",
'bbh',
],
summary_groups=sum(
[v for k, v in locals().items() if k.endswith("_summary_groups")], []),
[v for k, v in locals().items() if k.endswith('_summary_groups')], []),
)
......@@ -5,110 +5,110 @@ with read_base():
summarizer = dict(
dataset_abbrs=[
"----------------------------------------",
"--------- LVEval All ---------", # category
"----------------------------------------",
"LVEval_qa",
"----------------------------------------",
"--------- LVEval Tasks All ---------", # category
"----------------------------------------",
"LVEval_single_hop_qa",
"LVEval_single_hop_cqa",
"LVEval_multi_hop_qa",
"LVEval_multi_hop_cqa",
"LVEval_factrecall_cqa",
"----------------------------------------",
"--------- LVEval Datasets All ---------", # category
"----------------------------------------",
"LVEval_loogle_SD_mixup",
"LVEval_cmrc_mixup",
"LVEval_multifieldqa_en_mixup",
"LVEval_multifieldqa_zh_mixup",
"LVEval_dureader_mixup",
"LVEval_loogle_CR_mixup",
"LVEval_loogle_MIR_mixup",
"LVEval_hotpotwikiqa_mixup",
"LVEval_lic_mixup",
"LVEval_factrecall_en",
"LVEval_factrecall_zh",
"----------------------------------------",
"--------- LVEval Single_Hop QA ---------", # category
"----------------------------------------",
"LVEval_loogle_SD_mixup_16k",
"LVEval_loogle_SD_mixup_32k",
"LVEval_loogle_SD_mixup_64k",
"LVEval_loogle_SD_mixup_128k",
"LVEval_loogle_SD_mixup_256k",
"----------------------------------------",
"LVEval_cmrc_mixup_16k",
"LVEval_cmrc_mixup_32k",
"LVEval_cmrc_mixup_64k",
"LVEval_cmrc_mixup_128k",
"LVEval_cmrc_mixup_256k",
"----------------------------------------",
"--------- LVEval Single_Hop CQA ---------", # category
"----------------------------------------",
"LVEval_multifieldqa_en_mixup_16k",
"LVEval_multifieldqa_en_mixup_32k",
"LVEval_multifieldqa_en_mixup_64k",
"LVEval_multifieldqa_en_mixup_128k",
"LVEval_multifieldqa_en_mixup_256k",
"----------------------------------------",
"LVEval_multifieldqa_zh_mixup_16k",
"LVEval_multifieldqa_zh_mixup_32k",
"LVEval_multifieldqa_zh_mixup_64k",
"LVEval_multifieldqa_zh_mixup_128k",
"LVEval_multifieldqa_zh_mixup_256k",
"----------------------------------------",
"--------- LVEval Multi_Hop QA ---------", # category
"----------------------------------------",
"LVEval_dureader_mixup_16k",
"LVEval_dureader_mixup_32k",
"LVEval_dureader_mixup_64k",
"LVEval_dureader_mixup_128k",
"LVEval_dureader_mixup_256k",
"----------------------------------------",
"LVEval_loogle_CR_mixup_16k",
"LVEval_loogle_CR_mixup_32k",
"LVEval_loogle_CR_mixup_64k",
"LVEval_loogle_CR_mixup_128k",
"LVEval_loogle_CR_mixup_256k",
"----------------------------------------",
"LVEval_loogle_MIR_mixup_16k",
"LVEval_loogle_MIR_mixup_32k",
"LVEval_loogle_MIR_mixup_64k",
"LVEval_loogle_MIR_mixup_128k",
"LVEval_loogle_MIR_mixup_256k",
"----------------------------------------",
"--------- LVEval Multi_Hop CQA ---------", # category
"----------------------------------------",
"LVEval_hotpotwikiqa_mixup_16k",
"LVEval_hotpotwikiqa_mixup_32k",
"LVEval_hotpotwikiqa_mixup_64k",
"LVEval_hotpotwikiqa_mixup_128k",
"LVEval_hotpotwikiqa_mixup_256k",
"----------------------------------------",
"LVEval_lic_mixup_16k",
"LVEval_lic_mixup_32k",
"LVEval_lic_mixup_64k",
"LVEval_lic_mixup_128k",
"LVEval_lic_mixup_256k",
"----------------------------------------",
"--------- LVEval Factrecall CQA ---------", # category
"----------------------------------------",
"LVEval_factrecall_en_16k",
"LVEval_factrecall_en_32k",
"LVEval_factrecall_en_64k",
"LVEval_factrecall_en_128k",
"LVEval_factrecall_en_256k",
"----------------------------------------",
"LVEval_factrecall_zh_16k",
"LVEval_factrecall_zh_32k",
"LVEval_factrecall_zh_64k",
"LVEval_factrecall_zh_128k",
"LVEval_factrecall_zh_256k",
'----------------------------------------',
'--------- LVEval All ---------', # category
'----------------------------------------',
'LVEval_qa',
'----------------------------------------',
'--------- LVEval Tasks All ---------', # category
'----------------------------------------',
'LVEval_single_hop_qa',
'LVEval_single_hop_cqa',
'LVEval_multi_hop_qa',
'LVEval_multi_hop_cqa',
'LVEval_factrecall_cqa',
'----------------------------------------',
'--------- LVEval Datasets All ---------', # category
'----------------------------------------',
'LVEval_loogle_SD_mixup',
'LVEval_cmrc_mixup',
'LVEval_multifieldqa_en_mixup',
'LVEval_multifieldqa_zh_mixup',
'LVEval_dureader_mixup',
'LVEval_loogle_CR_mixup',
'LVEval_loogle_MIR_mixup',
'LVEval_hotpotwikiqa_mixup',
'LVEval_lic_mixup',
'LVEval_factrecall_en',
'LVEval_factrecall_zh',
'----------------------------------------',
'--------- LVEval Single_Hop QA ---------', # category
'----------------------------------------',
'LVEval_loogle_SD_mixup_16k',
'LVEval_loogle_SD_mixup_32k',
'LVEval_loogle_SD_mixup_64k',
'LVEval_loogle_SD_mixup_128k',
'LVEval_loogle_SD_mixup_256k',
'----------------------------------------',
'LVEval_cmrc_mixup_16k',
'LVEval_cmrc_mixup_32k',
'LVEval_cmrc_mixup_64k',
'LVEval_cmrc_mixup_128k',
'LVEval_cmrc_mixup_256k',
'----------------------------------------',
'--------- LVEval Single_Hop CQA ---------', # category
'----------------------------------------',
'LVEval_multifieldqa_en_mixup_16k',
'LVEval_multifieldqa_en_mixup_32k',
'LVEval_multifieldqa_en_mixup_64k',
'LVEval_multifieldqa_en_mixup_128k',
'LVEval_multifieldqa_en_mixup_256k',
'----------------------------------------',
'LVEval_multifieldqa_zh_mixup_16k',
'LVEval_multifieldqa_zh_mixup_32k',
'LVEval_multifieldqa_zh_mixup_64k',
'LVEval_multifieldqa_zh_mixup_128k',
'LVEval_multifieldqa_zh_mixup_256k',
'----------------------------------------',
'--------- LVEval Multi_Hop QA ---------', # category
'----------------------------------------',
'LVEval_dureader_mixup_16k',
'LVEval_dureader_mixup_32k',
'LVEval_dureader_mixup_64k',
'LVEval_dureader_mixup_128k',
'LVEval_dureader_mixup_256k',
'----------------------------------------',
'LVEval_loogle_CR_mixup_16k',
'LVEval_loogle_CR_mixup_32k',
'LVEval_loogle_CR_mixup_64k',
'LVEval_loogle_CR_mixup_128k',
'LVEval_loogle_CR_mixup_256k',
'----------------------------------------',
'LVEval_loogle_MIR_mixup_16k',
'LVEval_loogle_MIR_mixup_32k',
'LVEval_loogle_MIR_mixup_64k',
'LVEval_loogle_MIR_mixup_128k',
'LVEval_loogle_MIR_mixup_256k',
'----------------------------------------',
'--------- LVEval Multi_Hop CQA ---------', # category
'----------------------------------------',
'LVEval_hotpotwikiqa_mixup_16k',
'LVEval_hotpotwikiqa_mixup_32k',
'LVEval_hotpotwikiqa_mixup_64k',
'LVEval_hotpotwikiqa_mixup_128k',
'LVEval_hotpotwikiqa_mixup_256k',
'----------------------------------------',
'LVEval_lic_mixup_16k',
'LVEval_lic_mixup_32k',
'LVEval_lic_mixup_64k',
'LVEval_lic_mixup_128k',
'LVEval_lic_mixup_256k',
'----------------------------------------',
'--------- LVEval Factrecall CQA ---------', # category
'----------------------------------------',
'LVEval_factrecall_en_16k',
'LVEval_factrecall_en_32k',
'LVEval_factrecall_en_64k',
'LVEval_factrecall_en_128k',
'LVEval_factrecall_en_256k',
'----------------------------------------',
'LVEval_factrecall_zh_16k',
'LVEval_factrecall_zh_32k',
'LVEval_factrecall_zh_64k',
'LVEval_factrecall_zh_128k',
'LVEval_factrecall_zh_256k',
],
summary_groups=sum(
[v for k, v in locals().items() if k.endswith("_summary_groups")], []
[v for k, v in locals().items() if k.endswith('_summary_groups')], []
),
)
......@@ -21,5 +21,5 @@ summarizer = dict(
'mathbench-circular-and-cloze-agent',
],
summary_groups=sum(
[v for k, v in locals().items() if k.endswith("_summary_groups")], [])
[v for k, v in locals().items() if k.endswith('_summary_groups')], [])
)
......@@ -15,5 +15,5 @@ summarizer = dict(
'mathbench-circular-and-cloze',
],
summary_groups=sum(
[v for k, v in locals().items() if k.endswith("_summary_groups")], [])
[v for k, v in locals().items() if k.endswith('_summary_groups')], [])
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment