Unverified Commit d34ba111 authored by Fengzhe Zhou's avatar Fengzhe Zhou Committed by GitHub
Browse files

[Sync] Merge branch 'dev' into zfz/update-keyset-demo (#876)

parent 32b5948f
...@@ -18,30 +18,30 @@ agent_summary_groups = [ ...@@ -18,30 +18,30 @@ agent_summary_groups = [
summarizer = dict( summarizer = dict(
dataset_abbrs=[ dataset_abbrs=[
'agent', # 'agent',
'math_acc_1_and_fill_in_blank-native', # 'math_acc_1_and_fill_in_blank-native',
'math_perf_4_and_fill_in_blank-native', # 'math_perf_4_and_fill_in_blank-native',
# '######## MathBench-Agent Accuracy ########', # category # # '######## MathBench-Agent Accuracy ########', # category
'math_acc_1_and_fill_in_blank-agent', # 'math_acc_1_and_fill_in_blank-agent',
'math_perf_4_and_fill_in_blank-agent', # 'math_perf_4_and_fill_in_blank-agent',
# '######## CIBench Template ########', # category # # '######## CIBench Template ########', # category
'cibench_template:executable', # 'cibench_template:executable',
'cibench_template:numeric_correct', # 'cibench_template:numeric_correct',
'cibench_template:text_score', # 'cibench_template:text_score',
'cibench_template:vis_sim', # 'cibench_template:vis_sim',
# '######## CIBench Template Chinese ########', # category # # '######## CIBench Template Chinese ########', # category
'cibench_template_cn:executable', # 'cibench_template_cn:executable',
'cibench_template_cn:numeric_correct', # 'cibench_template_cn:numeric_correct',
'cibench_template_cn:text_score', # 'cibench_template_cn:text_score',
'cibench_template_cn:vis_sim', # 'cibench_template_cn:vis_sim',
# '######## CIBench Template w/o NLTK ########', # category no text score becase it is only for nltk # # '######## CIBench Template w/o NLTK ########', # category no text score becase it is only for nltk
'cibench_template_wo_nltk:executable', # 'cibench_template_wo_nltk:executable',
'cibench_template_wo_nltk:numeric_correct', # 'cibench_template_wo_nltk:numeric_correct',
'cibench_template_wo_nltk:vis_sim', # 'cibench_template_wo_nltk:vis_sim',
# '######## CIBench Template Chinese w/o NLTK ########', # category # # '######## CIBench Template Chinese w/o NLTK ########', # category
'cibench_template_cn_wo_nltk:executable', # 'cibench_template_cn_wo_nltk:executable',
'cibench_template_cn_wo_nltk:numeric_correct', # 'cibench_template_cn_wo_nltk:numeric_correct',
'cibench_template_cn_wo_nltk:vis_sim', # 'cibench_template_cn_wo_nltk:vis_sim',
# '######## T-Eval ########', # category # '######## T-Eval ########', # category
['plugin_eval-p10', 'naive_average'], ['plugin_eval-p10', 'naive_average'],
['plugin_eval-p10-instruct_v1', 'format_metric'], ['plugin_eval-p10-instruct_v1', 'format_metric'],
...@@ -68,6 +68,38 @@ summarizer = dict( ...@@ -68,6 +68,38 @@ summarizer = dict(
['plugin_eval-p10-understand_str_v1_zh', 'args'], ['plugin_eval-p10-understand_str_v1_zh', 'args'],
['plugin_eval-p10-reason_retrieve_understand_json_v1_zh', 'args'], ['plugin_eval-p10-reason_retrieve_understand_json_v1_zh', 'args'],
['plugin_eval-p10-review_str_v1_zh', 'review_quality'], ['plugin_eval-p10-review_str_v1_zh', 'review_quality'],
# '######## MUS-T-Eval ########', # category
['plugin_eval-mus-p10', 'naive_average'],
['plugin_eval-mus-p10-instruct_v1', 'format_metric'],
['plugin_eval-mus-p10-instruct_v1', 'args_em_metric'],
['plugin_eval-mus-p10-plan_str_v1', 'f1_score'],
['plugin_eval-mus-p10-plan_json_v1', 'f1_score'],
['plugin_eval-mus-p10-reason_str_v1', 'thought'],
['plugin_eval-mus-p10-reason_retrieve_understand_json_v1', 'thought'],
['plugin_eval-mus-p10-retrieve_str_v1', 'name'],
['plugin_eval-mus-p10-reason_retrieve_understand_json_v1', 'name'],
['plugin_eval-mus-p10-understand_str_v1', 'args'],
['plugin_eval-mus-p10-reason_retrieve_understand_json_v1', 'args'],
['plugin_eval-mus-p10-review_str_v1', 'review_quality'],
['plugin_eval-mus-p10_zh', 'naive_average'],
['plugin_eval-mus-p10-instruct_v1_zh', 'format_metric'],
['plugin_eval-mus-p10-instruct_v1_zh', 'args_em_metric'],
['plugin_eval-mus-p10-plan_str_v1_zh', 'f1_score'],
['plugin_eval-mus-p10-plan_json_v1_zh', 'f1_score'],
['plugin_eval-mus-p10-reason_str_v1_zh', 'thought'],
['plugin_eval-mus-p10-reason_retrieve_understand_json_v1_zh', 'thought'],
['plugin_eval-mus-p10-retrieve_str_v1_zh', 'name'],
['plugin_eval-mus-p10-reason_retrieve_understand_json_v1_zh', 'name'],
['plugin_eval-mus-p10-understand_str_v1_zh', 'args'],
['plugin_eval-mus-p10-reason_retrieve_understand_json_v1_zh', 'args'],
['plugin_eval-mus-p10-review_str_v1_zh', 'review_quality'],
# ['plugin_eval-p10', 'naive_average'],
# ['plugin_eval-mus-p10', 'naive_average'],
# ['plugin_eval-p10_zh', 'naive_average'],
# ['plugin_eval-mus-p10_zh', 'naive_average'],
], ],
summary_groups=sum( summary_groups=sum(
[v for k, v in locals().items() if k.endswith("_summary_groups")], []) [v for k, v in locals().items() if k.endswith("_summary_groups")], [])
......
# This summarizer is used for `./datasets/compassbench_v1_knowledge/compassbench_v1_knowledge_gen` # This summarizer is used for `./datasets/compassbench_v1_knowledge/compassbench_v1_knowledge_gen`
compassbench_v1_knowledge_names = [ compassbench_v1_knowledge_names = [
'compassbench_v1_knowledge-common_knowledge-single_choice_cn_circular', 'compassbench_v1_knowledge-common_knowledge-single_choice_cn_circular',
'compassbench_v1_knowledge-engineering-single_choice_cn_circular',
'compassbench_v1_knowledge-humanity-single_choice_cn_circular', 'compassbench_v1_knowledge-humanity-single_choice_cn_circular',
'compassbench_v1_knowledge-natural_science-single_choice_cn_circular', 'compassbench_v1_knowledge-natural_science-single_choice_cn_circular',
'compassbench_v1_knowledge-social_science-single_choice_cn_circular', 'compassbench_v1_knowledge-social_science-single_choice_cn_circular',
...@@ -19,7 +18,6 @@ summarizer = dict( ...@@ -19,7 +18,6 @@ summarizer = dict(
'knowledge_acc_1_and_cloze', 'knowledge_acc_1_and_cloze',
['knowledge_cn', 'acc_1'], ['knowledge_cn', 'acc_1'],
['compassbench_v1_knowledge-common_knowledge-single_choice_cn_circular', 'acc_1'], ['compassbench_v1_knowledge-common_knowledge-single_choice_cn_circular', 'acc_1'],
['compassbench_v1_knowledge-engineering-single_choice_cn_circular', 'acc_1'],
['compassbench_v1_knowledge-humanity-single_choice_cn_circular', 'acc_1'], ['compassbench_v1_knowledge-humanity-single_choice_cn_circular', 'acc_1'],
['compassbench_v1_knowledge-natural_science-single_choice_cn_circular', 'acc_1'], ['compassbench_v1_knowledge-natural_science-single_choice_cn_circular', 'acc_1'],
['compassbench_v1_knowledge-social_science-single_choice_cn_circular', 'acc_1'], ['compassbench_v1_knowledge-social_science-single_choice_cn_circular', 'acc_1'],
...@@ -28,7 +26,6 @@ summarizer = dict( ...@@ -28,7 +26,6 @@ summarizer = dict(
'knowledge_perf_4_and_cloze', 'knowledge_perf_4_and_cloze',
['knowledge_cn', 'perf_4'], ['knowledge_cn', 'perf_4'],
['compassbench_v1_knowledge-common_knowledge-single_choice_cn_circular', 'perf_4'], ['compassbench_v1_knowledge-common_knowledge-single_choice_cn_circular', 'perf_4'],
['compassbench_v1_knowledge-engineering-single_choice_cn_circular', 'perf_4'],
['compassbench_v1_knowledge-humanity-single_choice_cn_circular', 'perf_4'], ['compassbench_v1_knowledge-humanity-single_choice_cn_circular', 'perf_4'],
['compassbench_v1_knowledge-natural_science-single_choice_cn_circular', 'perf_4'], ['compassbench_v1_knowledge-natural_science-single_choice_cn_circular', 'perf_4'],
['compassbench_v1_knowledge-social_science-single_choice_cn_circular', 'perf_4'], ['compassbench_v1_knowledge-social_science-single_choice_cn_circular', 'perf_4'],
......
...@@ -37,8 +37,8 @@ summarizer = dict( ...@@ -37,8 +37,8 @@ summarizer = dict(
'language_acc_1_and_non_mcq', 'language_acc_1_and_non_mcq',
'language_en_acc_1_and_non_mcq', 'language_en_acc_1_and_non_mcq',
'language_zh_acc_1_and_non_mcq', 'language_zh_acc_1_and_non_mcq',
['information_retrieval_en', 'score'], # ['information_retrieval_en', 'score'],
['information_retrieval_zh', 'score'], # ['information_retrieval_zh', 'score'],
['intention_recognition_en_circular', 'acc_origin'], ['intention_recognition_en_circular', 'acc_origin'],
['intention_recognition_zh_circular', 'acc_origin'], ['intention_recognition_zh_circular', 'acc_origin'],
['sentiment_analysis_en_circular', 'acc_origin'], ['sentiment_analysis_en_circular', 'acc_origin'],
...@@ -54,8 +54,8 @@ summarizer = dict( ...@@ -54,8 +54,8 @@ summarizer = dict(
'language_perf_4_and_non_mcq', 'language_perf_4_and_non_mcq',
'language_en_perf_4_and_non_mcq', 'language_en_perf_4_and_non_mcq',
'language_zh_perf_4_and_non_mcq', 'language_zh_perf_4_and_non_mcq',
['information_retrieval_en', 'score'], # ['information_retrieval_en', 'score'],
['information_retrieval_zh', 'score'], # ['information_retrieval_zh', 'score'],
['intention_recognition_en_circular', 'perf_circular'], ['intention_recognition_en_circular', 'perf_circular'],
['intention_recognition_zh_circular', 'perf_circular'], ['intention_recognition_zh_circular', 'perf_circular'],
['sentiment_analysis_en_circular', 'perf_circular'], ['sentiment_analysis_en_circular', 'perf_circular'],
......
compassbench_v1_reason_groups = [ compassbench_v1_reason_groups = [
{'name': 'reasonbench_cn_logic_circular', 'subsets': ['reasonbench_cn_abductive_alphanlg_translated_circular', 'reasonbench_cn_deductive_bbh3obj_translated_circular', 'reasonbench_cn_deductive_logiqa_zh_circular', 'reasonbench_cn_inductive_deer_translated_circular', 'reasonbench_cn_inductive_selfgenerated_circular']}, {'name': 'reasonbench_cn_abductive_circular', 'subsets': ['reasonbench_cn_abductive_alphanlg_translated_circular']},
{'name': 'reasonbench_en_logic_circular', 'subsets': ['reasonbench_en_abductive_alphanlg_circular', 'reasonbench_en_deductive_bbh7obj_circular', 'reasonbench_en_deductive_logiqa_zh_translated_circular', 'reasonbench_en_deductive_ocnli_translated_circular', 'reasonbench_en_inductive_deer_circular', 'reasonbench_en_inductive_selfgenerated_circular']}, {'name': 'reasonbench_en_abductive_circular', 'subsets': ['reasonbench_en_abductive_alphanlg_circular']},
{'name': 'reasonbench', 'subsets': ['reasonbench_cn_commonsense_circular', 'reasonbench_cn_logic_circular', 'reasonbench_en_commonsense_circular', 'reasonbench_en_logic_circular']}, {'name': 'reasonbench_cn_deductive_circular', 'subsets': ['reasonbench_cn_deductive_bbh3obj_translated_circular', 'reasonbench_cn_deductive_logiqa_zh_circular']},
{'name': 'reasonbench_cn_inductive_circular', 'subsets': ['reasonbench_cn_inductive_deer_translated_circular', 'reasonbench_cn_inductive_selfgenerated_circular']},
{'name': 'reasonbench_en_inductive_circular', 'subsets': ['reasonbench_en_inductive_deer_circular', 'reasonbench_en_inductive_selfgenerated_circular']},
{'name': 'reasonbench_cn_circular', 'subsets': ['reasonbench_cn_commonsense_circular', 'reasonbench_cn_abductive_circular', 'reasonbench_cn_deductive_circular', 'reasonbench_cn_inductive_circular']},
{'name': 'reasonbench_en_circular', 'subsets': ['reasonbench_en_commonsense_circular', 'reasonbench_en_abductive_circular', 'reasonbench_en_deductive_logiqa_zh_translated_circular', 'reasonbench_en_inductive_circular']},
{'name': 'reasonbench', 'subsets': ['reasonbench_cn_circular', 'reasonbench_en_circular']},
] ]
summarizer = dict( summarizer = dict(
dataset_abbrs=[ dataset_abbrs=[
['reasonbench', 'acc_origin'], ['reasonbench', 'acc_origin'],
['reasonbench_cn_circular', 'acc_origin'],
['reasonbench_en_circular', 'acc_origin'],
['reasonbench_cn_commonsense_circular', 'acc_origin'], ['reasonbench_cn_commonsense_circular', 'acc_origin'],
['reasonbench_cn_abductive_circular', 'acc_origin'],
['reasonbench_cn_deductive_circular', 'acc_origin'],
['reasonbench_cn_inductive_circular', 'acc_origin'],
['reasonbench_en_commonsense_circular', 'acc_origin'], ['reasonbench_en_commonsense_circular', 'acc_origin'],
['reasonbench_cn_logic_circular', 'acc_origin'], ['reasonbench_en_abductive_circular', 'acc_origin'],
['reasonbench_en_logic_circular', 'acc_origin'], ['reasonbench_en_deductive_logiqa_zh_translated_circular', 'acc_origin'],
['reasonbench_en_inductive_circular', 'acc_origin'],
['reasonbench_cn_commonsense_circular', 'acc_origin'],
['reasonbench_cn_abductive_alphanlg_translated_circular', 'acc_origin'], ['reasonbench_cn_abductive_alphanlg_translated_circular', 'acc_origin'],
['reasonbench_cn_deductive_bbh3obj_translated_circular', 'acc_origin'], ['reasonbench_cn_deductive_bbh3obj_translated_circular', 'acc_origin'],
['reasonbench_cn_deductive_logiqa_zh_circular', 'acc_origin'], ['reasonbench_cn_deductive_logiqa_zh_circular', 'acc_origin'],
['reasonbench_cn_inductive_deer_translated_circular', 'acc_origin'], ['reasonbench_cn_inductive_deer_translated_circular', 'acc_origin'],
['reasonbench_cn_inductive_selfgenerated_circular', 'acc_origin'], ['reasonbench_cn_inductive_selfgenerated_circular', 'acc_origin'],
['reasonbench_en_commonsense_circular', 'acc_origin'],
['reasonbench_en_abductive_alphanlg_circular', 'acc_origin'], ['reasonbench_en_abductive_alphanlg_circular', 'acc_origin'],
['reasonbench_en_deductive_bbh7obj_circular', 'acc_origin'],
['reasonbench_en_deductive_logiqa_zh_translated_circular', 'acc_origin'], ['reasonbench_en_deductive_logiqa_zh_translated_circular', 'acc_origin'],
['reasonbench_en_deductive_ocnli_translated_circular', 'acc_origin'],
['reasonbench_en_inductive_deer_circular', 'acc_origin'], ['reasonbench_en_inductive_deer_circular', 'acc_origin'],
['reasonbench_en_inductive_selfgenerated_circular', 'acc_origin'], ['reasonbench_en_inductive_selfgenerated_circular', 'acc_origin'],
['reasonbench', 'perf_circular'], ['reasonbench', 'perf_circular'],
['reasonbench_cn_circular', 'perf_circular'],
['reasonbench_en_circular', 'perf_circular'],
['reasonbench_cn_commonsense_circular', 'perf_circular'], ['reasonbench_cn_commonsense_circular', 'perf_circular'],
['reasonbench_cn_abductive_circular', 'perf_circular'],
['reasonbench_cn_deductive_circular', 'perf_circular'],
['reasonbench_cn_inductive_circular', 'perf_circular'],
['reasonbench_en_commonsense_circular', 'perf_circular'], ['reasonbench_en_commonsense_circular', 'perf_circular'],
['reasonbench_cn_logic_circular', 'perf_circular'], ['reasonbench_en_abductive_circular', 'perf_circular'],
['reasonbench_en_logic_circular', 'perf_circular'], ['reasonbench_en_deductive_logiqa_zh_translated_circular', 'perf_circular'],
['reasonbench_en_inductive_circular', 'perf_circular'],
['reasonbench_cn_commonsense_circular', 'perf_circular'],
['reasonbench_cn_abductive_alphanlg_translated_circular', 'perf_circular'], ['reasonbench_cn_abductive_alphanlg_translated_circular', 'perf_circular'],
['reasonbench_cn_deductive_bbh3obj_translated_circular', 'perf_circular'], ['reasonbench_cn_deductive_bbh3obj_translated_circular', 'perf_circular'],
['reasonbench_cn_deductive_logiqa_zh_circular', 'perf_circular'], ['reasonbench_cn_deductive_logiqa_zh_circular', 'perf_circular'],
['reasonbench_cn_inductive_deer_translated_circular', 'perf_circular'], ['reasonbench_cn_inductive_deer_translated_circular', 'perf_circular'],
['reasonbench_cn_inductive_selfgenerated_circular', 'perf_circular'], ['reasonbench_cn_inductive_selfgenerated_circular', 'perf_circular'],
['reasonbench_en_commonsense_circular', 'perf_circular'],
['reasonbench_en_abductive_alphanlg_circular', 'perf_circular'], ['reasonbench_en_abductive_alphanlg_circular', 'perf_circular'],
['reasonbench_en_deductive_bbh7obj_circular', 'perf_circular'],
['reasonbench_en_deductive_logiqa_zh_translated_circular', 'perf_circular'], ['reasonbench_en_deductive_logiqa_zh_translated_circular', 'perf_circular'],
['reasonbench_en_deductive_ocnli_translated_circular', 'perf_circular'],
['reasonbench_en_inductive_deer_circular', 'perf_circular'], ['reasonbench_en_inductive_deer_circular', 'perf_circular'],
['reasonbench_en_inductive_selfgenerated_circular', 'perf_circular'], ['reasonbench_en_inductive_selfgenerated_circular', 'perf_circular'],
], ],
......
...@@ -17,6 +17,28 @@ _base_summary_groups = [ ...@@ -17,6 +17,28 @@ _base_summary_groups = [
['plugin_eval-instruct_v1', 'json_args_em_metric'], ['plugin_eval-instruct_v1', 'json_args_em_metric'],
] ]
}, },
{
'name': 'plugin_eval-instruct_v1',
'metric': 'string_metric',
'subsets': [
['plugin_eval-instruct_v1', 'string_format_metric'],
['plugin_eval-instruct_v1', 'string_args_em_metric'],
]
},
{
'name': 'plugin_eval-instruct_v1',
'metric': 'json_metric',
'subsets': [
['plugin_eval-instruct_v1', 'json_format_metric'],
['plugin_eval-instruct_v1', 'json_args_em_metric'],
]
},
{
'name': 'copy_plugin_eval-review_str_v1',
'subsets': [
['plugin_eval-review_str_v1', 'review_quality'],
],
},
{ {
'name': 'plugin_eval', 'name': 'plugin_eval',
'subsets': [ 'subsets': [
...@@ -31,6 +53,7 @@ _base_summary_groups = [ ...@@ -31,6 +53,7 @@ _base_summary_groups = [
['plugin_eval-understand_str_v1', 'args'], ['plugin_eval-understand_str_v1', 'args'],
['plugin_eval-reason_retrieve_understand_json_v1', 'args'], ['plugin_eval-reason_retrieve_understand_json_v1', 'args'],
['plugin_eval-review_str_v1', 'review_quality'], ['plugin_eval-review_str_v1', 'review_quality'],
['copy_plugin_eval-review_str_v1', 'naive_average'], # a hack for review * 2
] ]
}, },
] ]
...@@ -62,3 +85,17 @@ for group in _base_summary_groups: ...@@ -62,3 +85,17 @@ for group in _base_summary_groups:
group['name'] = group['name'].replace('plugin_eval', 'plugin_eval-p10') + '_zh' group['name'] = group['name'].replace('plugin_eval', 'plugin_eval-p10') + '_zh'
group['subsets'] = [[subset[0].replace('plugin_eval', 'plugin_eval-p10') + '_zh', subset[1]] for subset in group['subsets']] group['subsets'] = [[subset[0].replace('plugin_eval', 'plugin_eval-p10') + '_zh', subset[1]] for subset in group['subsets']]
plugineval_summary_groups.append(group) plugineval_summary_groups.append(group)
# base -mus-p10-
for group in _base_summary_groups:
group = deepcopy(group)
group['name'] = group['name'].replace('plugin_eval', 'plugin_eval-mus-p10')
group['subsets'] = [[subset[0].replace('plugin_eval', 'plugin_eval-mus-p10'), subset[1]] for subset in group['subsets']]
plugineval_summary_groups.append(group)
# base -mus-p10- _zh
for group in _base_summary_groups:
group = deepcopy(group)
group['name'] = group['name'].replace('plugin_eval', 'plugin_eval-mus-p10') + '_zh'
group['subsets'] = [[subset[0].replace('plugin_eval', 'plugin_eval-mus-p10') + '_zh', subset[1]] for subset in group['subsets']]
plugineval_summary_groups.append(group)
from copy import deepcopy
_base_summary_groups = [
{
'name': 'teval-instruct_v1',
'metric': 'format_metric',
'subsets': [
['teval-instruct_v1', 'string_format_metric'],
['teval-instruct_v1', 'json_format_metric'],
]
},
{
'name': 'teval-instruct_v1',
'metric': 'args_em_metric',
'subsets': [
['teval-instruct_v1', 'string_args_em_metric'],
['teval-instruct_v1', 'json_args_em_metric'],
]
},
{
'name': 'teval-instruct_v1',
'metric': 'string_metric',
'subsets': [
['teval-instruct_v1', 'string_format_metric'],
['teval-instruct_v1', 'string_args_em_metric'],
]
},
{
'name': 'teval-instruct_v1',
'metric': 'json_metric',
'subsets': [
['teval-instruct_v1', 'json_format_metric'],
['teval-instruct_v1', 'json_args_em_metric'],
]
},
{
'name': 'copy_teval-review_str_v1',
'subsets': [
['teval-review_str_v1', 'review_quality'],
],
},
{
'name': 'teval',
'subsets': [
['teval-instruct_v1', 'format_metric'],
['teval-instruct_v1', 'args_em_metric'],
['teval-plan_str_v1', 'f1_score'],
['teval-plan_json_v1', 'f1_score'],
['teval-reason_str_v1', 'thought'],
['teval-reason_retrieve_understand_json_v1', 'thought'],
['teval-retrieve_str_v1', 'name'],
['teval-reason_retrieve_understand_json_v1', 'name'],
['teval-understand_str_v1', 'args'],
['teval-reason_retrieve_understand_json_v1', 'args'],
['teval-review_str_v1', 'review_quality'],
['copy_teval-review_str_v1', 'naive_average'], # a hack for review * 2
]
},
]
teval_summary_groups = []
# base
for group in _base_summary_groups:
group = deepcopy(group)
teval_summary_groups.append(group)
# base _zh
for group in _base_summary_groups:
group = deepcopy(group)
group['name'] = group['name'] + '_zh'
group['subsets'] = [[subset[0] + '_zh', subset[1]] for subset in group['subsets']]
teval_summary_groups.append(group)
...@@ -12,8 +12,22 @@ with read_base(): ...@@ -12,8 +12,22 @@ with read_base():
from .groups.xiezhi import xiezhi_summary_groups from .groups.xiezhi import xiezhi_summary_groups
other_summary_groups = []
other_summary_groups.append({'name': 'Exam', 'subsets': ["ceval",'agieval','mmlu','cmmlu',"GaokaoBench",'ARC-c','ARC-e']})
other_summary_groups.append({'name': 'Language', 'subsets': ['WiC','chid-dev','afqmc-dev','WSC','tydiqa-goldp','flores_100']})
other_summary_groups.append({'name': 'Knowledge', 'subsets': ['BoolQ','commonsense_qa','triviaqa','nq']})
other_summary_groups.append({'name': 'Understanding', 'subsets': ['C3','race-middle','race-high','openbookqa_fact','csl_dev','lcsts','Xsum','eprstmt-dev','lambada']})
other_summary_groups.append({'name': 'Reasoning', 'subsets': ['cmnli','ocnli','AX_b','AX_g','RTE','COPA','ReCoRD','hellaswag','piqa','siqa','math','gsm8k','drop','openai_humaneval','mbpp',"bbh"]})
other_summary_groups.append({'name': 'Overall', 'subsets': ['Exam', 'Language', 'Knowledge', 'Understanding', 'Reasoning']})
summarizer = dict( summarizer = dict(
dataset_abbrs=[ dataset_abbrs=[
'Overall',
'Exam',
'Language',
'Knowledge',
'Understanding',
'Reasoning',
'--------- 考试 Exam ---------', # category '--------- 考试 Exam ---------', # category
# 'Mixed', # subcategory # 'Mixed', # subcategory
"ceval", "ceval",
......
summarizer = dict(
dataset_abbrs=[
'######## MathBench Accuracy ########', # category
['mathbench-college-single_choice_cn', 'acc_1'],
['mathbench-college-single_choice_en', 'acc_1'],
['mathbench-high-single_choice_cn', 'acc_1'],
['mathbench-high-single_choice_en', 'acc_1'],
['mathbench-middle-single_choice_cn', 'acc_1'],
['mathbench-middle-single_choice_en', 'acc_1'],
['mathbench-primary-cloze_cn', 'accuracy'],
['mathbench-primary-cloze_en', 'accuracy'],
['mathbench-calculate-cloze_en', 'accuracy'],
'######## MathBench CircularEval ########', # category
['mathbench-college-single_choice_cn', 'perf_4'],
['mathbench-college-single_choice_en', 'perf_4'],
['mathbench-high-single_choice_cn', 'perf_4'],
['mathbench-high-single_choice_en', 'perf_4'],
['mathbench-middle-single_choice_cn', 'perf_4'],
['mathbench-middle-single_choice_en', 'perf_4'],
],
summary_groups=sum(
[v for k, v in locals().items() if k.endswith("_summary_groups")], [])
)
from mmengine.config import read_base
with read_base():
from .groups.plugineval import plugineval_summary_groups
summarizer = dict(
dataset_abbrs=[
['plugin_eval', 'naive_average'],
['plugin_eval-instruct_v1', 'string_metric'], # 指令跟随能力-string格式
['plugin_eval-instruct_v1', 'json_metric'], # 指令跟随能力-json格式
['plugin_eval-plan_str_v1', 'f1_score'], # 规划能力-string格式
['plugin_eval-plan_json_v1', 'f1_score'], # 规划能力-json格式
['plugin_eval-reason_str_v1', 'thought'], # 推理能力-string格式
['plugin_eval-reason_retrieve_understand_json_v1', 'thought'], # 推理能力-json格式
['plugin_eval-retrieve_str_v1', 'name'], # 检索能力-string格式
['plugin_eval-reason_retrieve_understand_json_v1', 'name'], # 检索能力-json格式
['plugin_eval-understand_str_v1', 'args'], # 理解能力-string格式
['plugin_eval-reason_retrieve_understand_json_v1', 'args'], # 理解能力-json格式
['plugin_eval-review_str_v1', 'review_quality'], # 反思能力-string格式
['plugin_eval_zh', 'naive_average'],
['plugin_eval-instruct_v1_zh', 'string_metric'],
['plugin_eval-instruct_v1_zh', 'json_metric'],
['plugin_eval-plan_str_v1_zh', 'f1_score'],
['plugin_eval-plan_json_v1_zh', 'f1_score'],
['plugin_eval-reason_str_v1_zh', 'thought'],
['plugin_eval-reason_retrieve_understand_json_v1_zh', 'thought'],
['plugin_eval-retrieve_str_v1_zh', 'name'],
['plugin_eval-reason_retrieve_understand_json_v1_zh', 'name'],
['plugin_eval-understand_str_v1_zh', 'args'],
['plugin_eval-reason_retrieve_understand_json_v1_zh', 'args'],
['plugin_eval-review_str_v1_zh', 'review_quality'],
],
summary_groups=sum(
[v for k, v in locals().items() if k.endswith("_summary_groups")], [])
)
from mmengine.config import read_base
with read_base():
from .groups.teval import teval_summary_groups
summarizer = dict(
dataset_abbrs=[
['teval', 'naive_average'],
['teval-instruct_v1', 'string_metric'], # 指令跟随能力-string格式
['teval-instruct_v1', 'json_metric'], # 指令跟随能力-json格式
['teval-plan_str_v1', 'f1_score'], # 规划能力-string格式
['teval-plan_json_v1', 'f1_score'], # 规划能力-json格式
['teval-reason_str_v1', 'thought'], # 推理能力-string格式
['teval-reason_retrieve_understand_json_v1', 'thought'], # 推理能力-json格式
['teval-retrieve_str_v1', 'name'], # 检索能力-string格式
['teval-reason_retrieve_understand_json_v1', 'name'], # 检索能力-json格式
['teval-understand_str_v1', 'args'], # 理解能力-string格式
['teval-reason_retrieve_understand_json_v1', 'args'], # 理解能力-json格式
['teval-review_str_v1', 'review_quality'], # 反思能力-string格式
['teval_zh', 'naive_average'],
['teval-instruct_v1_zh', 'string_metric'],
['teval-instruct_v1_zh', 'json_metric'],
['teval-plan_str_v1_zh', 'f1_score'],
['teval-plan_json_v1_zh', 'f1_score'],
['teval-reason_str_v1_zh', 'thought'],
['teval-reason_retrieve_understand_json_v1_zh', 'thought'],
['teval-retrieve_str_v1_zh', 'name'],
['teval-reason_retrieve_understand_json_v1_zh', 'name'],
['teval-understand_str_v1_zh', 'args'],
['teval-reason_retrieve_understand_json_v1_zh', 'args'],
['teval-review_str_v1_zh', 'review_quality'],
],
summary_groups=sum(
[v for k, v in locals().items() if k.endswith("_summary_groups")], [])
)
...@@ -91,6 +91,7 @@ from .summedits import * # noqa: F401, F403 ...@@ -91,6 +91,7 @@ from .summedits import * # noqa: F401, F403
from .summscreen import * # noqa: F401, F403 from .summscreen import * # noqa: F401, F403
from .svamp import * # noqa: F401, F403 from .svamp import * # noqa: F401, F403
from .tabmwp import * # noqa: F401, F403 from .tabmwp import * # noqa: F401, F403
from .teval import * # noqa: F401, F403
from .TheoremQA import * # noqa: F401, F403 from .TheoremQA import * # noqa: F401, F403
from .tnews import * # noqa: F401, F403 from .tnews import * # noqa: F401, F403
from .triviaqa import * # noqa: F401, F403 from .triviaqa import * # noqa: F401, F403
......
...@@ -33,7 +33,7 @@ def gsm8k_dataset_postprocess(text: str) -> str: ...@@ -33,7 +33,7 @@ def gsm8k_dataset_postprocess(text: str) -> str:
@TEXT_POSTPROCESSORS.register_module('gsm8k') @TEXT_POSTPROCESSORS.register_module('gsm8k')
def gsm8k_postprocess(text: str) -> str: def gsm8k_postprocess(text: str) -> str:
text = text.split('\n\n')[0] text = text.split('Question:')[0]
text = text.split(' ')[::-1] text = text.split(' ')[::-1]
flag = False flag = False
ret = '' ret = ''
......
...@@ -263,9 +263,12 @@ class MBPPEvaluator(BaseEvaluator): ...@@ -263,9 +263,12 @@ class MBPPEvaluator(BaseEvaluator):
def _process_answer(self, text): def _process_answer(self, text):
try: try:
# for chatGLM related text # for chatGLM related text
text = eval(text) eval_text = eval(text)
except Exception: except Exception:
pass pass
else:
if isinstance(eval_text, str):
text = eval_text
# deal with code block # deal with code block
if '```' in text: if '```' in text:
blocks = re.findall(r'```(.*?)```', text, re.DOTALL) blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
......
import json
import os.path as osp
from typing import Dict, Optional
import mmengine
from datasets import Dataset, DatasetDict
from opencompass.registry import TEXT_POSTPROCESSORS
from ..base import BaseDataset
class TEvalDataset(BaseDataset):
def __init__(self, reader_cfg: Optional[Dict] = {}, **kwargs):
super().__init__(reader_cfg=reader_cfg, **kwargs)
def load(self, path: str, name: str):
dataset = DatasetDict()
data = mmengine.load(osp.join(path, f'{name}.json'))
raw_data = []
for i in data.keys():
origin_prompt = data[i]['origin_prompt']
if isinstance(origin_prompt, str):
origin_prompt = json.loads(origin_prompt)
# Aligning the default roles of opencompass
prompt = origin_prompt + [
dict(role='assistant',
content=str(data[i].get('ground_truth')))
]
raw_data.append({
'prompt': prompt,
'ground_truth': json.dumps(data[i])
})
dataset['test'] = Dataset.from_list(raw_data)
dataset['train'] = Dataset.from_list(raw_data)
return dataset
@TEXT_POSTPROCESSORS.register_module('teval')
def teval_postprocess(text: str) -> str:
if isinstance(text, str):
text = text.split('<eoa>')[0]
text = text.split('<TOKENS_UNUSED_1>')[0]
text = text.split('<|im_end|>')[0]
text = text.split('\nuser')[0]
text = text.split('\nUSER')[0]
text = text.split('[INST]')[0]
text = text.strip()
if text.startswith('```json'):
text = text[len('```json'):]
text = text.strip('`').strip()
if text[:2] == '{{' and text[-2:] == '}}':
text = text[1:-1]
return str(text)
from .instruct_evaluator import InstructEvaluator
from .planning_evaluator import PlanningEvaluator
from .review_evaluator import ReviewEvaluator
from .reason_retrieve_understand_evaluator import ReasonRetrieveUnderstandEvaluator
__all__ = ['InstructEvaluator', 'PlanningEvaluator', 'ReviewEvaluator', 'ReasonRetrieveUnderstandEvaluator']
from collections import defaultdict
from mmengine import load
from ..utils.template import parse_string
from ..utils.format_load import format_load
from ..schema import ResponseDataSample
import ast
import numpy as np
class InstructEvaluator:
"""Instruct Following Evaluation
Args:
dataset_path(str): File path of evaluation dataset.
"""
def __init__(
self,
dataset_path: str,
**kwargs,
) -> None:
self.dataset_path = dataset_path
def _load_dataset(self):
self.dataset = []
dataset = load(self.dataset_path)
for key in dataset.keys():
datum = dataset[key]
data_sample = self._process_response(datum)
self.dataset.append(
dict(
origin_prompt=datum["origin_prompt"],
response_data_sample=data_sample))
self.num_samples = len(self.dataset)
def _process_response(
self,
datum: dict,
) -> ResponseDataSample:
"""Process the response to needed format.
Args:
datum(dict): inputs.
Returns:
dict: Processed response data sample.
"""
# Dict with keyword-only arguments.
template = datum['template']
# Generated response.
pred_data = datum['prediction']
# Response of ground truth.
gt_data = datum['ground_truth']
meta_data = datum['meta_data']
return ResponseDataSample(
template=template, pred=pred_data, gt=gt_data, meta_data=meta_data)
def _evaluate(self, data_sample: dict) -> dict:
metrics_result = dict()
response_format = data_sample.meta_data['response_format']
if response_format == 'json':
pred_data = self.json_format_parse(data_sample)
else:
pred_data = self.string_format_parse(data_sample)
if pred_data is None:
# directly set to 0 for all metrics
metrics_result[f'{response_format}_format_metric'] = 0
metrics_result[f'{response_format}_args_em_metric'] = 0
return metrics_result
# Exact matching
metrics_result[f'{response_format}_format_metric'] = 1
metrics_result[f'{response_format}_args_em_metric'] = self.compute_args_em_metric(
gt_action=data_sample.gt['action'], pred_action=pred_data['action'],
gt_args=data_sample.gt['args'], pred_args=pred_data['args']
)
return metrics_result
def compute_args_em_metric(self, gt_action, pred_action, gt_args, pred_args):
cnt = 0.
if gt_action == pred_action:
cnt += 1.
num_args = len(gt_args) + 1 # 1 means action name match
for gt_key in gt_args:
pred_val = pred_args.get(gt_key, "")
if pred_val == gt_args[gt_key]:
cnt += 1.
return cnt / num_args
def string_format_parse(self, data_sample):
pred_data = data_sample.pred
template = data_sample.template
thought_start = template['thought_start']
thought_end = template['thought_end']
action_start = template['action_start']
action_end = template['action_end']
args_start = template['args_start']
args_end = template['args_end']
parse_template = thought_start + "{thought}" + thought_end \
+ action_start + "{action}" + action_end \
+ args_start + "{args}" + args_end
res = parse_string(parse_template, pred_data, allow_newline=True)
try:
if res is not None:
args = ast.literal_eval(res['args'].strip())
res['args'] = args if isinstance(args, dict) else {}
res['action'] = res['action'].strip()
return res
except:
return dict(thought=res['thought'], action=res['action'].strip(), args=dict())
def json_format_parse(self, data_sample):
try:
pred_data = format_load(data_sample.pred)
template = data_sample.template
new_data = dict()
new_data['thought'] = pred_data[template['thought']]
new_data['action'] = pred_data[template['action']]
args = pred_data[template['args']]
new_data['args'] = args if isinstance(args, dict) else {}
except Exception as e:
return None
return new_data
def evaluate(self):
self._load_dataset()
results_list = []
for data_sample in self.dataset:
metrics_result = self._evaluate(data_sample['response_data_sample'])
results_list.append(metrics_result)
return self._post_process(results_list)
def _post_process(self, results_list):
# list of dict to dict of list
results_dict = defaultdict(list)
{
results_dict[key].append(sub[key])
for sub in results_list for key in sub
}
metric_list = ['json_format_metric', 'json_args_em_metric',
'string_format_metric', 'string_args_em_metric']
for metric in metric_list:
results_dict[metric] = np.round(np.mean(results_dict[metric]), decimals=4)
return results_dict
from numpy import mean
from mmengine import load
from ..utils.format_load import format_load
import itertools
import networkx as nx
import numpy as np
import copy
import re
from tqdm import tqdm
from ..schema import ResponseDataSample
from sentence_transformers import SentenceTransformer, util
class PlanningEvaluator:
"""Planning Evaluation
Args:
dataset_path(str): File path of evaluation dataset
name_weight(float): the weight of action_name in bert_score match, default = 0.9
args_weight(float): the weight of action_args in bert_score match, default = 0.1
match_threshold(float): the threshold of matching
match_strategy(str): matching method, can choose 'bertscore' or 'permutation'
bert_score_model(str): the bert_score model for sentence similarity, default = "all-mpnet-base-v2".
Refer to https://www.sbert.net/docs/pretrained_models.html for more models.
"""
def __init__(
self,
dataset_path: str,
name_weight = 0.75,
args_weight = 0.25,
match_threshold = 0.7,
match_strategy: str = 'bertscore', # ["bertscore", "permutation"]
bert_score_model: str = "all-mpnet-base-v2", # ['thenlper/gte-large-zh', 'all-mpnet-base-v2']
default_prompt_type: str = 'json', # ["json", "ReWOO"]
**kwargs,
) -> None:
self.bert_score_model = bert_score_model
print(bert_score_model)
self.dataset_path = dataset_path
self.name_weight = name_weight
self.args_weight = args_weight
self.match_threshold = match_threshold
self.default_prompt_type = default_prompt_type # ["json", "ReWOO"]
assert match_strategy in ["bertscore", "permutation"], f"match strategy must in [\"bertscore\", \"permutation\"], but get {match_strategy}"
self.match_strategy = match_strategy
self.valid_data_count = None
self.sentence_model = SentenceTransformer(self.bert_score_model)
def _load_dataset(self):
self.dataset = []
dataset = load(self.dataset_path)
total_error = 0
total_count = 0
for key in dataset.keys():
datum = dataset[key]
data_sample, error = self._process_response(datum)
total_error += error
total_count += 1
self.dataset.append(
dict(response_data_sample=data_sample))
self.num_samples = len(self.dataset)
print("total_data_count:", total_count, "valid_data_count:", total_count - total_error)
self.valid_data_count = total_count - total_error
def format_load(self, data):
r'''
ensure evaluator can work correctly under any data input
'''
try:
json_format = format_load(data, start_character='[', end_character=']')
except Exception as e:
return []
if type(json_format) != list:
return []
for i in range(len(json_format)):
try:
json_format[i] = {
'name': str(json_format[i]['name']),
'id': int(json_format[i]['id']),
'args': str(json_format[i]['args'])
}
except Exception as e:
return []
return json_format
def _process_response(
self,
datum,
) -> ResponseDataSample:
"""Process the response to needed format.
Args:
datum(dict): inputs.
Returns:
dict: Processed response data sample.
"""
# Generated response, which can be a string or list
pred_data = datum['prediction']
# Response of ground truth, which can be a string or list
gt_data = datum['ground_truth']
# prompt_type: The type of planning prompt, supporting "json" and "ReWOO"
if "meta" in datum:
prompt_type = datum["meta"].get("prompt_type", self.default_prompt_type)
else:
prompt_type = self.default_prompt_type
error = 0
pred = dict()
gt = dict()
gt['planning'] = self.format_load(gt_data)
if prompt_type == 'json':
pred['planning'] = self.format_load(pred_data)
if pred['planning'] == [] or gt['planning'] == []:
error = 1
elif prompt_type == 'ReWOO':
"""
This type is deprecated
The planning prediction data should in this format:
Plan 1: <str> description about the first action
Dependency 1: <list[number]> the first action depends on which previous actions
Action 1: #E1 = api_name1(args1)
...
Which will be passed only if "number of plan lines == number of dependency lines == number of action lines"
The passed data's format is:
[
dict(
id = i,
name = curr_name,
args = args_str
)
...
]
The golden answer prediction is a json that is the same as the json format.
"""
thoughts = re.findall(r'(Plan [0-9]+: .+)', pred_data)
dependencies = re.findall(r'(Dependency [0-9]+: .+)', pred_data)
action_units = re.findall(r'Action [0-9]+: (.+)', pred_data)
if not (len(thoughts) == len(dependencies) and len(thoughts) == len(action_units)):
pred['planning'] = []
gt['planning'] = []
return ResponseDataSample(template = '', pred=pred, gt=gt), 1
plan_action = []
for i in range(len(action_units)):
dependency_list = re.findall(r'Dependency [0-9]+: (.+)', dependencies[i])
if action_units[i][0] == '#':
# The action has a return #E
args_str_list = re.findall(r'#E[0-9]+ = .+\((.+)\)', action_units[i])
name_list = re.findall(r'#E[0-9]+ = (.+)\(', action_units[i])
else:
# The action does not have a return
args_str_list = re.findall(r'.+\((.+)\)', action_units[i])
name_list = re.findall(r'(.+)\(', action_units[i])
if (len(name_list) > 0):
curr_name = name_list[0]
else:
curr_name = ""
if (len(args_str_list) > 0):
args_str = "{" + args_str_list[0] + "}"
else:
args_str = "{}"
if (len(dependency_list) > 0):
dependency_str = dependency_list[0]
else:
dependency_str = ""
dependency = re.findall('([0-9]+)', dependency_str)
dependency = list(set([int(x) - 1 for x in dependency]))
plan_action.append(
dict(
id = i,
name = curr_name,
prev = dependency,
args = args_str
))
pred['planning'] = plan_action
#Turn dict into args str
for i in range(len(gt['planning'])):
args_str = ""
if type(gt['planning'][i]['args']) == str:
args_dict = eval(gt['planning'][i]['args'])
else:
assert type(gt['planning'][i]['args']) == dict
args_dict = gt['planning'][i]['args']
for it in args_dict:
if args_str == "": args_str += f"{it}=\"{args_dict[it]}\""
else: args_str += f", {it}=\"{args_dict[it]}\""
gt['planning'][i]['args'] = '{' + args_str + '}'
elif prompt_type == 'str':
pred_data_format = pred_data.replace('. ', '\n').split('\n')
pred_actions = []
for pred_step in pred_data_format:
first_occur_time = 1e9
pred_action = ""
for api_name in datum['meta']['API_list']:
occur_time = pred_step.find(api_name)
if occur_time != -1 and occur_time < first_occur_time:
first_occur_time = occur_time
pred_action = api_name
if pred_action != "":
pred_actions.append({
'id': len(pred_actions),
'name': pred_action,
'args': pred_step
})
pred['planning'] = pred_actions
if len(pred['planning']) == 0:
error = 1
else:
raise NotImplementedError(f"Currently, we only support json and ReWOO format, but get {prompt_type}")
return ResponseDataSample(template = '', pred=pred, gt=gt), error
def _evaluate(self, data_sample) -> dict:
if self.match_strategy == 'bertscore':
metrics_result = self.bertscore_match(
data_sample.pred['planning'], data_sample.gt['planning'])
elif self.match_strategy == 'permutation':
metrics_result = self.permutation_match(
data_sample.pred['planning'], data_sample.gt['planning'])
else:
raise NotImplementedError
if len(data_sample.pred['planning']) == 0 or len(data_sample.gt['planning']) == 0:
metrics_result['parse_rate'] = 0
else:
metrics_result['parse_rate'] = 1
return metrics_result
def evaluate(self):
self._load_dataset()
results_list = []
for data_sample in tqdm(self.dataset):
metrics_result = self._evaluate(
data_sample['response_data_sample'])
results_list.append(metrics_result)
return self._post_process(results_list)
def permutation_match(self, pred_plan, gt_plan) -> dict:
'''
The function calculates all the permutation matches' score and selects the max f1_score;
Since permutation is time consuming, we truncate the length of plans to 9
'''
if pred_plan[-1]['name'] != 'FinishAction':
pred_plan.append(
{'id': len(pred_plan), 'prev': [], 'name': 'FinishAction', 'args': r'\{\}'}
)
if gt_plan[-1]['name'] != 'FinishAction':
gt_plan.append(
{'id': len(gt_plan), 'prev': [], 'name': 'FinishAction', 'args': r'\{\}'}
)
# truncate plans to 9 since it is too long for permutation.
if len(pred_plan) > 9: pred_plan = pred_plan[:9]
if len(gt_plan) > 9: gt_plan = pred_plan[:9]
pred_plan = sorted(pred_plan, key=lambda x: x['id'])
gt_plan = sorted(gt_plan, key=lambda x: x['id'])
len_pred = len(pred_plan)
len_gt = len(gt_plan)
map_id_max = max(len_pred, len_gt)
numbers = [i for i in range(map_id_max)]
perms = itertools.permutations(numbers, len_pred)
gt_prev_count, pred_prev_count = 0, 0
for i in range(len_gt):
gt_plan[i]['prev'].append(i)
gt_prev_count += len(gt_plan[i]['prev'])
for i in range(len_pred):
pred_plan[i]['prev'].append(i)
pred_prev_count += len(pred_plan[i]['prev'])
if gt_prev_count == 0 or pred_prev_count == 0:
return {
'precision': 0,
'recall': 0,
'f1_score': 0
}
max_recall, max_precision, max_f1 = 0, 0, 0
for perm in perms:
correct_count = 0
for i in range(len_pred):
if perm[i] >= len_gt:
continue
for j in pred_plan[i]['prev']:
if perm[j] in gt_plan[perm[i]]['prev']:
correct_count += 1
now_recall, now_precision = correct_count / gt_prev_count, correct_count / pred_prev_count
if now_recall + now_precision == 0:
continue
now_f1 = 2 * now_recall * now_precision / (now_recall + now_precision)
if now_f1 > max_f1:
max_f1, max_recall, max_precision = now_f1, now_recall, now_precision
return {
'precision': max_precision,
'recall': max_recall,
'f1_score': max_f1
}
def bertscore_match(self, pred_plan, gt_plan) -> dict:
"""
Calculate the similarity between predicted plan and golden answer,
A plan can be regarded a sequence of actions, and each action has a name and args.
Firstly, use bertscore to calculate pointwise similarity by:
similarity(u, v) = bertscore(u.name, v.name) * name_weight + bertscore(u.args, v.args) * args_weight;
Secondly, use Hungarian matching to match the points;
Finally, use LIS to calculate the number of matched nodes.
"""
if len(pred_plan) == 0 or len(gt_plan) == 0:
return {
'precision': 0,
'recall': 0,
'f1_score': 0
}
pred_plan = copy.deepcopy(sorted(pred_plan, key=lambda x: x['id']))
gt_plan = copy.deepcopy(sorted(gt_plan, key=lambda x: x['id']))
#Add end action
#Currently it is hard-code
if pred_plan[-1]['name'] == 'FinishAction':
pred_plan = pred_plan[:-1]
if gt_plan[-1]['name'] == 'FinishAction':
gt_plan = gt_plan[:-1]
#The total counts of nodes and edges.
len_pred = len(pred_plan)
len_gt = len(gt_plan)
bert_score_matrix = np.zeros((len_pred, len_gt))
name_pred, args_pred = [], []
name_gt, args_gt = [], []
for i in range(len_pred):
name_pred.append(pred_plan[i]['name'])
args_pred.append(str(pred_plan[i]['args']))
for i in range(len_gt):
name_gt.append(gt_plan[i]['name'])
args_gt.append(str(gt_plan[i]['args']))
name_pred_emb = self.sentence_model.encode(name_pred, convert_to_tensor=True)
name_gt_emb = self.sentence_model.encode(name_gt, convert_to_tensor=True)
args_pred_emb = self.sentence_model.encode(args_pred, convert_to_tensor=True)
args_gt_emb = self.sentence_model.encode(args_gt, convert_to_tensor=True)
name_cosine_scores = np.maximum(util.cos_sim(name_pred_emb, name_gt_emb).cpu().numpy(), 0)
args_cosine_scores = np.maximum(util.cos_sim(args_pred_emb, args_gt_emb).cpu().numpy(), 0)
for i in range(len_pred):
for j in range(len_gt):
bert_score_matrix[i][j] = \
name_cosine_scores[i][j] * self.name_weight \
+ args_cosine_scores[i][j] * self.args_weight
G = nx.Graph()
for i in range(len_pred):
for j in range(len_gt):
if bert_score_matrix[i][j] > self.match_threshold:
G.add_edge(i, str(j), weight=bert_score_matrix[i][j])
max_weight_matching = nx.max_weight_matching(G)
pred_to_gt_mapping = dict()
for key in max_weight_matching:
if type(key[0]) == int:
pred_to_gt_mapping[int(key[0])] = int(key[1])
else:
pred_to_gt_mapping[int(key[1])] = int(key[0])
#If a prediction node does not match any golden answer node, we mark the node as -1.
for i in range(len_pred):
if i not in pred_to_gt_mapping:
pred_to_gt_mapping[i] = -1
#Calculate how many nodes are matched by Longest Increasing Subsequence (LIS)
dp = np.ones(len_pred)
for i in range(len_pred):
for j in range(i):
if pred_to_gt_mapping[i] == -1 or pred_to_gt_mapping[j] == -1:
continue
if pred_to_gt_mapping[i] > pred_to_gt_mapping[j]:
dp[i] = max(dp[i], dp[j] + 1)
correct_count = int(max(dp))
recall, precision = correct_count / len(gt_plan), correct_count / len(pred_plan)
f1_score = 2 * recall * precision / (recall + precision)
result = {
'precision': precision,
'recall': recall,
'f1_score': f1_score
}
return result
def _post_process(self, results_list):
# list of dict to dict of list
results = dict()
planning_metric_keys = ["precision", "recall", "f1_score", 'parse_rate']
for key in planning_metric_keys:
results[key] = mean([result[key] for result in results_list])
return results
import json
from numpy import mean
from mmengine import load
import numpy as np
import json
import re
from tqdm import tqdm
from ..schema import ResponseDataSample
from ..utils.format_load import format_load
from sentence_transformers import SentenceTransformer, util
def input_postprocess(text: str) -> str:
if isinstance(text, str):
text = text.split('<|')[0]
text = text.split('<eoa>\n')[0]
text = text.split('<TOKENS_UNUSED_1>\n')[0]
text = text.split('<|im_end|>')[0]
if len(text) > 1 and text[:2] == '{{' and text[-2:] == '}}':
text = text[1:-1]
while len(text) > 0 and text[-1] == '\n':
text = text[:-1]
return str(text)
class ReasonRetrieveUnderstandEvaluator:
"""Planning Evaluation
Args:
dataset_path(str): File path of evaluation dataset
bert_score_model(str): the bert_score model for sentence similarity, default = "all-mpnet-base-v2".
Refer to https://www.sbert.net/docs/pretrained_models.html for more models.
"""
def __init__(
self,
dataset_path: str,
bert_score_model: str = "all-mpnet-base-v2", # ['thenlper/gte-large-zh', 'all-mpnet-base-v2']
default_prompt_type: str = 'json',
eval_type: str = 'reason',
**kwargs,
) -> None:
self.bert_score_model = bert_score_model
print(bert_score_model)
self.dataset_path = dataset_path
# self.bertscore = evaluate.load('bertscore')
self.default_prompt_type = default_prompt_type # ["json", "str"]
self.eval_type = eval_type
self.valid_data_count = None
self.sentence_model = SentenceTransformer(self.bert_score_model)
def _load_dataset(self):
self.dataset = []
dataset = load(self.dataset_path)
total_error = 0
total_count = 0
for key in dataset.keys():
datum = dataset[key]
data_sample, error = self._process_response(datum)
total_error += error
total_count += 1
self.dataset.append(
dict(response_data_sample=data_sample))
self.num_samples = len(self.dataset)
# print("total_data_count:", total_count, "valid_data_count:", total_count - total_error)
self.valid_data_count = total_count - total_error
def format_load(self, data):
r'''
ensure evaluator can work correctly under any data input
'''
try:
json_format = format_load(data, start_character='{', end_character='}')
except Exception as e:
return {}
if type(json_format) != dict:
return {}
prepared_json_format = dict()
try:
prepared_json_format['thought'] = str(json_format['thought'])
except Exception as e:
prepared_json_format['thought'] = ''
try:
prepared_json_format['name'] = str(json_format['name'])
except Exception as e:
prepared_json_format['name'] = ''
if self.default_prompt_type == 'json':
try:
if isinstance(json_format['args'], dict):
prepared_json_format['args'] = json_format['args']
else:
prepared_json_format['args'] = dict()
except:
prepared_json_format['args'] = dict()
else:
try:
prepared_json_format['args'] = str(json_format['args'])
except Exception as e:
prepared_json_format['args'] = ""
return prepared_json_format
def _process_response(
self,
datum,
) -> ResponseDataSample:
"""Process the response to needed format.
Args:
datum(dict): inputs.
Returns:
dict: Processed response data sample.
"""
# Generated response, which can be a string or list
pred_data = datum['prediction']
# Response of ground truth, which can be a string or list
gt_data = datum['ground_truth']
# prompt_type: The type of planning prompt, supporting "json" and "ReWOO"
if "meta_data" in datum:
prompt_type = datum["meta_data"].get("response_format", self.default_prompt_type)
else:
prompt_type = self.default_prompt_type
error = 0
gt = self.format_load(gt_data)
# pred_data = input_postprocess(pred_data)
if prompt_type == 'json':
pred = self.format_load(pred_data)
if pred == {} or gt == {}:
error = 1
elif prompt_type == 'str':
# choose the first line
pred = dict()
if self.eval_type == 'reason':
pred['thought'] = pred_data
if self.eval_type == 'retrieve':
pred['name'] = pred_data
if self.eval_type == 'understand':
pred['args'] = pred_data
else:
raise NotImplementedError(f"Currently, we only support json and str format, but get {prompt_type}")
if error == 1:
pred = dict()
return ResponseDataSample(template = '', pred=pred, gt=gt), error
def _evaluate(self, data_sample):
"""Evaluate the response data sample.
"""
# To enable batch evaluation, the evaluator is written at post_process.
return data_sample
def evaluate(self):
self._load_dataset()
results_list = []
for data_sample in tqdm(self.dataset):
metrics_result = self._evaluate(
data_sample['response_data_sample'])
results_list.append(metrics_result)
return self._post_process(results_list)
def find_a_dot_b_structure(self, text):
# find a.b structure
pattern = r'\w+\.\w+'
return re.findall(pattern, text)
def find_FinishAction(self, text):
# find FinishAction
pattern = r'FinishAction'
return re.findall(pattern, text)
def _post_process(self, results_list):
# list of dict to dict of list
if self.default_prompt_type == 'json':
metric_keys = ['thought', 'name', 'args', 'parse_rate']
if self.default_prompt_type == 'str':
if self.eval_type == 'reason':
metric_keys = ['thought', 'parse_rate']
if self.eval_type == 'retrieve':
metric_keys = ['name', 'parse_rate']
if self.eval_type == 'understand':
metric_keys = ['args', 'parse_rate']
metrics_results = []
batch_data = []; batch_arg_data = []
batch_id = []; batch_arg_id = []
BATCH_LIMIT = 32
for id, data in enumerate(results_list):
metrics_results.append(
{metric_keys[x]: 0 for x in range(len(metric_keys))}
)
if len(data.pred.keys()) != 0:
metrics_results[id]['parse_rate'] = 1
if 'thought' in data.pred and 'thought' in data.gt:
batch_data.extend([data.pred['thought'], data.gt['thought']])
batch_id.extend([id])
if len(batch_data) >= BATCH_LIMIT:
pred_emb = self.sentence_model.encode(batch_data, convert_to_tensor=True)
for i in range(0, len(batch_data), 2):
cosine_score = np.maximum(util.cos_sim(pred_emb[i], pred_emb[i+1]).cpu().numpy(), 0)
metrics_results[batch_id[i // 2]]['thought'] = cosine_score[0, 0]
batch_data = []
batch_id = []
if 'name' in data.pred and 'name' in data.gt:
if self.default_prompt_type == 'json':
if data.pred['name'] == data.gt['name']:
metrics_results[id]['name'] = 1
else:
metrics_results[id]['name'] = 0
else:
if data.gt['name'] not in data.pred['name']:
metrics_results[id]['name'] = 0
else:
metrics_results[id]['name'] = 1
find_all_name = self.find_a_dot_b_structure(data.pred['name']) + self.find_FinishAction(data.pred['name'])
for name in find_all_name:
if name != data.gt['name']:
metrics_results[id]['name'] = 0
if 'args' in data.pred and 'args' in data.gt:
batch_arg_data.extend([str(data.pred['args']), str(data.gt['args'])])
batch_arg_id.extend([id])
if len(batch_arg_data) >= BATCH_LIMIT:
pred_emb = self.sentence_model.encode(batch_arg_data, convert_to_tensor=True)
for i in range(0, len(batch_arg_data), 2):
cosine_score = np.maximum(util.cos_sim(pred_emb[i], pred_emb[i+1]).cpu().numpy(), 0)
metrics_results[batch_arg_id[i // 2]]['args'] = cosine_score[0, 0]
batch_arg_data = []
batch_arg_id = []
if len(batch_data) > 0:
pred_emb = self.sentence_model.encode(batch_data, convert_to_tensor=True)
for i in range(0, len(batch_data), 2):
cosine_score = np.maximum(util.cos_sim(pred_emb[i], pred_emb[i+1]).cpu().numpy(), 0)
metrics_results[batch_id[i // 2]]['thought'] = cosine_score[0, 0]
batch_data = []
batch_id = []
if len(batch_arg_data) > 0:
pred_emb = self.sentence_model.encode(batch_arg_data, convert_to_tensor=True)
for i in range(0, len(batch_arg_data), 2):
cosine_score = np.maximum(util.cos_sim(pred_emb[i], pred_emb[i+1]).cpu().numpy(), 0)
metrics_results[batch_arg_id[i // 2]]['args'] = cosine_score[0, 0]
batch_arg_data = []
batch_arg_id = []
results = dict()
for key in metric_keys:
results[key] = mean([metrics_results[key] for metrics_results in metrics_results])
return results
class ReasonRetrieveUnderstandEvaluatorNoBatch:
"""Planning Evaluation
Args:
dataset_path(str): File path of evaluation dataset
bert_score_model(str): the bert_score model for sentence similarity, default = "all-mpnet-base-v2".
Refer to https://www.sbert.net/docs/pretrained_models.html for more models.
"""
def __init__(
self,
dataset_path: str,
bert_score_model: str = "all-mpnet-base-v2",
default_prompt_type: str = 'json',
eval_type: str = 'reason',
) -> None:
self.bert_score_model = bert_score_model
self.dataset_path = dataset_path
# self.bertscore = evaluate.load('bertscore')
self.default_prompt_type = default_prompt_type # ["json", "str"]
self.eval_type = eval_type
self.valid_data_count = None
self.sentence_model = SentenceTransformer(self.bert_score_model)
def _load_dataset(self):
self.dataset = []
dataset = load(self.dataset_path)
total_error = 0
total_count = 0
for key in dataset.keys():
datum = dataset[key]
data_sample, error = self._process_response(datum)
total_error += error
total_count += 1
self.dataset.append(
dict(response_data_sample=data_sample))
self.num_samples = len(self.dataset)
# print("total_data_count:", total_count, "valid_data_count:", total_count - total_error)
self.valid_data_count = total_count - total_error
def format_load(self, data):
r'''
ensure evaluator can work correctly under any data input
'''
if type(data) == dict:
json_format = data
else:
try:
json_format = json.loads(data) #json.loads(pred_data)
except Exception as e:
return {}
if type(json_format) != dict:
return {}
prepared_json_format = dict()
try:
prepared_json_format['thought'] = str(json_format['thought'])
except Exception as e:
prepared_json_format['thought'] = ''
try:
prepared_json_format['name'] = str(json_format['name'])
except Exception as e:
prepared_json_format['name'] = ''
try:
if prepared_json_format["name"] != "FinishAction":
arg_inputs = json_format["args"]
if type(arg_inputs) == str:
arg_inputs = json.loads(arg_inputs)
if type(arg_inputs) == dict:
prepared_json_format['args'] = arg_inputs
else:
prepared_json_format["args"] = {}
else:
prepared_json_format["args"] = {}
except Exception as e:
prepared_json_format['args'] = {}
return prepared_json_format
def _process_response(
self,
datum,
) -> ResponseDataSample:
"""Process the response to needed format.
Args:
datum(dict): inputs.
Returns:
dict: Processed response data sample.
"""
# Generated response, which can be a string or list
pred_data = datum['prediction']
# Response of ground truth, which can be a string or list
gt_data = datum['ground_truth']
# prompt_type: The type of planning prompt, supporting "json" and "ReWOO"
if "meta" in datum:
prompt_type = datum["meta"].get("response_format", self.default_prompt_type)
else:
prompt_type = self.default_prompt_type
error = 0
gt = self.format_load(gt_data)
# pred_data = input_postprocess(pred_data)
if prompt_type == 'json':
# pred_data = pred_data.replace('\'', '\"')
pred = self.format_load(pred_data)
if pred == {} or gt == {}:
error = 1
elif prompt_type == 'str':
# choose the first line
pred = dict()
if self.eval_type == 'reason':
pred['thought'] = pred_data
if self.eval_type == 'retrieve':
pred['name'] = pred_data
if self.eval_type == 'understand':
# pred_data = pred_data.replace('\'', '\"')
# try:
# pred['args'] = json.loads(pred_data)
# if type(pred['args']) != dict:
# pred['args'] = {}
# except Exception as e:
# error = 1
pred['args'] = pred_data
else:
raise NotImplementedError(f"Currently, we only support json and str format, but get {prompt_type}")
if error == 1:
pred = dict()
return ResponseDataSample(template = '', pred=pred, gt=gt), error
def _evaluate(self, data_sample) -> dict:
"""Evaluate the response data sample.
"""
metrics_result = {
'thought': 0,
'name': 0,
'args_precision': 0,
'args_recall': 0,
'args_f1_score': 0,
'parse_rate': 0,
}
if 'thought' in data_sample.pred and 'thought' in data_sample.gt:
pred_emb = self.sentence_model.encode(data_sample.pred['thought'], convert_to_tensor=True)
gt_emb = self.sentence_model.encode(data_sample.gt['thought'], convert_to_tensor=True)
cosine_scores = np.maximum(util.cos_sim(pred_emb, gt_emb).cpu().numpy(), 0)
metrics_result['thought'] = cosine_scores[0, 0]
if 'name' in data_sample.pred and 'name' in data_sample.gt:
if data_sample.pred['name'] == data_sample.gt['name']:
metrics_result['name'] = 1
else:
metrics_result['name'] = 0
if 'args' in data_sample.pred and 'args' in data_sample.gt:
gt_num_keys = len(data_sample.gt['args'].keys())
pred_num_keys = len(data_sample.pred['args'].keys())
if pred_num_keys == 0 and gt_num_keys == 0:
metrics_result['args_precision'] = 1
metrics_result['args_recall'] = 1
metrics_result['args_f1_score'] = 1
elif pred_num_keys == 0 or gt_num_keys == 0:
metrics_result['args_precision'] = 0
metrics_result['args_recall'] = 0
metrics_result['args_f1_score'] = 0
else:
correct_count = 0
for key in data_sample.gt['args'].keys():
if key in data_sample.pred['args'] and str(data_sample.pred['args'][key]) == str(data_sample.gt['args'][key]):
correct_count += 1
metrics_result['args_precision'] = correct_count / pred_num_keys
metrics_result['args_recall'] = correct_count / gt_num_keys
if metrics_result['args_precision'] + metrics_result['args_recall'] == 0:
metrics_result['args_f1_score'] = 0
else:
metrics_result['args_f1_score'] = 2 * metrics_result['args_precision'] * metrics_result['args_recall'] / \
(metrics_result['args_precision'] + metrics_result['args_recall'])
if len(data_sample.pred.keys()) == 0:
metrics_result['parse_rate'] = 0
else:
metrics_result['parse_rate'] = 1
return metrics_result
def evaluate(self):
self._load_dataset()
results_list = []
for data_sample in tqdm(self.dataset):
metrics_result = self._evaluate(
data_sample['response_data_sample'])
results_list.append(metrics_result)
return self._post_process(results_list)
def _post_process(self, results_list):
# list of dict to dict of list
results = dict()
if self.default_prompt_type == 'json':
metric_keys = ['thought', 'name', 'args_precision', 'args_recall', 'args_f1_score', 'parse_rate']
if self.default_prompt_type == 'str':
if self.eval_type == 'reason':
metric_keys = ['thought', 'parse_rate']
if self.eval_type == 'retrieve':
metric_keys = ['name', 'parse_rate']
if self.eval_type == 'understand':
metric_keys = ['args_precision', 'args_recall', 'args_f1_score', 'parse_rate']
for key in metric_keys:
results[key] = mean([result[key] for result in results_list])
return results
from collections import defaultdict
from mmengine import load
from ..schema import ResponseDataSample
import numpy as np
from ..utils.format_load import format_load
class ReviewEvaluator:
"""Review Capability Evaluation
Args:
dataset_path(str): File path of evaluation dataset.
"""
def __init__(
self,
dataset_path: str,
# bert_score_model: str = "all-mpnet-base-v2",
**kwargs,
) -> None:
self.dataset_path = dataset_path
# self.bert_score_model = bert_score_model
# self.sentence_model = SentenceTransformer(self.bert_score_model)
def _load_dataset(self):
self.dataset = []
dataset = load(self.dataset_path)
for key in dataset.keys():
datum = dataset[key]
data_sample = self._process_response(datum)
self.dataset.append(
dict(
origin_prompt=datum['origin_prompt'],
response_data_sample=data_sample))
self.num_samples = len(self.dataset)
def _process_response(
self,
datum: dict,
) -> ResponseDataSample:
"""Process the response to needed format.
Args:
datum(dict): inputs.
Returns:
dict: Processed response data sample.
"""
template = datum['template']
pred_data = datum['prediction']
gt_data = datum['ground_truth']['answer']
meta_data = datum['meta_data']
if meta_data['response_format'] == 'json':
pred_data = self.json_format_parse(pred_data)
else:
pred_data = pred_data[pred_data.find(":") + 1:]
pred_data = pred_data.strip()
if len(pred_data) > 0 and pred_data[0] in ['A', 'B', 'C', 'D', 'E']:
pred_data = pred_data[0]
else:
pred_data = None
return ResponseDataSample(
template=template, pred=pred_data, gt=gt_data, meta_data=meta_data)
def _evaluate(self, data_sample) -> dict:
metrics_result = dict(
parse_rate=0,
review_quality=0,
)
pred_data = data_sample.pred
if pred_data is not None:
# import pdb; pdb.set_trace()
metrics_result['review_quality'] = 1.0 if pred_data == \
data_sample.gt else 0.0
metrics_result['parse_rate'] = 1.0
return metrics_result
# def compute_sen_similarity(self, gt, pred):
# gt_embed = self.sentence_model.encode(gt, convert_to_tensor=True)
# pred_embed = self.sentence_model.encode(pred, convert_to_tensor=True)
# sen_sim = max(0, util.cos_sim(gt_embed, pred_embed).item())
# return sen_sim
def json_format_parse(self, pred_data):
try:
data = format_load(pred_data)
except Exception as e:
return None
try:
new_data = dict()
new_data['review'] = data['is_finished']
assert new_data['review'] in [True, False]
except Exception as e:
return None
return new_data
def evaluate(self):
self._load_dataset()
results_list = []
for data_sample in self.dataset:
metrics_result = self._evaluate(
data_sample['response_data_sample'])
results_list.append(metrics_result)
return self._post_process(results_list)
def _post_process(self, results_list):
# list of dict to dict of list
results_dict = defaultdict(list)
{
results_dict[key].append(sub[key])
for sub in results_list for key in sub
}
metric_list = ['parse_rate', 'review_quality']
for metric in metric_list:
results_dict[metric] = np.round(np.mean(results_dict[metric]), decimals=4)
return results_dict
from dataclasses import asdict, dataclass, field
from typing import Any, Dict
@dataclass
class ResponseDataSample:
"""
Args:
template(str): Format string with keyword-only arguments. For
example '{who} like {what}'
pred(Any): Parsed data from LLM generating response.
gt(Any): Ground truth data
meta_data(dict, optional): Meta information will be used to evaluate
LLM's response
"""
template: str
pred: Any
gt: Any
meta_data: dict = None
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment