Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
opencompass
Commits
d34ba111
Unverified
Commit
d34ba111
authored
Feb 05, 2024
by
Fengzhe Zhou
Committed by
GitHub
Feb 05, 2024
Browse files
[Sync] Merge branch 'dev' into zfz/update-keyset-demo (#876)
parent
32b5948f
Changes
97
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1526 additions
and
44 deletions
+1526
-44
configs/summarizers/agent_bench.py
configs/summarizers/agent_bench.py
+56
-24
configs/summarizers/compass_knowledge.py
configs/summarizers/compass_knowledge.py
+0
-3
configs/summarizers/compassbench_v1_language.py
configs/summarizers/compassbench_v1_language.py
+4
-4
configs/summarizers/compassbench_v1_reason.py
configs/summarizers/compassbench_v1_reason.py
+34
-11
configs/summarizers/groups/plugineval.py
configs/summarizers/groups/plugineval.py
+37
-0
configs/summarizers/groups/teval.py
configs/summarizers/groups/teval.py
+74
-0
configs/summarizers/leaderboard.py
configs/summarizers/leaderboard.py
+14
-0
configs/summarizers/mathbench_v1.py
configs/summarizers/mathbench_v1.py
+23
-0
configs/summarizers/plugineval.py
configs/summarizers/plugineval.py
+36
-0
configs/summarizers/teval.py
configs/summarizers/teval.py
+36
-0
opencompass/datasets/__init__.py
opencompass/datasets/__init__.py
+1
-0
opencompass/datasets/gsm8k.py
opencompass/datasets/gsm8k.py
+1
-1
opencompass/datasets/mbpp.py
opencompass/datasets/mbpp.py
+4
-1
opencompass/datasets/teval/__init__.py
opencompass/datasets/teval/__init__.py
+58
-0
opencompass/datasets/teval/evaluators/__init__.py
opencompass/datasets/teval/evaluators/__init__.py
+5
-0
opencompass/datasets/teval/evaluators/instruct_evaluator.py
opencompass/datasets/teval/evaluators/instruct_evaluator.py
+152
-0
opencompass/datasets/teval/evaluators/planning_evaluator.py
opencompass/datasets/teval/evaluators/planning_evaluator.py
+394
-0
opencompass/datasets/teval/evaluators/reason_retrieve_understand_evaluator.py
.../teval/evaluators/reason_retrieve_understand_evaluator.py
+455
-0
opencompass/datasets/teval/evaluators/review_evaluator.py
opencompass/datasets/teval/evaluators/review_evaluator.py
+123
-0
opencompass/datasets/teval/schema.py
opencompass/datasets/teval/schema.py
+19
-0
No files found.
configs/summarizers/agent_bench.py
View file @
d34ba111
...
@@ -18,30 +18,30 @@ agent_summary_groups = [
...
@@ -18,30 +18,30 @@ agent_summary_groups = [
summarizer
=
dict
(
summarizer
=
dict
(
dataset_abbrs
=
[
dataset_abbrs
=
[
'agent'
,
#
'agent',
'math_acc_1_and_fill_in_blank-native'
,
#
'math_acc_1_and_fill_in_blank-native',
'math_perf_4_and_fill_in_blank-native'
,
#
'math_perf_4_and_fill_in_blank-native',
# '######## MathBench-Agent Accuracy ########', # category
#
#
'######## MathBench-Agent Accuracy ########', # category
'math_acc_1_and_fill_in_blank-agent'
,
#
'math_acc_1_and_fill_in_blank-agent',
'math_perf_4_and_fill_in_blank-agent'
,
#
'math_perf_4_and_fill_in_blank-agent',
# '######## CIBench Template ########', # category
#
#
'######## CIBench Template ########', # category
'cibench_template:executable'
,
#
'cibench_template:executable',
'cibench_template:numeric_correct'
,
#
'cibench_template:numeric_correct',
'cibench_template:text_score'
,
#
'cibench_template:text_score',
'cibench_template:vis_sim'
,
#
'cibench_template:vis_sim',
# '######## CIBench Template Chinese ########', # category
#
#
'######## CIBench Template Chinese ########', # category
'cibench_template_cn:executable'
,
#
'cibench_template_cn:executable',
'cibench_template_cn:numeric_correct'
,
#
'cibench_template_cn:numeric_correct',
'cibench_template_cn:text_score'
,
#
'cibench_template_cn:text_score',
'cibench_template_cn:vis_sim'
,
#
'cibench_template_cn:vis_sim',
# '######## CIBench Template w/o NLTK ########', # category no text score becase it is only for nltk
#
#
'######## CIBench Template w/o NLTK ########', # category no text score becase it is only for nltk
'cibench_template_wo_nltk:executable'
,
#
'cibench_template_wo_nltk:executable',
'cibench_template_wo_nltk:numeric_correct'
,
#
'cibench_template_wo_nltk:numeric_correct',
'cibench_template_wo_nltk:vis_sim'
,
#
'cibench_template_wo_nltk:vis_sim',
# '######## CIBench Template Chinese w/o NLTK ########', # category
#
#
'######## CIBench Template Chinese w/o NLTK ########', # category
'cibench_template_cn_wo_nltk:executable'
,
#
'cibench_template_cn_wo_nltk:executable',
'cibench_template_cn_wo_nltk:numeric_correct'
,
#
'cibench_template_cn_wo_nltk:numeric_correct',
'cibench_template_cn_wo_nltk:vis_sim'
,
#
'cibench_template_cn_wo_nltk:vis_sim',
# '######## T-Eval ########', # category
# '######## T-Eval ########', # category
[
'plugin_eval-p10'
,
'naive_average'
],
[
'plugin_eval-p10'
,
'naive_average'
],
[
'plugin_eval-p10-instruct_v1'
,
'format_metric'
],
[
'plugin_eval-p10-instruct_v1'
,
'format_metric'
],
...
@@ -68,6 +68,38 @@ summarizer = dict(
...
@@ -68,6 +68,38 @@ summarizer = dict(
[
'plugin_eval-p10-understand_str_v1_zh'
,
'args'
],
[
'plugin_eval-p10-understand_str_v1_zh'
,
'args'
],
[
'plugin_eval-p10-reason_retrieve_understand_json_v1_zh'
,
'args'
],
[
'plugin_eval-p10-reason_retrieve_understand_json_v1_zh'
,
'args'
],
[
'plugin_eval-p10-review_str_v1_zh'
,
'review_quality'
],
[
'plugin_eval-p10-review_str_v1_zh'
,
'review_quality'
],
# '######## MUS-T-Eval ########', # category
[
'plugin_eval-mus-p10'
,
'naive_average'
],
[
'plugin_eval-mus-p10-instruct_v1'
,
'format_metric'
],
[
'plugin_eval-mus-p10-instruct_v1'
,
'args_em_metric'
],
[
'plugin_eval-mus-p10-plan_str_v1'
,
'f1_score'
],
[
'plugin_eval-mus-p10-plan_json_v1'
,
'f1_score'
],
[
'plugin_eval-mus-p10-reason_str_v1'
,
'thought'
],
[
'plugin_eval-mus-p10-reason_retrieve_understand_json_v1'
,
'thought'
],
[
'plugin_eval-mus-p10-retrieve_str_v1'
,
'name'
],
[
'plugin_eval-mus-p10-reason_retrieve_understand_json_v1'
,
'name'
],
[
'plugin_eval-mus-p10-understand_str_v1'
,
'args'
],
[
'plugin_eval-mus-p10-reason_retrieve_understand_json_v1'
,
'args'
],
[
'plugin_eval-mus-p10-review_str_v1'
,
'review_quality'
],
[
'plugin_eval-mus-p10_zh'
,
'naive_average'
],
[
'plugin_eval-mus-p10-instruct_v1_zh'
,
'format_metric'
],
[
'plugin_eval-mus-p10-instruct_v1_zh'
,
'args_em_metric'
],
[
'plugin_eval-mus-p10-plan_str_v1_zh'
,
'f1_score'
],
[
'plugin_eval-mus-p10-plan_json_v1_zh'
,
'f1_score'
],
[
'plugin_eval-mus-p10-reason_str_v1_zh'
,
'thought'
],
[
'plugin_eval-mus-p10-reason_retrieve_understand_json_v1_zh'
,
'thought'
],
[
'plugin_eval-mus-p10-retrieve_str_v1_zh'
,
'name'
],
[
'plugin_eval-mus-p10-reason_retrieve_understand_json_v1_zh'
,
'name'
],
[
'plugin_eval-mus-p10-understand_str_v1_zh'
,
'args'
],
[
'plugin_eval-mus-p10-reason_retrieve_understand_json_v1_zh'
,
'args'
],
[
'plugin_eval-mus-p10-review_str_v1_zh'
,
'review_quality'
],
# ['plugin_eval-p10', 'naive_average'],
# ['plugin_eval-mus-p10', 'naive_average'],
# ['plugin_eval-p10_zh', 'naive_average'],
# ['plugin_eval-mus-p10_zh', 'naive_average'],
],
],
summary_groups
=
sum
(
summary_groups
=
sum
(
[
v
for
k
,
v
in
locals
().
items
()
if
k
.
endswith
(
"_summary_groups"
)],
[])
[
v
for
k
,
v
in
locals
().
items
()
if
k
.
endswith
(
"_summary_groups"
)],
[])
...
...
configs/summarizers/compass_knowledge.py
View file @
d34ba111
# This summarizer is used for `./datasets/compassbench_v1_knowledge/compassbench_v1_knowledge_gen`
# This summarizer is used for `./datasets/compassbench_v1_knowledge/compassbench_v1_knowledge_gen`
compassbench_v1_knowledge_names
=
[
compassbench_v1_knowledge_names
=
[
'compassbench_v1_knowledge-common_knowledge-single_choice_cn_circular'
,
'compassbench_v1_knowledge-common_knowledge-single_choice_cn_circular'
,
'compassbench_v1_knowledge-engineering-single_choice_cn_circular'
,
'compassbench_v1_knowledge-humanity-single_choice_cn_circular'
,
'compassbench_v1_knowledge-humanity-single_choice_cn_circular'
,
'compassbench_v1_knowledge-natural_science-single_choice_cn_circular'
,
'compassbench_v1_knowledge-natural_science-single_choice_cn_circular'
,
'compassbench_v1_knowledge-social_science-single_choice_cn_circular'
,
'compassbench_v1_knowledge-social_science-single_choice_cn_circular'
,
...
@@ -19,7 +18,6 @@ summarizer = dict(
...
@@ -19,7 +18,6 @@ summarizer = dict(
'knowledge_acc_1_and_cloze'
,
'knowledge_acc_1_and_cloze'
,
[
'knowledge_cn'
,
'acc_1'
],
[
'knowledge_cn'
,
'acc_1'
],
[
'compassbench_v1_knowledge-common_knowledge-single_choice_cn_circular'
,
'acc_1'
],
[
'compassbench_v1_knowledge-common_knowledge-single_choice_cn_circular'
,
'acc_1'
],
[
'compassbench_v1_knowledge-engineering-single_choice_cn_circular'
,
'acc_1'
],
[
'compassbench_v1_knowledge-humanity-single_choice_cn_circular'
,
'acc_1'
],
[
'compassbench_v1_knowledge-humanity-single_choice_cn_circular'
,
'acc_1'
],
[
'compassbench_v1_knowledge-natural_science-single_choice_cn_circular'
,
'acc_1'
],
[
'compassbench_v1_knowledge-natural_science-single_choice_cn_circular'
,
'acc_1'
],
[
'compassbench_v1_knowledge-social_science-single_choice_cn_circular'
,
'acc_1'
],
[
'compassbench_v1_knowledge-social_science-single_choice_cn_circular'
,
'acc_1'
],
...
@@ -28,7 +26,6 @@ summarizer = dict(
...
@@ -28,7 +26,6 @@ summarizer = dict(
'knowledge_perf_4_and_cloze'
,
'knowledge_perf_4_and_cloze'
,
[
'knowledge_cn'
,
'perf_4'
],
[
'knowledge_cn'
,
'perf_4'
],
[
'compassbench_v1_knowledge-common_knowledge-single_choice_cn_circular'
,
'perf_4'
],
[
'compassbench_v1_knowledge-common_knowledge-single_choice_cn_circular'
,
'perf_4'
],
[
'compassbench_v1_knowledge-engineering-single_choice_cn_circular'
,
'perf_4'
],
[
'compassbench_v1_knowledge-humanity-single_choice_cn_circular'
,
'perf_4'
],
[
'compassbench_v1_knowledge-humanity-single_choice_cn_circular'
,
'perf_4'
],
[
'compassbench_v1_knowledge-natural_science-single_choice_cn_circular'
,
'perf_4'
],
[
'compassbench_v1_knowledge-natural_science-single_choice_cn_circular'
,
'perf_4'
],
[
'compassbench_v1_knowledge-social_science-single_choice_cn_circular'
,
'perf_4'
],
[
'compassbench_v1_knowledge-social_science-single_choice_cn_circular'
,
'perf_4'
],
...
...
configs/summarizers/compassbench_v1_language.py
View file @
d34ba111
...
@@ -37,8 +37,8 @@ summarizer = dict(
...
@@ -37,8 +37,8 @@ summarizer = dict(
'language_acc_1_and_non_mcq'
,
'language_acc_1_and_non_mcq'
,
'language_en_acc_1_and_non_mcq'
,
'language_en_acc_1_and_non_mcq'
,
'language_zh_acc_1_and_non_mcq'
,
'language_zh_acc_1_and_non_mcq'
,
[
'information_retrieval_en'
,
'score'
],
#
['information_retrieval_en', 'score'],
[
'information_retrieval_zh'
,
'score'
],
#
['information_retrieval_zh', 'score'],
[
'intention_recognition_en_circular'
,
'acc_origin'
],
[
'intention_recognition_en_circular'
,
'acc_origin'
],
[
'intention_recognition_zh_circular'
,
'acc_origin'
],
[
'intention_recognition_zh_circular'
,
'acc_origin'
],
[
'sentiment_analysis_en_circular'
,
'acc_origin'
],
[
'sentiment_analysis_en_circular'
,
'acc_origin'
],
...
@@ -54,8 +54,8 @@ summarizer = dict(
...
@@ -54,8 +54,8 @@ summarizer = dict(
'language_perf_4_and_non_mcq'
,
'language_perf_4_and_non_mcq'
,
'language_en_perf_4_and_non_mcq'
,
'language_en_perf_4_and_non_mcq'
,
'language_zh_perf_4_and_non_mcq'
,
'language_zh_perf_4_and_non_mcq'
,
[
'information_retrieval_en'
,
'score'
],
#
['information_retrieval_en', 'score'],
[
'information_retrieval_zh'
,
'score'
],
#
['information_retrieval_zh', 'score'],
[
'intention_recognition_en_circular'
,
'perf_circular'
],
[
'intention_recognition_en_circular'
,
'perf_circular'
],
[
'intention_recognition_zh_circular'
,
'perf_circular'
],
[
'intention_recognition_zh_circular'
,
'perf_circular'
],
[
'sentiment_analysis_en_circular'
,
'perf_circular'
],
[
'sentiment_analysis_en_circular'
,
'perf_circular'
],
...
...
configs/summarizers/compassbench_v1_reason.py
View file @
d34ba111
compassbench_v1_reason_groups
=
[
compassbench_v1_reason_groups
=
[
{
'name'
:
'reasonbench_cn_logic_circular'
,
'subsets'
:
[
'reasonbench_cn_abductive_alphanlg_translated_circular'
,
'reasonbench_cn_deductive_bbh3obj_translated_circular'
,
'reasonbench_cn_deductive_logiqa_zh_circular'
,
'reasonbench_cn_inductive_deer_translated_circular'
,
'reasonbench_cn_inductive_selfgenerated_circular'
]},
{
'name'
:
'reasonbench_cn_abductive_circular'
,
'subsets'
:
[
'reasonbench_cn_abductive_alphanlg_translated_circular'
]},
{
'name'
:
'reasonbench_en_logic_circular'
,
'subsets'
:
[
'reasonbench_en_abductive_alphanlg_circular'
,
'reasonbench_en_deductive_bbh7obj_circular'
,
'reasonbench_en_deductive_logiqa_zh_translated_circular'
,
'reasonbench_en_deductive_ocnli_translated_circular'
,
'reasonbench_en_inductive_deer_circular'
,
'reasonbench_en_inductive_selfgenerated_circular'
]},
{
'name'
:
'reasonbench_en_abductive_circular'
,
'subsets'
:
[
'reasonbench_en_abductive_alphanlg_circular'
]},
{
'name'
:
'reasonbench'
,
'subsets'
:
[
'reasonbench_cn_commonsense_circular'
,
'reasonbench_cn_logic_circular'
,
'reasonbench_en_commonsense_circular'
,
'reasonbench_en_logic_circular'
]},
{
'name'
:
'reasonbench_cn_deductive_circular'
,
'subsets'
:
[
'reasonbench_cn_deductive_bbh3obj_translated_circular'
,
'reasonbench_cn_deductive_logiqa_zh_circular'
]},
{
'name'
:
'reasonbench_cn_inductive_circular'
,
'subsets'
:
[
'reasonbench_cn_inductive_deer_translated_circular'
,
'reasonbench_cn_inductive_selfgenerated_circular'
]},
{
'name'
:
'reasonbench_en_inductive_circular'
,
'subsets'
:
[
'reasonbench_en_inductive_deer_circular'
,
'reasonbench_en_inductive_selfgenerated_circular'
]},
{
'name'
:
'reasonbench_cn_circular'
,
'subsets'
:
[
'reasonbench_cn_commonsense_circular'
,
'reasonbench_cn_abductive_circular'
,
'reasonbench_cn_deductive_circular'
,
'reasonbench_cn_inductive_circular'
]},
{
'name'
:
'reasonbench_en_circular'
,
'subsets'
:
[
'reasonbench_en_commonsense_circular'
,
'reasonbench_en_abductive_circular'
,
'reasonbench_en_deductive_logiqa_zh_translated_circular'
,
'reasonbench_en_inductive_circular'
]},
{
'name'
:
'reasonbench'
,
'subsets'
:
[
'reasonbench_cn_circular'
,
'reasonbench_en_circular'
]},
]
]
summarizer
=
dict
(
summarizer
=
dict
(
dataset_abbrs
=
[
dataset_abbrs
=
[
[
'reasonbench'
,
'acc_origin'
],
[
'reasonbench'
,
'acc_origin'
],
[
'reasonbench_cn_circular'
,
'acc_origin'
],
[
'reasonbench_en_circular'
,
'acc_origin'
],
[
'reasonbench_cn_commonsense_circular'
,
'acc_origin'
],
[
'reasonbench_cn_commonsense_circular'
,
'acc_origin'
],
[
'reasonbench_cn_abductive_circular'
,
'acc_origin'
],
[
'reasonbench_cn_deductive_circular'
,
'acc_origin'
],
[
'reasonbench_cn_inductive_circular'
,
'acc_origin'
],
[
'reasonbench_en_commonsense_circular'
,
'acc_origin'
],
[
'reasonbench_en_commonsense_circular'
,
'acc_origin'
],
[
'reasonbench_cn_logic_circular'
,
'acc_origin'
],
[
'reasonbench_en_abductive_circular'
,
'acc_origin'
],
[
'reasonbench_en_logic_circular'
,
'acc_origin'
],
[
'reasonbench_en_deductive_logiqa_zh_translated_circular'
,
'acc_origin'
],
[
'reasonbench_en_inductive_circular'
,
'acc_origin'
],
[
'reasonbench_cn_commonsense_circular'
,
'acc_origin'
],
[
'reasonbench_cn_abductive_alphanlg_translated_circular'
,
'acc_origin'
],
[
'reasonbench_cn_abductive_alphanlg_translated_circular'
,
'acc_origin'
],
[
'reasonbench_cn_deductive_bbh3obj_translated_circular'
,
'acc_origin'
],
[
'reasonbench_cn_deductive_bbh3obj_translated_circular'
,
'acc_origin'
],
[
'reasonbench_cn_deductive_logiqa_zh_circular'
,
'acc_origin'
],
[
'reasonbench_cn_deductive_logiqa_zh_circular'
,
'acc_origin'
],
[
'reasonbench_cn_inductive_deer_translated_circular'
,
'acc_origin'
],
[
'reasonbench_cn_inductive_deer_translated_circular'
,
'acc_origin'
],
[
'reasonbench_cn_inductive_selfgenerated_circular'
,
'acc_origin'
],
[
'reasonbench_cn_inductive_selfgenerated_circular'
,
'acc_origin'
],
[
'reasonbench_en_commonsense_circular'
,
'acc_origin'
],
[
'reasonbench_en_abductive_alphanlg_circular'
,
'acc_origin'
],
[
'reasonbench_en_abductive_alphanlg_circular'
,
'acc_origin'
],
[
'reasonbench_en_deductive_bbh7obj_circular'
,
'acc_origin'
],
[
'reasonbench_en_deductive_logiqa_zh_translated_circular'
,
'acc_origin'
],
[
'reasonbench_en_deductive_logiqa_zh_translated_circular'
,
'acc_origin'
],
[
'reasonbench_en_deductive_ocnli_translated_circular'
,
'acc_origin'
],
[
'reasonbench_en_inductive_deer_circular'
,
'acc_origin'
],
[
'reasonbench_en_inductive_deer_circular'
,
'acc_origin'
],
[
'reasonbench_en_inductive_selfgenerated_circular'
,
'acc_origin'
],
[
'reasonbench_en_inductive_selfgenerated_circular'
,
'acc_origin'
],
[
'reasonbench'
,
'perf_circular'
],
[
'reasonbench'
,
'perf_circular'
],
[
'reasonbench_cn_circular'
,
'perf_circular'
],
[
'reasonbench_en_circular'
,
'perf_circular'
],
[
'reasonbench_cn_commonsense_circular'
,
'perf_circular'
],
[
'reasonbench_cn_commonsense_circular'
,
'perf_circular'
],
[
'reasonbench_cn_abductive_circular'
,
'perf_circular'
],
[
'reasonbench_cn_deductive_circular'
,
'perf_circular'
],
[
'reasonbench_cn_inductive_circular'
,
'perf_circular'
],
[
'reasonbench_en_commonsense_circular'
,
'perf_circular'
],
[
'reasonbench_en_commonsense_circular'
,
'perf_circular'
],
[
'reasonbench_cn_logic_circular'
,
'perf_circular'
],
[
'reasonbench_en_abductive_circular'
,
'perf_circular'
],
[
'reasonbench_en_logic_circular'
,
'perf_circular'
],
[
'reasonbench_en_deductive_logiqa_zh_translated_circular'
,
'perf_circular'
],
[
'reasonbench_en_inductive_circular'
,
'perf_circular'
],
[
'reasonbench_cn_commonsense_circular'
,
'perf_circular'
],
[
'reasonbench_cn_abductive_alphanlg_translated_circular'
,
'perf_circular'
],
[
'reasonbench_cn_abductive_alphanlg_translated_circular'
,
'perf_circular'
],
[
'reasonbench_cn_deductive_bbh3obj_translated_circular'
,
'perf_circular'
],
[
'reasonbench_cn_deductive_bbh3obj_translated_circular'
,
'perf_circular'
],
[
'reasonbench_cn_deductive_logiqa_zh_circular'
,
'perf_circular'
],
[
'reasonbench_cn_deductive_logiqa_zh_circular'
,
'perf_circular'
],
[
'reasonbench_cn_inductive_deer_translated_circular'
,
'perf_circular'
],
[
'reasonbench_cn_inductive_deer_translated_circular'
,
'perf_circular'
],
[
'reasonbench_cn_inductive_selfgenerated_circular'
,
'perf_circular'
],
[
'reasonbench_cn_inductive_selfgenerated_circular'
,
'perf_circular'
],
[
'reasonbench_en_commonsense_circular'
,
'perf_circular'
],
[
'reasonbench_en_abductive_alphanlg_circular'
,
'perf_circular'
],
[
'reasonbench_en_abductive_alphanlg_circular'
,
'perf_circular'
],
[
'reasonbench_en_deductive_bbh7obj_circular'
,
'perf_circular'
],
[
'reasonbench_en_deductive_logiqa_zh_translated_circular'
,
'perf_circular'
],
[
'reasonbench_en_deductive_logiqa_zh_translated_circular'
,
'perf_circular'
],
[
'reasonbench_en_deductive_ocnli_translated_circular'
,
'perf_circular'
],
[
'reasonbench_en_inductive_deer_circular'
,
'perf_circular'
],
[
'reasonbench_en_inductive_deer_circular'
,
'perf_circular'
],
[
'reasonbench_en_inductive_selfgenerated_circular'
,
'perf_circular'
],
[
'reasonbench_en_inductive_selfgenerated_circular'
,
'perf_circular'
],
],
],
...
...
configs/summarizers/groups/plugineval.py
View file @
d34ba111
...
@@ -17,6 +17,28 @@ _base_summary_groups = [
...
@@ -17,6 +17,28 @@ _base_summary_groups = [
[
'plugin_eval-instruct_v1'
,
'json_args_em_metric'
],
[
'plugin_eval-instruct_v1'
,
'json_args_em_metric'
],
]
]
},
},
{
'name'
:
'plugin_eval-instruct_v1'
,
'metric'
:
'string_metric'
,
'subsets'
:
[
[
'plugin_eval-instruct_v1'
,
'string_format_metric'
],
[
'plugin_eval-instruct_v1'
,
'string_args_em_metric'
],
]
},
{
'name'
:
'plugin_eval-instruct_v1'
,
'metric'
:
'json_metric'
,
'subsets'
:
[
[
'plugin_eval-instruct_v1'
,
'json_format_metric'
],
[
'plugin_eval-instruct_v1'
,
'json_args_em_metric'
],
]
},
{
'name'
:
'copy_plugin_eval-review_str_v1'
,
'subsets'
:
[
[
'plugin_eval-review_str_v1'
,
'review_quality'
],
],
},
{
{
'name'
:
'plugin_eval'
,
'name'
:
'plugin_eval'
,
'subsets'
:
[
'subsets'
:
[
...
@@ -31,6 +53,7 @@ _base_summary_groups = [
...
@@ -31,6 +53,7 @@ _base_summary_groups = [
[
'plugin_eval-understand_str_v1'
,
'args'
],
[
'plugin_eval-understand_str_v1'
,
'args'
],
[
'plugin_eval-reason_retrieve_understand_json_v1'
,
'args'
],
[
'plugin_eval-reason_retrieve_understand_json_v1'
,
'args'
],
[
'plugin_eval-review_str_v1'
,
'review_quality'
],
[
'plugin_eval-review_str_v1'
,
'review_quality'
],
[
'copy_plugin_eval-review_str_v1'
,
'naive_average'
],
# a hack for review * 2
]
]
},
},
]
]
...
@@ -62,3 +85,17 @@ for group in _base_summary_groups:
...
@@ -62,3 +85,17 @@ for group in _base_summary_groups:
group
[
'name'
]
=
group
[
'name'
].
replace
(
'plugin_eval'
,
'plugin_eval-p10'
)
+
'_zh'
group
[
'name'
]
=
group
[
'name'
].
replace
(
'plugin_eval'
,
'plugin_eval-p10'
)
+
'_zh'
group
[
'subsets'
]
=
[[
subset
[
0
].
replace
(
'plugin_eval'
,
'plugin_eval-p10'
)
+
'_zh'
,
subset
[
1
]]
for
subset
in
group
[
'subsets'
]]
group
[
'subsets'
]
=
[[
subset
[
0
].
replace
(
'plugin_eval'
,
'plugin_eval-p10'
)
+
'_zh'
,
subset
[
1
]]
for
subset
in
group
[
'subsets'
]]
plugineval_summary_groups
.
append
(
group
)
plugineval_summary_groups
.
append
(
group
)
# base -mus-p10-
for
group
in
_base_summary_groups
:
group
=
deepcopy
(
group
)
group
[
'name'
]
=
group
[
'name'
].
replace
(
'plugin_eval'
,
'plugin_eval-mus-p10'
)
group
[
'subsets'
]
=
[[
subset
[
0
].
replace
(
'plugin_eval'
,
'plugin_eval-mus-p10'
),
subset
[
1
]]
for
subset
in
group
[
'subsets'
]]
plugineval_summary_groups
.
append
(
group
)
# base -mus-p10- _zh
for
group
in
_base_summary_groups
:
group
=
deepcopy
(
group
)
group
[
'name'
]
=
group
[
'name'
].
replace
(
'plugin_eval'
,
'plugin_eval-mus-p10'
)
+
'_zh'
group
[
'subsets'
]
=
[[
subset
[
0
].
replace
(
'plugin_eval'
,
'plugin_eval-mus-p10'
)
+
'_zh'
,
subset
[
1
]]
for
subset
in
group
[
'subsets'
]]
plugineval_summary_groups
.
append
(
group
)
configs/summarizers/groups/teval.py
0 → 100644
View file @
d34ba111
from
copy
import
deepcopy
_base_summary_groups
=
[
{
'name'
:
'teval-instruct_v1'
,
'metric'
:
'format_metric'
,
'subsets'
:
[
[
'teval-instruct_v1'
,
'string_format_metric'
],
[
'teval-instruct_v1'
,
'json_format_metric'
],
]
},
{
'name'
:
'teval-instruct_v1'
,
'metric'
:
'args_em_metric'
,
'subsets'
:
[
[
'teval-instruct_v1'
,
'string_args_em_metric'
],
[
'teval-instruct_v1'
,
'json_args_em_metric'
],
]
},
{
'name'
:
'teval-instruct_v1'
,
'metric'
:
'string_metric'
,
'subsets'
:
[
[
'teval-instruct_v1'
,
'string_format_metric'
],
[
'teval-instruct_v1'
,
'string_args_em_metric'
],
]
},
{
'name'
:
'teval-instruct_v1'
,
'metric'
:
'json_metric'
,
'subsets'
:
[
[
'teval-instruct_v1'
,
'json_format_metric'
],
[
'teval-instruct_v1'
,
'json_args_em_metric'
],
]
},
{
'name'
:
'copy_teval-review_str_v1'
,
'subsets'
:
[
[
'teval-review_str_v1'
,
'review_quality'
],
],
},
{
'name'
:
'teval'
,
'subsets'
:
[
[
'teval-instruct_v1'
,
'format_metric'
],
[
'teval-instruct_v1'
,
'args_em_metric'
],
[
'teval-plan_str_v1'
,
'f1_score'
],
[
'teval-plan_json_v1'
,
'f1_score'
],
[
'teval-reason_str_v1'
,
'thought'
],
[
'teval-reason_retrieve_understand_json_v1'
,
'thought'
],
[
'teval-retrieve_str_v1'
,
'name'
],
[
'teval-reason_retrieve_understand_json_v1'
,
'name'
],
[
'teval-understand_str_v1'
,
'args'
],
[
'teval-reason_retrieve_understand_json_v1'
,
'args'
],
[
'teval-review_str_v1'
,
'review_quality'
],
[
'copy_teval-review_str_v1'
,
'naive_average'
],
# a hack for review * 2
]
},
]
teval_summary_groups
=
[]
# base
for
group
in
_base_summary_groups
:
group
=
deepcopy
(
group
)
teval_summary_groups
.
append
(
group
)
# base _zh
for
group
in
_base_summary_groups
:
group
=
deepcopy
(
group
)
group
[
'name'
]
=
group
[
'name'
]
+
'_zh'
group
[
'subsets'
]
=
[[
subset
[
0
]
+
'_zh'
,
subset
[
1
]]
for
subset
in
group
[
'subsets'
]]
teval_summary_groups
.
append
(
group
)
configs/summarizers/leaderboard.py
View file @
d34ba111
...
@@ -12,8 +12,22 @@ with read_base():
...
@@ -12,8 +12,22 @@ with read_base():
from
.groups.xiezhi
import
xiezhi_summary_groups
from
.groups.xiezhi
import
xiezhi_summary_groups
other_summary_groups
=
[]
other_summary_groups
.
append
({
'name'
:
'Exam'
,
'subsets'
:
[
"ceval"
,
'agieval'
,
'mmlu'
,
'cmmlu'
,
"GaokaoBench"
,
'ARC-c'
,
'ARC-e'
]})
other_summary_groups
.
append
({
'name'
:
'Language'
,
'subsets'
:
[
'WiC'
,
'chid-dev'
,
'afqmc-dev'
,
'WSC'
,
'tydiqa-goldp'
,
'flores_100'
]})
other_summary_groups
.
append
({
'name'
:
'Knowledge'
,
'subsets'
:
[
'BoolQ'
,
'commonsense_qa'
,
'triviaqa'
,
'nq'
]})
other_summary_groups
.
append
({
'name'
:
'Understanding'
,
'subsets'
:
[
'C3'
,
'race-middle'
,
'race-high'
,
'openbookqa_fact'
,
'csl_dev'
,
'lcsts'
,
'Xsum'
,
'eprstmt-dev'
,
'lambada'
]})
other_summary_groups
.
append
({
'name'
:
'Reasoning'
,
'subsets'
:
[
'cmnli'
,
'ocnli'
,
'AX_b'
,
'AX_g'
,
'RTE'
,
'COPA'
,
'ReCoRD'
,
'hellaswag'
,
'piqa'
,
'siqa'
,
'math'
,
'gsm8k'
,
'drop'
,
'openai_humaneval'
,
'mbpp'
,
"bbh"
]})
other_summary_groups
.
append
({
'name'
:
'Overall'
,
'subsets'
:
[
'Exam'
,
'Language'
,
'Knowledge'
,
'Understanding'
,
'Reasoning'
]})
summarizer
=
dict
(
summarizer
=
dict
(
dataset_abbrs
=
[
dataset_abbrs
=
[
'Overall'
,
'Exam'
,
'Language'
,
'Knowledge'
,
'Understanding'
,
'Reasoning'
,
'--------- 考试 Exam ---------'
,
# category
'--------- 考试 Exam ---------'
,
# category
# 'Mixed', # subcategory
# 'Mixed', # subcategory
"ceval"
,
"ceval"
,
...
...
configs/summarizers/mathbench_v1.py
0 → 100644
View file @
d34ba111
summarizer
=
dict
(
dataset_abbrs
=
[
'######## MathBench Accuracy ########'
,
# category
[
'mathbench-college-single_choice_cn'
,
'acc_1'
],
[
'mathbench-college-single_choice_en'
,
'acc_1'
],
[
'mathbench-high-single_choice_cn'
,
'acc_1'
],
[
'mathbench-high-single_choice_en'
,
'acc_1'
],
[
'mathbench-middle-single_choice_cn'
,
'acc_1'
],
[
'mathbench-middle-single_choice_en'
,
'acc_1'
],
[
'mathbench-primary-cloze_cn'
,
'accuracy'
],
[
'mathbench-primary-cloze_en'
,
'accuracy'
],
[
'mathbench-calculate-cloze_en'
,
'accuracy'
],
'######## MathBench CircularEval ########'
,
# category
[
'mathbench-college-single_choice_cn'
,
'perf_4'
],
[
'mathbench-college-single_choice_en'
,
'perf_4'
],
[
'mathbench-high-single_choice_cn'
,
'perf_4'
],
[
'mathbench-high-single_choice_en'
,
'perf_4'
],
[
'mathbench-middle-single_choice_cn'
,
'perf_4'
],
[
'mathbench-middle-single_choice_en'
,
'perf_4'
],
],
summary_groups
=
sum
(
[
v
for
k
,
v
in
locals
().
items
()
if
k
.
endswith
(
"_summary_groups"
)],
[])
)
configs/summarizers/plugineval.py
0 → 100644
View file @
d34ba111
from
mmengine.config
import
read_base
with
read_base
():
from
.groups.plugineval
import
plugineval_summary_groups
summarizer
=
dict
(
dataset_abbrs
=
[
[
'plugin_eval'
,
'naive_average'
],
[
'plugin_eval-instruct_v1'
,
'string_metric'
],
# 指令跟随能力-string格式
[
'plugin_eval-instruct_v1'
,
'json_metric'
],
# 指令跟随能力-json格式
[
'plugin_eval-plan_str_v1'
,
'f1_score'
],
# 规划能力-string格式
[
'plugin_eval-plan_json_v1'
,
'f1_score'
],
# 规划能力-json格式
[
'plugin_eval-reason_str_v1'
,
'thought'
],
# 推理能力-string格式
[
'plugin_eval-reason_retrieve_understand_json_v1'
,
'thought'
],
# 推理能力-json格式
[
'plugin_eval-retrieve_str_v1'
,
'name'
],
# 检索能力-string格式
[
'plugin_eval-reason_retrieve_understand_json_v1'
,
'name'
],
# 检索能力-json格式
[
'plugin_eval-understand_str_v1'
,
'args'
],
# 理解能力-string格式
[
'plugin_eval-reason_retrieve_understand_json_v1'
,
'args'
],
# 理解能力-json格式
[
'plugin_eval-review_str_v1'
,
'review_quality'
],
# 反思能力-string格式
[
'plugin_eval_zh'
,
'naive_average'
],
[
'plugin_eval-instruct_v1_zh'
,
'string_metric'
],
[
'plugin_eval-instruct_v1_zh'
,
'json_metric'
],
[
'plugin_eval-plan_str_v1_zh'
,
'f1_score'
],
[
'plugin_eval-plan_json_v1_zh'
,
'f1_score'
],
[
'plugin_eval-reason_str_v1_zh'
,
'thought'
],
[
'plugin_eval-reason_retrieve_understand_json_v1_zh'
,
'thought'
],
[
'plugin_eval-retrieve_str_v1_zh'
,
'name'
],
[
'plugin_eval-reason_retrieve_understand_json_v1_zh'
,
'name'
],
[
'plugin_eval-understand_str_v1_zh'
,
'args'
],
[
'plugin_eval-reason_retrieve_understand_json_v1_zh'
,
'args'
],
[
'plugin_eval-review_str_v1_zh'
,
'review_quality'
],
],
summary_groups
=
sum
(
[
v
for
k
,
v
in
locals
().
items
()
if
k
.
endswith
(
"_summary_groups"
)],
[])
)
configs/summarizers/teval.py
0 → 100644
View file @
d34ba111
from
mmengine.config
import
read_base
with
read_base
():
from
.groups.teval
import
teval_summary_groups
summarizer
=
dict
(
dataset_abbrs
=
[
[
'teval'
,
'naive_average'
],
[
'teval-instruct_v1'
,
'string_metric'
],
# 指令跟随能力-string格式
[
'teval-instruct_v1'
,
'json_metric'
],
# 指令跟随能力-json格式
[
'teval-plan_str_v1'
,
'f1_score'
],
# 规划能力-string格式
[
'teval-plan_json_v1'
,
'f1_score'
],
# 规划能力-json格式
[
'teval-reason_str_v1'
,
'thought'
],
# 推理能力-string格式
[
'teval-reason_retrieve_understand_json_v1'
,
'thought'
],
# 推理能力-json格式
[
'teval-retrieve_str_v1'
,
'name'
],
# 检索能力-string格式
[
'teval-reason_retrieve_understand_json_v1'
,
'name'
],
# 检索能力-json格式
[
'teval-understand_str_v1'
,
'args'
],
# 理解能力-string格式
[
'teval-reason_retrieve_understand_json_v1'
,
'args'
],
# 理解能力-json格式
[
'teval-review_str_v1'
,
'review_quality'
],
# 反思能力-string格式
[
'teval_zh'
,
'naive_average'
],
[
'teval-instruct_v1_zh'
,
'string_metric'
],
[
'teval-instruct_v1_zh'
,
'json_metric'
],
[
'teval-plan_str_v1_zh'
,
'f1_score'
],
[
'teval-plan_json_v1_zh'
,
'f1_score'
],
[
'teval-reason_str_v1_zh'
,
'thought'
],
[
'teval-reason_retrieve_understand_json_v1_zh'
,
'thought'
],
[
'teval-retrieve_str_v1_zh'
,
'name'
],
[
'teval-reason_retrieve_understand_json_v1_zh'
,
'name'
],
[
'teval-understand_str_v1_zh'
,
'args'
],
[
'teval-reason_retrieve_understand_json_v1_zh'
,
'args'
],
[
'teval-review_str_v1_zh'
,
'review_quality'
],
],
summary_groups
=
sum
(
[
v
for
k
,
v
in
locals
().
items
()
if
k
.
endswith
(
"_summary_groups"
)],
[])
)
opencompass/datasets/__init__.py
View file @
d34ba111
...
@@ -91,6 +91,7 @@ from .summedits import * # noqa: F401, F403
...
@@ -91,6 +91,7 @@ from .summedits import * # noqa: F401, F403
from
.summscreen
import
*
# noqa: F401, F403
from
.summscreen
import
*
# noqa: F401, F403
from
.svamp
import
*
# noqa: F401, F403
from
.svamp
import
*
# noqa: F401, F403
from
.tabmwp
import
*
# noqa: F401, F403
from
.tabmwp
import
*
# noqa: F401, F403
from
.teval
import
*
# noqa: F401, F403
from
.TheoremQA
import
*
# noqa: F401, F403
from
.TheoremQA
import
*
# noqa: F401, F403
from
.tnews
import
*
# noqa: F401, F403
from
.tnews
import
*
# noqa: F401, F403
from
.triviaqa
import
*
# noqa: F401, F403
from
.triviaqa
import
*
# noqa: F401, F403
...
...
opencompass/datasets/gsm8k.py
View file @
d34ba111
...
@@ -33,7 +33,7 @@ def gsm8k_dataset_postprocess(text: str) -> str:
...
@@ -33,7 +33,7 @@ def gsm8k_dataset_postprocess(text: str) -> str:
@
TEXT_POSTPROCESSORS
.
register_module
(
'gsm8k'
)
@
TEXT_POSTPROCESSORS
.
register_module
(
'gsm8k'
)
def
gsm8k_postprocess
(
text
:
str
)
->
str
:
def
gsm8k_postprocess
(
text
:
str
)
->
str
:
text
=
text
.
split
(
'
\n\n
'
)[
0
]
text
=
text
.
split
(
'
Question:
'
)[
0
]
text
=
text
.
split
(
' '
)[::
-
1
]
text
=
text
.
split
(
' '
)[::
-
1
]
flag
=
False
flag
=
False
ret
=
''
ret
=
''
...
...
opencompass/datasets/mbpp.py
View file @
d34ba111
...
@@ -263,9 +263,12 @@ class MBPPEvaluator(BaseEvaluator):
...
@@ -263,9 +263,12 @@ class MBPPEvaluator(BaseEvaluator):
def
_process_answer
(
self
,
text
):
def
_process_answer
(
self
,
text
):
try
:
try
:
# for chatGLM related text
# for chatGLM related text
text
=
eval
(
text
)
eval_
text
=
eval
(
text
)
except
Exception
:
except
Exception
:
pass
pass
else
:
if
isinstance
(
eval_text
,
str
):
text
=
eval_text
# deal with code block
# deal with code block
if
'```'
in
text
:
if
'```'
in
text
:
blocks
=
re
.
findall
(
r
'```(.*?)```'
,
text
,
re
.
DOTALL
)
blocks
=
re
.
findall
(
r
'```(.*?)```'
,
text
,
re
.
DOTALL
)
...
...
opencompass/datasets/teval/__init__.py
0 → 100644
View file @
d34ba111
import
json
import
os.path
as
osp
from
typing
import
Dict
,
Optional
import
mmengine
from
datasets
import
Dataset
,
DatasetDict
from
opencompass.registry
import
TEXT_POSTPROCESSORS
from
..base
import
BaseDataset
class
TEvalDataset
(
BaseDataset
):
def
__init__
(
self
,
reader_cfg
:
Optional
[
Dict
]
=
{},
**
kwargs
):
super
().
__init__
(
reader_cfg
=
reader_cfg
,
**
kwargs
)
def
load
(
self
,
path
:
str
,
name
:
str
):
dataset
=
DatasetDict
()
data
=
mmengine
.
load
(
osp
.
join
(
path
,
f
'
{
name
}
.json'
))
raw_data
=
[]
for
i
in
data
.
keys
():
origin_prompt
=
data
[
i
][
'origin_prompt'
]
if
isinstance
(
origin_prompt
,
str
):
origin_prompt
=
json
.
loads
(
origin_prompt
)
# Aligning the default roles of opencompass
prompt
=
origin_prompt
+
[
dict
(
role
=
'assistant'
,
content
=
str
(
data
[
i
].
get
(
'ground_truth'
)))
]
raw_data
.
append
({
'prompt'
:
prompt
,
'ground_truth'
:
json
.
dumps
(
data
[
i
])
})
dataset
[
'test'
]
=
Dataset
.
from_list
(
raw_data
)
dataset
[
'train'
]
=
Dataset
.
from_list
(
raw_data
)
return
dataset
@
TEXT_POSTPROCESSORS
.
register_module
(
'teval'
)
def
teval_postprocess
(
text
:
str
)
->
str
:
if
isinstance
(
text
,
str
):
text
=
text
.
split
(
'<eoa>'
)[
0
]
text
=
text
.
split
(
'<TOKENS_UNUSED_1>'
)[
0
]
text
=
text
.
split
(
'<|im_end|>'
)[
0
]
text
=
text
.
split
(
'
\n
user'
)[
0
]
text
=
text
.
split
(
'
\n
USER'
)[
0
]
text
=
text
.
split
(
'[INST]'
)[
0
]
text
=
text
.
strip
()
if
text
.
startswith
(
'```json'
):
text
=
text
[
len
(
'```json'
):]
text
=
text
.
strip
(
'`'
).
strip
()
if
text
[:
2
]
==
'{{'
and
text
[
-
2
:]
==
'}}'
:
text
=
text
[
1
:
-
1
]
return
str
(
text
)
opencompass/datasets/teval/evaluators/__init__.py
0 → 100644
View file @
d34ba111
from
.instruct_evaluator
import
InstructEvaluator
from
.planning_evaluator
import
PlanningEvaluator
from
.review_evaluator
import
ReviewEvaluator
from
.reason_retrieve_understand_evaluator
import
ReasonRetrieveUnderstandEvaluator
__all__
=
[
'InstructEvaluator'
,
'PlanningEvaluator'
,
'ReviewEvaluator'
,
'ReasonRetrieveUnderstandEvaluator'
]
opencompass/datasets/teval/evaluators/instruct_evaluator.py
0 → 100644
View file @
d34ba111
from
collections
import
defaultdict
from
mmengine
import
load
from
..utils.template
import
parse_string
from
..utils.format_load
import
format_load
from
..schema
import
ResponseDataSample
import
ast
import
numpy
as
np
class
InstructEvaluator
:
"""Instruct Following Evaluation
Args:
dataset_path(str): File path of evaluation dataset.
"""
def
__init__
(
self
,
dataset_path
:
str
,
**
kwargs
,
)
->
None
:
self
.
dataset_path
=
dataset_path
def
_load_dataset
(
self
):
self
.
dataset
=
[]
dataset
=
load
(
self
.
dataset_path
)
for
key
in
dataset
.
keys
():
datum
=
dataset
[
key
]
data_sample
=
self
.
_process_response
(
datum
)
self
.
dataset
.
append
(
dict
(
origin_prompt
=
datum
[
"origin_prompt"
],
response_data_sample
=
data_sample
))
self
.
num_samples
=
len
(
self
.
dataset
)
def
_process_response
(
self
,
datum
:
dict
,
)
->
ResponseDataSample
:
"""Process the response to needed format.
Args:
datum(dict): inputs.
Returns:
dict: Processed response data sample.
"""
# Dict with keyword-only arguments.
template
=
datum
[
'template'
]
# Generated response.
pred_data
=
datum
[
'prediction'
]
# Response of ground truth.
gt_data
=
datum
[
'ground_truth'
]
meta_data
=
datum
[
'meta_data'
]
return
ResponseDataSample
(
template
=
template
,
pred
=
pred_data
,
gt
=
gt_data
,
meta_data
=
meta_data
)
def
_evaluate
(
self
,
data_sample
:
dict
)
->
dict
:
metrics_result
=
dict
()
response_format
=
data_sample
.
meta_data
[
'response_format'
]
if
response_format
==
'json'
:
pred_data
=
self
.
json_format_parse
(
data_sample
)
else
:
pred_data
=
self
.
string_format_parse
(
data_sample
)
if
pred_data
is
None
:
# directly set to 0 for all metrics
metrics_result
[
f
'
{
response_format
}
_format_metric'
]
=
0
metrics_result
[
f
'
{
response_format
}
_args_em_metric'
]
=
0
return
metrics_result
# Exact matching
metrics_result
[
f
'
{
response_format
}
_format_metric'
]
=
1
metrics_result
[
f
'
{
response_format
}
_args_em_metric'
]
=
self
.
compute_args_em_metric
(
gt_action
=
data_sample
.
gt
[
'action'
],
pred_action
=
pred_data
[
'action'
],
gt_args
=
data_sample
.
gt
[
'args'
],
pred_args
=
pred_data
[
'args'
]
)
return
metrics_result
def
compute_args_em_metric
(
self
,
gt_action
,
pred_action
,
gt_args
,
pred_args
):
cnt
=
0.
if
gt_action
==
pred_action
:
cnt
+=
1.
num_args
=
len
(
gt_args
)
+
1
# 1 means action name match
for
gt_key
in
gt_args
:
pred_val
=
pred_args
.
get
(
gt_key
,
""
)
if
pred_val
==
gt_args
[
gt_key
]:
cnt
+=
1.
return
cnt
/
num_args
def
string_format_parse
(
self
,
data_sample
):
pred_data
=
data_sample
.
pred
template
=
data_sample
.
template
thought_start
=
template
[
'thought_start'
]
thought_end
=
template
[
'thought_end'
]
action_start
=
template
[
'action_start'
]
action_end
=
template
[
'action_end'
]
args_start
=
template
[
'args_start'
]
args_end
=
template
[
'args_end'
]
parse_template
=
thought_start
+
"{thought}"
+
thought_end
\
+
action_start
+
"{action}"
+
action_end
\
+
args_start
+
"{args}"
+
args_end
res
=
parse_string
(
parse_template
,
pred_data
,
allow_newline
=
True
)
try
:
if
res
is
not
None
:
args
=
ast
.
literal_eval
(
res
[
'args'
].
strip
())
res
[
'args'
]
=
args
if
isinstance
(
args
,
dict
)
else
{}
res
[
'action'
]
=
res
[
'action'
].
strip
()
return
res
except
:
return
dict
(
thought
=
res
[
'thought'
],
action
=
res
[
'action'
].
strip
(),
args
=
dict
())
def
json_format_parse
(
self
,
data_sample
):
try
:
pred_data
=
format_load
(
data_sample
.
pred
)
template
=
data_sample
.
template
new_data
=
dict
()
new_data
[
'thought'
]
=
pred_data
[
template
[
'thought'
]]
new_data
[
'action'
]
=
pred_data
[
template
[
'action'
]]
args
=
pred_data
[
template
[
'args'
]]
new_data
[
'args'
]
=
args
if
isinstance
(
args
,
dict
)
else
{}
except
Exception
as
e
:
return
None
return
new_data
def
evaluate
(
self
):
self
.
_load_dataset
()
results_list
=
[]
for
data_sample
in
self
.
dataset
:
metrics_result
=
self
.
_evaluate
(
data_sample
[
'response_data_sample'
])
results_list
.
append
(
metrics_result
)
return
self
.
_post_process
(
results_list
)
def
_post_process
(
self
,
results_list
):
# list of dict to dict of list
results_dict
=
defaultdict
(
list
)
{
results_dict
[
key
].
append
(
sub
[
key
])
for
sub
in
results_list
for
key
in
sub
}
metric_list
=
[
'json_format_metric'
,
'json_args_em_metric'
,
'string_format_metric'
,
'string_args_em_metric'
]
for
metric
in
metric_list
:
results_dict
[
metric
]
=
np
.
round
(
np
.
mean
(
results_dict
[
metric
]),
decimals
=
4
)
return
results_dict
opencompass/datasets/teval/evaluators/planning_evaluator.py
0 → 100644
View file @
d34ba111
from
numpy
import
mean
from
mmengine
import
load
from
..utils.format_load
import
format_load
import
itertools
import
networkx
as
nx
import
numpy
as
np
import
copy
import
re
from
tqdm
import
tqdm
from
..schema
import
ResponseDataSample
from
sentence_transformers
import
SentenceTransformer
,
util
class
PlanningEvaluator
:
"""Planning Evaluation
Args:
dataset_path(str): File path of evaluation dataset
name_weight(float): the weight of action_name in bert_score match, default = 0.9
args_weight(float): the weight of action_args in bert_score match, default = 0.1
match_threshold(float): the threshold of matching
match_strategy(str): matching method, can choose 'bertscore' or 'permutation'
bert_score_model(str): the bert_score model for sentence similarity, default = "all-mpnet-base-v2".
Refer to https://www.sbert.net/docs/pretrained_models.html for more models.
"""
def
__init__
(
self
,
dataset_path
:
str
,
name_weight
=
0.75
,
args_weight
=
0.25
,
match_threshold
=
0.7
,
match_strategy
:
str
=
'bertscore'
,
# ["bertscore", "permutation"]
bert_score_model
:
str
=
"all-mpnet-base-v2"
,
# ['thenlper/gte-large-zh', 'all-mpnet-base-v2']
default_prompt_type
:
str
=
'json'
,
# ["json", "ReWOO"]
**
kwargs
,
)
->
None
:
self
.
bert_score_model
=
bert_score_model
print
(
bert_score_model
)
self
.
dataset_path
=
dataset_path
self
.
name_weight
=
name_weight
self
.
args_weight
=
args_weight
self
.
match_threshold
=
match_threshold
self
.
default_prompt_type
=
default_prompt_type
# ["json", "ReWOO"]
assert
match_strategy
in
[
"bertscore"
,
"permutation"
],
f
"match strategy must in [
\"
bertscore
\"
,
\"
permutation
\"
], but get
{
match_strategy
}
"
self
.
match_strategy
=
match_strategy
self
.
valid_data_count
=
None
self
.
sentence_model
=
SentenceTransformer
(
self
.
bert_score_model
)
def
_load_dataset
(
self
):
self
.
dataset
=
[]
dataset
=
load
(
self
.
dataset_path
)
total_error
=
0
total_count
=
0
for
key
in
dataset
.
keys
():
datum
=
dataset
[
key
]
data_sample
,
error
=
self
.
_process_response
(
datum
)
total_error
+=
error
total_count
+=
1
self
.
dataset
.
append
(
dict
(
response_data_sample
=
data_sample
))
self
.
num_samples
=
len
(
self
.
dataset
)
print
(
"total_data_count:"
,
total_count
,
"valid_data_count:"
,
total_count
-
total_error
)
self
.
valid_data_count
=
total_count
-
total_error
def
format_load
(
self
,
data
):
r
'''
ensure evaluator can work correctly under any data input
'''
try
:
json_format
=
format_load
(
data
,
start_character
=
'['
,
end_character
=
']'
)
except
Exception
as
e
:
return
[]
if
type
(
json_format
)
!=
list
:
return
[]
for
i
in
range
(
len
(
json_format
)):
try
:
json_format
[
i
]
=
{
'name'
:
str
(
json_format
[
i
][
'name'
]),
'id'
:
int
(
json_format
[
i
][
'id'
]),
'args'
:
str
(
json_format
[
i
][
'args'
])
}
except
Exception
as
e
:
return
[]
return
json_format
def
_process_response
(
self
,
datum
,
)
->
ResponseDataSample
:
"""Process the response to needed format.
Args:
datum(dict): inputs.
Returns:
dict: Processed response data sample.
"""
# Generated response, which can be a string or list
pred_data
=
datum
[
'prediction'
]
# Response of ground truth, which can be a string or list
gt_data
=
datum
[
'ground_truth'
]
# prompt_type: The type of planning prompt, supporting "json" and "ReWOO"
if
"meta"
in
datum
:
prompt_type
=
datum
[
"meta"
].
get
(
"prompt_type"
,
self
.
default_prompt_type
)
else
:
prompt_type
=
self
.
default_prompt_type
error
=
0
pred
=
dict
()
gt
=
dict
()
gt
[
'planning'
]
=
self
.
format_load
(
gt_data
)
if
prompt_type
==
'json'
:
pred
[
'planning'
]
=
self
.
format_load
(
pred_data
)
if
pred
[
'planning'
]
==
[]
or
gt
[
'planning'
]
==
[]:
error
=
1
elif
prompt_type
==
'ReWOO'
:
"""
This type is deprecated
The planning prediction data should in this format:
Plan 1: <str> description about the first action
Dependency 1: <list[number]> the first action depends on which previous actions
Action 1: #E1 = api_name1(args1)
...
Which will be passed only if "number of plan lines == number of dependency lines == number of action lines"
The passed data's format is:
[
dict(
id = i,
name = curr_name,
args = args_str
)
...
]
The golden answer prediction is a json that is the same as the json format.
"""
thoughts
=
re
.
findall
(
r
'(Plan [0-9]+: .+)'
,
pred_data
)
dependencies
=
re
.
findall
(
r
'(Dependency [0-9]+: .+)'
,
pred_data
)
action_units
=
re
.
findall
(
r
'Action [0-9]+: (.+)'
,
pred_data
)
if
not
(
len
(
thoughts
)
==
len
(
dependencies
)
and
len
(
thoughts
)
==
len
(
action_units
)):
pred
[
'planning'
]
=
[]
gt
[
'planning'
]
=
[]
return
ResponseDataSample
(
template
=
''
,
pred
=
pred
,
gt
=
gt
),
1
plan_action
=
[]
for
i
in
range
(
len
(
action_units
)):
dependency_list
=
re
.
findall
(
r
'Dependency [0-9]+: (.+)'
,
dependencies
[
i
])
if
action_units
[
i
][
0
]
==
'#'
:
# The action has a return #E
args_str_list
=
re
.
findall
(
r
'#E[0-9]+ = .+\((.+)\)'
,
action_units
[
i
])
name_list
=
re
.
findall
(
r
'#E[0-9]+ = (.+)\('
,
action_units
[
i
])
else
:
# The action does not have a return
args_str_list
=
re
.
findall
(
r
'.+\((.+)\)'
,
action_units
[
i
])
name_list
=
re
.
findall
(
r
'(.+)\('
,
action_units
[
i
])
if
(
len
(
name_list
)
>
0
):
curr_name
=
name_list
[
0
]
else
:
curr_name
=
""
if
(
len
(
args_str_list
)
>
0
):
args_str
=
"{"
+
args_str_list
[
0
]
+
"}"
else
:
args_str
=
"{}"
if
(
len
(
dependency_list
)
>
0
):
dependency_str
=
dependency_list
[
0
]
else
:
dependency_str
=
""
dependency
=
re
.
findall
(
'([0-9]+)'
,
dependency_str
)
dependency
=
list
(
set
([
int
(
x
)
-
1
for
x
in
dependency
]))
plan_action
.
append
(
dict
(
id
=
i
,
name
=
curr_name
,
prev
=
dependency
,
args
=
args_str
))
pred
[
'planning'
]
=
plan_action
#Turn dict into args str
for
i
in
range
(
len
(
gt
[
'planning'
])):
args_str
=
""
if
type
(
gt
[
'planning'
][
i
][
'args'
])
==
str
:
args_dict
=
eval
(
gt
[
'planning'
][
i
][
'args'
])
else
:
assert
type
(
gt
[
'planning'
][
i
][
'args'
])
==
dict
args_dict
=
gt
[
'planning'
][
i
][
'args'
]
for
it
in
args_dict
:
if
args_str
==
""
:
args_str
+=
f
"
{
it
}
=
\"
{
args_dict
[
it
]
}
\"
"
else
:
args_str
+=
f
",
{
it
}
=
\"
{
args_dict
[
it
]
}
\"
"
gt
[
'planning'
][
i
][
'args'
]
=
'{'
+
args_str
+
'}'
elif
prompt_type
==
'str'
:
pred_data_format
=
pred_data
.
replace
(
'. '
,
'
\n
'
).
split
(
'
\n
'
)
pred_actions
=
[]
for
pred_step
in
pred_data_format
:
first_occur_time
=
1e9
pred_action
=
""
for
api_name
in
datum
[
'meta'
][
'API_list'
]:
occur_time
=
pred_step
.
find
(
api_name
)
if
occur_time
!=
-
1
and
occur_time
<
first_occur_time
:
first_occur_time
=
occur_time
pred_action
=
api_name
if
pred_action
!=
""
:
pred_actions
.
append
({
'id'
:
len
(
pred_actions
),
'name'
:
pred_action
,
'args'
:
pred_step
})
pred
[
'planning'
]
=
pred_actions
if
len
(
pred
[
'planning'
])
==
0
:
error
=
1
else
:
raise
NotImplementedError
(
f
"Currently, we only support json and ReWOO format, but get
{
prompt_type
}
"
)
return
ResponseDataSample
(
template
=
''
,
pred
=
pred
,
gt
=
gt
),
error
def
_evaluate
(
self
,
data_sample
)
->
dict
:
if
self
.
match_strategy
==
'bertscore'
:
metrics_result
=
self
.
bertscore_match
(
data_sample
.
pred
[
'planning'
],
data_sample
.
gt
[
'planning'
])
elif
self
.
match_strategy
==
'permutation'
:
metrics_result
=
self
.
permutation_match
(
data_sample
.
pred
[
'planning'
],
data_sample
.
gt
[
'planning'
])
else
:
raise
NotImplementedError
if
len
(
data_sample
.
pred
[
'planning'
])
==
0
or
len
(
data_sample
.
gt
[
'planning'
])
==
0
:
metrics_result
[
'parse_rate'
]
=
0
else
:
metrics_result
[
'parse_rate'
]
=
1
return
metrics_result
def
evaluate
(
self
):
self
.
_load_dataset
()
results_list
=
[]
for
data_sample
in
tqdm
(
self
.
dataset
):
metrics_result
=
self
.
_evaluate
(
data_sample
[
'response_data_sample'
])
results_list
.
append
(
metrics_result
)
return
self
.
_post_process
(
results_list
)
def
permutation_match
(
self
,
pred_plan
,
gt_plan
)
->
dict
:
'''
The function calculates all the permutation matches' score and selects the max f1_score;
Since permutation is time consuming, we truncate the length of plans to 9
'''
if
pred_plan
[
-
1
][
'name'
]
!=
'FinishAction'
:
pred_plan
.
append
(
{
'id'
:
len
(
pred_plan
),
'prev'
:
[],
'name'
:
'FinishAction'
,
'args'
:
r
'\{\}'
}
)
if
gt_plan
[
-
1
][
'name'
]
!=
'FinishAction'
:
gt_plan
.
append
(
{
'id'
:
len
(
gt_plan
),
'prev'
:
[],
'name'
:
'FinishAction'
,
'args'
:
r
'\{\}'
}
)
# truncate plans to 9 since it is too long for permutation.
if
len
(
pred_plan
)
>
9
:
pred_plan
=
pred_plan
[:
9
]
if
len
(
gt_plan
)
>
9
:
gt_plan
=
pred_plan
[:
9
]
pred_plan
=
sorted
(
pred_plan
,
key
=
lambda
x
:
x
[
'id'
])
gt_plan
=
sorted
(
gt_plan
,
key
=
lambda
x
:
x
[
'id'
])
len_pred
=
len
(
pred_plan
)
len_gt
=
len
(
gt_plan
)
map_id_max
=
max
(
len_pred
,
len_gt
)
numbers
=
[
i
for
i
in
range
(
map_id_max
)]
perms
=
itertools
.
permutations
(
numbers
,
len_pred
)
gt_prev_count
,
pred_prev_count
=
0
,
0
for
i
in
range
(
len_gt
):
gt_plan
[
i
][
'prev'
].
append
(
i
)
gt_prev_count
+=
len
(
gt_plan
[
i
][
'prev'
])
for
i
in
range
(
len_pred
):
pred_plan
[
i
][
'prev'
].
append
(
i
)
pred_prev_count
+=
len
(
pred_plan
[
i
][
'prev'
])
if
gt_prev_count
==
0
or
pred_prev_count
==
0
:
return
{
'precision'
:
0
,
'recall'
:
0
,
'f1_score'
:
0
}
max_recall
,
max_precision
,
max_f1
=
0
,
0
,
0
for
perm
in
perms
:
correct_count
=
0
for
i
in
range
(
len_pred
):
if
perm
[
i
]
>=
len_gt
:
continue
for
j
in
pred_plan
[
i
][
'prev'
]:
if
perm
[
j
]
in
gt_plan
[
perm
[
i
]][
'prev'
]:
correct_count
+=
1
now_recall
,
now_precision
=
correct_count
/
gt_prev_count
,
correct_count
/
pred_prev_count
if
now_recall
+
now_precision
==
0
:
continue
now_f1
=
2
*
now_recall
*
now_precision
/
(
now_recall
+
now_precision
)
if
now_f1
>
max_f1
:
max_f1
,
max_recall
,
max_precision
=
now_f1
,
now_recall
,
now_precision
return
{
'precision'
:
max_precision
,
'recall'
:
max_recall
,
'f1_score'
:
max_f1
}
def
bertscore_match
(
self
,
pred_plan
,
gt_plan
)
->
dict
:
"""
Calculate the similarity between predicted plan and golden answer,
A plan can be regarded a sequence of actions, and each action has a name and args.
Firstly, use bertscore to calculate pointwise similarity by:
similarity(u, v) = bertscore(u.name, v.name) * name_weight + bertscore(u.args, v.args) * args_weight;
Secondly, use Hungarian matching to match the points;
Finally, use LIS to calculate the number of matched nodes.
"""
if
len
(
pred_plan
)
==
0
or
len
(
gt_plan
)
==
0
:
return
{
'precision'
:
0
,
'recall'
:
0
,
'f1_score'
:
0
}
pred_plan
=
copy
.
deepcopy
(
sorted
(
pred_plan
,
key
=
lambda
x
:
x
[
'id'
]))
gt_plan
=
copy
.
deepcopy
(
sorted
(
gt_plan
,
key
=
lambda
x
:
x
[
'id'
]))
#Add end action
#Currently it is hard-code
if
pred_plan
[
-
1
][
'name'
]
==
'FinishAction'
:
pred_plan
=
pred_plan
[:
-
1
]
if
gt_plan
[
-
1
][
'name'
]
==
'FinishAction'
:
gt_plan
=
gt_plan
[:
-
1
]
#The total counts of nodes and edges.
len_pred
=
len
(
pred_plan
)
len_gt
=
len
(
gt_plan
)
bert_score_matrix
=
np
.
zeros
((
len_pred
,
len_gt
))
name_pred
,
args_pred
=
[],
[]
name_gt
,
args_gt
=
[],
[]
for
i
in
range
(
len_pred
):
name_pred
.
append
(
pred_plan
[
i
][
'name'
])
args_pred
.
append
(
str
(
pred_plan
[
i
][
'args'
]))
for
i
in
range
(
len_gt
):
name_gt
.
append
(
gt_plan
[
i
][
'name'
])
args_gt
.
append
(
str
(
gt_plan
[
i
][
'args'
]))
name_pred_emb
=
self
.
sentence_model
.
encode
(
name_pred
,
convert_to_tensor
=
True
)
name_gt_emb
=
self
.
sentence_model
.
encode
(
name_gt
,
convert_to_tensor
=
True
)
args_pred_emb
=
self
.
sentence_model
.
encode
(
args_pred
,
convert_to_tensor
=
True
)
args_gt_emb
=
self
.
sentence_model
.
encode
(
args_gt
,
convert_to_tensor
=
True
)
name_cosine_scores
=
np
.
maximum
(
util
.
cos_sim
(
name_pred_emb
,
name_gt_emb
).
cpu
().
numpy
(),
0
)
args_cosine_scores
=
np
.
maximum
(
util
.
cos_sim
(
args_pred_emb
,
args_gt_emb
).
cpu
().
numpy
(),
0
)
for
i
in
range
(
len_pred
):
for
j
in
range
(
len_gt
):
bert_score_matrix
[
i
][
j
]
=
\
name_cosine_scores
[
i
][
j
]
*
self
.
name_weight
\
+
args_cosine_scores
[
i
][
j
]
*
self
.
args_weight
G
=
nx
.
Graph
()
for
i
in
range
(
len_pred
):
for
j
in
range
(
len_gt
):
if
bert_score_matrix
[
i
][
j
]
>
self
.
match_threshold
:
G
.
add_edge
(
i
,
str
(
j
),
weight
=
bert_score_matrix
[
i
][
j
])
max_weight_matching
=
nx
.
max_weight_matching
(
G
)
pred_to_gt_mapping
=
dict
()
for
key
in
max_weight_matching
:
if
type
(
key
[
0
])
==
int
:
pred_to_gt_mapping
[
int
(
key
[
0
])]
=
int
(
key
[
1
])
else
:
pred_to_gt_mapping
[
int
(
key
[
1
])]
=
int
(
key
[
0
])
#If a prediction node does not match any golden answer node, we mark the node as -1.
for
i
in
range
(
len_pred
):
if
i
not
in
pred_to_gt_mapping
:
pred_to_gt_mapping
[
i
]
=
-
1
#Calculate how many nodes are matched by Longest Increasing Subsequence (LIS)
dp
=
np
.
ones
(
len_pred
)
for
i
in
range
(
len_pred
):
for
j
in
range
(
i
):
if
pred_to_gt_mapping
[
i
]
==
-
1
or
pred_to_gt_mapping
[
j
]
==
-
1
:
continue
if
pred_to_gt_mapping
[
i
]
>
pred_to_gt_mapping
[
j
]:
dp
[
i
]
=
max
(
dp
[
i
],
dp
[
j
]
+
1
)
correct_count
=
int
(
max
(
dp
))
recall
,
precision
=
correct_count
/
len
(
gt_plan
),
correct_count
/
len
(
pred_plan
)
f1_score
=
2
*
recall
*
precision
/
(
recall
+
precision
)
result
=
{
'precision'
:
precision
,
'recall'
:
recall
,
'f1_score'
:
f1_score
}
return
result
def
_post_process
(
self
,
results_list
):
# list of dict to dict of list
results
=
dict
()
planning_metric_keys
=
[
"precision"
,
"recall"
,
"f1_score"
,
'parse_rate'
]
for
key
in
planning_metric_keys
:
results
[
key
]
=
mean
([
result
[
key
]
for
result
in
results_list
])
return
results
opencompass/datasets/teval/evaluators/reason_retrieve_understand_evaluator.py
0 → 100644
View file @
d34ba111
import
json
from
numpy
import
mean
from
mmengine
import
load
import
numpy
as
np
import
json
import
re
from
tqdm
import
tqdm
from
..schema
import
ResponseDataSample
from
..utils.format_load
import
format_load
from
sentence_transformers
import
SentenceTransformer
,
util
def
input_postprocess
(
text
:
str
)
->
str
:
if
isinstance
(
text
,
str
):
text
=
text
.
split
(
'<|'
)[
0
]
text
=
text
.
split
(
'<eoa>
\n
'
)[
0
]
text
=
text
.
split
(
'<TOKENS_UNUSED_1>
\n
'
)[
0
]
text
=
text
.
split
(
'<|im_end|>'
)[
0
]
if
len
(
text
)
>
1
and
text
[:
2
]
==
'{{'
and
text
[
-
2
:]
==
'}}'
:
text
=
text
[
1
:
-
1
]
while
len
(
text
)
>
0
and
text
[
-
1
]
==
'
\n
'
:
text
=
text
[:
-
1
]
return
str
(
text
)
class
ReasonRetrieveUnderstandEvaluator
:
"""Planning Evaluation
Args:
dataset_path(str): File path of evaluation dataset
bert_score_model(str): the bert_score model for sentence similarity, default = "all-mpnet-base-v2".
Refer to https://www.sbert.net/docs/pretrained_models.html for more models.
"""
def
__init__
(
self
,
dataset_path
:
str
,
bert_score_model
:
str
=
"all-mpnet-base-v2"
,
# ['thenlper/gte-large-zh', 'all-mpnet-base-v2']
default_prompt_type
:
str
=
'json'
,
eval_type
:
str
=
'reason'
,
**
kwargs
,
)
->
None
:
self
.
bert_score_model
=
bert_score_model
print
(
bert_score_model
)
self
.
dataset_path
=
dataset_path
# self.bertscore = evaluate.load('bertscore')
self
.
default_prompt_type
=
default_prompt_type
# ["json", "str"]
self
.
eval_type
=
eval_type
self
.
valid_data_count
=
None
self
.
sentence_model
=
SentenceTransformer
(
self
.
bert_score_model
)
def
_load_dataset
(
self
):
self
.
dataset
=
[]
dataset
=
load
(
self
.
dataset_path
)
total_error
=
0
total_count
=
0
for
key
in
dataset
.
keys
():
datum
=
dataset
[
key
]
data_sample
,
error
=
self
.
_process_response
(
datum
)
total_error
+=
error
total_count
+=
1
self
.
dataset
.
append
(
dict
(
response_data_sample
=
data_sample
))
self
.
num_samples
=
len
(
self
.
dataset
)
# print("total_data_count:", total_count, "valid_data_count:", total_count - total_error)
self
.
valid_data_count
=
total_count
-
total_error
def
format_load
(
self
,
data
):
r
'''
ensure evaluator can work correctly under any data input
'''
try
:
json_format
=
format_load
(
data
,
start_character
=
'{'
,
end_character
=
'}'
)
except
Exception
as
e
:
return
{}
if
type
(
json_format
)
!=
dict
:
return
{}
prepared_json_format
=
dict
()
try
:
prepared_json_format
[
'thought'
]
=
str
(
json_format
[
'thought'
])
except
Exception
as
e
:
prepared_json_format
[
'thought'
]
=
''
try
:
prepared_json_format
[
'name'
]
=
str
(
json_format
[
'name'
])
except
Exception
as
e
:
prepared_json_format
[
'name'
]
=
''
if
self
.
default_prompt_type
==
'json'
:
try
:
if
isinstance
(
json_format
[
'args'
],
dict
):
prepared_json_format
[
'args'
]
=
json_format
[
'args'
]
else
:
prepared_json_format
[
'args'
]
=
dict
()
except
:
prepared_json_format
[
'args'
]
=
dict
()
else
:
try
:
prepared_json_format
[
'args'
]
=
str
(
json_format
[
'args'
])
except
Exception
as
e
:
prepared_json_format
[
'args'
]
=
""
return
prepared_json_format
def
_process_response
(
self
,
datum
,
)
->
ResponseDataSample
:
"""Process the response to needed format.
Args:
datum(dict): inputs.
Returns:
dict: Processed response data sample.
"""
# Generated response, which can be a string or list
pred_data
=
datum
[
'prediction'
]
# Response of ground truth, which can be a string or list
gt_data
=
datum
[
'ground_truth'
]
# prompt_type: The type of planning prompt, supporting "json" and "ReWOO"
if
"meta_data"
in
datum
:
prompt_type
=
datum
[
"meta_data"
].
get
(
"response_format"
,
self
.
default_prompt_type
)
else
:
prompt_type
=
self
.
default_prompt_type
error
=
0
gt
=
self
.
format_load
(
gt_data
)
# pred_data = input_postprocess(pred_data)
if
prompt_type
==
'json'
:
pred
=
self
.
format_load
(
pred_data
)
if
pred
==
{}
or
gt
==
{}:
error
=
1
elif
prompt_type
==
'str'
:
# choose the first line
pred
=
dict
()
if
self
.
eval_type
==
'reason'
:
pred
[
'thought'
]
=
pred_data
if
self
.
eval_type
==
'retrieve'
:
pred
[
'name'
]
=
pred_data
if
self
.
eval_type
==
'understand'
:
pred
[
'args'
]
=
pred_data
else
:
raise
NotImplementedError
(
f
"Currently, we only support json and str format, but get
{
prompt_type
}
"
)
if
error
==
1
:
pred
=
dict
()
return
ResponseDataSample
(
template
=
''
,
pred
=
pred
,
gt
=
gt
),
error
def
_evaluate
(
self
,
data_sample
):
"""Evaluate the response data sample.
"""
# To enable batch evaluation, the evaluator is written at post_process.
return
data_sample
def
evaluate
(
self
):
self
.
_load_dataset
()
results_list
=
[]
for
data_sample
in
tqdm
(
self
.
dataset
):
metrics_result
=
self
.
_evaluate
(
data_sample
[
'response_data_sample'
])
results_list
.
append
(
metrics_result
)
return
self
.
_post_process
(
results_list
)
def
find_a_dot_b_structure
(
self
,
text
):
# find a.b structure
pattern
=
r
'\w+\.\w+'
return
re
.
findall
(
pattern
,
text
)
def
find_FinishAction
(
self
,
text
):
# find FinishAction
pattern
=
r
'FinishAction'
return
re
.
findall
(
pattern
,
text
)
def
_post_process
(
self
,
results_list
):
# list of dict to dict of list
if
self
.
default_prompt_type
==
'json'
:
metric_keys
=
[
'thought'
,
'name'
,
'args'
,
'parse_rate'
]
if
self
.
default_prompt_type
==
'str'
:
if
self
.
eval_type
==
'reason'
:
metric_keys
=
[
'thought'
,
'parse_rate'
]
if
self
.
eval_type
==
'retrieve'
:
metric_keys
=
[
'name'
,
'parse_rate'
]
if
self
.
eval_type
==
'understand'
:
metric_keys
=
[
'args'
,
'parse_rate'
]
metrics_results
=
[]
batch_data
=
[];
batch_arg_data
=
[]
batch_id
=
[];
batch_arg_id
=
[]
BATCH_LIMIT
=
32
for
id
,
data
in
enumerate
(
results_list
):
metrics_results
.
append
(
{
metric_keys
[
x
]:
0
for
x
in
range
(
len
(
metric_keys
))}
)
if
len
(
data
.
pred
.
keys
())
!=
0
:
metrics_results
[
id
][
'parse_rate'
]
=
1
if
'thought'
in
data
.
pred
and
'thought'
in
data
.
gt
:
batch_data
.
extend
([
data
.
pred
[
'thought'
],
data
.
gt
[
'thought'
]])
batch_id
.
extend
([
id
])
if
len
(
batch_data
)
>=
BATCH_LIMIT
:
pred_emb
=
self
.
sentence_model
.
encode
(
batch_data
,
convert_to_tensor
=
True
)
for
i
in
range
(
0
,
len
(
batch_data
),
2
):
cosine_score
=
np
.
maximum
(
util
.
cos_sim
(
pred_emb
[
i
],
pred_emb
[
i
+
1
]).
cpu
().
numpy
(),
0
)
metrics_results
[
batch_id
[
i
//
2
]][
'thought'
]
=
cosine_score
[
0
,
0
]
batch_data
=
[]
batch_id
=
[]
if
'name'
in
data
.
pred
and
'name'
in
data
.
gt
:
if
self
.
default_prompt_type
==
'json'
:
if
data
.
pred
[
'name'
]
==
data
.
gt
[
'name'
]:
metrics_results
[
id
][
'name'
]
=
1
else
:
metrics_results
[
id
][
'name'
]
=
0
else
:
if
data
.
gt
[
'name'
]
not
in
data
.
pred
[
'name'
]:
metrics_results
[
id
][
'name'
]
=
0
else
:
metrics_results
[
id
][
'name'
]
=
1
find_all_name
=
self
.
find_a_dot_b_structure
(
data
.
pred
[
'name'
])
+
self
.
find_FinishAction
(
data
.
pred
[
'name'
])
for
name
in
find_all_name
:
if
name
!=
data
.
gt
[
'name'
]:
metrics_results
[
id
][
'name'
]
=
0
if
'args'
in
data
.
pred
and
'args'
in
data
.
gt
:
batch_arg_data
.
extend
([
str
(
data
.
pred
[
'args'
]),
str
(
data
.
gt
[
'args'
])])
batch_arg_id
.
extend
([
id
])
if
len
(
batch_arg_data
)
>=
BATCH_LIMIT
:
pred_emb
=
self
.
sentence_model
.
encode
(
batch_arg_data
,
convert_to_tensor
=
True
)
for
i
in
range
(
0
,
len
(
batch_arg_data
),
2
):
cosine_score
=
np
.
maximum
(
util
.
cos_sim
(
pred_emb
[
i
],
pred_emb
[
i
+
1
]).
cpu
().
numpy
(),
0
)
metrics_results
[
batch_arg_id
[
i
//
2
]][
'args'
]
=
cosine_score
[
0
,
0
]
batch_arg_data
=
[]
batch_arg_id
=
[]
if
len
(
batch_data
)
>
0
:
pred_emb
=
self
.
sentence_model
.
encode
(
batch_data
,
convert_to_tensor
=
True
)
for
i
in
range
(
0
,
len
(
batch_data
),
2
):
cosine_score
=
np
.
maximum
(
util
.
cos_sim
(
pred_emb
[
i
],
pred_emb
[
i
+
1
]).
cpu
().
numpy
(),
0
)
metrics_results
[
batch_id
[
i
//
2
]][
'thought'
]
=
cosine_score
[
0
,
0
]
batch_data
=
[]
batch_id
=
[]
if
len
(
batch_arg_data
)
>
0
:
pred_emb
=
self
.
sentence_model
.
encode
(
batch_arg_data
,
convert_to_tensor
=
True
)
for
i
in
range
(
0
,
len
(
batch_arg_data
),
2
):
cosine_score
=
np
.
maximum
(
util
.
cos_sim
(
pred_emb
[
i
],
pred_emb
[
i
+
1
]).
cpu
().
numpy
(),
0
)
metrics_results
[
batch_arg_id
[
i
//
2
]][
'args'
]
=
cosine_score
[
0
,
0
]
batch_arg_data
=
[]
batch_arg_id
=
[]
results
=
dict
()
for
key
in
metric_keys
:
results
[
key
]
=
mean
([
metrics_results
[
key
]
for
metrics_results
in
metrics_results
])
return
results
class
ReasonRetrieveUnderstandEvaluatorNoBatch
:
"""Planning Evaluation
Args:
dataset_path(str): File path of evaluation dataset
bert_score_model(str): the bert_score model for sentence similarity, default = "all-mpnet-base-v2".
Refer to https://www.sbert.net/docs/pretrained_models.html for more models.
"""
def
__init__
(
self
,
dataset_path
:
str
,
bert_score_model
:
str
=
"all-mpnet-base-v2"
,
default_prompt_type
:
str
=
'json'
,
eval_type
:
str
=
'reason'
,
)
->
None
:
self
.
bert_score_model
=
bert_score_model
self
.
dataset_path
=
dataset_path
# self.bertscore = evaluate.load('bertscore')
self
.
default_prompt_type
=
default_prompt_type
# ["json", "str"]
self
.
eval_type
=
eval_type
self
.
valid_data_count
=
None
self
.
sentence_model
=
SentenceTransformer
(
self
.
bert_score_model
)
def
_load_dataset
(
self
):
self
.
dataset
=
[]
dataset
=
load
(
self
.
dataset_path
)
total_error
=
0
total_count
=
0
for
key
in
dataset
.
keys
():
datum
=
dataset
[
key
]
data_sample
,
error
=
self
.
_process_response
(
datum
)
total_error
+=
error
total_count
+=
1
self
.
dataset
.
append
(
dict
(
response_data_sample
=
data_sample
))
self
.
num_samples
=
len
(
self
.
dataset
)
# print("total_data_count:", total_count, "valid_data_count:", total_count - total_error)
self
.
valid_data_count
=
total_count
-
total_error
def
format_load
(
self
,
data
):
r
'''
ensure evaluator can work correctly under any data input
'''
if
type
(
data
)
==
dict
:
json_format
=
data
else
:
try
:
json_format
=
json
.
loads
(
data
)
#json.loads(pred_data)
except
Exception
as
e
:
return
{}
if
type
(
json_format
)
!=
dict
:
return
{}
prepared_json_format
=
dict
()
try
:
prepared_json_format
[
'thought'
]
=
str
(
json_format
[
'thought'
])
except
Exception
as
e
:
prepared_json_format
[
'thought'
]
=
''
try
:
prepared_json_format
[
'name'
]
=
str
(
json_format
[
'name'
])
except
Exception
as
e
:
prepared_json_format
[
'name'
]
=
''
try
:
if
prepared_json_format
[
"name"
]
!=
"FinishAction"
:
arg_inputs
=
json_format
[
"args"
]
if
type
(
arg_inputs
)
==
str
:
arg_inputs
=
json
.
loads
(
arg_inputs
)
if
type
(
arg_inputs
)
==
dict
:
prepared_json_format
[
'args'
]
=
arg_inputs
else
:
prepared_json_format
[
"args"
]
=
{}
else
:
prepared_json_format
[
"args"
]
=
{}
except
Exception
as
e
:
prepared_json_format
[
'args'
]
=
{}
return
prepared_json_format
def
_process_response
(
self
,
datum
,
)
->
ResponseDataSample
:
"""Process the response to needed format.
Args:
datum(dict): inputs.
Returns:
dict: Processed response data sample.
"""
# Generated response, which can be a string or list
pred_data
=
datum
[
'prediction'
]
# Response of ground truth, which can be a string or list
gt_data
=
datum
[
'ground_truth'
]
# prompt_type: The type of planning prompt, supporting "json" and "ReWOO"
if
"meta"
in
datum
:
prompt_type
=
datum
[
"meta"
].
get
(
"response_format"
,
self
.
default_prompt_type
)
else
:
prompt_type
=
self
.
default_prompt_type
error
=
0
gt
=
self
.
format_load
(
gt_data
)
# pred_data = input_postprocess(pred_data)
if
prompt_type
==
'json'
:
# pred_data = pred_data.replace('\'', '\"')
pred
=
self
.
format_load
(
pred_data
)
if
pred
==
{}
or
gt
==
{}:
error
=
1
elif
prompt_type
==
'str'
:
# choose the first line
pred
=
dict
()
if
self
.
eval_type
==
'reason'
:
pred
[
'thought'
]
=
pred_data
if
self
.
eval_type
==
'retrieve'
:
pred
[
'name'
]
=
pred_data
if
self
.
eval_type
==
'understand'
:
# pred_data = pred_data.replace('\'', '\"')
# try:
# pred['args'] = json.loads(pred_data)
# if type(pred['args']) != dict:
# pred['args'] = {}
# except Exception as e:
# error = 1
pred
[
'args'
]
=
pred_data
else
:
raise
NotImplementedError
(
f
"Currently, we only support json and str format, but get
{
prompt_type
}
"
)
if
error
==
1
:
pred
=
dict
()
return
ResponseDataSample
(
template
=
''
,
pred
=
pred
,
gt
=
gt
),
error
def
_evaluate
(
self
,
data_sample
)
->
dict
:
"""Evaluate the response data sample.
"""
metrics_result
=
{
'thought'
:
0
,
'name'
:
0
,
'args_precision'
:
0
,
'args_recall'
:
0
,
'args_f1_score'
:
0
,
'parse_rate'
:
0
,
}
if
'thought'
in
data_sample
.
pred
and
'thought'
in
data_sample
.
gt
:
pred_emb
=
self
.
sentence_model
.
encode
(
data_sample
.
pred
[
'thought'
],
convert_to_tensor
=
True
)
gt_emb
=
self
.
sentence_model
.
encode
(
data_sample
.
gt
[
'thought'
],
convert_to_tensor
=
True
)
cosine_scores
=
np
.
maximum
(
util
.
cos_sim
(
pred_emb
,
gt_emb
).
cpu
().
numpy
(),
0
)
metrics_result
[
'thought'
]
=
cosine_scores
[
0
,
0
]
if
'name'
in
data_sample
.
pred
and
'name'
in
data_sample
.
gt
:
if
data_sample
.
pred
[
'name'
]
==
data_sample
.
gt
[
'name'
]:
metrics_result
[
'name'
]
=
1
else
:
metrics_result
[
'name'
]
=
0
if
'args'
in
data_sample
.
pred
and
'args'
in
data_sample
.
gt
:
gt_num_keys
=
len
(
data_sample
.
gt
[
'args'
].
keys
())
pred_num_keys
=
len
(
data_sample
.
pred
[
'args'
].
keys
())
if
pred_num_keys
==
0
and
gt_num_keys
==
0
:
metrics_result
[
'args_precision'
]
=
1
metrics_result
[
'args_recall'
]
=
1
metrics_result
[
'args_f1_score'
]
=
1
elif
pred_num_keys
==
0
or
gt_num_keys
==
0
:
metrics_result
[
'args_precision'
]
=
0
metrics_result
[
'args_recall'
]
=
0
metrics_result
[
'args_f1_score'
]
=
0
else
:
correct_count
=
0
for
key
in
data_sample
.
gt
[
'args'
].
keys
():
if
key
in
data_sample
.
pred
[
'args'
]
and
str
(
data_sample
.
pred
[
'args'
][
key
])
==
str
(
data_sample
.
gt
[
'args'
][
key
]):
correct_count
+=
1
metrics_result
[
'args_precision'
]
=
correct_count
/
pred_num_keys
metrics_result
[
'args_recall'
]
=
correct_count
/
gt_num_keys
if
metrics_result
[
'args_precision'
]
+
metrics_result
[
'args_recall'
]
==
0
:
metrics_result
[
'args_f1_score'
]
=
0
else
:
metrics_result
[
'args_f1_score'
]
=
2
*
metrics_result
[
'args_precision'
]
*
metrics_result
[
'args_recall'
]
/
\
(
metrics_result
[
'args_precision'
]
+
metrics_result
[
'args_recall'
])
if
len
(
data_sample
.
pred
.
keys
())
==
0
:
metrics_result
[
'parse_rate'
]
=
0
else
:
metrics_result
[
'parse_rate'
]
=
1
return
metrics_result
def
evaluate
(
self
):
self
.
_load_dataset
()
results_list
=
[]
for
data_sample
in
tqdm
(
self
.
dataset
):
metrics_result
=
self
.
_evaluate
(
data_sample
[
'response_data_sample'
])
results_list
.
append
(
metrics_result
)
return
self
.
_post_process
(
results_list
)
def
_post_process
(
self
,
results_list
):
# list of dict to dict of list
results
=
dict
()
if
self
.
default_prompt_type
==
'json'
:
metric_keys
=
[
'thought'
,
'name'
,
'args_precision'
,
'args_recall'
,
'args_f1_score'
,
'parse_rate'
]
if
self
.
default_prompt_type
==
'str'
:
if
self
.
eval_type
==
'reason'
:
metric_keys
=
[
'thought'
,
'parse_rate'
]
if
self
.
eval_type
==
'retrieve'
:
metric_keys
=
[
'name'
,
'parse_rate'
]
if
self
.
eval_type
==
'understand'
:
metric_keys
=
[
'args_precision'
,
'args_recall'
,
'args_f1_score'
,
'parse_rate'
]
for
key
in
metric_keys
:
results
[
key
]
=
mean
([
result
[
key
]
for
result
in
results_list
])
return
results
opencompass/datasets/teval/evaluators/review_evaluator.py
0 → 100644
View file @
d34ba111
from
collections
import
defaultdict
from
mmengine
import
load
from
..schema
import
ResponseDataSample
import
numpy
as
np
from
..utils.format_load
import
format_load
class
ReviewEvaluator
:
"""Review Capability Evaluation
Args:
dataset_path(str): File path of evaluation dataset.
"""
def
__init__
(
self
,
dataset_path
:
str
,
# bert_score_model: str = "all-mpnet-base-v2",
**
kwargs
,
)
->
None
:
self
.
dataset_path
=
dataset_path
# self.bert_score_model = bert_score_model
# self.sentence_model = SentenceTransformer(self.bert_score_model)
def
_load_dataset
(
self
):
self
.
dataset
=
[]
dataset
=
load
(
self
.
dataset_path
)
for
key
in
dataset
.
keys
():
datum
=
dataset
[
key
]
data_sample
=
self
.
_process_response
(
datum
)
self
.
dataset
.
append
(
dict
(
origin_prompt
=
datum
[
'origin_prompt'
],
response_data_sample
=
data_sample
))
self
.
num_samples
=
len
(
self
.
dataset
)
def
_process_response
(
self
,
datum
:
dict
,
)
->
ResponseDataSample
:
"""Process the response to needed format.
Args:
datum(dict): inputs.
Returns:
dict: Processed response data sample.
"""
template
=
datum
[
'template'
]
pred_data
=
datum
[
'prediction'
]
gt_data
=
datum
[
'ground_truth'
][
'answer'
]
meta_data
=
datum
[
'meta_data'
]
if
meta_data
[
'response_format'
]
==
'json'
:
pred_data
=
self
.
json_format_parse
(
pred_data
)
else
:
pred_data
=
pred_data
[
pred_data
.
find
(
":"
)
+
1
:]
pred_data
=
pred_data
.
strip
()
if
len
(
pred_data
)
>
0
and
pred_data
[
0
]
in
[
'A'
,
'B'
,
'C'
,
'D'
,
'E'
]:
pred_data
=
pred_data
[
0
]
else
:
pred_data
=
None
return
ResponseDataSample
(
template
=
template
,
pred
=
pred_data
,
gt
=
gt_data
,
meta_data
=
meta_data
)
def
_evaluate
(
self
,
data_sample
)
->
dict
:
metrics_result
=
dict
(
parse_rate
=
0
,
review_quality
=
0
,
)
pred_data
=
data_sample
.
pred
if
pred_data
is
not
None
:
# import pdb; pdb.set_trace()
metrics_result
[
'review_quality'
]
=
1.0
if
pred_data
==
\
data_sample
.
gt
else
0.0
metrics_result
[
'parse_rate'
]
=
1.0
return
metrics_result
# def compute_sen_similarity(self, gt, pred):
# gt_embed = self.sentence_model.encode(gt, convert_to_tensor=True)
# pred_embed = self.sentence_model.encode(pred, convert_to_tensor=True)
# sen_sim = max(0, util.cos_sim(gt_embed, pred_embed).item())
# return sen_sim
def
json_format_parse
(
self
,
pred_data
):
try
:
data
=
format_load
(
pred_data
)
except
Exception
as
e
:
return
None
try
:
new_data
=
dict
()
new_data
[
'review'
]
=
data
[
'is_finished'
]
assert
new_data
[
'review'
]
in
[
True
,
False
]
except
Exception
as
e
:
return
None
return
new_data
def
evaluate
(
self
):
self
.
_load_dataset
()
results_list
=
[]
for
data_sample
in
self
.
dataset
:
metrics_result
=
self
.
_evaluate
(
data_sample
[
'response_data_sample'
])
results_list
.
append
(
metrics_result
)
return
self
.
_post_process
(
results_list
)
def
_post_process
(
self
,
results_list
):
# list of dict to dict of list
results_dict
=
defaultdict
(
list
)
{
results_dict
[
key
].
append
(
sub
[
key
])
for
sub
in
results_list
for
key
in
sub
}
metric_list
=
[
'parse_rate'
,
'review_quality'
]
for
metric
in
metric_list
:
results_dict
[
metric
]
=
np
.
round
(
np
.
mean
(
results_dict
[
metric
]),
decimals
=
4
)
return
results_dict
opencompass/datasets/teval/schema.py
0 → 100644
View file @
d34ba111
from
dataclasses
import
asdict
,
dataclass
,
field
from
typing
import
Any
,
Dict
@
dataclass
class
ResponseDataSample
:
"""
Args:
template(str): Format string with keyword-only arguments. For
example '{who} like {what}'
pred(Any): Parsed data from LLM generating response.
gt(Any): Ground truth data
meta_data(dict, optional): Meta information will be used to evaluate
LLM's response
"""
template
:
str
pred
:
Any
gt
:
Any
meta_data
:
dict
=
None
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment