Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
opencompass
Commits
b39f5015
Unverified
Commit
b39f5015
authored
Apr 09, 2024
by
Fengzhe Zhou
Committed by
GitHub
Apr 09, 2024
Browse files
[Sync] update taco (#1030)
parent
16f29b25
Changes
87
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
755 additions
and
39 deletions
+755
-39
configs/summarizers/compass_knowledge.py
configs/summarizers/compass_knowledge.py
+0
-1
configs/summarizers/compassbench_v1_objective.py
configs/summarizers/compassbench_v1_objective.py
+227
-0
configs/summarizers/groups/lcbench.py
configs/summarizers/groups/lcbench.py
+3
-0
configs/summarizers/groups/mathbench_v1.py
configs/summarizers/groups/mathbench_v1.py
+13
-0
configs/summarizers/groups/plugineval.py
configs/summarizers/groups/plugineval.py
+34
-0
configs/summarizers/mathbench_v1.py
configs/summarizers/mathbench_v1.py
+13
-4
opencompass/cli/main.py
opencompass/cli/main.py
+364
-0
opencompass/datasets/NPHardEval/cmp_GCP_D.py
opencompass/datasets/NPHardEval/cmp_GCP_D.py
+5
-1
opencompass/datasets/NPHardEval/cmp_TSP_D.py
opencompass/datasets/NPHardEval/cmp_TSP_D.py
+5
-1
opencompass/datasets/NPHardEval/p_SPP.py
opencompass/datasets/NPHardEval/p_SPP.py
+5
-1
opencompass/datasets/__init__.py
opencompass/datasets/__init__.py
+2
-0
opencompass/datasets/apps.py
opencompass/datasets/apps.py
+25
-12
opencompass/datasets/custom.py
opencompass/datasets/custom.py
+10
-0
opencompass/datasets/humanevalx.py
opencompass/datasets/humanevalx.py
+4
-1
opencompass/datasets/lawbench/evaluation_functions/ljp_article.py
...ass/datasets/lawbench/evaluation_functions/ljp_article.py
+1
-1
opencompass/datasets/lawbench/evaluation_functions/ljp_imprison.py
...ss/datasets/lawbench/evaluation_functions/ljp_imprison.py
+3
-1
opencompass/datasets/math.py
opencompass/datasets/math.py
+19
-6
opencompass/datasets/mathbench.py
opencompass/datasets/mathbench.py
+1
-1
opencompass/datasets/mbpp.py
opencompass/datasets/mbpp.py
+1
-0
opencompass/datasets/taco.py
opencompass/datasets/taco.py
+20
-9
No files found.
configs/summarizers/compass_knowledge.py
View file @
b39f5015
...
...
@@ -12,7 +12,6 @@ compassbench_v1_knowledge_groups = [
{
'name'
:
'knowledge_perf_4_and_cloze'
,
'subsets'
:
[[
'knowledge_cn'
,
'perf_4'
],
[
'compassbench_v1_knowledge-mixed-cloze_en'
,
'score'
]]},
]
'compassbench_v1_knowledge-mixed-cloze_en'
summarizer
=
dict
(
dataset_abbrs
=
[
'knowledge_perf_4_and_cloze'
,
...
...
configs/summarizers/compassbench_v1_objective.py
0 → 100644
View file @
b39f5015
from
mmengine.config
import
read_base
with
read_base
():
from
.groups.cibench
import
cibench_summary_groups
from
.groups.plugineval
import
plugineval_summary_groups
compassbench_v1_language_names
=
[
# ['information_retrieval_en', 'score'],
# ['information_retrieval_zh', 'score'],
[
'intention_recognition_en_circular'
,
'acc_origin'
],
[
'intention_recognition_en_circular'
,
'perf_circular'
],
[
'intention_recognition_zh_circular'
,
'acc_origin'
],
[
'intention_recognition_zh_circular'
,
'perf_circular'
],
[
'sentiment_analysis_en_circular'
,
'acc_origin'
],
[
'sentiment_analysis_en_circular'
,
'perf_circular'
],
[
'sentiment_analysis_zh_circular'
,
'acc_origin'
],
[
'sentiment_analysis_zh_circular'
,
'perf_circular'
],
[
'translation'
,
'score'
],
[
'content_critic_en_circular'
,
'acc_origin'
],
[
'content_critic_en_circular'
,
'perf_circular'
],
[
'content_critic_zh_circular'
,
'acc_origin'
],
[
'content_critic_zh_circular'
,
'perf_circular'
],
[
'content_summarization_en'
,
'rouge1'
],
[
'content_summarization_zh'
,
'rouge1'
],
[
'traditional_cultural_understanding_zh_circular'
,
'acc_origin'
],
[
'traditional_cultural_understanding_zh_circular'
,
'perf_circular'
],
[
'chinese_semantic_understanding_zh_circular'
,
'acc_origin'
],
[
'chinese_semantic_understanding_zh_circular'
,
'perf_circular'
],
]
compassbench_v1_language_summary_groups
=
[
{
'name'
:
'language_zh_acc_1_and_non_mcq'
,
'subsets'
:
[[
name
,
metric
]
for
name
,
metric
in
compassbench_v1_language_names
if
'_zh'
in
name
and
metric
!=
'perf_circular'
]},
{
'name'
:
'language_en_acc_1_and_non_mcq'
,
'subsets'
:
[[
name
,
metric
]
for
name
,
metric
in
compassbench_v1_language_names
if
'_en'
in
name
and
metric
!=
'perf_circular'
]},
{
'name'
:
'language_acc_1_and_non_mcq'
,
'subsets'
:
[
'language_zh_acc_1_and_non_mcq'
,
'language_en_acc_1_and_non_mcq'
]},
{
'name'
:
'language_zh_perf_4_and_non_mcq'
,
'subsets'
:
[[
name
,
metric
]
for
name
,
metric
in
compassbench_v1_language_names
if
'_zh'
in
name
and
metric
!=
'acc_origin'
]},
{
'name'
:
'language_en_perf_4_and_non_mcq'
,
'subsets'
:
[[
name
,
metric
]
for
name
,
metric
in
compassbench_v1_language_names
if
'_en'
in
name
and
metric
!=
'acc_origin'
]},
{
'name'
:
'language_perf_4_and_non_mcq'
,
'subsets'
:
[
'language_zh_perf_4_and_non_mcq'
,
'language_en_perf_4_and_non_mcq'
]},
]
# This summarizer is used for `./datasets/compassbench_v1_knowledge/compassbench_v1_knowledge_gen`
compassbench_v1_knowledge_names
=
[
'compassbench_v1_knowledge-common_knowledge-single_choice_cn_circular'
,
'compassbench_v1_knowledge-humanity-single_choice_cn_circular'
,
'compassbench_v1_knowledge-natural_science-single_choice_cn_circular'
,
'compassbench_v1_knowledge-social_science-single_choice_cn_circular'
,
]
compassbench_v1_knowledge_summary_groups
=
[
{
'name'
:
'knowledge_cn'
,
'subsets'
:
compassbench_v1_knowledge_names
},
{
'name'
:
'knowledge_acc_1_and_cloze'
,
'subsets'
:
[[
'knowledge_cn'
,
'acc_1'
],
[
'compassbench_v1_knowledge-mixed-cloze_en'
,
'score'
]]},
{
'name'
:
'knowledge_perf_4_and_cloze'
,
'subsets'
:
[[
'knowledge_cn'
,
'perf_4'
],
[
'compassbench_v1_knowledge-mixed-cloze_en'
,
'score'
]]},
]
compassbench_v1_reason_summary_groups
=
[
{
'name'
:
'reasonbench_cn_abductive_circular'
,
'subsets'
:
[
'reasonbench_cn_abductive_alphanlg_translated_circular'
]},
{
'name'
:
'reasonbench_en_abductive_circular'
,
'subsets'
:
[
'reasonbench_en_abductive_alphanlg_circular'
]},
{
'name'
:
'reasonbench_cn_deductive_circular'
,
'subsets'
:
[
'reasonbench_cn_deductive_bbh3obj_translated_circular'
,
'reasonbench_cn_deductive_logiqa_zh_circular'
]},
{
'name'
:
'reasonbench_cn_inductive_circular'
,
'subsets'
:
[
'reasonbench_cn_inductive_deer_translated_circular'
,
'reasonbench_cn_inductive_selfgenerated_circular'
]},
{
'name'
:
'reasonbench_en_inductive_circular'
,
'subsets'
:
[
'reasonbench_en_inductive_deer_circular'
,
'reasonbench_en_inductive_selfgenerated_circular'
]},
{
'name'
:
'reasonbench_cn_circular'
,
'subsets'
:
[
'reasonbench_cn_commonsense_circular'
,
'reasonbench_cn_abductive_circular'
,
'reasonbench_cn_deductive_circular'
,
'reasonbench_cn_inductive_circular'
]},
{
'name'
:
'reasonbench_en_circular'
,
'subsets'
:
[
'reasonbench_en_commonsense_circular'
,
'reasonbench_en_abductive_circular'
,
'reasonbench_en_deductive_logiqa_zh_translated_circular'
,
'reasonbench_en_inductive_circular'
]},
{
'name'
:
'reasonbench'
,
'subsets'
:
[
'reasonbench_cn_circular'
,
'reasonbench_en_circular'
]},
]
compassbench_v1_math_summary_groups
=
[
{
'name'
:
'math_acc_1_and_fill_in_blank'
,
'subsets'
:
[[
'compassbench_v1_math-high-single_choice_cn'
,
'acc_1'
],
[
'compassbench_v1_math-high-single_choice_en'
,
'acc_1'
],
[
'compassbench_v1_math-middle-single_choice_cn'
,
'acc_1'
],
[
'compassbench_v1_math-middle-single_choice_en'
,
'acc_1'
],
[
'compassbench_v1_math-primary-cloze_cn'
,
'accuracy'
],
[
'compassbench_v1_math-primary-cloze_en'
,
'accuracy'
]]},
{
'name'
:
'math_perf_4_and_fill_in_blank'
,
'subsets'
:
[[
'compassbench_v1_math-high-single_choice_cn'
,
'perf_4'
],
[
'compassbench_v1_math-high-single_choice_en'
,
'perf_4'
],
[
'compassbench_v1_math-middle-single_choice_cn'
,
'perf_4'
],
[
'compassbench_v1_math-middle-single_choice_en'
,
'perf_4'
],
[
'compassbench_v1_math-primary-cloze_cn'
,
'accuracy'
],
[
'compassbench_v1_math-primary-cloze_en'
,
'accuracy'
]]},
{
'name'
:
'math_perf_4_and_fill_in_blank_cn'
,
'subsets'
:
[[
'compassbench_v1_math-high-single_choice_cn'
,
'perf_4'
],
[
'compassbench_v1_math-middle-single_choice_cn'
,
'perf_4'
],
[
'compassbench_v1_math-primary-cloze_cn'
,
'accuracy'
]]},
{
'name'
:
'math_perf_4_and_fill_in_blank_en'
,
'subsets'
:
[[
'compassbench_v1_math-high-single_choice_en'
,
'perf_4'
],
[
'compassbench_v1_math-middle-single_choice_en'
,
'perf_4'
],
[
'compassbench_v1_math-primary-cloze_en'
,
'accuracy'
]]},
]
code_passk_summary_groups
=
[
# rename
{
'name'
:
'humaneval_pass@1(greedy)'
,
'subsets'
:
[[
'openai_humaneval'
,
'humaneval_pass@1'
]]},
{
'name'
:
'humaneval_pass@10'
,
'subsets'
:
[[
'openai_humaneval_passk'
,
'humaneval_pass@10'
]]},
{
'name'
:
'humaneval_pass@10'
,
'subsets'
:
[[
'openai_humaneval_repeat10'
,
'humaneval_pass@10'
]]},
{
'name'
:
'humaneval_cn_pass@1(greedy)'
,
'subsets'
:
[[
'openai_humaneval_cn'
,
'humaneval_pass@1'
]]},
{
'name'
:
'humaneval_cn_pass@10'
,
'subsets'
:
[[
'openai_humaneval_cn_passk'
,
'humaneval_pass@10'
]]},
{
'name'
:
'humaneval_cn_pass@10'
,
'subsets'
:
[[
'openai_humaneval_cn_repeat10'
,
'humaneval_pass@10'
]]},
{
'name'
:
'humaneval_plus_pass@1(greedy)'
,
'subsets'
:
[[
'humaneval_plus'
,
'humaneval_plus_pass@1'
]]},
{
'name'
:
'humaneval_plus_pass@10'
,
'subsets'
:
[[
'humaneval_plus_passk'
,
'humaneval_plus_pass@10'
]]},
{
'name'
:
'humaneval_plus_pass@10'
,
'subsets'
:
[[
'humaneval_plus_repeat10'
,
'humaneval_plus_pass@10'
]]},
{
'name'
:
'mbpp_pass@1(greedy)'
,
'subsets'
:
[[
'mbpp'
,
'score'
]]},
{
'name'
:
'mbpp_pass@10'
,
'subsets'
:
[[
'mbpp_passk'
,
'pass@10'
]]},
{
'name'
:
'mbpp_pass@10'
,
'subsets'
:
[[
'mbpp_repeat10'
,
'pass@10'
]]},
{
'name'
:
'mbpp_cn_pass@1(greedy)'
,
'subsets'
:
[[
'mbpp_cn'
,
'score'
]]},
{
'name'
:
'mbpp_cn_pass@10'
,
'subsets'
:
[[
'mbpp_cn_passk'
,
'pass@10'
]]},
{
'name'
:
'mbpp_cn_pass@10'
,
'subsets'
:
[[
'mbpp_cn_repeat10'
,
'pass@10'
]]},
{
'name'
:
'sanitized_mbpp_pass@1(greedy)'
,
'subsets'
:
[[
'sanitized_mbpp'
,
'score'
]]},
{
'name'
:
'sanitized_mbpp_pass@10'
,
'subsets'
:
[[
'sanitized_mbpp_passk'
,
'pass@10'
]]},
{
'name'
:
'sanitized_mbpp_pass@10'
,
'subsets'
:
[[
'sanitized_mbpp_repeat10'
,
'pass@10'
]]},
# real add
{
'name'
:
'humanevalx'
,
'subsets'
:
[
'humanevalx-python'
,
'humanevalx-cpp'
,
'humanevalx-go'
,
'humanevalx-java'
,
'humanevalx-js'
]},
{
'name'
:
'code_cn'
,
'subsets'
:
[
'humaneval_cn_pass@1(greedy)'
,
'mbpp_cn_pass@1(greedy)'
]},
{
'name'
:
'code_en'
,
'subsets'
:
[
'humaneval_plus_pass@1(greedy)'
,
'sanitized_mbpp_pass@1(greedy)'
,
'humanevalx'
]},
{
'name'
:
'code'
,
'subsets'
:
[
'humaneval_cn_pass@1(greedy)'
,
'mbpp_cn_pass@1(greedy)'
,
'humaneval_plus_pass@1(greedy)'
,
'sanitized_mbpp_pass@1(greedy)'
,
'humanevalx'
]},
]
agent_summary_groups
=
[
# dict(name='cibench_template', subsets=['cibench_template:executable', 'cibench_template:numeric_correct', 'cibench_template:text_score', 'cibench_template:vis_sim']),
# dict(name='cibench_template_cn', subsets=['cibench_template_cn:executable', 'cibench_template_cn:numeric_correct', 'cibench_template_cn:text_score', 'cibench_template_cn:vis_sim']),
dict
(
name
=
'cibench_template'
,
subsets
=
[
'cibench_template_wo_nltk:executable'
,
'cibench_template_wo_nltk:numeric_correct'
,
'cibench_template_wo_nltk:vis_sim'
]),
dict
(
name
=
'cibench_template_cn'
,
subsets
=
[
'cibench_template_cn_wo_nltk:executable'
,
'cibench_template_cn_wo_nltk:numeric_correct'
,
'cibench_template_cn_wo_nltk:vis_sim'
]),
dict
(
name
=
'agent_cn'
,
subsets
=
[
'cibench_template_cn'
,
'plugin_eval-mus-p10_one_review_zh'
]),
dict
(
name
=
'agent_en'
,
subsets
=
[
'cibench_template'
,
'plugin_eval-mus-p10_one_review'
]),
dict
(
name
=
'agent'
,
subsets
=
[
'agent_cn'
,
'agent_en'
]),
]
other_summary_groups
=
[
{
"name"
:
"average_cn"
,
"subsets"
:
[
[
"language_zh_perf_4_and_non_mcq"
,
"naive_average"
],
[
"knowledge_cn"
,
"perf_4"
],
[
"reasonbench_cn_circular"
,
"perf_circular"
],
[
"math_perf_4_and_fill_in_blank_cn"
,
"naive_average"
],
[
"code_cn"
,
"naive_average"
],
[
"agent_cn"
,
"naive_average"
],
],
},
{
"name"
:
"average_en"
,
"subsets"
:
[
[
"language_en_perf_4_and_non_mcq"
,
"naive_average"
],
[
"compassbench_v1_knowledge-mixed-cloze_en"
,
"score"
],
[
"reasonbench_en_circular"
,
"perf_circular"
],
[
"math_perf_4_and_fill_in_blank_en"
,
"naive_average"
],
[
"code_en"
,
"naive_average"
],
[
"agent_en"
,
"naive_average"
],
],
},
{
"name"
:
"average"
,
"subsets"
:
[
[
"language_perf_4_and_non_mcq"
,
"naive_average"
],
[
"knowledge_perf_4_and_cloze"
,
"naive_average"
],
[
"reasonbench"
,
"perf_circular"
],
[
"math_perf_4_and_fill_in_blank"
,
"naive_average"
],
[
"code"
,
"naive_average"
],
[
"agent"
,
"naive_average"
],
],
},
]
summarizer
=
dict
(
dataset_abbrs
=
[
[
'average'
,
'naive_average'
],
[
'average_cn'
,
'naive_average'
],
[
'average_en'
,
'naive_average'
],
''
,
''
,
''
,
[
'language_perf_4_and_non_mcq'
,
'naive_average'
],
[
'language_zh_perf_4_and_non_mcq'
,
'naive_average'
],
[
'language_en_perf_4_and_non_mcq'
,
'naive_average'
],
[
'intention_recognition_zh_circular'
,
'perf_circular'
],
[
'intention_recognition_en_circular'
,
'perf_circular'
],
[
'sentiment_analysis_zh_circular'
,
'perf_circular'
],
[
'sentiment_analysis_en_circular'
,
'perf_circular'
],
[
'translation'
,
'score'
],
[
'content_critic_zh_circular'
,
'perf_circular'
],
[
'content_critic_en_circular'
,
'perf_circular'
],
[
'content_summarization_zh'
,
'rouge1'
],
[
'content_summarization_en'
,
'rouge1'
],
[
'traditional_cultural_understanding_zh_circular'
,
'perf_circular'
],
[
'chinese_semantic_understanding_zh_circular'
,
'perf_circular'
],
[
'knowledge_perf_4_and_cloze'
,
'naive_average'
],
[
'knowledge_cn'
,
'perf_4'
],
[
'compassbench_v1_knowledge-mixed-cloze_en'
,
'score'
],
[
'compassbench_v1_knowledge-common_knowledge-single_choice_cn_circular'
,
'perf_4'
],
[
'compassbench_v1_knowledge-humanity-single_choice_cn_circular'
,
'perf_4'
],
[
'compassbench_v1_knowledge-natural_science-single_choice_cn_circular'
,
'perf_4'
],
[
'compassbench_v1_knowledge-social_science-single_choice_cn_circular'
,
'perf_4'
],
[
'reasonbench'
,
'perf_circular'
],
[
'reasonbench_cn_circular'
,
'perf_circular'
],
[
'reasonbench_en_circular'
,
'perf_circular'
],
[
'reasonbench_cn_commonsense_circular'
,
'perf_circular'
],
[
'reasonbench_cn_abductive_circular'
,
'perf_circular'
],
[
'reasonbench_cn_deductive_circular'
,
'perf_circular'
],
[
'reasonbench_cn_inductive_circular'
,
'perf_circular'
],
[
'reasonbench_en_commonsense_circular'
,
'perf_circular'
],
[
'reasonbench_en_abductive_circular'
,
'perf_circular'
],
[
'reasonbench_en_deductive_logiqa_zh_translated_circular'
,
'perf_circular'
],
[
'reasonbench_en_inductive_circular'
,
'perf_circular'
],
[
'math_perf_4_and_fill_in_blank'
,
'naive_average'
],
[
'math_perf_4_and_fill_in_blank_cn'
,
'naive_average'
],
[
'math_perf_4_and_fill_in_blank_en'
,
'naive_average'
],
[
'compassbench_v1_math-high-single_choice_cn'
,
'perf_4'
],
[
'compassbench_v1_math-high-single_choice_en'
,
'perf_4'
],
[
'compassbench_v1_math-middle-single_choice_cn'
,
'perf_4'
],
[
'compassbench_v1_math-middle-single_choice_en'
,
'perf_4'
],
[
'compassbench_v1_math-primary-cloze_cn'
,
'accuracy'
],
[
'compassbench_v1_math-primary-cloze_en'
,
'accuracy'
],
[
'code'
,
'naive_average'
],
[
'code_cn'
,
'naive_average'
],
[
'code_en'
,
'naive_average'
],
[
'humaneval_cn_pass@1(greedy)'
,
'naive_average'
],
[
'humaneval_plus_pass@1(greedy)'
,
'naive_average'
],
[
'mbpp_cn_pass@1(greedy)'
,
'naive_average'
],
[
'sanitized_mbpp_pass@1(greedy)'
,
'naive_average'
],
[
'humanevalx'
,
'naive_average'
],
[
'agent'
,
'naive_average'
],
[
'agent_cn'
,
'naive_average'
],
[
'agent_en'
,
'naive_average'
],
[
'cibench_template_cn'
,
'naive_average'
],
[
'cibench_template'
,
'naive_average'
],
[
'plugin_eval-mus-p10_one_review_zh'
,
'naive_average'
],
[
'plugin_eval-mus-p10_one_review'
,
'naive_average'
],
],
summary_groups
=
sum
([
v
for
k
,
v
in
locals
().
items
()
if
k
.
endswith
(
"_summary_groups"
)],
[]),
)
configs/summarizers/groups/lcbench.py
0 → 100644
View file @
b39f5015
lcbench_summary_groups
=
[
{
'name'
:
'lcbench'
,
'subsets'
:
[
'lcbench_en'
,
'lcbench_cn'
]},
]
configs/summarizers/groups/mathbench_v1.py
0 → 100644
View file @
b39f5015
mathbench_v1_summary_groups
=
[
{
'name'
:
'mathbench-college_application'
,
'subsets'
:
[
'mathbench-college-single_choice_cn'
,
'mathbench-college-single_choice_en'
]},
{
'name'
:
'mathbench-high_application'
,
'subsets'
:
[
'mathbench-high-single_choice_cn'
,
'mathbench-high-single_choice_en'
]},
{
'name'
:
'mathbench-middle_application'
,
'subsets'
:
[
'mathbench-middle-single_choice_cn'
,
'mathbench-middle-single_choice_en'
]},
{
'name'
:
'mathbench-primary_application'
,
'subsets'
:
[
'mathbench-primary-cloze_cn'
,
'mathbench-primary-cloze_en'
,
'mathbench-calculate-cloze_en'
],
'weights'
:
{
'mathbench-primary-cloze_cn'
:
1
,
'mathbench-primary-cloze_en'
:
1
,
'mathbench-calculate-cloze_en'
:
2
}},
{
'name'
:
'mathbench-college_knowledge'
,
'subsets'
:
[
'mathbench-college_knowledge-single_choice_cn'
,
'mathbench-college_knowledge-single_choice_en'
]},
{
'name'
:
'mathbench-high_knowledge'
,
'subsets'
:
[
'mathbench-high_knowledge-single_choice_cn'
,
'mathbench-high_knowledge-single_choice_en'
]},
{
'name'
:
'mathbench-middle_knowledge'
,
'subsets'
:
[
'mathbench-middle_knowledge-single_choice_cn'
,
'mathbench-middle_knowledge-single_choice_en'
]},
{
'name'
:
'mathbench-primary_knowledge'
,
'subsets'
:
[
'mathbench-primary_knowledge-single_choice_cn'
,
'mathbench-primary_knowledge-single_choice_en'
]},
{
'name'
:
'mathbench_application'
,
'subsets'
:
[
'mathbench-college_application'
,
'mathbench-high_application'
,
'mathbench-middle_application'
,
'mathbench-primary_application'
]},
{
'name'
:
'mathbench_knowledge'
,
'subsets'
:
[
'mathbench-college_knowledge'
,
'mathbench-high_knowledge'
,
'mathbench-middle_knowledge'
,
'mathbench-primary_knowledge'
]},
{
'name'
:
'mathbench'
,
'subsets'
:
[
'mathbench_application'
,
'mathbench_knowledge'
]},
]
configs/summarizers/groups/plugineval.py
View file @
b39f5015
...
...
@@ -71,6 +71,40 @@ _base_summary_groups = [
[
'plugin_eval-review_str_v1'
,
'review_quality'
],
]
},
# special treatment for first 10% data points
{
'name'
:
'plugin_eval-p10-instruct_v1'
,
'metric'
:
'format_metric'
,
'subsets'
:
[
[
'plugin_eval-p10-instruct_v1'
,
'string_format_metric'
],
[
'plugin_eval-p10-instruct_v1'
,
'json_format_metric'
],
]
},
{
'name'
:
'plugin_eval-p10-instruct_v1'
,
'metric'
:
'args_em_metric'
,
'subsets'
:
[
[
'plugin_eval-p10-instruct_v1'
,
'string_args_em_metric'
],
[
'plugin_eval-p10-instruct_v1'
,
'json_args_em_metric'
],
]
},
{
'name'
:
'plugin_eval-p10'
,
'subsets'
:
[
[
'plugin_eval-p10-instruct_v1'
,
'format_metric'
],
[
'plugin_eval-p10-instruct_v1'
,
'args_em_metric'
],
[
'plugin_eval-p10-plan_str_v1'
,
'f1_score'
],
[
'plugin_eval-p10-plan_json_v1'
,
'f1_score'
],
[
'plugin_eval-p10-reason_str_v2'
,
'thought'
],
[
'plugin_eval-p10-reason_retrieve_understand_json_v2'
,
'thought'
],
[
'plugin_eval-p10-retrieve_str_v2'
,
'name'
],
[
'plugin_eval-p10-reason_retrieve_understand_json_v2'
,
'name'
],
[
'plugin_eval-p10-understand_str_v2'
,
'args'
],
[
'plugin_eval-p10-reason_retrieve_understand_json_v2'
,
'args'
],
[
'plugin_eval-p10-review_str_v6'
,
'review_quality'
],
]
},
]
plugineval_summary_groups
=
[]
...
...
configs/summarizers/mathbench_v1.py
View file @
b39f5015
summarizer
=
dict
(
dataset_abbrs
=
[
'######## MathBench Accuracy ########'
,
# category
'######## MathBench
Application
Accuracy ########'
,
# category
[
'mathbench-college-single_choice_cn'
,
'acc_1'
],
[
'mathbench-college-single_choice_en'
,
'acc_1'
],
[
'mathbench-high-single_choice_cn'
,
'acc_1'
],
...
...
@@ -9,15 +9,15 @@ summarizer = dict(
[
'mathbench-middle-single_choice_en'
,
'acc_1'
],
[
'mathbench-primary-cloze_cn'
,
'accuracy'
],
[
'mathbench-primary-cloze_en'
,
'accuracy'
],
[
'mathbench-
calculate
-cloze_en'
,
'accuracy'
],
'######## MathBench CircularEval ########'
,
# category
[
'mathbench-
arithmetic
-cloze_en'
,
'accuracy'
],
'######## MathBench
Application
CircularEval ########'
,
# category
[
'mathbench-college-single_choice_cn'
,
'perf_4'
],
[
'mathbench-college-single_choice_en'
,
'perf_4'
],
[
'mathbench-high-single_choice_cn'
,
'perf_4'
],
[
'mathbench-high-single_choice_en'
,
'perf_4'
],
[
'mathbench-middle-single_choice_cn'
,
'perf_4'
],
[
'mathbench-middle-single_choice_en'
,
'perf_4'
],
'######## MathBench Knowledge ########'
,
# category
'######## MathBench Knowledge
CircularEval
########'
,
# category
[
'mathbench-college_knowledge-single_choice_cn'
,
'perf_4'
],
[
'mathbench-college_knowledge-single_choice_en'
,
'perf_4'
],
[
'mathbench-high_knowledge-single_choice_cn'
,
'perf_4'
],
...
...
@@ -26,6 +26,15 @@ summarizer = dict(
[
'mathbench-middle_knowledge-single_choice_en'
,
'perf_4'
],
[
'mathbench-primary_knowledge-single_choice_cn'
,
'perf_4'
],
[
'mathbench-primary_knowledge-single_choice_en'
,
'perf_4'
],
'######## MathBench Knowledge Accuracy ########'
,
# category
[
'mathbench-college_knowledge-single_choice_cn'
,
'acc_1'
],
[
'mathbench-college_knowledge-single_choice_en'
,
'acc_1'
],
[
'mathbench-high_knowledge-single_choice_cn'
,
'acc_1'
],
[
'mathbench-high_knowledge-single_choice_en'
,
'acc_1'
],
[
'mathbench-middle_knowledge-single_choice_cn'
,
'acc_1'
],
[
'mathbench-middle_knowledge-single_choice_en'
,
'acc_1'
],
[
'mathbench-primary_knowledge-single_choice_cn'
,
'acc_1'
],
[
'mathbench-primary_knowledge-single_choice_en'
,
'acc_1'
],
],
summary_groups
=
sum
(
[
v
for
k
,
v
in
locals
().
items
()
if
k
.
endswith
(
"_summary_groups"
)],
[])
...
...
opencompass/cli/main.py
0 → 100644
View file @
b39f5015
import
argparse
import
getpass
import
os
import
os.path
as
osp
from
datetime
import
datetime
from
mmengine.config
import
Config
,
DictAction
from
opencompass.partitioners
import
MultimodalNaivePartitioner
from
opencompass.registry
import
PARTITIONERS
,
RUNNERS
,
build_from_cfg
from
opencompass.runners
import
SlurmRunner
from
opencompass.summarizers
import
DefaultSummarizer
from
opencompass.utils
import
LarkReporter
,
get_logger
from
opencompass.utils.run
import
(
exec_mm_infer_runner
,
fill_eval_cfg
,
fill_infer_cfg
,
get_config_from_arg
)
def
parse_args
():
parser
=
argparse
.
ArgumentParser
(
description
=
'Run an evaluation task'
)
parser
.
add_argument
(
'config'
,
nargs
=
'?'
,
help
=
'Train config file path'
)
# add mutually exclusive args `--slurm` `--dlc`, defaults to local runner
# if "infer" or "eval" not specified
launch_method
=
parser
.
add_mutually_exclusive_group
()
launch_method
.
add_argument
(
'--slurm'
,
action
=
'store_true'
,
default
=
False
,
help
=
'Whether to force tasks to run with srun. '
'If True, `--partition(-p)` must be set. '
'Defaults to False'
)
launch_method
.
add_argument
(
'--dlc'
,
action
=
'store_true'
,
default
=
False
,
help
=
'Whether to force tasks to run on dlc. If '
'True, `--aliyun-cfg` must be set. Defaults'
' to False'
)
# multi-modal support
parser
.
add_argument
(
'--mm-eval'
,
help
=
'Whether or not enable multimodal evaluation'
,
action
=
'store_true'
,
default
=
False
)
# Add shortcut parameters (models, datasets and summarizer)
parser
.
add_argument
(
'--models'
,
nargs
=
'+'
,
help
=
''
,
default
=
None
)
parser
.
add_argument
(
'--datasets'
,
nargs
=
'+'
,
help
=
''
,
default
=
None
)
parser
.
add_argument
(
'--summarizer'
,
help
=
''
,
default
=
None
)
# add general args
parser
.
add_argument
(
'--debug'
,
help
=
'Debug mode, in which scheduler will run tasks '
'in the single process, and output will not be '
'redirected to files'
,
action
=
'store_true'
,
default
=
False
)
parser
.
add_argument
(
'--dry-run'
,
help
=
'Dry run mode, in which the scheduler will not '
'actually run the tasks, but only print the commands '
'to run'
,
action
=
'store_true'
,
default
=
False
)
parser
.
add_argument
(
'-m'
,
'--mode'
,
help
=
'Running mode. You can choose "infer" if you '
'only want the inference results, or "eval" if you '
'already have the results and want to evaluate them, '
'or "viz" if you want to visualize the results.'
,
choices
=
[
'all'
,
'infer'
,
'eval'
,
'viz'
],
default
=
'all'
,
type
=
str
)
parser
.
add_argument
(
'-r'
,
'--reuse'
,
nargs
=
'?'
,
type
=
str
,
const
=
'latest'
,
help
=
'Reuse previous outputs & results, and run any '
'missing jobs presented in the config. If its '
'argument is not specified, the latest results in '
'the work_dir will be reused. The argument should '
'also be a specific timestamp, e.g. 20230516_144254'
)
parser
.
add_argument
(
'-w'
,
'--work-dir'
,
help
=
'Work path, all the outputs will be '
'saved in this path, including the slurm logs, '
'the evaluation results, the summary results, etc.'
'If not specified, the work_dir will be set to '
'./outputs/default.'
,
default
=
None
,
type
=
str
)
parser
.
add_argument
(
'--config-dir'
,
default
=
'configs'
,
help
=
'Use the custom config directory instead of config/ to '
'search the configs for datasets, models and summarizers'
,
type
=
str
)
parser
.
add_argument
(
'-l'
,
'--lark'
,
help
=
'Report the running status to lark bot'
,
action
=
'store_true'
,
default
=
False
)
parser
.
add_argument
(
'--max-partition-size'
,
help
=
'The maximum size of an infer task. Only '
'effective when "infer" is missing from the config.'
,
type
=
int
,
default
=
40000
),
parser
.
add_argument
(
'--gen-task-coef'
,
help
=
'The dataset cost measurement coefficient for generation tasks, '
'Only effective when "infer" is missing from the config.'
,
type
=
int
,
default
=
20
)
parser
.
add_argument
(
'--max-num-workers'
,
help
=
'Max number of workers to run in parallel. '
'Will be overrideen by the "max_num_workers" argument '
'in the config.'
,
type
=
int
,
default
=
32
)
parser
.
add_argument
(
'--max-workers-per-gpu'
,
help
=
'Max task to run in parallel on one GPU. '
'It will only be used in the local runner.'
,
type
=
int
,
default
=
1
)
parser
.
add_argument
(
'--retry'
,
help
=
'Number of retries if the job failed when using slurm or dlc. '
'Will be overrideen by the "retry" argument in the config.'
,
type
=
int
,
default
=
2
)
parser
.
add_argument
(
'--dump-eval-details'
,
help
=
'Whether to dump the evaluation details, including the '
'correctness of each sample, bpb, etc.'
,
action
=
'store_true'
,
)
# set srun args
slurm_parser
=
parser
.
add_argument_group
(
'slurm_args'
)
parse_slurm_args
(
slurm_parser
)
# set dlc args
dlc_parser
=
parser
.
add_argument_group
(
'dlc_args'
)
parse_dlc_args
(
dlc_parser
)
# set hf args
hf_parser
=
parser
.
add_argument_group
(
'hf_args'
)
parse_hf_args
(
hf_parser
)
# set custom dataset args
custom_dataset_parser
=
parser
.
add_argument_group
(
'custom_dataset_args'
)
parse_custom_dataset_args
(
custom_dataset_parser
)
args
=
parser
.
parse_args
()
if
args
.
slurm
:
assert
args
.
partition
is
not
None
,
(
'--partition(-p) must be set if you want to use slurm'
)
if
args
.
dlc
:
assert
os
.
path
.
exists
(
args
.
aliyun_cfg
),
(
'When launching tasks using dlc, it needs to be configured '
'in "~/.aliyun.cfg", or use "--aliyun-cfg $ALiYun-CFG_Path"'
' to specify a new path.'
)
return
args
def
parse_slurm_args
(
slurm_parser
):
"""These args are all for slurm launch."""
slurm_parser
.
add_argument
(
'-p'
,
'--partition'
,
help
=
'Slurm partition name'
,
default
=
None
,
type
=
str
)
slurm_parser
.
add_argument
(
'-q'
,
'--quotatype'
,
help
=
'Slurm quota type'
,
default
=
None
,
type
=
str
)
slurm_parser
.
add_argument
(
'--qos'
,
help
=
'Slurm quality of service'
,
default
=
None
,
type
=
str
)
def
parse_dlc_args
(
dlc_parser
):
"""These args are all for dlc launch."""
dlc_parser
.
add_argument
(
'--aliyun-cfg'
,
help
=
'The config path for aliyun config'
,
default
=
'~/.aliyun.cfg'
,
type
=
str
)
def
parse_hf_args
(
hf_parser
):
"""These args are all for the quick construction of HuggingFace models."""
hf_parser
.
add_argument
(
'--hf-path'
,
type
=
str
)
hf_parser
.
add_argument
(
'--peft-path'
,
type
=
str
)
hf_parser
.
add_argument
(
'--tokenizer-path'
,
type
=
str
)
hf_parser
.
add_argument
(
'--model-kwargs'
,
nargs
=
'+'
,
action
=
DictAction
,
default
=
{})
hf_parser
.
add_argument
(
'--tokenizer-kwargs'
,
nargs
=
'+'
,
action
=
DictAction
,
default
=
{})
hf_parser
.
add_argument
(
'--max-out-len'
,
type
=
int
)
hf_parser
.
add_argument
(
'--max-seq-len'
,
type
=
int
)
hf_parser
.
add_argument
(
'--no-batch-padding'
,
action
=
'store_true'
,
default
=
False
)
hf_parser
.
add_argument
(
'--batch-size'
,
type
=
int
)
hf_parser
.
add_argument
(
'--num-gpus'
,
type
=
int
)
hf_parser
.
add_argument
(
'--pad-token-id'
,
type
=
int
)
def
parse_custom_dataset_args
(
custom_dataset_parser
):
"""These args are all for the quick construction of custom datasets."""
custom_dataset_parser
.
add_argument
(
'--custom-dataset-path'
,
type
=
str
)
custom_dataset_parser
.
add_argument
(
'--custom-dataset-meta-path'
,
type
=
str
)
custom_dataset_parser
.
add_argument
(
'--custom-dataset-data-type'
,
type
=
str
,
choices
=
[
'mcq'
,
'qa'
])
custom_dataset_parser
.
add_argument
(
'--custom-dataset-infer-method'
,
type
=
str
,
choices
=
[
'gen'
,
'ppl'
])
def
main
():
args
=
parse_args
()
if
args
.
dry_run
:
args
.
debug
=
True
# initialize logger
logger
=
get_logger
(
log_level
=
'DEBUG'
if
args
.
debug
else
'INFO'
)
cfg
=
get_config_from_arg
(
args
)
if
args
.
work_dir
is
not
None
:
cfg
[
'work_dir'
]
=
args
.
work_dir
else
:
cfg
.
setdefault
(
'work_dir'
,
'./outputs/default/'
)
# cfg_time_str defaults to the current time
cfg_time_str
=
dir_time_str
=
datetime
.
now
().
strftime
(
'%Y%m%d_%H%M%S'
)
if
args
.
reuse
:
if
args
.
reuse
==
'latest'
:
if
not
os
.
path
.
exists
(
cfg
.
work_dir
)
or
not
os
.
listdir
(
cfg
.
work_dir
):
logger
.
warning
(
'No previous results to reuse!'
)
else
:
dirs
=
os
.
listdir
(
cfg
.
work_dir
)
dir_time_str
=
sorted
(
dirs
)[
-
1
]
else
:
dir_time_str
=
args
.
reuse
logger
.
info
(
f
'Reusing experiements from
{
dir_time_str
}
'
)
elif
args
.
mode
in
[
'eval'
,
'viz'
]:
raise
ValueError
(
'You must specify -r or --reuse when running in eval '
'or viz mode!'
)
# update "actual" work_dir
cfg
[
'work_dir'
]
=
osp
.
join
(
cfg
.
work_dir
,
dir_time_str
)
os
.
makedirs
(
osp
.
join
(
cfg
.
work_dir
,
'configs'
),
exist_ok
=
True
)
# dump config
output_config_path
=
osp
.
join
(
cfg
.
work_dir
,
'configs'
,
f
'
{
cfg_time_str
}
.py'
)
cfg
.
dump
(
output_config_path
)
# Config is intentally reloaded here to avoid initialized
# types cannot be serialized
cfg
=
Config
.
fromfile
(
output_config_path
,
format_python_code
=
False
)
# report to lark bot if specify --lark
if
not
args
.
lark
:
cfg
[
'lark_bot_url'
]
=
None
elif
cfg
.
get
(
'lark_bot_url'
,
None
):
content
=
f
'
{
getpass
.
getuser
()
}
\'
s task has been launched!'
LarkReporter
(
cfg
[
'lark_bot_url'
]).
post
(
content
)
if
args
.
mode
in
[
'all'
,
'infer'
]:
# When user have specified --slurm or --dlc, or have not set
# "infer" in config, we will provide a default configuration
# for infer
if
(
args
.
dlc
or
args
.
slurm
)
and
cfg
.
get
(
'infer'
,
None
):
logger
.
warning
(
'You have set "infer" in the config, but '
'also specified --slurm or --dlc. '
'The "infer" configuration will be overridden by '
'your runtime arguments.'
)
# Check whether run multimodal evaluation
if
args
.
mm_eval
:
partitioner
=
MultimodalNaivePartitioner
(
osp
.
join
(
cfg
[
'work_dir'
],
'predictions/'
))
tasks
=
partitioner
(
cfg
)
exec_mm_infer_runner
(
tasks
,
args
,
cfg
)
return
if
args
.
dlc
or
args
.
slurm
or
cfg
.
get
(
'infer'
,
None
)
is
None
:
fill_infer_cfg
(
cfg
,
args
)
if
args
.
partition
is
not
None
:
if
RUNNERS
.
get
(
cfg
.
infer
.
runner
.
type
)
==
SlurmRunner
:
cfg
.
infer
.
runner
.
partition
=
args
.
partition
cfg
.
infer
.
runner
.
quotatype
=
args
.
quotatype
else
:
logger
.
warning
(
'SlurmRunner is not used, so the partition '
'argument is ignored.'
)
if
args
.
debug
:
cfg
.
infer
.
runner
.
debug
=
True
if
args
.
lark
:
cfg
.
infer
.
runner
.
lark_bot_url
=
cfg
[
'lark_bot_url'
]
cfg
.
infer
.
partitioner
[
'out_dir'
]
=
osp
.
join
(
cfg
[
'work_dir'
],
'predictions/'
)
partitioner
=
PARTITIONERS
.
build
(
cfg
.
infer
.
partitioner
)
tasks
=
partitioner
(
cfg
)
if
args
.
dry_run
:
return
runner
=
RUNNERS
.
build
(
cfg
.
infer
.
runner
)
# Add extra attack config if exists
if
hasattr
(
cfg
,
'attack'
):
for
task
in
tasks
:
cfg
.
attack
.
dataset
=
task
.
datasets
[
0
][
0
].
abbr
task
.
attack
=
cfg
.
attack
runner
(
tasks
)
# evaluate
if
args
.
mode
in
[
'all'
,
'eval'
]:
# When user have specified --slurm or --dlc, or have not set
# "eval" in config, we will provide a default configuration
# for eval
if
(
args
.
dlc
or
args
.
slurm
)
and
cfg
.
get
(
'eval'
,
None
):
logger
.
warning
(
'You have set "eval" in the config, but '
'also specified --slurm or --dlc. '
'The "eval" configuration will be overridden by '
'your runtime arguments.'
)
if
args
.
dlc
or
args
.
slurm
or
cfg
.
get
(
'eval'
,
None
)
is
None
:
fill_eval_cfg
(
cfg
,
args
)
if
args
.
dump_eval_details
:
cfg
.
eval
.
runner
.
task
.
dump_details
=
True
if
args
.
partition
is
not
None
:
if
RUNNERS
.
get
(
cfg
.
eval
.
runner
.
type
)
==
SlurmRunner
:
cfg
.
eval
.
runner
.
partition
=
args
.
partition
cfg
.
eval
.
runner
.
quotatype
=
args
.
quotatype
else
:
logger
.
warning
(
'SlurmRunner is not used, so the partition '
'argument is ignored.'
)
if
args
.
debug
:
cfg
.
eval
.
runner
.
debug
=
True
if
args
.
lark
:
cfg
.
eval
.
runner
.
lark_bot_url
=
cfg
[
'lark_bot_url'
]
cfg
.
eval
.
partitioner
[
'out_dir'
]
=
osp
.
join
(
cfg
[
'work_dir'
],
'results/'
)
partitioner
=
PARTITIONERS
.
build
(
cfg
.
eval
.
partitioner
)
tasks
=
partitioner
(
cfg
)
if
args
.
dry_run
:
return
runner
=
RUNNERS
.
build
(
cfg
.
eval
.
runner
)
# For meta-review-judge in subjective evaluation
if
isinstance
(
tasks
,
list
)
and
len
(
tasks
)
!=
0
and
isinstance
(
tasks
[
0
],
list
):
for
task_part
in
tasks
:
runner
(
task_part
)
else
:
runner
(
tasks
)
# visualize
if
args
.
mode
in
[
'all'
,
'eval'
,
'viz'
]:
summarizer_cfg
=
cfg
.
get
(
'summarizer'
,
{})
if
not
summarizer_cfg
or
summarizer_cfg
.
get
(
'type'
,
None
)
is
None
:
summarizer_cfg
[
'type'
]
=
DefaultSummarizer
summarizer_cfg
[
'config'
]
=
cfg
summarizer
=
build_from_cfg
(
summarizer_cfg
)
summarizer
.
summarize
(
time_str
=
cfg_time_str
)
if
__name__
==
'__main__'
:
main
()
opencompass/datasets/NPHardEval/cmp_GCP_D.py
View file @
b39f5015
import
ast
import
networkx
as
nx
try
:
import
networkx
as
nx
except
ImportError
:
nx
=
None
from
datasets
import
Dataset
from
opencompass.openicl.icl_evaluator
import
BaseEvaluator
...
...
opencompass/datasets/NPHardEval/cmp_TSP_D.py
View file @
b39f5015
import
ast
import
json
import
networkx
as
nx
try
:
import
networkx
as
nx
except
ImportError
:
nx
=
None
import
pandas
as
pd
from
datasets
import
Dataset
...
...
opencompass/datasets/NPHardEval/p_SPP.py
View file @
b39f5015
import
ast
import
json
import
networkx
as
nx
try
:
import
networkx
as
nx
except
ImportError
:
nx
=
None
from
datasets
import
Dataset
from
opencompass.openicl.icl_evaluator
import
BaseEvaluator
...
...
opencompass/datasets/__init__.py
View file @
b39f5015
...
...
@@ -3,6 +3,7 @@ from .afqmcd import * # noqa: F401, F403
from
.agieval
import
*
# noqa: F401, F403
from
.anli
import
AnliDataset
# noqa: F401, F403
from
.anthropics_evals
import
*
# noqa: F401, F403
from
.apps
import
*
# noqa: F401, F403
from
.arc
import
*
# noqa: F401, F403
from
.ax
import
*
# noqa: F401, F403
from
.bbh
import
*
# noqa: F401, F403
...
...
@@ -94,6 +95,7 @@ from .summedits import * # noqa: F401, F403
from
.summscreen
import
*
# noqa: F401, F403
from
.svamp
import
*
# noqa: F401, F403
from
.tabmwp
import
*
# noqa: F401, F403
from
.taco
import
*
# noqa: F401, F403
from
.teval
import
*
# noqa: F401, F403
from
.TheoremQA
import
*
# noqa: F401, F403
from
.tnews
import
*
# noqa: F401, F403
...
...
opencompass/datasets/apps.py
View file @
b39f5015
...
...
@@ -19,13 +19,19 @@ from unittest.mock import mock_open, patch
import
numpy
as
np
from
datasets
import
Dataset
,
DatasetDict
,
load_dataset
,
load_from_disk
from
pyext
import
RuntimeModule
try
:
from
pyext
import
RuntimeModule
except
ImportError
:
RuntimeModule
=
None
from
opencompass.openicl.icl_evaluator
import
BaseEvaluator
from
opencompass.registry
import
ICL_EVALUATORS
,
LOAD_DATASET
from
opencompass.utils.logging
import
get_logger
from
.base
import
BaseDataset
logger
=
get_logger
()
TIMEOUT
=
10
...
...
@@ -67,18 +73,20 @@ class APPSDataset(BaseDataset):
new_dataset
[
split
]
=
Dataset
.
from_dict
(
new_data
)
# num_repeats duplicate
train_repeated
=
[]
#
train_repeated = []
test_repeated
=
[]
for
sample
in
new_dataset
[
'train'
]:
train_repeated
.
extend
([
sample
]
*
num_repeats
)
#
for sample in new_dataset['train']:
#
train_repeated.extend([sample] * num_repeats)
for
sample
in
new_dataset
[
'test'
]:
test_repeated
.
extend
([
sample
]
*
num_repeats
)
dataset_train_repeated
=
new_dataset
[
'train'
].
from_list
(
train_repeated
)
# dataset_train_repeated = new_dataset['train'].from_list(
# train_repeated
# )
dataset_test_repeated
=
new_dataset
[
'test'
].
from_list
(
test_repeated
)
return
DatasetDict
({
'train'
:
dataset_train_repeated
,
#
'train': dataset_train_repeated,
'test'
:
dataset_test_repeated
})
...
...
@@ -121,18 +129,20 @@ class APPS_miniDataset(BaseDataset):
new_dataset
[
split
]
=
Dataset
.
from_dict
(
new_data
)
# num_repeats duplicate
train_repeated
=
[]
#
train_repeated = []
test_repeated
=
[]
for
sample
in
new_dataset
[
'train'
]:
train_repeated
.
extend
([
sample
]
*
num_repeats
)
#
for sample in new_dataset['train']:
#
train_repeated.extend([sample] * num_repeats)
for
sample
in
new_dataset
[
'test'
]:
test_repeated
.
extend
([
sample
]
*
num_repeats
)
dataset_train_repeated
=
new_dataset
[
'train'
].
from_list
(
train_repeated
)
# dataset_train_repeated = new_dataset['train'].from_list(
# train_repeated
# )
dataset_test_repeated
=
new_dataset
[
'test'
].
from_list
(
test_repeated
)
return
DatasetDict
({
'train'
:
dataset_train_repeated
,
#
'train': dataset_train_repeated,
'test'
:
dataset_test_repeated
})
...
...
@@ -308,7 +318,10 @@ def timeout_handler(signum, frame):
raise
TimeoutException
signal
.
signal
(
signal
.
SIGALRM
,
timeout_handler
)
try
:
signal
.
signal
(
signal
.
SIGALRM
,
timeout_handler
)
except
AttributeError
:
logger
.
warning
(
'signal.SIGALRM is not available on this platform'
)
timeout
=
4
# seconds
...
...
opencompass/datasets/custom.py
View file @
b39f5015
...
...
@@ -210,6 +210,8 @@ def make_mcq_gen_config(meta):
input_columns
=
meta
[
'input_columns'
],
output_column
=
meta
[
'output_column'
],
)
if
'test_range'
in
meta
:
reader_cfg
[
'test_range'
]
=
meta
[
'test_range'
]
infer_cfg
=
dict
(
prompt_template
=
dict
(
type
=
PromptTemplate
,
...
...
@@ -255,6 +257,8 @@ def make_circular_mcq_gen_config(meta):
input_columns
=
meta
[
'input_columns'
],
output_column
=
meta
[
'output_column'
],
)
if
'test_range'
in
meta
:
reader_cfg
[
'test_range'
]
=
meta
[
'test_range'
]
infer_cfg
=
dict
(
prompt_template
=
dict
(
type
=
PromptTemplate
,
...
...
@@ -304,6 +308,8 @@ def make_qa_gen_config(meta):
input_columns
=
meta
[
'input_columns'
],
output_column
=
meta
[
'output_column'
],
)
if
'test_range'
in
meta
:
reader_cfg
[
'test_range'
]
=
meta
[
'test_range'
]
infer_cfg
=
dict
(
prompt_template
=
dict
(
type
=
PromptTemplate
,
...
...
@@ -353,6 +359,8 @@ def make_mcq_ppl_config(meta):
input_columns
=
meta
[
'input_columns'
],
output_column
=
meta
[
'output_column'
],
)
if
'test_range'
in
meta
:
reader_cfg
[
'test_range'
]
=
meta
[
'test_range'
]
infer_cfg
=
dict
(
prompt_template
=
dict
(
type
=
PromptTemplate
,
...
...
@@ -399,6 +407,8 @@ def make_circular_mcq_ppl_config(meta):
input_columns
=
meta
[
'input_columns'
],
output_column
=
meta
[
'output_column'
],
)
if
'test_range'
in
meta
:
reader_cfg
[
'test_range'
]
=
meta
[
'test_range'
]
infer_cfg
=
dict
(
prompt_template
=
dict
(
type
=
PromptTemplate
,
...
...
opencompass/datasets/humanevalx.py
View file @
b39f5015
...
...
@@ -168,9 +168,12 @@ def _clean_up_code(text: str, language_type: str, reference) -> str:
"""Cleans up the generated code."""
try
:
# for chatGLM related text
text
=
eval
(
text
)
eval_
text
=
eval
(
text
)
except
Exception
:
pass
else
:
if
isinstance
(
eval_text
,
str
):
text
=
eval_text
# extract code from code block
text
=
text
.
lstrip
(
'
\n
'
)
if
'```'
in
text
:
...
...
opencompass/datasets/lawbench/evaluation_functions/ljp_article.py
View file @
b39f5015
import
re
import
cn2an
"""
task: law article prediction
...
...
@@ -15,6 +14,7 @@ def compute_ljp_article(data_dict):
A reference contains a list of articles of the Criminal Law of the People's Republic of China.
We compute the F1-score between the prediction and the reference.
"""
import
cn2an
score_list
,
abstentions
=
[],
0
...
...
opencompass/datasets/lawbench/evaluation_functions/ljp_imprison.py
View file @
b39f5015
import
math
import
cn2an
import
re
#法律判决预测-刑期预测
def
compute_ljp_imprison
(
data_dict
):
import
cn2an
score_list
,
abstentions
=
[],
0
for
example
in
data_dict
:
...
...
opencompass/datasets/math.py
View file @
b39f5015
...
...
@@ -85,6 +85,7 @@ def normalize_final_answer(final_answer: str) -> str:
# Extract answer that is in LaTeX math, is bold,
# is surrounded by a box, etc.
final_answer
=
re
.
sub
(
r
'(\\text\{)\((.*?)\)(\})'
,
'
\\
2'
,
final_answer
)
final_answer
=
re
.
sub
(
r
'(\\text\{)(.*?)(\})'
,
'
\\
2'
,
final_answer
)
final_answer
=
re
.
sub
(
r
'(\\textbf\{)(.*?)(\})'
,
'
\\
2'
,
final_answer
)
final_answer
=
re
.
sub
(
r
'(\\overline\{)(.*?)(\})'
,
'
\\
2'
,
final_answer
)
...
...
@@ -178,10 +179,7 @@ class MATHEvaluator(BaseEvaluator):
def
score
(
self
,
predictions
,
references
):
if
len
(
predictions
)
!=
len
(
references
):
return
{
'error'
:
'predictions and references have different '
'length'
}
return
{
'error'
:
'preds and refrs have different length'
}
correct
=
0
count
=
0
details
=
[]
...
...
@@ -457,8 +455,23 @@ class MATHEvaluator(BaseEvaluator):
ss2
=
strip_string_func
(
str2
)
if
verbose
:
print
(
ss1
,
ss2
)
return
ss1
==
ss2
if
ss1
==
ss2
:
return
True
ss1
=
normalize_final_answer
(
ss1
)
ss2
=
normalize_final_answer
(
ss2
)
if
ss1
==
ss2
:
return
True
except
Exception
:
pass
try
:
ss1
=
normalize_final_answer
(
str1
)
ss2
=
normalize_final_answer
(
str2
)
if
ss1
==
ss2
:
return
True
except
Exception
:
pass
return
str1
==
str2
...
...
opencompass/datasets/mathbench.py
View file @
b39f5015
...
...
@@ -57,7 +57,7 @@ class MathBenchDataset(BaseDataset):
"""
data
=
[]
filename
=
osp
.
join
(
path
,
f
'
{
name
}
.jsonl'
)
with
open
(
filename
,
'r'
)
as
infile
:
with
open
(
filename
,
'r'
,
encoding
=
'utf-8'
)
as
infile
:
for
id
,
line
in
enumerate
(
infile
):
entry
=
json
.
loads
(
line
)
if
'cloze'
in
name
:
...
...
opencompass/datasets/mbpp.py
View file @
b39f5015
...
...
@@ -244,6 +244,7 @@ class MBPPEvaluator(BaseEvaluator):
if
not
isinstance
(
preds
,
list
):
preds
=
[
preds
]
for
pred
in
preds
:
pred
=
self
.
_process_answer
(
pred
)
mbpp_preds
.
append
({
'task_id'
:
refer
,
'solution'
:
pred
})
with
tempfile
.
TemporaryDirectory
()
as
tmp_dir
:
out_dir
=
osp
.
join
(
tmp_dir
,
'mbpp_eval.jsonl'
)
...
...
opencompass/datasets/taco.py
View file @
b39f5015
...
...
@@ -18,14 +18,20 @@ from io import StringIO
from
unittest.mock
import
mock_open
,
patch
import
numpy
as
np
from
datasets
import
Dataset
,
DatasetDict
,
load_dataset
from
pyext
import
RuntimeModule
from
datasets
import
Dataset
,
DatasetDict
,
load_from_disk
try
:
from
pyext
import
RuntimeModule
except
ImportError
:
RuntimeModule
=
None
from
opencompass.openicl.icl_evaluator
import
BaseEvaluator
from
opencompass.registry
import
ICL_EVALUATORS
,
LOAD_DATASET
from
opencompass.utils.logging
import
get_logger
from
.base
import
BaseDataset
logger
=
get_logger
()
TIMEOUT
=
10
...
...
@@ -34,7 +40,7 @@ class TACODataset(BaseDataset):
@
staticmethod
def
load
(
path
:
str
,
num_repeats
:
int
=
1
):
dataset
=
load_
dataset
(
path
)
dataset
=
load_
from_disk
(
path
)
new_dataset
=
DatasetDict
()
# add new column "starter" in the prompt
for
split
in
dataset
.
keys
():
...
...
@@ -69,18 +75,20 @@ class TACODataset(BaseDataset):
new_dataset
[
split
]
=
Dataset
.
from_dict
(
new_data
)
# num_repeats duplicate
train_repeated
=
[]
#
train_repeated = []
test_repeated
=
[]
for
sample
in
new_dataset
[
'train'
]:
train_repeated
.
extend
([
sample
]
*
num_repeats
)
#
for sample in new_dataset['train']:
#
train_repeated.extend([sample] * num_repeats)
for
sample
in
new_dataset
[
'test'
]:
test_repeated
.
extend
([
sample
]
*
num_repeats
)
dataset_train_repeated
=
new_dataset
[
'train'
].
from_list
(
train_repeated
)
# dataset_train_repeated = new_dataset['train'].from_list(
# train_repeated
# )
dataset_test_repeated
=
new_dataset
[
'test'
].
from_list
(
test_repeated
)
return
DatasetDict
({
'train'
:
dataset_train_repeated
,
#
'train': dataset_train_repeated,
'test'
:
dataset_test_repeated
})
...
...
@@ -256,7 +264,10 @@ def timeout_handler(signum, frame):
raise
TimeoutException
signal
.
signal
(
signal
.
SIGALRM
,
timeout_handler
)
try
:
signal
.
signal
(
signal
.
SIGALRM
,
timeout_handler
)
except
AttributeError
:
logger
.
warning
(
'signal.SIGALRM is not available on this platform'
)
timeout
=
4
# seconds
...
...
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment