Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
opencompass
Commits
2b3d4150
"vscode:/vscode.git/clone" did not exist on "e4559f48c188486a1b6371ccdcc387fd47a04369"
Unverified
Commit
2b3d4150
authored
May 21, 2024
by
Fengzhe Zhou
Committed by
GitHub
May 21, 2024
Browse files
[Sync] update evaluator (#1175)
parent
296ea599
Changes
24
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
124 additions
and
60 deletions
+124
-60
configs/datasets/cmmlu/cmmlu_gen_c13365.py
configs/datasets/cmmlu/cmmlu_gen_c13365.py
+2
-2
configs/datasets/cmmlu/cmmlu_ppl_041cbf.py
configs/datasets/cmmlu/cmmlu_ppl_041cbf.py
+2
-2
configs/datasets/hellaswag/hellaswag_10shot_gen_e42710.py
configs/datasets/hellaswag/hellaswag_10shot_gen_e42710.py
+2
-2
configs/datasets/hellaswag/hellaswag_10shot_ppl_59c85e.py
configs/datasets/hellaswag/hellaswag_10shot_ppl_59c85e.py
+2
-2
configs/datasets/mmlu/mmlu_gen_4d595a.py
configs/datasets/mmlu/mmlu_gen_4d595a.py
+2
-2
configs/datasets/mmlu/mmlu_ppl_ac766d.py
configs/datasets/mmlu/mmlu_ppl_ac766d.py
+2
-2
configs/datasets/race/race_gen_69ee4f.py
configs/datasets/race/race_gen_69ee4f.py
+2
-2
configs/datasets/race/race_ppl_abed12.py
configs/datasets/race/race_ppl_abed12.py
+2
-2
configs/datasets/winogrande/winogrande_5shot_gen_b36770.py
configs/datasets/winogrande/winogrande_5shot_gen_b36770.py
+2
-2
configs/datasets/winogrande/winogrande_5shot_ll_252f01.py
configs/datasets/winogrande/winogrande_5shot_ll_252f01.py
+2
-2
configs/models/qwen/hf_qwen1_5_110b.py
configs/models/qwen/hf_qwen1_5_110b.py
+1
-1
configs/models/qwen/hf_qwen1_5_110b_chat.py
configs/models/qwen/hf_qwen1_5_110b_chat.py
+1
-1
configs/models/qwen/hf_qwen1_5_14b.py
configs/models/qwen/hf_qwen1_5_14b.py
+1
-1
configs/models/qwen/hf_qwen1_5_14b_chat.py
configs/models/qwen/hf_qwen1_5_14b_chat.py
+1
-1
configs/models/qwen/hf_qwen1_5_72b.py
configs/models/qwen/hf_qwen1_5_72b.py
+1
-1
configs/models/qwen/hf_qwen1_5_72b_chat.py
configs/models/qwen/hf_qwen1_5_72b_chat.py
+1
-1
configs/models/qwen/lmdeploy_qwen1_5_series.py
configs/models/qwen/lmdeploy_qwen1_5_series.py
+1
-0
configs/summarizers/chat_OC15_multi_faceted.py
configs/summarizers/chat_OC15_multi_faceted.py
+19
-0
opencompass/datasets/GaokaoBench.py
opencompass/datasets/GaokaoBench.py
+23
-6
opencompass/datasets/IFEval/ifeval.py
opencompass/datasets/IFEval/ifeval.py
+55
-28
No files found.
configs/datasets/cmmlu/cmmlu_gen_c13365.py
View file @
2b3d4150
from
opencompass.openicl.icl_prompt_template
import
PromptTemplate
from
opencompass.openicl.icl_retriever
import
FixKRetriever
from
opencompass.openicl.icl_inferencer
import
GenInferencer
from
opencompass.openicl.icl_evaluator
import
AccEvaluator
from
opencompass.openicl.icl_evaluator
import
Acc
withDetails
Evaluator
from
opencompass.datasets
import
CMMLUDataset
from
opencompass.utils.text_postprocessors
import
first_capital_postprocess
...
...
@@ -101,7 +101,7 @@ for _name in cmmlu_all_sets:
)
cmmlu_eval_cfg
=
dict
(
evaluator
=
dict
(
type
=
AccEvaluator
),
evaluator
=
dict
(
type
=
Acc
withDetails
Evaluator
),
pred_postprocessor
=
dict
(
type
=
first_capital_postprocess
))
cmmlu_datasets
.
append
(
...
...
configs/datasets/cmmlu/cmmlu_ppl_041cbf.py
View file @
2b3d4150
from
opencompass.openicl.icl_prompt_template
import
PromptTemplate
from
opencompass.openicl.icl_retriever
import
FixKRetriever
from
opencompass.openicl.icl_inferencer
import
PPLInferencer
from
opencompass.openicl.icl_evaluator
import
AccEvaluator
from
opencompass.openicl.icl_evaluator
import
Acc
withDetails
Evaluator
from
opencompass.datasets
import
CMMLUDataset
from
opencompass.utils.text_postprocessors
import
first_capital_postprocess
...
...
@@ -97,7 +97,7 @@ for _name in cmmlu_all_sets:
inferencer
=
dict
(
type
=
PPLInferencer
),
)
cmmlu_eval_cfg
=
dict
(
evaluator
=
dict
(
type
=
AccEvaluator
))
cmmlu_eval_cfg
=
dict
(
evaluator
=
dict
(
type
=
Acc
withDetails
Evaluator
))
cmmlu_datasets
.
append
(
dict
(
...
...
configs/datasets/hellaswag/hellaswag_10shot_gen_e42710.py
View file @
2b3d4150
from
opencompass.openicl.icl_prompt_template
import
PromptTemplate
from
opencompass.openicl.icl_retriever
import
FixKRetriever
from
opencompass.openicl.icl_inferencer
import
GenInferencer
from
opencompass.openicl.icl_evaluator
import
AccEvaluator
from
opencompass.openicl.icl_evaluator
import
Acc
withDetails
Evaluator
from
opencompass.datasets
import
hellaswagDatasetwithICE
from
opencompass.utils.text_postprocessors
import
first_option_postprocess
...
...
@@ -41,7 +41,7 @@ hellaswag_infer_cfg = dict(
)
hellaswag_eval_cfg
=
dict
(
evaluator
=
dict
(
type
=
AccEvaluator
),
evaluator
=
dict
(
type
=
Acc
withDetails
Evaluator
),
pred_role
=
'BOT'
,
pred_postprocessor
=
dict
(
type
=
first_option_postprocess
,
options
=
'ABCD'
),
)
...
...
configs/datasets/hellaswag/hellaswag_10shot_ppl_59c85e.py
View file @
2b3d4150
from
opencompass.openicl.icl_prompt_template
import
PromptTemplate
from
opencompass.openicl.icl_retriever
import
FixKRetriever
from
opencompass.openicl.icl_inferencer
import
PPLInferencer
from
opencompass.openicl.icl_evaluator
import
AccEvaluator
from
opencompass.openicl.icl_evaluator
import
Acc
withDetails
Evaluator
from
opencompass.datasets
import
hellaswagDatasetwithICE
from
opencompass.utils.text_postprocessors
import
first_capital_postprocess
...
...
@@ -29,7 +29,7 @@ hellaswag_infer_cfg = dict(
)
hellaswag_eval_cfg
=
dict
(
evaluator
=
dict
(
type
=
AccEvaluator
),
evaluator
=
dict
(
type
=
Acc
withDetails
Evaluator
),
pred_postprocessor
=
dict
(
type
=
first_capital_postprocess
),
)
...
...
configs/datasets/mmlu/mmlu_gen_4d595a.py
View file @
2b3d4150
from
opencompass.openicl.icl_prompt_template
import
PromptTemplate
from
opencompass.openicl.icl_retriever
import
FixKRetriever
from
opencompass.openicl.icl_inferencer
import
GenInferencer
from
opencompass.openicl.icl_evaluator
import
AccEvaluator
from
opencompass.openicl.icl_evaluator
import
Acc
withDetails
Evaluator
from
opencompass.datasets
import
MMLUDataset
from
opencompass.utils.text_postprocessors
import
first_option_postprocess
...
...
@@ -106,7 +106,7 @@ for _name in mmlu_all_sets:
)
mmlu_eval_cfg
=
dict
(
evaluator
=
dict
(
type
=
AccEvaluator
),
evaluator
=
dict
(
type
=
Acc
withDetails
Evaluator
),
pred_postprocessor
=
dict
(
type
=
first_option_postprocess
,
options
=
'ABCD'
))
mmlu_datasets
.
append
(
...
...
configs/datasets/mmlu/mmlu_ppl_ac766d.py
View file @
2b3d4150
from
opencompass.openicl.icl_prompt_template
import
PromptTemplate
from
opencompass.openicl.icl_retriever
import
FixKRetriever
from
opencompass.openicl.icl_inferencer
import
PPLInferencer
from
opencompass.openicl.icl_evaluator
import
AccEvaluator
from
opencompass.openicl.icl_evaluator
import
Acc
withDetails
Evaluator
from
opencompass.datasets
import
MMLUDataset
# None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader
...
...
@@ -90,7 +90,7 @@ for _name in mmlu_all_sets:
inferencer
=
dict
(
type
=
PPLInferencer
),
)
mmlu_eval_cfg
=
dict
(
evaluator
=
dict
(
type
=
AccEvaluator
),
)
mmlu_eval_cfg
=
dict
(
evaluator
=
dict
(
type
=
Acc
withDetails
Evaluator
),
)
mmlu_datasets
.
append
(
dict
(
...
...
configs/datasets/race/race_gen_69ee4f.py
View file @
2b3d4150
from
opencompass.openicl.icl_prompt_template
import
PromptTemplate
from
opencompass.openicl.icl_retriever
import
ZeroRetriever
from
opencompass.openicl.icl_inferencer
import
GenInferencer
from
opencompass.openicl.icl_evaluator
import
AccEvaluator
from
opencompass.openicl.icl_evaluator
import
Acc
withDetails
Evaluator
from
opencompass.datasets
import
RaceDataset
from
opencompass.utils.text_postprocessors
import
first_option_postprocess
...
...
@@ -26,7 +26,7 @@ race_infer_cfg = dict(
inferencer
=
dict
(
type
=
GenInferencer
))
race_eval_cfg
=
dict
(
evaluator
=
dict
(
type
=
AccEvaluator
),
evaluator
=
dict
(
type
=
Acc
withDetails
Evaluator
),
pred_postprocessor
=
dict
(
type
=
first_option_postprocess
,
options
=
'ABCD'
),
pred_role
=
'BOT'
)
...
...
configs/datasets/race/race_ppl_abed12.py
View file @
2b3d4150
from
opencompass.openicl.icl_prompt_template
import
PromptTemplate
from
opencompass.openicl.icl_retriever
import
ZeroRetriever
from
opencompass.openicl.icl_inferencer
import
PPLInferencer
from
opencompass.openicl.icl_evaluator
import
AccEvaluator
from
opencompass.openicl.icl_evaluator
import
Acc
withDetails
Evaluator
from
opencompass.datasets
import
RaceDataset
race_reader_cfg
=
dict
(
...
...
@@ -20,7 +20,7 @@ race_infer_cfg = dict(
retriever
=
dict
(
type
=
ZeroRetriever
),
inferencer
=
dict
(
type
=
PPLInferencer
))
race_eval_cfg
=
dict
(
evaluator
=
dict
(
type
=
AccEvaluator
))
race_eval_cfg
=
dict
(
evaluator
=
dict
(
type
=
Acc
withDetails
Evaluator
))
race_datasets
=
[
dict
(
...
...
configs/datasets/winogrande/winogrande_5shot_gen_b36770.py
View file @
2b3d4150
from
opencompass.openicl.icl_prompt_template
import
PromptTemplate
from
opencompass.openicl.icl_retriever
import
FixKRetriever
from
opencompass.openicl.icl_inferencer
import
GenInferencer
from
opencompass.openicl.icl_evaluator
import
AccEvaluator
from
opencompass.openicl.icl_evaluator
import
Acc
withDetails
Evaluator
from
opencompass.datasets
import
winograndeDataset_V3
from
opencompass.utils.text_postprocessors
import
first_option_postprocess
...
...
@@ -29,7 +29,7 @@ winogrande_infer_cfg = dict(
)
winogrande_eval_cfg
=
dict
(
evaluator
=
dict
(
type
=
AccEvaluator
),
evaluator
=
dict
(
type
=
Acc
withDetails
Evaluator
),
pred_role
=
'BOT'
,
pred_postprocessor
=
dict
(
type
=
first_option_postprocess
,
options
=
'AB'
),
)
...
...
configs/datasets/winogrande/winogrande_5shot_ll_252f01.py
View file @
2b3d4150
from
opencompass.openicl.icl_prompt_template
import
PromptTemplate
from
opencompass.openicl.icl_retriever
import
FixKRetriever
from
opencompass.openicl.icl_inferencer
import
LLInferencer
from
opencompass.openicl.icl_evaluator
import
AccEvaluator
from
opencompass.openicl.icl_evaluator
import
Acc
withDetails
Evaluator
from
opencompass.datasets
import
winograndeDataset_V3
winogrande_reader_cfg
=
dict
(
...
...
@@ -25,7 +25,7 @@ winogrande_infer_cfg = dict(
retriever
=
dict
(
type
=
FixKRetriever
,
fix_id_list
=
[
0
,
2
,
4
,
6
,
8
]),
inferencer
=
dict
(
type
=
LLInferencer
),
)
winogrande_eval_cfg
=
dict
(
evaluator
=
dict
(
type
=
AccEvaluator
))
winogrande_eval_cfg
=
dict
(
evaluator
=
dict
(
type
=
Acc
withDetails
Evaluator
))
winogrande_datasets
=
[
dict
(
...
...
configs/models/qwen/hf_qwen1_5_110b.py
View file @
2b3d4150
...
...
@@ -7,6 +7,6 @@ models = [
path
=
'Qwen/Qwen1.5-110B'
,
max_out_len
=
1024
,
batch_size
=
8
,
run_cfg
=
dict
(
num_gpus
=
4
),
run_cfg
=
dict
(
num_gpus
=
8
),
)
]
configs/models/qwen/hf_qwen1_5_110b_chat.py
View file @
2b3d4150
...
...
@@ -7,6 +7,6 @@ models = [
path
=
'Qwen/Qwen1.5-110B-Chat'
,
max_out_len
=
1024
,
batch_size
=
8
,
run_cfg
=
dict
(
num_gpus
=
4
),
run_cfg
=
dict
(
num_gpus
=
8
),
)
]
configs/models/qwen/hf_qwen1_5_14b.py
View file @
2b3d4150
...
...
@@ -7,6 +7,6 @@ models = [
path
=
'Qwen/Qwen1.5-14B'
,
max_out_len
=
1024
,
batch_size
=
8
,
run_cfg
=
dict
(
num_gpus
=
1
),
run_cfg
=
dict
(
num_gpus
=
2
),
)
]
configs/models/qwen/hf_qwen1_5_14b_chat.py
View file @
2b3d4150
...
...
@@ -7,6 +7,6 @@ models = [
path
=
'Qwen/Qwen1.5-14B-Chat'
,
max_out_len
=
1024
,
batch_size
=
8
,
run_cfg
=
dict
(
num_gpus
=
1
),
run_cfg
=
dict
(
num_gpus
=
2
),
)
]
configs/models/qwen/hf_qwen1_5_72b.py
View file @
2b3d4150
...
...
@@ -7,6 +7,6 @@ models = [
path
=
'Qwen/Qwen1.5-72B'
,
max_out_len
=
1024
,
batch_size
=
8
,
run_cfg
=
dict
(
num_gpus
=
4
),
run_cfg
=
dict
(
num_gpus
=
8
),
)
]
configs/models/qwen/hf_qwen1_5_72b_chat.py
View file @
2b3d4150
...
...
@@ -7,6 +7,6 @@ models = [
path
=
'Qwen/Qwen1.5-72B-Chat'
,
max_out_len
=
1024
,
batch_size
=
8
,
run_cfg
=
dict
(
num_gpus
=
4
),
run_cfg
=
dict
(
num_gpus
=
8
),
)
]
configs/models/qwen/lmdeploy_qwen1_5_series.py
View file @
2b3d4150
...
...
@@ -8,6 +8,7 @@ settings = [
(
'qwen1.5-14b-pytorch'
,
'Qwen/Qwen1.5-14B'
,
1
),
(
'qwen1.5-32b-pytorch'
,
'Qwen/Qwen1.5-32B'
,
2
),
(
'qwen1.5-72b-pytorch'
,
'Qwen/Qwen1.5-72B'
,
4
),
(
'qwen1.5-110b-pytorch'
,
'Qwen/Qwen1.5-110B'
,
4
),
(
'qwen1.5-moe-a2.7b-pytorch'
,
'Qwen/Qwen1.5-MoE-A2.7B'
,
1
),
]
...
...
configs/summarizers/chat_OC15_multi_faceted.py
View file @
2b3d4150
...
...
@@ -115,6 +115,14 @@ sanitized_mbpp_dataset_abbrs = [
[
'sanitized_mbpp'
,
'timeout'
],
]
IFEval_dataset_abbrs
=
[
[
'IFEval'
,
'Prompt-level-strict-accuracy'
],
[
'IFEval'
,
'Inst-level-strict-accuracy'
],
[
'IFEval'
,
'Prompt-level-loose-accuracy'
],
[
'IFEval'
,
'Inst-level-loose-accuracy'
],
]
summarizer
=
dict
(
type
=
MultiFacetedSummarizer
,
dataset_abbrs_list
=
[
...
...
@@ -124,6 +132,17 @@ summarizer = dict(
{
'name'
:
'bbh'
,
'dataset_abbrs'
:
bbh_dataset_abbrs
},
{
'name'
:
'GaokaoBench'
,
'dataset_abbrs'
:
GaokaoBench_dataset_abbrs
},
{
'name'
:
'sanitized_mbpp'
,
'dataset_abbrs'
:
sanitized_mbpp_dataset_abbrs
},
{
'name'
:
'triviaqa'
,
'dataset_abbrs'
:
[[
'triviaqa_wiki_1shot'
,
'score'
]]},
{
'name'
:
'nq'
,
'dataset_abbrs'
:
[[
'nq_open_1shot'
,
'score'
]]},
{
'name'
:
'race'
,
'dataset_abbrs'
:
[[
'race-high'
,
'accuracy'
]]},
{
'name'
:
'winogrande'
,
'dataset_abbrs'
:
[[
'winogrande'
,
'accuracy'
]]},
{
'name'
:
'hellaswag'
,
'dataset_abbrs'
:
[[
'hellaswag'
,
'accuracy'
]]},
{
'name'
:
'gsm8k'
,
'dataset_abbrs'
:
[[
'gsm8k'
,
'accuracy'
]]},
{
'name'
:
'math'
,
'dataset_abbrs'
:
[[
'math'
,
'accuracy'
]]},
{
'name'
:
'TheoremQA'
,
'dataset_abbrs'
:
[[
'TheoremQA'
,
'score'
]]},
{
'name'
:
'humaneval'
,
'dataset_abbrs'
:
[[
'openai_humaneval'
,
'humaneval_pass@1'
]]},
{
'name'
:
'GPQA'
,
'dataset_abbrs'
:
[[
'GPQA_diamond'
,
'accuracy'
]]},
{
'name'
:
'IFEval'
,
'dataset_abbrs'
:
IFEval_dataset_abbrs
},
{
'name'
:
'overall'
,
'dataset_abbrs'
:
overall_dataset_abbrs
},
],
summary_groups
=
sum
([
v
for
k
,
v
in
locals
().
items
()
if
k
.
endswith
(
'_summary_groups'
)],
[]),
...
...
opencompass/datasets/GaokaoBench.py
View file @
2b3d4150
...
...
@@ -91,34 +91,51 @@ class GaokaoBenchEvaluator(BaseEvaluator):
]:
return
{
'score'
:
0
}
elif
self
.
question_type
==
'multi_choice'
:
details
=
{}
correct_score
,
total_score
=
0
,
0
for
pred
,
refr
in
zip
(
predictions
,
references
):
for
index
,
(
pred
,
refr
)
in
enumerate
(
zip
(
predictions
,
references
)
)
:
pred
=
self
.
do_predictions_postprocess
(
pred
)
pred
=
self
.
ensure_same_length
(
pred
,
refr
)
is_corrects
=
[]
for
p
,
r
in
zip
(
pred
,
refr
):
if
p
==
r
:
correct_score
+=
2
is_corrects
.
append
(
True
)
else
:
for
i
in
p
:
if
i
not
in
r
:
break
else
:
correct_score
+=
1
is_corrects
.
append
(
False
)
total_score
+=
2
return
{
'score'
:
correct_score
/
total_score
*
100
}
details
[
str
(
index
)]
=
{
'pred'
:
pred
,
'refr'
:
refr
,
'is_correct'
:
all
(
is_corrects
),
}
else
:
details
=
{}
correct_score
,
total_score
=
0
,
0
for
pred
,
refr
in
zip
(
predictions
,
references
):
for
index
,
(
pred
,
refr
)
in
enumerate
(
zip
(
predictions
,
references
)
)
:
if
self
.
question_type
==
'multi_question_choice'
:
pred
=
self
.
do_predictions_postprocess
(
pred
,
len
(
refr
))
else
:
pred
=
self
.
do_predictions_postprocess
(
pred
)
pred
=
self
.
ensure_same_length
(
pred
,
refr
)
is_corrects
=
[]
for
p
,
r
in
zip
(
pred
,
refr
):
i
f
p
==
r
:
correct_score
+=
1
i
s_correct
=
p
==
r
correct_score
+=
is_correct
total_score
+=
1
return
{
'score'
:
correct_score
/
total_score
*
100
}
is_corrects
.
append
(
is_correct
)
details
[
str
(
index
)]
=
{
'pred'
:
pred
,
'refr'
:
refr
,
'is_correct'
:
all
(
is_corrects
),
}
return
{
'score'
:
correct_score
/
total_score
*
100
,
'details'
:
details
}
for
question_type
in
valid_gaokao_bench_question_types
:
...
...
opencompass/datasets/IFEval/ifeval.py
View file @
2b3d4150
...
...
@@ -26,11 +26,13 @@ class IFEvalDataset(BaseDataset):
class
IFEvaluator
(
BaseEvaluator
):
def
score
(
self
,
predictions
,
references
):
results
=
dict
()
for
metric
in
(
'strict'
,
'loose'
):
results
[
metric
]
=
[]
for
pred
,
refer
in
zip
(
predictions
,
references
):
def
score
(
self
,
predictions
,
references
,
origin_prompt
):
prompt_strict_correct
,
prompt_strict_total
=
0
,
0
inst_strict_correct
,
inst_strict_total
=
0
,
0
prompt_loose_correct
,
prompt_loose_total
=
0
,
0
inst_loose_correct
,
inst_loose_total
=
0
,
0
details
=
{}
for
index
,
(
pred
,
refer
)
in
enumerate
(
zip
(
predictions
,
references
)):
input
=
InputExample
(
key
=
refer
[
'key'
],
instruction_id_list
=
refer
[
'instruction_id_list'
],
...
...
@@ -40,29 +42,54 @@ class IFEvaluator(BaseEvaluator):
for
k
in
list
(
kwarg
.
keys
()):
if
kwarg
[
k
]
is
None
:
kwarg
.
pop
(
k
,
None
)
results
[
'strict'
].
append
(
test_instruction_following_strict
(
input
,
pred
))
results
[
'loose'
].
append
(
test_instruction_following_loose
(
input
,
pred
))
final_scores
=
dict
()
for
metric
in
(
'strict'
,
'loose'
):
prompt_total
=
0
prompt_correct
=
0
inst_total
=
0
inst_correct
=
0
for
example
in
results
[
metric
]:
follow_instruction_list
=
example
.
follow_instruction_list
instruction_id_list
=
example
.
instruction_id_list
# strict
example
=
test_instruction_following_strict
(
input
,
pred
)
follow_instruction_list
=
example
.
follow_instruction_list
instruction_id_list
=
example
.
instruction_id_list
prompt_strict_total
+=
1
is_strict_correct
=
all
(
follow_instruction_list
)
prompt_strict_correct
+=
is_strict_correct
inst_strict_total
+=
len
(
instruction_id_list
)
inst_strict_correct
+=
sum
(
follow_instruction_list
)
prompt_total
+=
1
if
all
(
follow_instruction_list
):
prompt_correct
+=
1
# loose
example
=
test_instruction_following_loose
(
input
,
pred
)
follow_instruction_list
=
example
.
follow_instruction_list
instruction_id_list
=
example
.
instruction_id_list
prompt_loose_total
+=
1
is_loose_correct
=
all
(
follow_instruction_list
)
prompt_loose_correct
+=
is_loose_correct
inst_loose_total
+=
len
(
instruction_id_list
)
inst_loose_correct
+=
sum
(
follow_instruction_list
)
inst_total
+=
len
(
instruction_id_list
)
inst_correct
+=
sum
(
follow_instruction_list
)
prompt_score
=
f
'Prompt-level-
{
metric
}
-accuracy'
inst_score
=
f
'Inst-level-
{
metric
}
-accuracy'
final_scores
[
prompt_score
]
=
prompt_correct
/
prompt_total
*
100
final_scores
[
inst_score
]
=
inst_correct
/
inst_total
*
100
return
final_scores
if
is_strict_correct
:
grade
=
'strict'
elif
is_loose_correct
:
grade
=
'loose'
else
:
grade
=
'none'
details
[
str
(
index
)]
=
{
'prompt'
:
origin_prompt
[
index
],
'pred'
:
pred
,
'refer'
:
refer
,
'is_strict_correct'
:
is_strict_correct
,
'is_loose_correct'
:
is_loose_correct
,
'is_correct'
:
is_strict_correct
,
'grade'
:
grade
}
results
=
{
'Prompt-level-strict-accuracy'
:
prompt_strict_correct
/
prompt_strict_total
*
100
,
'Inst-level-strict-accuracy'
:
inst_strict_correct
/
inst_strict_total
*
100
,
'Prompt-level-loose-accuracy'
:
prompt_loose_correct
/
prompt_loose_total
*
100
,
'Inst-level-loose-accuracy'
:
inst_loose_correct
/
inst_loose_total
*
100
,
'details'
:
details
}
return
results
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment