Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
opencompass
Commits
5c6dc908
Unverified
Commit
5c6dc908
authored
Jan 30, 2024
by
bittersweet1999
Committed by
GitHub
Jan 30, 2024
Browse files
fix compass arena (#854)
parent
4f78388c
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
81 additions
and
64 deletions
+81
-64
configs/datasets/subjective/compassarena/compassarena_compare.py
.../datasets/subjective/compassarena/compassarena_compare.py
+2
-17
configs/eval_subjective_compassarena.py
configs/eval_subjective_compassarena.py
+7
-6
opencompass/summarizers/subjective/compass_arena.py
opencompass/summarizers/subjective/compass_arena.py
+68
-38
opencompass/summarizers/subjective/utils.py
opencompass/summarizers/subjective/utils.py
+4
-3
No files found.
configs/datasets/subjective/compassarena/compassarena_compare.py
View file @
5c6dc908
...
@@ -88,19 +88,6 @@ math_prompt = """
...
@@ -88,19 +88,6 @@ math_prompt = """
reason_prompt
=
math_prompt
reason_prompt
=
math_prompt
qa_prompt
=
"""
请根据提供的 评分要求,用户问题 以及 相应的两个回答(回答1,回答2),判断两个回答中哪一个更好。
评分要求(重要性依次递减):
1. 好的回答必须首先具有事实正确性,即除了想象的内容外,所引用或阐述的各种信息都是真实正确的
2. 好的回答必须具有逻辑连贯性,围绕一个中心进行回答,且前后连贯,逻辑没有问题
3. 在都准确答对问题的前提下,更好的回答能进行额外补充,且补充的内容准确无误
[用户问题]
{question}
"""
+
base_prompt
creation_prompt
=
"""
creation_prompt
=
"""
请根据提供的 评分要求,用户问题 以及 相应的两个回答(回答1,回答2),判断两个回答中哪一个更好。
请根据提供的 评分要求,用户问题 以及 相应的两个回答(回答1,回答2),判断两个回答中哪一个更好。
评分要求(重要性依次递减):
评分要求(重要性依次递减):
...
@@ -112,11 +99,9 @@ creation_prompt = """
...
@@ -112,11 +99,9 @@ creation_prompt = """
{question}
{question}
"""
+
base_prompt
"""
+
base_prompt
sub_map
=
{
"knowledge"
:
knowledge_prompt
,
"language"
:
language_prompt
,
"math_v2"
:
math_prompt
,
"reason_v2"
:
reason_prompt
,
"creationv2_zh"
:
creation_prompt
}
subjective_all_sets
=
[
"knowledge"
,
"language"
,
"math"
,
"reason"
,
"qa"
,
"creationv2_zh"
]
for
_name
,
_prompt
in
sub_map
.
items
():
prompt_all_sets
=
[
knowledge_prompt
,
language_prompt
,
math_prompt
,
reason_prompt
,
qa_prompt
,
creation_prompt
]
for
_name
,
_prompt
in
zip
(
subjective_all_sets
,
prompt_all_sets
):
subjective_infer_cfg
=
dict
(
subjective_infer_cfg
=
dict
(
prompt_template
=
dict
(
prompt_template
=
dict
(
type
=
PromptTemplate
,
type
=
PromptTemplate
,
...
...
configs/eval_subjective_compassarena.py
View file @
5c6dc908
...
@@ -3,7 +3,6 @@ from opencompass.models import HuggingFaceCausalLM
...
@@ -3,7 +3,6 @@ from opencompass.models import HuggingFaceCausalLM
from
mmengine.config
import
read_base
from
mmengine.config
import
read_base
with
read_base
():
with
read_base
():
from
.models.chatglm.hf_chatglm3_6b_32k
import
models
as
chatglm3_6b_32k_model
from
.models.chatglm.hf_chatglm3_6b_32k
import
models
as
chatglm3_6b_32k_model
from
.models.yi.hf_yi_6b_chat
import
models
as
yi_6b_chat_model
from
.datasets.subjective.compassarena.compassarena_compare
import
subjective_datasets
from
.datasets.subjective.compassarena.compassarena_compare
import
subjective_datasets
from
opencompass.models
import
HuggingFaceCausalLM
,
HuggingFace
,
HuggingFaceChatGLM3
,
OpenAI
from
opencompass.models
import
HuggingFaceCausalLM
,
HuggingFace
,
HuggingFaceChatGLM3
,
OpenAI
...
@@ -19,7 +18,7 @@ from opencompass.summarizers import CompassArenaSummarizer
...
@@ -19,7 +18,7 @@ from opencompass.summarizers import CompassArenaSummarizer
infer
=
dict
(
infer
=
dict
(
#partitioner=dict(type=NaivePartitioner),
#partitioner=dict(type=NaivePartitioner),
partitioner
=
dict
(
type
=
SizePartitioner
,
max_task_size
=
10000
),
partitioner
=
dict
(
type
=
SizePartitioner
,
strategy
=
'split'
,
max_task_size
=
10000
),
runner
=
dict
(
runner
=
dict
(
type
=
SlurmSequentialRunner
,
type
=
SlurmSequentialRunner
,
partition
=
'llm_dev2'
,
partition
=
'llm_dev2'
,
...
@@ -47,12 +46,12 @@ gpt4 = dict(
...
@@ -47,12 +46,12 @@ gpt4 = dict(
retry
=
20
,
retry
=
20
,
temperature
=
1
temperature
=
1
)
)
models
=
[
*
chatglm3_6b_32k_model
,
*
yi_6b_chat_model
]
models
=
[
*
chatglm3_6b_32k_model
]
datasets
=
[
*
subjective_datasets
]
datasets
=
[
*
subjective_datasets
]
work_dir
=
'outputs/compass_arena/'
work_dir
=
'outputs/compass_arena
_debug
/'
# -------------Inferen Stage ----------------------------------------
# -------------Inferen Stage ----------------------------------------
...
@@ -68,6 +67,7 @@ judge_model = dict(
...
@@ -68,6 +67,7 @@ judge_model = dict(
retry
=
20
,
retry
=
20
,
temperature
=
0
temperature
=
0
)
)
## ------------- Evaluation Configuration
## ------------- Evaluation Configuration
eval
=
dict
(
eval
=
dict
(
partitioner
=
dict
(
partitioner
=
dict
(
...
@@ -76,7 +76,7 @@ eval = dict(
...
@@ -76,7 +76,7 @@ eval = dict(
max_task_size
=
10000
,
max_task_size
=
10000
,
mode
=
'm2n'
,
mode
=
'm2n'
,
base_models
=
[
gpt4
],
base_models
=
[
gpt4
],
compare_models
=
[
*
chatglm3_6b_32k_model
,
*
yi_6b_chat_model
,
]
compare_models
=
[
*
chatglm3_6b_32k_model
]
),
),
runner
=
dict
(
runner
=
dict
(
type
=
SlurmSequentialRunner
,
type
=
SlurmSequentialRunner
,
...
@@ -91,5 +91,6 @@ eval = dict(
...
@@ -91,5 +91,6 @@ eval = dict(
summarizer
=
dict
(
summarizer
=
dict
(
type
=
CompassArenaSummarizer
type
=
CompassArenaSummarizer
,
summary_type
=
'half_add'
)
)
\ No newline at end of file
opencompass/summarizers/subjective/compass_arena.py
View file @
5c6dc908
...
@@ -35,7 +35,7 @@ def check_position_bias(judged_answers, references, banned_choice=['C']):
...
@@ -35,7 +35,7 @@ def check_position_bias(judged_answers, references, banned_choice=['C']):
position_bias_flag
=
0
position_bias_flag
=
0
position_bias_dict
=
{}
position_bias_dict
=
{}
for
judge
,
ref
in
zip
(
judged_answers
,
references
):
for
judge
,
ref
in
zip
(
judged_answers
,
references
):
question
=
ref
[
'
others'
][
'
question'
]
question
=
ref
[
'question'
]
question_hash
=
hash
(
question
)
question_hash
=
hash
(
question
)
if
question_hash
not
in
position_bias_dict
:
if
question_hash
not
in
position_bias_dict
:
position_bias_dict
[
question_hash
]
=
{
position_bias_dict
[
question_hash
]
=
{
...
@@ -58,7 +58,11 @@ class CompassArenaSummarizer:
...
@@ -58,7 +58,11 @@ class CompassArenaSummarizer:
It's expected to be filled out at runtime.
It's expected to be filled out at runtime.
"""
"""
def
__init__
(
self
,
config
:
ConfigDict
,
judge_type
=
'general'
)
->
None
:
def
__init__
(
self
,
config
:
ConfigDict
,
judge_type
=
'general'
,
check_pos_bias
=
True
,
summary_type
=
'single'
)
->
None
:
self
.
tasks
=
[]
self
.
tasks
=
[]
self
.
cfg
=
config
self
.
cfg
=
config
self
.
base_models
=
self
.
cfg
[
'eval'
][
'partitioner'
][
'base_models'
]
self
.
base_models
=
self
.
cfg
[
'eval'
][
'partitioner'
][
'base_models'
]
...
@@ -70,10 +74,13 @@ class CompassArenaSummarizer:
...
@@ -70,10 +74,13 @@ class CompassArenaSummarizer:
'general'
:
post_process_compass_arena
,
'general'
:
post_process_compass_arena
,
}
}
self
.
judge_function
=
self
.
judge_map
[
self
.
judge_type
]
self
.
judge_function
=
self
.
judge_map
[
self
.
judge_type
]
self
.
check_pos_bias
=
check_pos_bias
self
.
summary_type
=
summary_type
def
summarize
(
self
,
def
summarize
(
time_str
:
str
=
datetime
.
now
().
strftime
(
'%Y%m%d_%H%M%S'
),
self
,
check_pos_bias
=
True
):
time_str
:
str
=
datetime
.
now
().
strftime
(
'%Y%m%d_%H%M%S'
),
):
"""Summarize the subjectivity analysis based on evaluation results.
"""Summarize the subjectivity analysis based on evaluation results.
Args:
Args:
...
@@ -88,25 +95,25 @@ class CompassArenaSummarizer:
...
@@ -88,25 +95,25 @@ class CompassArenaSummarizer:
product
(
self
.
base_models
,
self
.
compare_models
))
product
(
self
.
base_models
,
self
.
compare_models
))
unique_combinations
=
remove_duplicate_pairs
(
unique_combinations
=
remove_duplicate_pairs
(
[
combo
for
combo
in
model_combinations
if
combo
[
0
]
!=
combo
[
1
]])
[
combo
for
combo
in
model_combinations
if
combo
[
0
]
!=
combo
[
1
]])
judge_model
=
self
.
judge_abbr
fout_list
=
[]
fout_list
=
[]
for
model_pair
in
unique_combinations
:
for
dataset
in
dataset_cfgs
:
model1
,
model2
,
judge_model
=
model_pair
[
0
][
'abbr'
],
model_pair
[
1
][
dataset_abbr
=
dataset_abbr_from_cfg
(
dataset
)
'abbr'
],
self
.
judge_abbr
fout
=
osp
.
join
(
subdir
=
model1
+
'_'
+
model2
+
'_judged-by--'
+
self
.
judge_abbr
output_dir
,
'judged-by--'
+
judge_model
+
'-'
+
dataset_abbr
+
subdir_path
=
os
.
path
.
join
(
results_folder
,
subdir
)
'-report.csv'
)
if
os
.
path
.
isdir
(
subdir_path
):
fout_list
.
append
(
fout
)
for
dataset
in
dataset_cfgs
:
for
model_pair
in
unique_combinations
:
dataset_abbr
=
dataset_abbr_from_cfg
(
dataset
)
model1
,
model2
,
=
model_pair
[
0
][
'abbr'
],
model_pair
[
1
][
'abbr'
],
fout
=
osp
.
join
(
subdir
=
model1
+
'_'
+
model2
+
'_judged-by--'
+
judge_model
output_dir
,
'judged-by--'
+
judge_model
+
'-'
+
subdir_path
=
os
.
path
.
join
(
results_folder
,
subdir
)
dataset_abbr
+
'-report.csv'
)
if
os
.
path
.
isdir
(
subdir_path
):
fout_list
.
append
(
fout
)
judged_answers
,
references
=
get_judgeanswer_and_reference
(
judged_answers
,
references
=
get_judgeanswer_and_reference
(
dataset
,
dataset
,
subdir_path
,
subdir_path
,
self
.
judge_function
,
self
.
judge_function
,
)
)
if
check_pos_bias
:
if
self
.
check_pos_bias
:
bias_num
=
check_position_bias
(
judged_answers
,
bias_num
=
check_position_bias
(
judged_answers
,
references
)
references
)
else
:
else
:
...
@@ -117,24 +124,47 @@ class CompassArenaSummarizer:
...
@@ -117,24 +124,47 @@ class CompassArenaSummarizer:
'answer2'
]
'answer2'
]
for
prediction
,
reference
in
zip
(
judged_answers
,
for
prediction
,
reference
in
zip
(
judged_answers
,
references
):
references
):
if
dataset_abbr
==
'qa'
:
if
self
.
summary_type
==
'single'
:
reference
[
'capability'
]
=
'QA'
if
prediction
==
'A'
:
categories
[
'total'
]
+=
1
categories
[
'total'
]
+=
1
categories
[
reference
[
'capability'
]]
+=
1
categories
[
reference
[
'capability'
]]
+=
1
if
prediction
==
'A'
:
if
reference
[
'answer1'
]
==
model1
:
if
reference
[
'answer1'
]
==
model1
:
win_model1
[
reference
[
'capability'
]]
+=
1
win_model1
[
reference
[
'capability'
]]
+=
1
win_model1
[
'total'
]
+=
1
win_model1
[
'total'
]
+=
1
else
:
else
:
win_model2
[
reference
[
'capability'
]]
+=
1
win_model2
[
reference
[
'capability'
]]
+=
1
win_model2
[
'total'
]
+=
1
win_model2
[
'total'
]
+=
1
elif
prediction
==
'B'
:
elif
prediction
==
'B'
:
categories
[
'total'
]
+=
1
if
reference
[
'answer1'
]
==
model1
:
categories
[
reference
[
'capability'
]]
+=
1
win_model2
[
reference
[
'capability'
]]
+=
1
if
reference
[
'answer1'
]
==
model1
:
win_model2
[
'total'
]
+=
1
win_model2
[
reference
[
'capability'
]]
+=
1
else
:
win_model2
[
'total'
]
+=
1
win_model1
[
reference
[
'capability'
]]
+=
1
else
:
win_model1
[
'total'
]
+=
1
win_model1
[
reference
[
'capability'
]]
+=
1
win_model1
[
'total'
]
+=
1
elif
self
.
summary_type
==
'half_add'
:
categories
[
'total'
]
+=
1
categories
[
reference
[
'capability'
]]
+=
1
if
prediction
==
'A'
:
if
reference
[
'answer1'
]
==
model1
:
win_model1
[
reference
[
'capability'
]]
+=
1
win_model1
[
'total'
]
+=
1
else
:
win_model2
[
reference
[
'capability'
]]
+=
1
win_model2
[
'total'
]
+=
1
elif
prediction
==
'B'
:
if
reference
[
'answer1'
]
==
model1
:
win_model2
[
reference
[
'capability'
]]
+=
1
win_model2
[
'total'
]
+=
1
else
:
win_model1
[
reference
[
'capability'
]]
+=
1
win_model1
[
'total'
]
+=
1
elif
prediction
==
'C'
:
win_model1
[
reference
[
'capability'
]]
+=
0.5
win_model1
[
'total'
]
+=
0.5
win_model2
[
reference
[
'capability'
]]
+=
0.5
win_model2
[
'total'
]
+=
0.5
for
capability
in
categories
:
for
capability
in
categories
:
if
capability
not
in
win_model1
:
if
capability
not
in
win_model1
:
win_model1
[
capability
]
=
0.0
win_model1
[
capability
]
=
0.0
...
@@ -166,8 +196,8 @@ class CompassArenaSummarizer:
...
@@ -166,8 +196,8 @@ class CompassArenaSummarizer:
writer
.
writerow
(
writer
.
writerow
(
[
row
]
+
[
row
]
+
[
scores
[
row
][
column
]
for
column
in
columns
])
[
scores
[
row
][
column
]
for
column
in
columns
])
else
:
else
:
print
(
subdir_path
+
' is not exist! please check!'
)
print
(
subdir_path
+
' is not exist! please check!'
)
for
fout
in
fout_list
:
for
fout
in
fout_list
:
with
open
(
fout
,
'r'
)
as
f
:
with
open
(
fout
,
'r'
)
as
f
:
x
=
from_csv
(
f
)
x
=
from_csv
(
f
)
...
...
opencompass/summarizers/subjective/utils.py
View file @
5c6dc908
...
@@ -64,9 +64,10 @@ def get_judgeanswer_and_reference(dataset, subdir_path, post_process):
...
@@ -64,9 +64,10 @@ def get_judgeanswer_and_reference(dataset, subdir_path, post_process):
if
processed_judge
is
not
None
:
if
processed_judge
is
not
None
:
judged_answers
.
append
(
processed_judge
)
judged_answers
.
append
(
processed_judge
)
references
.
append
(
v
[
'gold'
])
references
.
append
(
v
[
'gold'
])
print
(
if
len
(
judged_answers
)
!=
len
(
result
):
f
'Among
{
len
(
result
)
}
judgements, successfully extracted
{
len
(
judged_answers
)
}
judgements.'
print
(
)
f
'Among
{
len
(
result
)
}
judgements, successfully extracted
{
len
(
judged_answers
)
}
judgements, please check!'
)
if
len
(
judged_answers
)
==
0
:
if
len
(
judged_answers
)
==
0
:
print
(
'*'
*
100
)
print
(
'*'
*
100
)
print
(
print
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment