Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
opencompass
Commits
e985100c
Unverified
Commit
e985100c
authored
Dec 23, 2023
by
bittersweet1999
Committed by
GitHub
Dec 23, 2023
Browse files
[Fix] Fix subjective alignbench (#730)
parent
0e24f421
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
129 additions
and
32 deletions
+129
-32
opencompass/partitioners/sub_naive.py
opencompass/partitioners/sub_naive.py
+13
-12
opencompass/summarizers/alignmentbench.py
opencompass/summarizers/alignmentbench.py
+48
-11
opencompass/summarizers/corev2.py
opencompass/summarizers/corev2.py
+58
-6
opencompass/tasks/subjective_eval.py
opencompass/tasks/subjective_eval.py
+10
-3
No files found.
opencompass/partitioners/sub_naive.py
View file @
e985100c
...
...
@@ -8,6 +8,18 @@ from opencompass.registry import PARTITIONERS
from
.naive
import
NaivePartitioner
def
remove_duplicate_pairs
(
model_combinations
):
combo_dict
=
{}
for
i
,
combo
in
enumerate
(
model_combinations
):
sorted_names
=
tuple
(
sorted
((
combo
[
0
][
'abbr'
],
combo
[
1
][
'abbr'
])))
if
sorted_names
not
in
combo_dict
:
combo_dict
[
sorted_names
]
=
i
new_model_combinations
=
[
model_combinations
[
i
]
for
i
in
combo_dict
.
values
()
]
return
new_model_combinations
@
PARTITIONERS
.
register_module
()
class
SubjectiveNaivePartitioner
(
NaivePartitioner
):
"""Naive task partitioner for subjective evaluation. Compared to
...
...
@@ -35,17 +47,6 @@ class SubjectiveNaivePartitioner(NaivePartitioner):
self
.
compare_models
=
compare_models
self
.
model_pairs
=
model_pairs
def
remove_duplicate_pairs
(
self
,
model_combinations
):
combo_dict
=
{}
for
i
,
combo
in
enumerate
(
model_combinations
):
sorted_names
=
tuple
(
sorted
((
combo
[
0
][
'abbr'
],
combo
[
1
][
'abbr'
])))
if
sorted_names
not
in
combo_dict
:
combo_dict
[
sorted_names
]
=
i
new_model_combinations
=
[
model_combinations
[
i
]
for
i
in
combo_dict
.
values
()
]
return
new_model_combinations
def
get_model_combinations
(
self
,
models
:
List
[
ConfigDict
],
...
...
@@ -57,7 +58,7 @@ class SubjectiveNaivePartitioner(NaivePartitioner):
elif
self
.
mode
==
'm2n'
:
assert
len
(
base_models
)
>
0
and
len
(
compare_models
)
>
0
model_combinations
=
list
(
product
(
base_models
,
compare_models
))
unique_combinations
=
self
.
remove_duplicate_pairs
([
unique_combinations
=
remove_duplicate_pairs
([
combo
for
combo
in
model_combinations
if
combo
[
0
]
!=
combo
[
1
]
])
return
unique_combinations
...
...
opencompass/summarizers/alignmentbench.py
View file @
e985100c
...
...
@@ -38,9 +38,7 @@ def post_process(judgment: str):
dictionary_str
=
match
.
group
(
1
)
kv_pattern
=
r
"'(.*?)': (\d+)"
matches
=
re
.
findall
(
kv_pattern
,
dictionary_str
)
result_dict
=
{
key
:
int
(
value
)
for
key
,
value
in
matches
}
return
result_dict
else
:
return
None
...
...
@@ -95,6 +93,7 @@ class AlignmentBenchSummarizer:
self
.
eval_model_abbrs
=
[
model_abbr_from_cfg
(
model
)
for
model
in
self
.
eval_model_cfgs
]
self
.
judge_abbr
=
self
.
cfg
[
'judge_model'
][
'abbr'
]
def
summarize
(
self
,
time_str
:
str
=
datetime
.
now
().
strftime
(
'%Y%m%d_%H%M%S'
)):
...
...
@@ -106,6 +105,7 @@ class AlignmentBenchSummarizer:
Returns:
pd.DataFrame: The summary results.
"""
dataset_cfgs
=
self
.
cfg
[
'datasets'
]
work_dir
=
self
.
cfg
[
'work_dir'
]
self
.
work_dir
=
work_dir
...
...
@@ -118,19 +118,48 @@ class AlignmentBenchSummarizer:
results_folder
=
osp
.
join
(
work_dir
,
'results'
)
fout_flag
,
fout_flag2
=
0
,
0
for
subdir
in
os
.
listdir
(
results_folder
):
if
subdir
not
in
self
.
eval_model_abbrs
:
continue
for
eval_model_abbr
in
self
.
eval_model_abbrs
:
subdir
=
eval_model_abbr
+
'_judged-by--'
+
self
.
judge_abbr
subdir_path
=
os
.
path
.
join
(
results_folder
,
subdir
)
if
os
.
path
.
isdir
(
subdir_path
):
model
,
judge_model
=
subdir
.
split
(
'_'
)
fout
=
osp
.
join
(
output_dir
,
judge_model
+
'dimension.csv'
)
fout2
=
osp
.
join
(
output_dir
,
judge_model
+
'capability.csv'
)
model
,
judge_model
=
eval_model_abbr
,
self
.
judge_abbr
fout
=
osp
.
join
(
output_dir
,
'judged-by--'
+
judge_model
+
'-dimension.csv'
)
fout2
=
osp
.
join
(
output_dir
,
'judged-by--'
+
judge_model
+
'-capability.csv'
)
for
dataset
in
dataset_cfgs
:
dataset_abbr
=
dataset_abbr_from_cfg
(
dataset
)
file
path
=
os
.
path
.
join
(
subdir_path
,
file
name
=
os
.
path
.
join
(
subdir_path
,
dataset_abbr
+
'.json'
)
result
=
mmengine
.
load
(
filepath
)
partial_filename
=
os
.
path
.
join
(
subdir_path
,
dataset_abbr
+
'_0.json'
)
if
osp
.
exists
(
osp
.
realpath
(
filename
)):
result
=
mmengine
.
load
(
filename
)
elif
osp
.
exists
(
osp
.
realpath
(
partial_filename
)):
filename
=
partial_filename
result
=
{}
i
=
1
partial_dict_flag
=
0
while
osp
.
exists
(
osp
.
realpath
(
filename
)):
res
=
mmengine
.
load
(
filename
)
for
k
,
v
in
res
.
items
():
result
[
partial_dict_flag
]
=
v
partial_dict_flag
+=
1
filename
=
os
.
path
.
join
(
subdir_path
,
dataset_abbr
+
'_'
+
str
(
i
)
+
'.json'
)
i
+=
1
else
:
result
=
{}
if
len
(
result
)
==
0
:
print
(
'*'
*
100
)
print
(
'There are no results for '
+
filename
+
' or '
+
partial_filename
)
print
(
'*'
*
100
)
assert
len
(
result
>
0
)
judged_answers
=
[]
references
=
[]
for
k
,
v
in
result
.
items
():
...
...
@@ -144,8 +173,14 @@ class AlignmentBenchSummarizer:
print
(
f
'Among
{
len
(
result
)
}
judgements, successfully extracted
{
len
(
judged_answers
)
}
judgements.'
)
if
len
(
judged_answers
)
==
0
:
print
(
'*'
*
100
)
print
(
'There are no extracted judgements, please change your judge model or check your prompt!!!'
)
print
(
'*'
*
100
)
assert
len
(
judged_answers
)
>
0
# 初始化一个嵌套字典用于存储模型和评分
dimension_ratings
=
defaultdict
(
int
)
dimension_counts
=
defaultdict
(
int
)
capability_ratings
=
defaultdict
(
int
)
...
...
@@ -225,6 +260,8 @@ class AlignmentBenchSummarizer:
for
sub_category
in
sub_categories
:
row
.
append
(
scores
[
model
][
sub_category
])
writer
.
writerow
(
row
)
else
:
print
(
subdir_path
+
' is not exist! please check!'
)
with
open
(
fout
,
'r'
)
as
f
:
x
=
from_csv
(
f
)
print
(
x
)
...
...
opencompass/summarizers/corev2.py
View file @
e985100c
...
...
@@ -5,6 +5,7 @@ import os.path as osp
import
re
from
collections
import
defaultdict
from
datetime
import
datetime
from
itertools
import
product
import
mmengine
from
mmengine
import
ConfigDict
...
...
@@ -14,6 +15,7 @@ try:
except
ImportError
:
from_csv
=
None
from
opencompass.partitioners.sub_naive
import
remove_duplicate_pairs
from
opencompass.utils
import
dataset_abbr_from_cfg
...
...
@@ -54,6 +56,9 @@ class Corev2Summarizer:
self
.
tasks
=
[]
self
.
cfg
=
config
self
.
match_method
=
match_method
self
.
base_models
=
self
.
cfg
[
'eval'
][
'partitioner'
][
'base_models'
]
self
.
compare_models
=
self
.
cfg
[
'eval'
][
'partitioner'
][
'compare_models'
]
self
.
judge_abbr
=
self
.
cfg
[
'judge_model'
][
'abbr'
]
def
summarize
(
self
,
time_str
:
str
=
datetime
.
now
().
strftime
(
'%Y%m%d_%H%M%S'
)):
...
...
@@ -76,25 +81,70 @@ class Corev2Summarizer:
mmengine
.
mkdir_or_exist
(
output_dir
)
results_folder
=
osp
.
join
(
work_dir
,
'results'
)
for
subdir
in
os
.
listdir
(
results_folder
):
model_combinations
=
list
(
product
(
self
.
base_models
,
self
.
compare_models
))
unique_combinations
=
remove_duplicate_pairs
(
[
combo
for
combo
in
model_combinations
if
combo
[
0
]
!=
combo
[
1
]])
for
model_pair
in
unique_combinations
:
model1
,
model2
,
judge_model
=
model_pair
[
0
][
'abbr'
],
model_pair
[
1
][
'abbr'
],
self
.
judge_abbr
subdir
=
model1
+
'_'
+
model2
+
'_judged-by--'
+
self
.
judge_abbr
subdir_path
=
os
.
path
.
join
(
results_folder
,
subdir
)
if
os
.
path
.
isdir
(
subdir_path
):
model1
,
model2
,
judge_model
=
subdir
.
split
(
'_'
)
fout
=
osp
.
join
(
output_dir
,
judge_model
+
'-report.csv'
)
fout
=
osp
.
join
(
output_dir
,
'judged-by--'
+
judge_model
+
'-report.csv'
)
for
dataset
in
dataset_cfgs
:
dataset_abbr
=
dataset_abbr_from_cfg
(
dataset
)
file
path
=
os
.
path
.
join
(
subdir_path
,
file
name
=
os
.
path
.
join
(
subdir_path
,
dataset_abbr
+
'.json'
)
result
=
mmengine
.
load
(
filepath
)
partial_filename
=
os
.
path
.
join
(
subdir_path
,
dataset_abbr
+
'_0.json'
)
if
osp
.
exists
(
osp
.
realpath
(
filename
)):
result
=
mmengine
.
load
(
filename
)
elif
osp
.
exists
(
osp
.
realpath
(
partial_filename
)):
filename
=
partial_filename
result
=
{}
i
=
1
partial_dict_flag
=
0
while
osp
.
exists
(
osp
.
realpath
(
filename
)):
res
=
mmengine
.
load
(
filename
)
for
k
,
v
in
res
.
items
():
result
[
partial_dict_flag
]
=
v
partial_dict_flag
+=
1
filename
=
os
.
path
.
join
(
subdir_path
,
dataset_abbr
+
'_'
+
str
(
i
)
+
'.json'
)
i
+=
1
else
:
result
=
{}
if
len
(
result
)
==
0
:
print
(
'*'
*
100
)
print
(
'There are no results for '
+
filename
+
' or '
+
partial_filename
)
print
(
'*'
*
100
)
assert
len
(
result
>
0
)
judged_answers
=
[]
references
=
[]
for
k
,
v
in
result
.
items
():
judged_answers
.
append
(
call_function
(
self
.
match_method
,
v
[
'prediction'
]))
references
.
append
(
v
[
'gold'
])
successful_judged_answers
=
len
(
judged_answers
)
-
judged_answers
.
count
(
None
)
print
(
f
'Among
{
len
(
judged_answers
)
}
judgements, successfully extracted
{
len
(
judged_answers
)
-
judged_answers
.
count
(
None
)
}
judgements.'
f
'Among
{
len
(
judged_answers
)
}
judgements, successfully extracted
{
successful_
judged_answers
}
judgements.'
)
if
successful_judged_answers
==
0
:
print
(
'*'
*
100
)
print
(
'There are no extracted judgements, please change your judge model or check your prompt!!!'
)
print
(
'*'
*
100
)
assert
successful_judged_answers
>
0
win_both_model1
,
win_both_model2
,
half_draw_model1
,
half_draw_model2
,
categories
=
defaultdict
(
float
),
defaultdict
(
float
),
defaultdict
(
float
),
defaultdict
(
float
),
defaultdict
(
float
)
...
...
@@ -168,6 +218,8 @@ class Corev2Summarizer:
writer
.
writerow
(
[
row
]
+
[
scores
[
row
][
column
]
for
column
in
columns
])
else
:
print
(
subdir_path
+
' is not exist! please check!'
)
with
open
(
fout
,
'r'
)
as
f
:
x
=
from_csv
(
f
)
print
(
x
)
opencompass/tasks/subjective_eval.py
View file @
e985100c
...
...
@@ -96,8 +96,11 @@ class SubjectiveEvalTask(BaseTask):
root
,
ext
=
osp
.
splitext
(
filename
)
partial_filename
=
root
+
'_0'
+
ext
pred_strs
=
None
if
osp
.
exists
(
osp
.
realpath
(
filename
))
or
osp
.
exists
(
if
not
osp
.
exists
(
osp
.
realpath
(
filename
))
and
not
osp
.
exists
(
osp
.
realpath
(
partial_filename
)):
return
{
'error'
:
'No predictions found.'
}
else
:
if
osp
.
exists
(
osp
.
realpath
(
filename
)):
preds
=
mmengine
.
load
(
filename
)
pred_strs
=
[
...
...
@@ -172,8 +175,12 @@ class SubjectiveEvalTask(BaseTask):
eval_cfg
[
'evaluator'
][
'output_path'
]
=
out_path
icl_evaluator
=
ICL_EVALUATORS
.
build
(
eval_cfg
[
'evaluator'
])
references
=
(
test_set
[
output_column
]
if
output_column
else
None
)
result
=
icl_evaluator
.
score
(
predictions
=
model_preds
,
references
=
references
)
if
'error'
not
in
model_preds
:
result
=
icl_evaluator
.
score
(
predictions
=
model_preds
,
references
=
references
)
else
:
result
=
model_preds
if
'error'
in
result
:
self
.
logger
.
error
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment