Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
opencompass
Commits
e985100c
Unverified
Commit
e985100c
authored
Dec 23, 2023
by
bittersweet1999
Committed by
GitHub
Dec 23, 2023
Browse files
[Fix] Fix subjective alignbench (#730)
parent
0e24f421
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
129 additions
and
32 deletions
+129
-32
opencompass/partitioners/sub_naive.py
opencompass/partitioners/sub_naive.py
+13
-12
opencompass/summarizers/alignmentbench.py
opencompass/summarizers/alignmentbench.py
+48
-11
opencompass/summarizers/corev2.py
opencompass/summarizers/corev2.py
+58
-6
opencompass/tasks/subjective_eval.py
opencompass/tasks/subjective_eval.py
+10
-3
No files found.
opencompass/partitioners/sub_naive.py
View file @
e985100c
...
@@ -8,6 +8,18 @@ from opencompass.registry import PARTITIONERS
...
@@ -8,6 +8,18 @@ from opencompass.registry import PARTITIONERS
from
.naive
import
NaivePartitioner
from
.naive
import
NaivePartitioner
def
remove_duplicate_pairs
(
model_combinations
):
combo_dict
=
{}
for
i
,
combo
in
enumerate
(
model_combinations
):
sorted_names
=
tuple
(
sorted
((
combo
[
0
][
'abbr'
],
combo
[
1
][
'abbr'
])))
if
sorted_names
not
in
combo_dict
:
combo_dict
[
sorted_names
]
=
i
new_model_combinations
=
[
model_combinations
[
i
]
for
i
in
combo_dict
.
values
()
]
return
new_model_combinations
@
PARTITIONERS
.
register_module
()
@
PARTITIONERS
.
register_module
()
class
SubjectiveNaivePartitioner
(
NaivePartitioner
):
class
SubjectiveNaivePartitioner
(
NaivePartitioner
):
"""Naive task partitioner for subjective evaluation. Compared to
"""Naive task partitioner for subjective evaluation. Compared to
...
@@ -35,17 +47,6 @@ class SubjectiveNaivePartitioner(NaivePartitioner):
...
@@ -35,17 +47,6 @@ class SubjectiveNaivePartitioner(NaivePartitioner):
self
.
compare_models
=
compare_models
self
.
compare_models
=
compare_models
self
.
model_pairs
=
model_pairs
self
.
model_pairs
=
model_pairs
def
remove_duplicate_pairs
(
self
,
model_combinations
):
combo_dict
=
{}
for
i
,
combo
in
enumerate
(
model_combinations
):
sorted_names
=
tuple
(
sorted
((
combo
[
0
][
'abbr'
],
combo
[
1
][
'abbr'
])))
if
sorted_names
not
in
combo_dict
:
combo_dict
[
sorted_names
]
=
i
new_model_combinations
=
[
model_combinations
[
i
]
for
i
in
combo_dict
.
values
()
]
return
new_model_combinations
def
get_model_combinations
(
def
get_model_combinations
(
self
,
self
,
models
:
List
[
ConfigDict
],
models
:
List
[
ConfigDict
],
...
@@ -57,7 +58,7 @@ class SubjectiveNaivePartitioner(NaivePartitioner):
...
@@ -57,7 +58,7 @@ class SubjectiveNaivePartitioner(NaivePartitioner):
elif
self
.
mode
==
'm2n'
:
elif
self
.
mode
==
'm2n'
:
assert
len
(
base_models
)
>
0
and
len
(
compare_models
)
>
0
assert
len
(
base_models
)
>
0
and
len
(
compare_models
)
>
0
model_combinations
=
list
(
product
(
base_models
,
compare_models
))
model_combinations
=
list
(
product
(
base_models
,
compare_models
))
unique_combinations
=
self
.
remove_duplicate_pairs
([
unique_combinations
=
remove_duplicate_pairs
([
combo
for
combo
in
model_combinations
if
combo
[
0
]
!=
combo
[
1
]
combo
for
combo
in
model_combinations
if
combo
[
0
]
!=
combo
[
1
]
])
])
return
unique_combinations
return
unique_combinations
...
...
opencompass/summarizers/alignmentbench.py
View file @
e985100c
...
@@ -38,9 +38,7 @@ def post_process(judgment: str):
...
@@ -38,9 +38,7 @@ def post_process(judgment: str):
dictionary_str
=
match
.
group
(
1
)
dictionary_str
=
match
.
group
(
1
)
kv_pattern
=
r
"'(.*?)': (\d+)"
kv_pattern
=
r
"'(.*?)': (\d+)"
matches
=
re
.
findall
(
kv_pattern
,
dictionary_str
)
matches
=
re
.
findall
(
kv_pattern
,
dictionary_str
)
result_dict
=
{
key
:
int
(
value
)
for
key
,
value
in
matches
}
result_dict
=
{
key
:
int
(
value
)
for
key
,
value
in
matches
}
return
result_dict
return
result_dict
else
:
else
:
return
None
return
None
...
@@ -95,6 +93,7 @@ class AlignmentBenchSummarizer:
...
@@ -95,6 +93,7 @@ class AlignmentBenchSummarizer:
self
.
eval_model_abbrs
=
[
self
.
eval_model_abbrs
=
[
model_abbr_from_cfg
(
model
)
for
model
in
self
.
eval_model_cfgs
model_abbr_from_cfg
(
model
)
for
model
in
self
.
eval_model_cfgs
]
]
self
.
judge_abbr
=
self
.
cfg
[
'judge_model'
][
'abbr'
]
def
summarize
(
self
,
def
summarize
(
self
,
time_str
:
str
=
datetime
.
now
().
strftime
(
'%Y%m%d_%H%M%S'
)):
time_str
:
str
=
datetime
.
now
().
strftime
(
'%Y%m%d_%H%M%S'
)):
...
@@ -106,6 +105,7 @@ class AlignmentBenchSummarizer:
...
@@ -106,6 +105,7 @@ class AlignmentBenchSummarizer:
Returns:
Returns:
pd.DataFrame: The summary results.
pd.DataFrame: The summary results.
"""
"""
dataset_cfgs
=
self
.
cfg
[
'datasets'
]
dataset_cfgs
=
self
.
cfg
[
'datasets'
]
work_dir
=
self
.
cfg
[
'work_dir'
]
work_dir
=
self
.
cfg
[
'work_dir'
]
self
.
work_dir
=
work_dir
self
.
work_dir
=
work_dir
...
@@ -118,19 +118,48 @@ class AlignmentBenchSummarizer:
...
@@ -118,19 +118,48 @@ class AlignmentBenchSummarizer:
results_folder
=
osp
.
join
(
work_dir
,
'results'
)
results_folder
=
osp
.
join
(
work_dir
,
'results'
)
fout_flag
,
fout_flag2
=
0
,
0
fout_flag
,
fout_flag2
=
0
,
0
for
subdir
in
os
.
listdir
(
results_folder
):
for
eval_model_abbr
in
self
.
eval_model_abbrs
:
if
subdir
not
in
self
.
eval_model_abbrs
:
subdir
=
eval_model_abbr
+
'_judged-by--'
+
self
.
judge_abbr
continue
subdir_path
=
os
.
path
.
join
(
results_folder
,
subdir
)
subdir_path
=
os
.
path
.
join
(
results_folder
,
subdir
)
if
os
.
path
.
isdir
(
subdir_path
):
if
os
.
path
.
isdir
(
subdir_path
):
model
,
judge_model
=
subdir
.
split
(
'_'
)
model
,
judge_model
=
eval_model_abbr
,
self
.
judge_abbr
fout
=
osp
.
join
(
output_dir
,
judge_model
+
'dimension.csv'
)
fout
=
osp
.
join
(
output_dir
,
fout2
=
osp
.
join
(
output_dir
,
judge_model
+
'capability.csv'
)
'judged-by--'
+
judge_model
+
'-dimension.csv'
)
fout2
=
osp
.
join
(
output_dir
,
'judged-by--'
+
judge_model
+
'-capability.csv'
)
for
dataset
in
dataset_cfgs
:
for
dataset
in
dataset_cfgs
:
dataset_abbr
=
dataset_abbr_from_cfg
(
dataset
)
dataset_abbr
=
dataset_abbr_from_cfg
(
dataset
)
file
path
=
os
.
path
.
join
(
subdir_path
,
file
name
=
os
.
path
.
join
(
subdir_path
,
dataset_abbr
+
'.json'
)
dataset_abbr
+
'.json'
)
result
=
mmengine
.
load
(
filepath
)
partial_filename
=
os
.
path
.
join
(
subdir_path
,
dataset_abbr
+
'_0.json'
)
if
osp
.
exists
(
osp
.
realpath
(
filename
)):
result
=
mmengine
.
load
(
filename
)
elif
osp
.
exists
(
osp
.
realpath
(
partial_filename
)):
filename
=
partial_filename
result
=
{}
i
=
1
partial_dict_flag
=
0
while
osp
.
exists
(
osp
.
realpath
(
filename
)):
res
=
mmengine
.
load
(
filename
)
for
k
,
v
in
res
.
items
():
result
[
partial_dict_flag
]
=
v
partial_dict_flag
+=
1
filename
=
os
.
path
.
join
(
subdir_path
,
dataset_abbr
+
'_'
+
str
(
i
)
+
'.json'
)
i
+=
1
else
:
result
=
{}
if
len
(
result
)
==
0
:
print
(
'*'
*
100
)
print
(
'There are no results for '
+
filename
+
' or '
+
partial_filename
)
print
(
'*'
*
100
)
assert
len
(
result
>
0
)
judged_answers
=
[]
judged_answers
=
[]
references
=
[]
references
=
[]
for
k
,
v
in
result
.
items
():
for
k
,
v
in
result
.
items
():
...
@@ -144,8 +173,14 @@ class AlignmentBenchSummarizer:
...
@@ -144,8 +173,14 @@ class AlignmentBenchSummarizer:
print
(
print
(
f
'Among
{
len
(
result
)
}
judgements, successfully extracted
{
len
(
judged_answers
)
}
judgements.'
f
'Among
{
len
(
result
)
}
judgements, successfully extracted
{
len
(
judged_answers
)
}
judgements.'
)
)
if
len
(
judged_answers
)
==
0
:
print
(
'*'
*
100
)
print
(
'There are no extracted judgements, please change your judge model or check your prompt!!!'
)
print
(
'*'
*
100
)
assert
len
(
judged_answers
)
>
0
# 初始化一个嵌套字典用于存储模型和评分
dimension_ratings
=
defaultdict
(
int
)
dimension_ratings
=
defaultdict
(
int
)
dimension_counts
=
defaultdict
(
int
)
dimension_counts
=
defaultdict
(
int
)
capability_ratings
=
defaultdict
(
int
)
capability_ratings
=
defaultdict
(
int
)
...
@@ -225,6 +260,8 @@ class AlignmentBenchSummarizer:
...
@@ -225,6 +260,8 @@ class AlignmentBenchSummarizer:
for
sub_category
in
sub_categories
:
for
sub_category
in
sub_categories
:
row
.
append
(
scores
[
model
][
sub_category
])
row
.
append
(
scores
[
model
][
sub_category
])
writer
.
writerow
(
row
)
writer
.
writerow
(
row
)
else
:
print
(
subdir_path
+
' is not exist! please check!'
)
with
open
(
fout
,
'r'
)
as
f
:
with
open
(
fout
,
'r'
)
as
f
:
x
=
from_csv
(
f
)
x
=
from_csv
(
f
)
print
(
x
)
print
(
x
)
...
...
opencompass/summarizers/corev2.py
View file @
e985100c
...
@@ -5,6 +5,7 @@ import os.path as osp
...
@@ -5,6 +5,7 @@ import os.path as osp
import
re
import
re
from
collections
import
defaultdict
from
collections
import
defaultdict
from
datetime
import
datetime
from
datetime
import
datetime
from
itertools
import
product
import
mmengine
import
mmengine
from
mmengine
import
ConfigDict
from
mmengine
import
ConfigDict
...
@@ -14,6 +15,7 @@ try:
...
@@ -14,6 +15,7 @@ try:
except
ImportError
:
except
ImportError
:
from_csv
=
None
from_csv
=
None
from
opencompass.partitioners.sub_naive
import
remove_duplicate_pairs
from
opencompass.utils
import
dataset_abbr_from_cfg
from
opencompass.utils
import
dataset_abbr_from_cfg
...
@@ -54,6 +56,9 @@ class Corev2Summarizer:
...
@@ -54,6 +56,9 @@ class Corev2Summarizer:
self
.
tasks
=
[]
self
.
tasks
=
[]
self
.
cfg
=
config
self
.
cfg
=
config
self
.
match_method
=
match_method
self
.
match_method
=
match_method
self
.
base_models
=
self
.
cfg
[
'eval'
][
'partitioner'
][
'base_models'
]
self
.
compare_models
=
self
.
cfg
[
'eval'
][
'partitioner'
][
'compare_models'
]
self
.
judge_abbr
=
self
.
cfg
[
'judge_model'
][
'abbr'
]
def
summarize
(
self
,
def
summarize
(
self
,
time_str
:
str
=
datetime
.
now
().
strftime
(
'%Y%m%d_%H%M%S'
)):
time_str
:
str
=
datetime
.
now
().
strftime
(
'%Y%m%d_%H%M%S'
)):
...
@@ -76,25 +81,70 @@ class Corev2Summarizer:
...
@@ -76,25 +81,70 @@ class Corev2Summarizer:
mmengine
.
mkdir_or_exist
(
output_dir
)
mmengine
.
mkdir_or_exist
(
output_dir
)
results_folder
=
osp
.
join
(
work_dir
,
'results'
)
results_folder
=
osp
.
join
(
work_dir
,
'results'
)
for
subdir
in
os
.
listdir
(
results_folder
):
model_combinations
=
list
(
product
(
self
.
base_models
,
self
.
compare_models
))
unique_combinations
=
remove_duplicate_pairs
(
[
combo
for
combo
in
model_combinations
if
combo
[
0
]
!=
combo
[
1
]])
for
model_pair
in
unique_combinations
:
model1
,
model2
,
judge_model
=
model_pair
[
0
][
'abbr'
],
model_pair
[
1
][
'abbr'
],
self
.
judge_abbr
subdir
=
model1
+
'_'
+
model2
+
'_judged-by--'
+
self
.
judge_abbr
subdir_path
=
os
.
path
.
join
(
results_folder
,
subdir
)
subdir_path
=
os
.
path
.
join
(
results_folder
,
subdir
)
if
os
.
path
.
isdir
(
subdir_path
):
if
os
.
path
.
isdir
(
subdir_path
):
model1
,
model2
,
judge_model
=
subdir
.
split
(
'_'
)
fout
=
osp
.
join
(
output_dir
,
fout
=
osp
.
join
(
output_dir
,
judge_model
+
'-report.csv'
)
'judged-by--'
+
judge_model
+
'-report.csv'
)
for
dataset
in
dataset_cfgs
:
for
dataset
in
dataset_cfgs
:
dataset_abbr
=
dataset_abbr_from_cfg
(
dataset
)
dataset_abbr
=
dataset_abbr_from_cfg
(
dataset
)
file
path
=
os
.
path
.
join
(
subdir_path
,
file
name
=
os
.
path
.
join
(
subdir_path
,
dataset_abbr
+
'.json'
)
dataset_abbr
+
'.json'
)
result
=
mmengine
.
load
(
filepath
)
partial_filename
=
os
.
path
.
join
(
subdir_path
,
dataset_abbr
+
'_0.json'
)
if
osp
.
exists
(
osp
.
realpath
(
filename
)):
result
=
mmengine
.
load
(
filename
)
elif
osp
.
exists
(
osp
.
realpath
(
partial_filename
)):
filename
=
partial_filename
result
=
{}
i
=
1
partial_dict_flag
=
0
while
osp
.
exists
(
osp
.
realpath
(
filename
)):
res
=
mmengine
.
load
(
filename
)
for
k
,
v
in
res
.
items
():
result
[
partial_dict_flag
]
=
v
partial_dict_flag
+=
1
filename
=
os
.
path
.
join
(
subdir_path
,
dataset_abbr
+
'_'
+
str
(
i
)
+
'.json'
)
i
+=
1
else
:
result
=
{}
if
len
(
result
)
==
0
:
print
(
'*'
*
100
)
print
(
'There are no results for '
+
filename
+
' or '
+
partial_filename
)
print
(
'*'
*
100
)
assert
len
(
result
>
0
)
judged_answers
=
[]
judged_answers
=
[]
references
=
[]
references
=
[]
for
k
,
v
in
result
.
items
():
for
k
,
v
in
result
.
items
():
judged_answers
.
append
(
judged_answers
.
append
(
call_function
(
self
.
match_method
,
v
[
'prediction'
]))
call_function
(
self
.
match_method
,
v
[
'prediction'
]))
references
.
append
(
v
[
'gold'
])
references
.
append
(
v
[
'gold'
])
successful_judged_answers
=
len
(
judged_answers
)
-
judged_answers
.
count
(
None
)
print
(
print
(
f
'Among
{
len
(
judged_answers
)
}
judgements, successfully extracted
{
len
(
judged_answers
)
-
judged_answers
.
count
(
None
)
}
judgements.'
f
'Among
{
len
(
judged_answers
)
}
judgements, successfully extracted
{
successful_
judged_answers
}
judgements.'
)
)
if
successful_judged_answers
==
0
:
print
(
'*'
*
100
)
print
(
'There are no extracted judgements, please change your judge model or check your prompt!!!'
)
print
(
'*'
*
100
)
assert
successful_judged_answers
>
0
win_both_model1
,
win_both_model2
,
half_draw_model1
,
half_draw_model2
,
categories
=
defaultdict
(
win_both_model1
,
win_both_model2
,
half_draw_model1
,
half_draw_model2
,
categories
=
defaultdict
(
float
),
defaultdict
(
float
),
defaultdict
(
float
),
defaultdict
(
float
),
defaultdict
(
float
),
defaultdict
(
float
),
defaultdict
(
float
)
float
),
defaultdict
(
float
),
defaultdict
(
float
)
...
@@ -168,6 +218,8 @@ class Corev2Summarizer:
...
@@ -168,6 +218,8 @@ class Corev2Summarizer:
writer
.
writerow
(
writer
.
writerow
(
[
row
]
+
[
row
]
+
[
scores
[
row
][
column
]
for
column
in
columns
])
[
scores
[
row
][
column
]
for
column
in
columns
])
else
:
print
(
subdir_path
+
' is not exist! please check!'
)
with
open
(
fout
,
'r'
)
as
f
:
with
open
(
fout
,
'r'
)
as
f
:
x
=
from_csv
(
f
)
x
=
from_csv
(
f
)
print
(
x
)
print
(
x
)
opencompass/tasks/subjective_eval.py
View file @
e985100c
...
@@ -96,8 +96,11 @@ class SubjectiveEvalTask(BaseTask):
...
@@ -96,8 +96,11 @@ class SubjectiveEvalTask(BaseTask):
root
,
ext
=
osp
.
splitext
(
filename
)
root
,
ext
=
osp
.
splitext
(
filename
)
partial_filename
=
root
+
'_0'
+
ext
partial_filename
=
root
+
'_0'
+
ext
pred_strs
=
None
pred_strs
=
None
if
osp
.
exists
(
osp
.
realpath
(
filename
))
or
osp
.
exists
(
if
not
osp
.
exists
(
osp
.
realpath
(
filename
))
and
not
osp
.
exists
(
osp
.
realpath
(
partial_filename
)):
osp
.
realpath
(
partial_filename
)):
return
{
'error'
:
'No predictions found.'
}
else
:
if
osp
.
exists
(
osp
.
realpath
(
filename
)):
if
osp
.
exists
(
osp
.
realpath
(
filename
)):
preds
=
mmengine
.
load
(
filename
)
preds
=
mmengine
.
load
(
filename
)
pred_strs
=
[
pred_strs
=
[
...
@@ -172,8 +175,12 @@ class SubjectiveEvalTask(BaseTask):
...
@@ -172,8 +175,12 @@ class SubjectiveEvalTask(BaseTask):
eval_cfg
[
'evaluator'
][
'output_path'
]
=
out_path
eval_cfg
[
'evaluator'
][
'output_path'
]
=
out_path
icl_evaluator
=
ICL_EVALUATORS
.
build
(
eval_cfg
[
'evaluator'
])
icl_evaluator
=
ICL_EVALUATORS
.
build
(
eval_cfg
[
'evaluator'
])
references
=
(
test_set
[
output_column
]
if
output_column
else
None
)
references
=
(
test_set
[
output_column
]
if
output_column
else
None
)
result
=
icl_evaluator
.
score
(
predictions
=
model_preds
,
references
=
references
)
if
'error'
not
in
model_preds
:
result
=
icl_evaluator
.
score
(
predictions
=
model_preds
,
references
=
references
)
else
:
result
=
model_preds
if
'error'
in
result
:
if
'error'
in
result
:
self
.
logger
.
error
(
self
.
logger
.
error
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment