Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
opencompass
Commits
8a8987be
Unverified
Commit
8a8987be
authored
May 15, 2024
by
bittersweet1999
Committed by
GitHub
May 15, 2024
Browse files
fix arenahard summarizer (#1154)
Co-authored-by:
Leymore
<
zfz-960727@163.com
>
parent
62dbf047
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
47 additions
and
49 deletions
+47
-49
opencompass/summarizers/subjective/arenahard.py
opencompass/summarizers/subjective/arenahard.py
+47
-49
No files found.
opencompass/summarizers/subjective/arenahard.py
View file @
8a8987be
...
...
@@ -50,8 +50,7 @@ def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000):
tie_idx
=
(
df
[
'winner'
]
==
'tie'
)
|
(
df
[
'winner'
]
==
'tie (bothbad)'
)
tie_idx
[
len
(
tie_idx
)
//
2
:]
=
False
Y
[
tie_idx
]
=
1.0
lr
=
LogisticRegression
(
fit_intercept
=
False
,
penalty
=
None
,
tol
=
1e-8
)
lr
=
LogisticRegression
(
fit_intercept
=
False
,
penalty
=
None
,
tol
=
1e-8
)
# May need to set a small value when not use GPT4 as judge model
lr
.
fit
(
X
,
Y
)
elo_scores
=
SCALE
*
lr
.
coef_
[
0
]
+
INIT_RATING
...
...
@@ -134,10 +133,8 @@ def get_win_rate_column(df, column, baseline='gpt4-0314'):
return
win_rate_table
[
baseline
].
fillna
(
0.5
).
apply
(
lambda
x
:
round
(
x
*
100
,
2
))
def
get_battles_from_judgment
(
dataset
,
subdir_path
,
post_process
,
first_game_only
=
False
,
WEIGHT
=
3
):
def
get_battles_from_judgment
(
dataset
,
subdir_path
,
post_process
,
WEIGHT
=
3
):
arena_hard_battles
=
pd
.
DataFrame
()
print
(
'Turning judgment results into battles...'
)
dataset_abbr
=
dataset_abbr_from_cfg
(
dataset
)
filename
=
osp
.
join
(
subdir_path
,
dataset_abbr
+
'.json'
)
partial_filename
=
osp
.
join
(
subdir_path
,
dataset_abbr
+
'_0.json'
)
...
...
@@ -197,7 +194,6 @@ def get_battles_from_judgment(dataset, subdir_path, post_process, first_game_onl
if
weight
:
arena_hard_battles
=
pd
.
concat
([
arena_hard_battles
,
pd
.
DataFrame
([
output
]
*
weight
)])
arena_hard_battles
.
to_json
(
os
.
path
.
join
(
subdir_path
,
'arena_hard_battles.jsonl'
),
lines
=
True
,
orient
=
'records'
)
return
arena_hard_battles
class
ArenaHardSummarizer
:
...
...
@@ -240,6 +236,8 @@ class ArenaHardSummarizer:
judge_model
=
model_abbr_from_cfg
(
judge_model_cfg
)
for
dataset
in
self
.
cfg
[
'datasets'
]:
dataset_abbr
=
dataset_abbr_from_cfg
(
dataset
)
battles
=
pd
.
DataFrame
()
print
(
'Turning judgment results into battles...'
)
for
model_pair
in
unique_combinations
:
model1
=
model_pair
[
0
][
'abbr'
]
model2
=
model_pair
[
1
][
'abbr'
]
...
...
@@ -252,49 +250,49 @@ class ArenaHardSummarizer:
print
(
subdir_path
+
' is not exist! please check!'
)
continue
battle
s
=
get_battles_from_judgment
(
dataset
,
subdir_path
,
self
.
judge_function
)
bootstrap_online_elo
=
compute_mle_elo
(
battles
)
np
.
random
.
seed
(
42
)
bootstrap_elo_lu
=
get_bootstrap_result
(
battles
,
compute_mle_elo
,
100
)
bootstrap_elo_lu
.
to_json
(
os
.
path
.
join
(
subdir_path
,
'bootstrapping_results.jsonl'
),
lines
=
True
,
orient
=
'records'
)
stats
=
pd
.
DataFrame
(
)
stats
[
'results'
]
=
None
stats
[
'results'
]
=
stats
[
'results'
].
astype
(
'object'
)
for
i
,
model
in
enumerate
(
bootstrap_online_elo
.
index
):
assert
model
in
bootstrap_elo_lu
.
columns
stats
.
at
[
i
,
'model'
]
=
model
stats
.
at
[
i
,
'score'
]
=
bootstrap_online_elo
[
model
]
stats
.
at
[
i
,
'
lower'
]
=
np
.
percentile
(
bootstrap_elo_lu
[
model
],
2.5
)
stats
.
at
[
i
,
'
upper'
]
=
np
.
percentile
(
bootstrap_elo
_lu
[
model
]
,
97.5
)
if
model
==
'gpt4-0314'
:
stats
.
at
[
i
,
'
avg_tokens'
]
=
423
else
:
with
open
(
o
s
.
p
at
h
.
join
(
output_dir
.
split
(
'summary'
)[
0
],
'predictions'
,
model
,
dataset_abbr
+
'.json'
),
'r'
)
as
f
:
model_preds
=
json
.
load
(
f
)
pred_length
=
0
for
k
,
v
in
model_preds
.
items
():
pred_length
+
=
len
(
tiktoken
.
encoding_for_model
(
'gpt-3.5-turbo'
).
encode
(
v
[
'prediction'
]))
pred_length
/=
len
(
model_preds
)
stats
.
at
[
i
,
'avg_tokens'
]
=
pred_length
stats
.
at
[
i
,
'results'
]
=
bootstrap_elo_lu
[
model
].
tolist
(
)
stats
.
sort_values
(
by
=
'model'
,
inplace
=
True
)
stats
[
'score'
]
=
get_win_rate_column
(
stats
,
'score'
,
'gpt4-0314'
)
.
tolist
()
stats
[
'lower'
]
=
get_win_rate_column
(
stats
,
'lower'
,
'gpt4-0314'
).
tolist
(
)
stats
[
'
upper
'
]
=
get_win_rate_column
(
stats
,
'
upper
'
,
'gpt4-0314'
).
tolist
()
decimal
=
1
stats
.
sort_values
(
by
=
'score'
,
ascending
=
False
,
inplace
=
True
)
for
_
,
row
in
stats
.
iterrows
():
interval
=
str
((
round
(
row
[
'lower'
]
-
row
[
'score'
],
decimal
),
round
(
row
[
'upper'
]
-
row
[
'score'
],
decimal
)))
print
(
f
"
{
row
[
'model'
]
:
<
30
}
| score:
{
round
(
row
[
'score'
],
decimal
)
:
^
5
}
| 95% CI:
{
interval
:
^
12
}
| average #tokens:
{
int
(
row
[
'avg_tokens'
])
}
"
)
stats
.
to_
json
(
os
.
path
.
join
(
output_dir
,
'arena_hard_leaderboard
.json'
),
orient
=
'records'
,
indent
=
4
)
new_
battle
=
get_battles_from_judgment
(
dataset
,
subdir_path
,
self
.
judge_function
)
battles
=
pd
.
concat
([
battles
,
new_battle
],
ignore_index
=
True
)
battles
.
to_json
(
os
.
path
.
join
(
output_dir
,
'arena_hard_battles_judged-by--'
+
judge_model
+
'.jsonl'
),
lines
=
True
,
orient
=
'records'
)
bootstrap_online_elo
=
compute_mle_elo
(
battles
)
np
.
random
.
seed
(
42
)
bootstrap_elo_lu
=
get_bootstrap_result
(
battles
,
compute_mle_elo
,
100
)
bootstrap_elo_lu
.
to_json
(
os
.
path
.
join
(
output_dir
,
'bootstrapping_results'
+
judge_model
+
'.jsonl'
),
lines
=
True
,
orient
=
'records'
)
stats
=
pd
.
DataFrame
(
)
stats
[
'results'
]
=
None
stats
[
'results'
]
=
stats
[
'results'
].
astype
(
'object'
)
for
i
,
model
in
enumerate
(
bootstrap_online_elo
.
index
):
assert
model
in
bootstrap_elo_lu
.
columns
stats
.
at
[
i
,
'
model'
]
=
model
stats
.
at
[
i
,
'
score'
]
=
bootstrap_
online_
elo
[
model
]
stats
.
at
[
i
,
'lower'
]
=
np
.
percentile
(
bootstrap_elo_lu
[
model
],
2.5
)
stats
.
at
[
i
,
'
upper'
]
=
np
.
percentile
(
bootstrap_elo_lu
[
model
],
97.5
)
if
model
==
'gpt4-0314'
:
stat
s
.
at
[
i
,
'avg_tokens'
]
=
423
else
:
with
open
(
os
.
path
.
join
(
output_dir
.
split
(
'summary'
)[
0
],
'predictions'
,
model
,
dataset_abbr
+
'.json'
),
'r'
)
as
f
:
model_preds
=
json
.
load
(
f
)
pred_length
=
0
for
k
,
v
in
model_preds
.
items
():
pred_length
+=
len
(
tiktoken
.
encoding_for_model
(
'gpt-3.5-turbo'
).
encode
(
v
[
'prediction'
]))
pred_length
/=
len
(
model_preds
)
stats
.
at
[
i
,
'avg_tokens'
]
=
pred_length
stats
.
at
[
i
,
'results'
]
=
bootstrap_elo_lu
[
model
]
.
tolist
()
stats
.
sort_values
(
by
=
'model'
,
inplace
=
True
)
stats
[
'
score
'
]
=
get_win_rate_column
(
stats
,
'
score
'
,
'gpt4-0314'
).
tolist
()
stats
[
'lower'
]
=
get_win_rate_column
(
stats
,
'lower'
,
'gpt4-0314'
).
tolist
()
stats
[
'upper'
]
=
get_win_rate_column
(
stats
,
'upper'
,
'gpt4-0314'
).
tolist
()
decimal
=
1
stats
.
sort_values
(
by
=
'score'
,
ascending
=
False
,
inplace
=
True
)
for
_
,
row
in
stats
.
iterrows
():
interval
=
str
((
round
(
row
[
'lower'
]
-
row
[
'score'
],
decimal
),
round
(
row
[
'upper'
]
-
row
[
'score'
],
decimal
)))
print
(
f
"
{
row
[
'model'
]
:
<
30
}
| score:
{
round
(
row
[
'score'
],
decimal
)
:
^
5
}
| 95% CI:
{
interval
:
^
12
}
| average #tokens:
{
int
(
row
[
'avg_tokens'
])
}
"
)
stats
.
to_json
(
os
.
path
.
join
(
output_dir
,
'arena_hard_leaderboard_judged-by--'
+
judge_model
+
'.json'
),
orient
=
'records'
,
indent
=
4
)
stats
.
to_
csv
(
os
.
path
.
join
(
output_dir
,
'arena_hard_leaderboard
_judged-by--'
+
judge_model
+
'.csv'
)
)
def
summarize
(
self
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment