Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
8b74beaa
Commit
8b74beaa
authored
Dec 06, 2023
by
baberabb
Browse files
fix z-score and print; rename script
parent
b99ad796
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
59 additions
and
19 deletions
+59
-19
scripts/model_comparator.py
scripts/model_comparator.py
+59
-19
No files found.
scripts/
vllm_hf_equiv
.py
→
scripts/
model_comparator
.py
View file @
8b74beaa
...
@@ -3,20 +3,52 @@ import numpy as np
...
@@ -3,20 +3,52 @@ import numpy as np
import
lm_eval.evaluator
import
lm_eval.evaluator
from
lm_eval
import
tasks
from
lm_eval
import
tasks
import
scipy.stats
import
scipy.stats
from
typing
import
Tuple
,
Dict
from
typing
import
Tuple
,
Dict
,
List
import
pandas
as
pd
import
torch
import
os
os
.
environ
[
"TOKENIZERS_PARALLELISM"
]
=
"false"
eval_logger
=
lm_eval
.
utils
.
eval_logger
eval_logger
=
lm_eval
.
utils
.
eval_logger
def
calculate_z_value
(
res1
:
Dict
,
res2
:
Dict
,
limit
:
int
)
->
Tuple
[
float
,
float
]:
def
calculate_z_value
(
res1
:
Dict
,
res2
:
Dict
)
->
Tuple
[
float
,
float
]:
acc1
,
acc2
=
res1
[
"acc,none"
],
res2
[
"acc,none"
]
acc1
,
acc2
=
res1
[
"acc,none"
],
res2
[
"acc,none"
]
st_err1
,
st_err2
=
res1
[
"acc_stderr"
],
res2
[
"acc_stderr"
]
st_err1
,
st_err2
=
res1
[
"acc_stderr
,none
"
],
res2
[
"acc_stderr
,none
"
]
Z
=
(
acc1
-
acc2
)
/
np
.
sqrt
((
st_err1
**
2
/
limit
)
+
(
st_err2
**
2
/
limit
))
Z
=
(
acc1
-
acc2
)
/
np
.
sqrt
((
st_err1
**
2
)
+
(
st_err2
**
2
))
# Determining the p-value
# Determining the p-value
p_value
=
2
*
scipy
.
stats
.
norm
.
sf
(
abs
(
Z
))
# two-tailed test
p_value
=
2
*
scipy
.
stats
.
norm
.
sf
(
abs
(
Z
))
# two-tailed test
return
Z
,
p_value
return
Z
,
p_value
def
print_results
(
data_to_print
:
List
=
None
,
results_dict
:
Dict
=
None
,
alpha
:
float
=
None
):
model1_data
=
data_to_print
[
0
]
model2_data
=
data_to_print
[
1
]
table_data
=
[]
for
task
in
model1_data
.
keys
():
row
=
{
"Task"
:
task
,
"HF Accuracy"
:
model1_data
[
task
][
"acc,none"
],
"vLLM Accuracy"
:
model2_data
[
task
][
"acc,none"
],
"HF StdErr"
:
model1_data
[
task
][
"acc_stderr,none"
],
"vLLM StdErr"
:
model2_data
[
task
][
"acc_stderr,none"
],
}
table_data
.
append
(
row
)
comparison_df
=
pd
.
DataFrame
(
table_data
)
comparison_df
[
"Z-Score"
]
=
comparison_df
[
"Task"
].
apply
(
lambda
task
:
results_dict
[
task
][
"z"
]
)
comparison_df
[
"P-Value"
]
=
comparison_df
[
"Task"
].
apply
(
lambda
task
:
results_dict
[
task
][
"p_value"
]
)
comparison_df
[
f
"p >
{
alpha
}
"
]
=
comparison_df
[
"P-Value"
].
apply
(
lambda
p
:
"✓"
if
p
>
alpha
else
"×"
)
return
comparison_df
def
parse_args
():
def
parse_args
():
parser
=
argparse
.
ArgumentParser
()
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
parser
.
add_argument
(
...
@@ -28,9 +60,15 @@ def parse_args():
...
@@ -28,9 +60,15 @@ def parse_args():
parser
.
add_argument
(
"--vllm_args"
,
help
=
"vllm model args <arg>=<value>"
,
default
=
""
)
parser
.
add_argument
(
"--vllm_args"
,
help
=
"vllm model args <arg>=<value>"
,
default
=
""
)
parser
.
add_argument
(
"--tasks"
,
type
=
str
,
default
=
"arc_easy,hellaswag"
)
parser
.
add_argument
(
"--tasks"
,
type
=
str
,
default
=
"arc_easy,hellaswag"
)
parser
.
add_argument
(
parser
.
add_argument
(
"--
samples
"
,
"--
limit
"
,
type
=
int
,
type
=
int
,
default
=
30
,
default
=
100
,
)
parser
.
add_argument
(
"--alpha"
,
type
=
float
,
default
=
0.05
,
help
=
"Significance level for two-tailed z-test"
,
)
)
parser
.
add_argument
(
parser
.
add_argument
(
"--device"
,
"--device"
,
...
@@ -56,21 +94,21 @@ if __name__ == "__main__":
...
@@ -56,21 +94,21 @@ if __name__ == "__main__":
args
=
parse_args
()
args
=
parse_args
()
tasks
=
args
.
tasks
.
split
(
","
)
tasks
=
args
.
tasks
.
split
(
","
)
print
(
tasks
)
print
(
tasks
)
hf_args
=
","
+
args
.
hf_args
hf_args
,
vllm_args
=
","
+
args
.
hf_args
,
","
+
args
.
vllm_args
vllm_args
=
","
+
args
.
vllm_args
results_vllm
=
lm_eval
.
evaluator
.
simple_evaluate
(
results_hf
=
lm_eval
.
evaluator
.
simple_evaluate
(
model
=
"vllm"
,
model
=
"hf"
,
model_args
=
f
"pretrained=
{
args
.
pretrained
}
"
+
vllm_args
,
model_args
=
f
"pretrained=
{
args
.
pretrained
}
"
+
hf_args
,
tasks
=
tasks
,
tasks
=
tasks
,
limit
=
args
.
samples
,
limit
=
args
.
limit
,
device
=
args
.
device
,
device
=
args
.
device
,
batch_size
=
args
.
batch
,
batch_size
=
args
.
batch
,
)
)
results_vllm
=
lm_eval
.
evaluator
.
simple_evaluate
(
torch
.
cuda
.
empty_cache
()
model
=
"vllm"
,
results_hf
=
lm_eval
.
evaluator
.
simple_evaluate
(
model_args
=
f
"pretrained=
{
args
.
pretrained
}
"
+
vllm_args
,
model
=
"hf"
,
model_args
=
f
"pretrained=
{
args
.
pretrained
}
"
+
hf_args
,
tasks
=
tasks
,
tasks
=
tasks
,
limit
=
args
.
samples
,
limit
=
args
.
limit
,
device
=
args
.
device
,
device
=
args
.
device
,
batch_size
=
args
.
batch
,
batch_size
=
args
.
batch
,
)
)
...
@@ -79,7 +117,9 @@ if __name__ == "__main__":
...
@@ -79,7 +117,9 @@ if __name__ == "__main__":
results_hf
[
"results"
].
items
(),
results_vllm
[
"results"
].
items
()
results_hf
[
"results"
].
items
(),
results_vllm
[
"results"
].
items
()
):
):
assert
task1
[
0
]
==
task2
[
0
]
assert
task1
[
0
]
==
task2
[
0
]
z
,
p_value
=
calculate_z_value
(
task1
[
1
],
task2
[
1
]
,
args
.
samples
)
z
,
p_value
=
calculate_z_value
(
task1
[
1
],
task2
[
1
])
all_res
[
task1
[
0
]]
=
{
"z"
:
z
,
"p_value"
:
p_value
}
all_res
[
task1
[
0
]]
=
{
"z"
:
z
,
"p_value"
:
p_value
}
assert
p_value
>
0.05
df
=
print_results
(
eval_logger
.
info
(
all_res
)
[
results_hf
[
"results"
],
results_vllm
[
"results"
]],
all_res
,
args
.
alpha
)
print
(
df
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment