Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
f44aa85f
Commit
f44aa85f
authored
Dec 06, 2023
by
baberabb
Browse files
add script to check vllm, hf equiv
parent
dc1c816b
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
78 additions
and
0 deletions
+78
-0
scripts/vllm_hf_equiv.py
scripts/vllm_hf_equiv.py
+78
-0
No files found.
scripts/vllm_hf_equiv.py
0 → 100644
View file @
f44aa85f
import
argparse
import
numpy
as
np
import
lm_eval.evaluator
from
lm_eval
import
tasks
import
scipy.stats
from
typing
import
Tuple
,
Dict
eval_logger
=
lm_eval
.
utils
.
eval_logger
def
calculate_z_value
(
res1
:
Dict
,
res2
:
Dict
,
limit
:
int
)
->
Tuple
[
float
,
float
]:
acc1
,
acc2
=
res1
[
"acc,none"
],
res2
[
"acc,none"
]
st_err1
,
st_err2
=
res1
[
"acc_stderr"
],
res2
[
"acc_stderr"
]
Z
=
(
acc1
-
acc2
)
/
np
.
sqrt
((
st_err1
**
2
/
limit
)
+
(
st_err2
**
2
/
limit
))
# Determining the p-value
p_value
=
2
*
scipy
.
stats
.
norm
.
sf
(
abs
(
Z
))
# two-tailed test
return
Z
,
p_value
def
parse_args
():
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--pretrained"
,
default
=
"EleutherAI/pythia-70m"
,
help
=
"name of model to compare"
)
parser
.
add_argument
(
"--hf_args"
,
help
=
"huggingface model args <arg>=<value>"
)
parser
.
add_argument
(
"--vllm_args"
,
help
=
"vllm model args <arg>=<value>"
)
parser
.
add_argument
(
"--tasks"
,
type
=
str
,
default
=
"arc_easy,hellaswag"
)
parser
.
add_argument
(
"--samples"
,
type
=
int
,
default
=
30
,
)
parser
.
add_argument
(
"--device"
,
type
=
str
,
default
=
"cuda"
,
)
parser
.
add_argument
(
"--batch"
,
default
=
"auto"
,
)
parser
.
add_argument
(
"--verbosity"
,
type
=
str
,
default
=
"INFO"
,
help
=
"Logging verbosity"
,
)
return
parser
.
parse_args
()
if
__name__
==
"__main__"
:
args
=
parse_args
()
results_hf
=
lm_eval
.
evaluator
.
simple_evaluate
(
model
=
"hf"
,
model_args
=
f
"pretrained=
{
args
.
pretrained
}
"
+
args
.
hf_args
,
tasks
=
args
.
tasks
,
limit
=
args
.
limit
,
device
=
args
.
device
,
batch
=
args
.
batch
,
)
results_vllm
=
lm_eval
.
evaluator
.
simple_evaluate
(
model
=
"vllm"
,
model_args
=
f
"pretrained=
{
args
.
pretrained
}
"
+
args
.
hf_args
,
tasks
=
args
.
tasks
,
limit
=
args
.
limit
,
device
=
args
.
device
,
batch
=
args
.
batch
,
)
all_res
=
{}
for
task
,
res1
,
task2
,
res2
in
zip
(
results_hf
[
"results"
].
items
(),
results_vllm
[
"results"
].
items
()
):
assert
task
==
task2
z
,
p_value
=
calculate_z_value
(
res1
,
res2
,
args
.
limit
)
all_res
[
"task"
]
=
{
"z"
:
z
,
"p_value"
:
p_value
}
assert
p_value
>
0.05
eval_logger
.
info
(
all_res
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment