Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
950b7118
Unverified
Commit
950b7118
authored
May 06, 2025
by
Michael Goin
Committed by
GitHub
May 06, 2025
Browse files
Replace lm-eval bash script with pytest and use enforce_eager for faster CI (#17717)
Signed-off-by:
mgoin
<
mgoin64@gmail.com
>
parent
e50a1f1a
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
52 additions
and
91 deletions
+52
-91
.buildkite/lm-eval-harness/conftest.py
.buildkite/lm-eval-harness/conftest.py
+39
-0
.buildkite/lm-eval-harness/run-tests.sh
.buildkite/lm-eval-harness/run-tests.sh
+0
-59
.buildkite/lm-eval-harness/test_lm_eval_correctness.py
.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+11
-30
.buildkite/test-pipeline.yaml
.buildkite/test-pipeline.yaml
+2
-2
No files found.
.buildkite/lm-eval-harness/conftest.py
0 → 100644
View file @
950b7118
# SPDX-License-Identifier: Apache-2.0
from
pathlib
import
Path
import
pytest
def
pytest_addoption
(
parser
):
parser
.
addoption
(
"--config-list-file"
,
action
=
"store"
,
help
=
"Path to the file listing model config YAMLs (one per line)"
)
parser
.
addoption
(
"--tp-size"
,
action
=
"store"
,
default
=
"1"
,
help
=
"Tensor parallel size to use for evaluation"
)
@
pytest
.
fixture
(
scope
=
"session"
)
def
config_list_file
(
pytestconfig
,
config_dir
):
rel_path
=
pytestconfig
.
getoption
(
"--config-list-file"
)
return
config_dir
/
rel_path
@
pytest
.
fixture
(
scope
=
"session"
)
def
tp_size
(
pytestconfig
):
return
pytestconfig
.
getoption
(
"--tp-size"
)
def
pytest_generate_tests
(
metafunc
):
if
"config_filename"
in
metafunc
.
fixturenames
:
rel_path
=
metafunc
.
config
.
getoption
(
"--config-list-file"
)
config_list_file
=
Path
(
rel_path
).
resolve
()
config_dir
=
config_list_file
.
parent
with
open
(
config_list_file
,
encoding
=
"utf-8"
)
as
f
:
configs
=
[
config_dir
/
line
.
strip
()
for
line
in
f
if
line
.
strip
()
and
not
line
.
startswith
(
"#"
)
]
metafunc
.
parametrize
(
"config_filename"
,
configs
)
.buildkite/lm-eval-harness/run-tests.sh
deleted
100644 → 0
View file @
e50a1f1a
#!/bin/bash
usage
()
{
echo
``
echo
"Runs lm eval harness on GSM8k using vllm and compares to "
echo
"precomputed baseline (measured by HF transformers.)"
echo
echo
"usage:
${
0
}
<options>"
echo
echo
" -c - path to the test data config (e.g. configs/small-models.txt)"
echo
" -t - tensor parallel size"
echo
}
SUCCESS
=
0
while
getopts
"c:t:"
OPT
;
do
case
${
OPT
}
in
c
)
CONFIG
=
"
$OPTARG
"
;;
t
)
TP_SIZE
=
"
$OPTARG
"
;;
\?
)
usage
exit
1
;;
esac
done
# Parse list of configs.
IFS
=
$'
\n
'
read
-d
''
-r
-a
MODEL_CONFIGS <
"
$CONFIG
"
for
MODEL_CONFIG
in
"
${
MODEL_CONFIGS
[@]
}
"
do
LOCAL_SUCCESS
=
0
echo
"=== RUNNING MODEL:
$MODEL_CONFIG
WITH TP SIZE:
$TP_SIZE
==="
export
LM_EVAL_TEST_DATA_FILE
=
$PWD
/configs/
${
MODEL_CONFIG
}
export
LM_EVAL_TP_SIZE
=
$TP_SIZE
pytest
-s
test_lm_eval_correctness.py
||
LOCAL_SUCCESS
=
$?
if
[[
$LOCAL_SUCCESS
==
0
]]
;
then
echo
"=== PASSED MODEL:
${
MODEL_CONFIG
}
==="
else
echo
"=== FAILED MODEL:
${
MODEL_CONFIG
}
==="
fi
SUCCESS
=
$((
SUCCESS
+
LOCAL_SUCCESS
))
done
if
[
"
${
SUCCESS
}
"
-eq
"0"
]
;
then
exit
0
else
exit
1
fi
.buildkite/lm-eval-harness/test_lm_eval_correctness.py
View file @
950b7118
...
...
@@ -3,35 +3,25 @@
LM eval harness on model to compare vs HF baseline computed offline.
Configs are found in configs/$MODEL.yaml
* export LM_EVAL_TEST_DATA_FILE=configs/Meta-Llama-3-70B-Instruct.yaml
* export LM_EVAL_TP_SIZE=4
* pytest -s test_lm_eval_correctness.py
pytest -s -v test_lm_eval_correctness.py
\
--config-list-file=configs/models-small.txt
\
--tp-size=1
"""
import
os
from
pathlib
import
Path
import
lm_eval
import
numpy
import
pytest
import
numpy
as
np
import
yaml
RTOL
=
0.08
TEST_DATA_FILE
=
os
.
environ
.
get
(
"LM_EVAL_TEST_DATA_FILE"
,
".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml"
)
TP_SIZE
=
os
.
environ
.
get
(
"LM_EVAL_TP_SIZE"
,
1
)
def
launch_lm_eval
(
eval_config
):
def
launch_lm_eval
(
eval_config
,
tp_size
):
trust_remote_code
=
eval_config
.
get
(
'trust_remote_code'
,
False
)
model_args
=
f
"pretrained=
{
eval_config
[
'model_name'
]
}
,"
\
f
"tensor_parallel_size=
{
TP_SIZE
}
,"
\
f
"tensor_parallel_size=
{
tp_size
}
,"
\
f
"enforce_eager=true,"
\
f
"add_bos_token=true,"
\
f
"trust_remote_code=
{
trust_remote_code
}
"
results
=
lm_eval
.
simple_evaluate
(
model
=
"vllm"
,
model_args
=
model_args
,
...
...
@@ -39,22 +29,14 @@ def launch_lm_eval(eval_config):
num_fewshot
=
eval_config
[
"num_fewshot"
],
limit
=
eval_config
[
"limit"
],
batch_size
=
"auto"
)
return
results
def
test_lm_eval_correctness
():
eval_config
=
yaml
.
safe_load
(
Path
(
TEST_DATA_FILE
).
read_text
(
encoding
=
"utf-8"
))
if
eval_config
[
"model_name"
]
==
"nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform"
:
#noqa: E501
pytest
.
skip
(
"FBGEMM is currently failing on main."
)
def
test_lm_eval_correctness_param
(
config_filename
,
tp_size
):
eval_config
=
yaml
.
safe_load
(
config_filename
.
read_text
(
encoding
=
"utf-8"
))
# Launch eval requests.
results
=
launch_lm_eval
(
eval_config
)
results
=
launch_lm_eval
(
eval_config
,
tp_size
)
# Confirm scores match ground truth.
success
=
True
for
task
in
eval_config
[
"tasks"
]:
for
metric
in
task
[
"metrics"
]:
...
...
@@ -62,8 +44,7 @@ def test_lm_eval_correctness():
measured_value
=
results
[
"results"
][
task
[
"name"
]][
metric
[
"name"
]]
print
(
f
'
{
task
[
"name"
]
}
|
{
metric
[
"name"
]
}
: '
f
'ground_truth=
{
ground_truth
}
| measured=
{
measured_value
}
'
)
success
=
success
and
n
umpy
.
isclose
(
success
=
success
and
n
p
.
isclose
(
ground_truth
,
measured_value
,
rtol
=
RTOL
)
# Assert at the end, print all scores even on failure for debugging.
assert
success
.buildkite/test-pipeline.yaml
View file @
950b7118
...
...
@@ -408,7 +408,7 @@ steps:
-
vllm/model_executor/layers/quantization
commands
:
-
export VLLM_WORKER_MULTIPROC_METHOD=spawn
-
bash ./run-tests.sh -c
configs/models-small.txt -
t
1
-
pytest -s -v test_lm_eval_correctness.py --config-list-file=
configs/models-small.txt -
-tp-size=
1
-
label
:
OpenAI API correctness
source_file_dependencies
:
...
...
@@ -713,4 +713,4 @@ steps:
-
vllm/model_executor/layers/quantization
commands
:
-
export VLLM_WORKER_MULTIPROC_METHOD=spawn
-
bash ./run-tests.sh -c
configs/models-large.txt -
t
4
-
pytest -s -v test_lm_eval_correctness.py --config-list-file=
configs/models-large.txt -
-tp-size=
4
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment