Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
75aa1442
Unverified
Commit
75aa1442
authored
Jun 29, 2024
by
Robert Shaw
Committed by
GitHub
Jun 29, 2024
Browse files
[ CI/Build ] LM Eval Harness Based CI Testing (#5838)
Co-authored-by:
Robert Shaw
<
rshaw@neuralmagic
>
parent
99397da5
Changes
11
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
274 additions
and
0 deletions
+274
-0
.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml
...te/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml
+11
-0
.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml
...lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml
+11
-0
.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml
...ite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml
+11
-0
.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml
...e/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml
+11
-0
.buildkite/lm-eval-harness/configs/models-large.txt
.buildkite/lm-eval-harness/configs/models-large.txt
+2
-0
.buildkite/lm-eval-harness/configs/models-small.txt
.buildkite/lm-eval-harness/configs/models-small.txt
+2
-0
.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
+46
-0
.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
+51
-0
.buildkite/lm-eval-harness/run-tests.sh
.buildkite/lm-eval-harness/run-tests.sh
+59
-0
.buildkite/lm-eval-harness/test_lm_eval_correctness.py
.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+54
-0
.buildkite/test-pipeline.yaml
.buildkite/test-pipeline.yaml
+16
-0
No files found.
.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml
0 → 100644
View file @
75aa1442
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
model_name
:
"
meta-llama/Meta-Llama-3-70B-Instruct"
tasks
:
-
name
:
"
gsm8k"
metrics
:
-
name
:
"
exact_match,strict-match"
value
:
0.892
-
name
:
"
exact_match,flexible-extract"
value
:
0.892
limit
:
250
num_fewshot
:
5
.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml
0 → 100644
View file @
75aa1442
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
model_name
:
"
neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
tasks
:
-
name
:
"
gsm8k"
metrics
:
-
name
:
"
exact_match,strict-match"
value
:
0.756
-
name
:
"
exact_match,flexible-extract"
value
:
0.752
limit
:
250
num_fewshot
:
5
.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml
0 → 100644
View file @
75aa1442
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 -t 1
model_name
:
"
meta-llama/Meta-Llama-3-8B-Instruct"
tasks
:
-
name
:
"
gsm8k"
metrics
:
-
name
:
"
exact_match,strict-match"
value
:
0.756
-
name
:
"
exact_match,flexible-extract"
value
:
0.752
limit
:
250
num_fewshot
:
5
.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml
0 → 100644
View file @
75aa1442
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5 -t 4
model_name
:
"
mistralai/Mixtral-8x7B-Instruct-v0.1"
tasks
:
-
name
:
"
gsm8k"
metrics
:
-
name
:
"
exact_match,strict-match"
value
:
0.616
-
name
:
"
exact_match,flexible-extract"
value
:
0.632
limit
:
250
num_fewshot
:
5
.buildkite/lm-eval-harness/configs/models-large.txt
0 → 100644
View file @
75aa1442
Meta-Llama-3-70B-Instruct.yaml
Mixtral-8x7B-Instruct-v0.1.yaml
.buildkite/lm-eval-harness/configs/models-small.txt
0 → 100644
View file @
75aa1442
Meta-Llama-3-8B-Instruct.yaml
Meta-Llama-3-8B-Instruct-FP8.yaml
.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
0 → 100644
View file @
75aa1442
#!/bin/bash
# We can use this script to compute baseline accuracy on GSM for transformers.
#
# Make sure you have lm-eval-harness installed:
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@9516087b81a61d0e220b22cc1b75be76de23bc10
usage
()
{
echo
``
echo
"Runs lm eval harness on GSM8k using huggingface transformers."
echo
"This pathway is intended to be used to create baselines for "
echo
"our automated nm-test-accuracy workflow"
echo
echo
"usage:
${
0
}
<options>"
echo
echo
" -m - huggingface stub or local directory of the model"
echo
" -b - batch size to run the evaluation at"
echo
" -l - limit number of samples to run"
echo
" -f - number of fewshot samples to use"
echo
}
while
getopts
"m:b:l:f:"
OPT
;
do
case
${
OPT
}
in
m
)
MODEL
=
"
$OPTARG
"
;;
b
)
BATCH_SIZE
=
"
$OPTARG
"
;;
l
)
LIMIT
=
"
$OPTARG
"
;;
f
)
FEWSHOT
=
"
$OPTARG
"
;;
\?
)
usage
exit
1
;;
esac
done
lm_eval
--model
hf
\
--model_args
pretrained
=
$MODEL
,parallelize
=
True
\
--tasks
gsm8k
--num_fewshot
$FEWSHOT
--limit
$LIMIT
\
--batch_size
$BATCH_SIZE
.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
0 → 100644
View file @
75aa1442
#!/bin/bash
# We can use this script to compute baseline accuracy on GSM for vllm.
# We use this for fp8, which HF does not support.
#
# Make sure you have lm-eval-harness installed:
# pip install lm-eval==0.4.2
usage
()
{
echo
``
echo
"Runs lm eval harness on GSM8k using huggingface transformers."
echo
"This pathway is intended to be used to create baselines for "
echo
"our automated nm-test-accuracy workflow"
echo
echo
"usage:
${
0
}
<options>"
echo
echo
" -m - huggingface stub or local directory of the model"
echo
" -b - batch size to run the evaluation at"
echo
" -l - limit number of samples to run"
echo
" -f - number of fewshot samples to use"
echo
" -t - tensor parallel size to run at"
echo
}
while
getopts
"m:b:l:f:t:"
OPT
;
do
case
${
OPT
}
in
m
)
MODEL
=
"
$OPTARG
"
;;
b
)
BATCH_SIZE
=
"
$OPTARG
"
;;
l
)
LIMIT
=
"
$OPTARG
"
;;
f
)
FEWSHOT
=
"
$OPTARG
"
;;
t
)
TP_SIZE
=
"
$OPTARG
"
;;
\?
)
usage
exit
1
;;
esac
done
lm_eval
--model
vllm
\
--model_args
pretrained
=
$MODEL
,tensor_parallel_size
=
$TP_SIZE
\
--tasks
gsm8k
--num_fewshot
$FEWSHOT
--limit
$LIMIT
\
--batch_size
$BATCH_SIZE
.buildkite/lm-eval-harness/run-tests.sh
0 → 100644
View file @
75aa1442
#!/bin/bash
usage
()
{
echo
``
echo
"Runs lm eval harness on GSM8k using vllm and compares to "
echo
"precomputed baseline (measured by HF transformers.)"
echo
echo
"usage:
${
0
}
<options>"
echo
echo
" -c - path to the test data config (e.g. configs/small-models.txt)"
echo
" -t - tensor parallel size"
echo
}
SUCCESS
=
0
while
getopts
"c:t:"
OPT
;
do
case
${
OPT
}
in
c
)
CONFIG
=
"
$OPTARG
"
;;
t
)
TP_SIZE
=
"
$OPTARG
"
;;
\?
)
usage
exit
1
;;
esac
done
# Parse list of configs.
IFS
=
$'
\n
'
read
-d
''
-r
-a
MODEL_CONFIGS <
$CONFIG
for
MODEL_CONFIG
in
"
${
MODEL_CONFIGS
[@]
}
"
do
LOCAL_SUCCESS
=
0
echo
"=== RUNNING MODEL:
$MODEL_CONFIG
WITH TP SIZE:
$TP_SIZE
==="
export
LM_EVAL_TEST_DATA_FILE
=
$PWD
/configs/
${
MODEL_CONFIG
}
export
LM_EVAL_TP_SIZE
=
$TP_SIZE
pytest
-s
test_lm_eval_correctness.py
||
LOCAL_SUCCESS
=
$?
if
[[
$LOCAL_SUCCESS
==
0
]]
;
then
echo
"=== PASSED MODEL:
${
MODEL_CONFIG
}
==="
else
echo
"=== FAILED MODEL:
${
MODEL_CONFIG
}
==="
fi
SUCCESS
=
$((
SUCCESS
+
LOCAL_SUCCESS
))
done
if
[
"
${
SUCCESS
}
"
-eq
"0"
]
;
then
exit
0
else
exit
1
fi
.buildkite/lm-eval-harness/test_lm_eval_correctness.py
0 → 100644
View file @
75aa1442
"""
LM eval harness on model to compare vs HF baseline computed offline.
Configs are found in configs/$MODEL.yaml
* export LM_EVAL_TEST_DATA_FILE=configs/Meta-Llama-3-70B-Instruct.yaml
* export LM_EVAL_TP_SIZE=4
* pytest -s test_lm_eval_correctness.py
"""
import
os
from
pathlib
import
Path
import
lm_eval
import
numpy
import
yaml
RTOL
=
0.02
TEST_DATA_FILE
=
os
.
environ
.
get
(
"LM_EVAL_TEST_DATA_FILE"
,
".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml"
)
TP_SIZE
=
os
.
environ
.
get
(
"LM_EVAL_TP_SIZE"
,
1
)
def
launch_lm_eval
(
eval_config
):
model_args
=
f
"pretrained=
{
eval_config
[
'model_name'
]
}
,"
\
f
"tensor_parallel_size=
{
TP_SIZE
}
"
results
=
lm_eval
.
simple_evaluate
(
model
=
"vllm"
,
model_args
=
model_args
,
tasks
=
[
task
[
"name"
]
for
task
in
eval_config
[
"tasks"
]],
num_fewshot
=
eval_config
[
"num_fewshot"
],
limit
=
eval_config
[
"limit"
],
batch_size
=
"auto"
)
return
results
def
test_lm_eval_correctness
():
eval_config
=
yaml
.
safe_load
(
Path
(
TEST_DATA_FILE
).
read_text
(
encoding
=
"utf-8"
))
# Launch eval requests.
results
=
launch_lm_eval
(
eval_config
)
# Confirm scores match ground truth.
for
task
in
eval_config
[
"tasks"
]:
for
metric
in
task
[
"metrics"
]:
ground_truth
=
metric
[
"value"
]
measured_value
=
results
[
"results"
][
task
[
"name"
]][
metric
[
"name"
]]
print
(
f
'
{
task
[
"name"
]
}
|
{
metric
[
"name"
]
}
: '
f
'ground_truth=
{
ground_truth
}
| measured=
{
measured_value
}
'
)
assert
numpy
.
isclose
(
ground_truth
,
measured_value
,
rtol
=
RTOL
)
.buildkite/test-pipeline.yaml
View file @
75aa1442
...
@@ -197,6 +197,22 @@ steps:
...
@@ -197,6 +197,22 @@ steps:
-
pip install aiohttp
-
pip install aiohttp
-
bash run-benchmarks.sh
-
bash run-benchmarks.sh
-
label
:
LM Eval Small Models
working_dir
:
"
/vllm-workspace/.buildkite/lm-eval-harness"
commands
:
-
pip install lm-eval
-
export VLLM_WORKER_MULTIPROC_METHOD=spawn
-
bash ./run-tests.sh -c configs/models-small.txt -t
1
-
label
:
LM Eval Large Models
gpu
:
a100
num_gpus
:
4
working_dir
:
"
/vllm-workspace/.buildkite/lm-eval-harness"
commands
:
-
pip install lm-eval
-
export VLLM_WORKER_MULTIPROC_METHOD=spawn
-
bash ./run-tests.sh -c configs/models-large.txt -t
4
-
label
:
Documentation Build
-
label
:
Documentation Build
working_dir
:
"
/vllm-workspace/test_docs/docs"
working_dir
:
"
/vllm-workspace/test_docs/docs"
no_gpu
:
True
no_gpu
:
True
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment