Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
d86a2d65
Unverified
Commit
d86a2d65
authored
Nov 01, 2024
by
Yineng Zhang
Committed by
GitHub
Nov 01, 2024
Browse files
minor: add human eval (#1754)
parent
16eb33ff
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
129 additions
and
3 deletions
+129
-3
.github/workflows/nightly-eval.yml
.github/workflows/nightly-eval.yml
+4
-3
test/srt/test_nightly_human_eval.py
test/srt/test_nightly_human_eval.py
+125
-0
No files found.
.github/workflows/nightly-eval.yml
View file @
d86a2d65
...
...
@@ -25,9 +25,10 @@ jobs:
-
name
:
Install dependencies
run
:
|
bash scripts/ci_install_dependency.sh
pip install --upgrade "evalplus[vllm] @ git+https://github.com/evalplus/evalplus"
-
name
:
Nightly gsm8k Accuracy
timeout-minutes
:
6
0
-
name
:
Nightly gsm8k
and human eval
Accuracy
timeout-minutes
:
12
0
run
:
|
cd test/srt
python3 test_nightly_
gsm8k
_eval.py
python3 test_nightly_
human
_eval.py
test/srt/test_nightly_human_eval.py
0 → 100644
View file @
d86a2d65
import
os
import
shutil
import
signal
import
subprocess
import
unittest
from
types
import
SimpleNamespace
from
test_nightly_gsm8k_eval
import
parse_models
from
sglang.srt.utils
import
kill_child_process
from
sglang.test.test_utils
import
(
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1
,
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2
,
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1
,
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2
,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
DEFAULT_URL_FOR_TEST
,
popen_launch_server
,
)
class
TestEvalAccuracyLarge
(
unittest
.
TestCase
):
@
classmethod
def
setUpClass
(
cls
):
cls
.
model_groups
=
[
(
parse_models
(
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1
),
False
,
False
),
(
parse_models
(
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2
),
False
,
True
),
(
parse_models
(
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1
),
True
,
False
),
(
parse_models
(
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2
),
True
,
True
),
]
cls
.
base_url
=
DEFAULT_URL_FOR_TEST
cls
.
process
=
None
cls
.
eval_process
=
None
@
classmethod
def
tearDownClass
(
cls
):
if
cls
.
process
:
kill_child_process
(
cls
.
process
.
pid
)
if
cls
.
eval_process
:
kill_child_process
(
cls
.
eval_process
.
pid
)
def
launch_server
(
self
,
model
,
is_fp8
,
is_tp2
):
other_args
=
[
"--log-level-http"
,
"warning"
,
"--trust-remote-code"
]
if
is_fp8
:
if
"Llama-3"
in
model
or
"gemma-2"
in
model
:
# compressed-tensors
other_args
.
extend
([
"--kv-cache-dtype"
,
"fp8_e5m2"
])
elif
"Qwen2-72B-Instruct-FP8"
in
model
:
# bug
other_args
.
extend
([
"--quantization"
,
"fp8"
])
else
:
other_args
.
extend
(
[
"--quantization"
,
"fp8"
,
"--kv-cache-dtype"
,
"fp8_e5m2"
]
)
if
is_tp2
:
other_args
.
extend
([
"--tp"
,
"2"
])
if
"DeepSeek"
in
model
:
other_args
.
extend
([
"--mem-frac"
,
"0.85"
])
if
"AWQ"
in
model
:
other_args
.
extend
([
"--quantization"
,
"awq"
])
elif
"GPTQ"
in
model
:
other_args
.
extend
([
"--quantization"
,
"gptq"
])
self
.
process
=
popen_launch_server
(
model
,
self
.
base_url
,
timeout
=
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
other_args
=
other_args
,
)
def
run_evalplus
(
self
,
model
):
print
(
"Delete evalplus results"
)
shutil
.
rmtree
(
"evalplus_results"
,
ignore_errors
=
True
)
cmd
=
[
"evalplus.evaluate"
,
"--model"
,
model
,
"--dataset"
,
"humaneval"
,
"--backend"
,
"openai"
,
"--base-url"
,
"http://localhost:6157/v1"
,
"--greedy"
,
]
try
:
self
.
eval_process
=
subprocess
.
Popen
(
cmd
,
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
,
text
=
True
,
preexec_fn
=
os
.
setsid
,
)
stdout
,
stderr
=
self
.
eval_process
.
communicate
(
timeout
=
600
)
if
self
.
eval_process
.
returncode
!=
0
:
print
(
f
"Fail to human eval model=
{
model
}
err=
{
stderr
}
"
)
print
(
"="
*
42
)
print
(
stdout
)
print
(
"="
*
42
)
except
subprocess
.
TimeoutExpired
:
if
self
.
eval_process
:
os
.
killpg
(
os
.
getpgid
(
self
.
eval_process
.
pid
),
signal
.
SIGTERM
)
print
(
f
"Timeout during evaluation for model=
{
model
}
"
)
except
Exception
as
e
:
print
(
f
"Error running evalplus for model=
{
model
}
{
str
(
e
)
}
"
)
if
self
.
eval_process
:
os
.
killpg
(
os
.
getpgid
(
self
.
eval_process
.
pid
),
signal
.
SIGTERM
)
def
test_human_eval_all_models
(
self
):
for
model_group
,
is_fp8
,
is_tp2
in
self
.
model_groups
:
for
model
in
model_group
:
# NOTE: only Llama for now
if
"Llama"
in
model
:
with
self
.
subTest
(
model
=
model
):
self
.
launch_server
(
model
,
is_fp8
,
is_tp2
)
self
.
run_evalplus
(
model
)
self
.
tearDownClass
()
if
__name__
==
"__main__"
:
unittest
.
main
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment