Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
104bf260
Unverified
Commit
104bf260
authored
Nov 01, 2024
by
Yineng Zhang
Committed by
GitHub
Nov 01, 2024
Browse files
minor: update nightly eval (#1867)
parent
3bf3d011
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
35 additions
and
61 deletions
+35
-61
.github/workflows/nightly-eval.yml
.github/workflows/nightly-eval.yml
+1
-0
test/srt/test_nightly_gsm8k_eval.py
test/srt/test_nightly_gsm8k_eval.py
+30
-30
test/srt/test_nightly_human_eval.py
test/srt/test_nightly_human_eval.py
+4
-31
No files found.
.github/workflows/nightly-eval.yml
View file @
104bf260
...
@@ -32,3 +32,4 @@ jobs:
...
@@ -32,3 +32,4 @@ jobs:
run
:
|
run
:
|
cd test/srt
cd test/srt
python3 test_nightly_human_eval.py
python3 test_nightly_human_eval.py
python3 test_nightly_gsm8k_eval.py
test/srt/test_nightly_gsm8k_eval.py
View file @
104bf260
...
@@ -19,6 +19,35 @@ def parse_models(model_string):
...
@@ -19,6 +19,35 @@ def parse_models(model_string):
return
[
model
.
strip
()
for
model
in
model_string
.
split
(
","
)
if
model
.
strip
()]
return
[
model
.
strip
()
for
model
in
model_string
.
split
(
","
)
if
model
.
strip
()]
def
launch_server
(
base_url
,
model
,
is_fp8
,
is_tp2
):
other_args
=
[
"--log-level-http"
,
"warning"
,
"--trust-remote-code"
]
if
is_fp8
:
if
"Llama-3"
in
model
or
"gemma-2"
in
model
:
# compressed-tensors
other_args
.
extend
([
"--kv-cache-dtype"
,
"fp8_e5m2"
])
elif
"Qwen2-72B-Instruct-FP8"
in
model
:
# bug
other_args
.
extend
([
"--quantization"
,
"fp8"
])
else
:
other_args
.
extend
([
"--quantization"
,
"fp8"
,
"--kv-cache-dtype"
,
"fp8_e5m2"
])
if
is_tp2
:
other_args
.
extend
([
"--tp"
,
"2"
])
if
"DeepSeek"
in
model
:
other_args
.
extend
([
"--mem-frac"
,
"0.85"
])
if
"AWQ"
in
model
:
other_args
.
extend
([
"--quantization"
,
"awq"
])
elif
"GPTQ"
in
model
:
other_args
.
extend
([
"--quantization"
,
"gptq"
])
process
=
popen_launch_server
(
model
,
base_url
,
timeout
=
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
other_args
=
other_args
,
)
return
process
class
TestEvalAccuracyLarge
(
unittest
.
TestCase
):
class
TestEvalAccuracyLarge
(
unittest
.
TestCase
):
@
classmethod
@
classmethod
def
setUpClass
(
cls
):
def
setUpClass
(
cls
):
...
@@ -38,40 +67,11 @@ class TestEvalAccuracyLarge(unittest.TestCase):
...
@@ -38,40 +67,11 @@ class TestEvalAccuracyLarge(unittest.TestCase):
if
self
.
process
:
if
self
.
process
:
kill_child_process
(
self
.
process
.
pid
,
include_self
=
True
)
kill_child_process
(
self
.
process
.
pid
,
include_self
=
True
)
def
launch_server
(
self
,
model
,
is_fp8
,
is_tp2
):
other_args
=
[
"--log-level-http"
,
"warning"
,
"--trust-remote-code"
]
if
is_fp8
:
if
"Llama-3"
in
model
or
"gemma-2"
in
model
:
# compressed-tensors
other_args
.
extend
([
"--kv-cache-dtype"
,
"fp8_e5m2"
])
elif
"Qwen2-72B-Instruct-FP8"
in
model
:
# bug
other_args
.
extend
([
"--quantization"
,
"fp8"
])
else
:
other_args
.
extend
(
[
"--quantization"
,
"fp8"
,
"--kv-cache-dtype"
,
"fp8_e5m2"
]
)
if
is_tp2
:
other_args
.
extend
([
"--tp"
,
"2"
])
if
"DeepSeek"
in
model
:
other_args
.
extend
([
"--mem-frac"
,
"0.85"
])
if
"AWQ"
in
model
:
other_args
.
extend
([
"--quantization"
,
"awq"
])
elif
"GPTQ"
in
model
:
other_args
.
extend
([
"--quantization"
,
"gptq"
])
self
.
process
=
popen_launch_server
(
model
,
self
.
base_url
,
timeout
=
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
other_args
=
other_args
,
)
def
test_mgsm_en_all_models
(
self
):
def
test_mgsm_en_all_models
(
self
):
for
model_group
,
is_fp8
,
is_tp2
in
self
.
model_groups
:
for
model_group
,
is_fp8
,
is_tp2
in
self
.
model_groups
:
for
model
in
model_group
:
for
model
in
model_group
:
with
self
.
subTest
(
model
=
model
):
with
self
.
subTest
(
model
=
model
):
self
.
launch_server
(
model
,
is_fp8
,
is_tp2
)
self
.
process
=
launch_server
(
self
.
base_url
,
model
,
is_fp8
,
is_tp2
)
args
=
SimpleNamespace
(
args
=
SimpleNamespace
(
base_url
=
self
.
base_url
,
base_url
=
self
.
base_url
,
...
...
test/srt/test_nightly_human_eval.py
View file @
104bf260
...
@@ -5,7 +5,7 @@ import subprocess
...
@@ -5,7 +5,7 @@ import subprocess
import
unittest
import
unittest
from
types
import
SimpleNamespace
from
types
import
SimpleNamespace
from
test_nightly_gsm8k_eval
import
parse_models
from
test_nightly_gsm8k_eval
import
launch_server
,
parse_models
from
sglang.srt.utils
import
kill_child_process
from
sglang.srt.utils
import
kill_child_process
from
sglang.test.test_utils
import
(
from
sglang.test.test_utils
import
(
...
@@ -39,35 +39,6 @@ class TestEvalAccuracyLarge(unittest.TestCase):
...
@@ -39,35 +39,6 @@ class TestEvalAccuracyLarge(unittest.TestCase):
if
cls
.
eval_process
:
if
cls
.
eval_process
:
kill_child_process
(
cls
.
eval_process
.
pid
)
kill_child_process
(
cls
.
eval_process
.
pid
)
def
launch_server
(
self
,
model
,
is_fp8
,
is_tp2
):
other_args
=
[
"--log-level-http"
,
"warning"
,
"--trust-remote-code"
]
if
is_fp8
:
if
"Llama-3"
in
model
or
"gemma-2"
in
model
:
# compressed-tensors
other_args
.
extend
([
"--kv-cache-dtype"
,
"fp8_e5m2"
])
elif
"Qwen2-72B-Instruct-FP8"
in
model
:
# bug
other_args
.
extend
([
"--quantization"
,
"fp8"
])
else
:
other_args
.
extend
(
[
"--quantization"
,
"fp8"
,
"--kv-cache-dtype"
,
"fp8_e5m2"
]
)
if
is_tp2
:
other_args
.
extend
([
"--tp"
,
"2"
])
if
"DeepSeek"
in
model
:
other_args
.
extend
([
"--mem-frac"
,
"0.85"
])
if
"AWQ"
in
model
:
other_args
.
extend
([
"--quantization"
,
"awq"
])
elif
"GPTQ"
in
model
:
other_args
.
extend
([
"--quantization"
,
"gptq"
])
self
.
process
=
popen_launch_server
(
model
,
self
.
base_url
,
timeout
=
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
other_args
=
other_args
,
)
def
run_evalplus
(
self
,
model
):
def
run_evalplus
(
self
,
model
):
print
(
"Delete evalplus results"
)
print
(
"Delete evalplus results"
)
shutil
.
rmtree
(
"evalplus_results"
,
ignore_errors
=
True
)
shutil
.
rmtree
(
"evalplus_results"
,
ignore_errors
=
True
)
...
@@ -116,7 +87,9 @@ class TestEvalAccuracyLarge(unittest.TestCase):
...
@@ -116,7 +87,9 @@ class TestEvalAccuracyLarge(unittest.TestCase):
# NOTE: only Llama for now
# NOTE: only Llama for now
if
"Llama"
in
model
:
if
"Llama"
in
model
:
with
self
.
subTest
(
model
=
model
):
with
self
.
subTest
(
model
=
model
):
self
.
launch_server
(
model
,
is_fp8
,
is_tp2
)
self
.
process
=
launch_server
(
self
.
base_url
,
model
,
is_fp8
,
is_tp2
)
self
.
run_evalplus
(
model
)
self
.
run_evalplus
(
model
)
self
.
tearDownClass
()
self
.
tearDownClass
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment