Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
b22f3f64
Unverified
Commit
b22f3f64
authored
Jan 07, 2025
by
Lianmin Zheng
Committed by
GitHub
Jan 07, 2025
Browse files
Fix nightly accuracy tests (#2780)
parent
6fb57683
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
33 additions
and
29 deletions
+33
-29
python/sglang/test/test_utils.py
python/sglang/test/test_utils.py
+1
-1
test/srt/run_suite.py
test/srt/run_suite.py
+1
-2
test/srt/test_nightly_gsm8k_eval.py
test/srt/test_nightly_gsm8k_eval.py
+26
-23
test/srt/test_nightly_human_eval.py
test/srt/test_nightly_human_eval.py
+1
-1
test/srt/test_skip_tokenizer_init.py
test/srt/test_skip_tokenizer_init.py
+4
-2
No files found.
python/sglang/test/test_utils.py
View file @
b22f3f64
...
@@ -36,7 +36,7 @@ DEFAULT_MLA_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
...
@@ -36,7 +36,7 @@ DEFAULT_MLA_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
DEFAULT_MLA_FP8_MODEL_NAME_FOR_TEST
=
"neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
DEFAULT_MLA_FP8_MODEL_NAME_FOR_TEST
=
"neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
=
600
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
=
600
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1
=
"meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1
=
"meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2
=
"meta-llama/Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct
,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct
"
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2
=
"meta-llama/Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct"
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1
=
"neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8,neuralmagic/Mistral-7B-Instruct-v0.3-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8,neuralmagic/gemma-2-2b-it-FP8"
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1
=
"neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8,neuralmagic/Mistral-7B-Instruct-v0.3-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8,neuralmagic/gemma-2-2b-it-FP8"
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2
=
"neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2
=
"neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1
=
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1
=
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"
...
...
test/srt/run_suite.py
View file @
b22f3f64
...
@@ -49,8 +49,7 @@ suites = {
...
@@ -49,8 +49,7 @@ suites = {
],
],
"nightly"
:
[
"nightly"
:
[
"test_nightly_gsm8k_eval.py"
,
"test_nightly_gsm8k_eval.py"
,
"test_nightly_human_eval.py"
,
# Disable temporarily
# Disable temporarly
# "test_nightly_math_eval.py",
# "test_nightly_math_eval.py",
],
],
"sampling/penaltylib"
:
glob
.
glob
(
"sampling/penaltylib"
:
glob
.
glob
(
...
...
test/srt/test_nightly_gsm8k_eval.py
View file @
b22f3f64
import
json
import
json
import
os
import
os
import
subprocess
import
unittest
import
unittest
import
warnings
import
warnings
from
datetime
import
datetime
from
datetime
import
datetime
...
@@ -16,24 +15,26 @@ from sglang.test.test_utils import (
...
@@ -16,24 +15,26 @@ from sglang.test.test_utils import (
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2
,
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2
,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
DEFAULT_URL_FOR_TEST
,
DEFAULT_URL_FOR_TEST
,
is_in_ci
,
popen_launch_server
,
popen_launch_server
,
write_github_step_summary
,
)
)
MODEL_SCORE_THRESHOLDS
=
{
MODEL_SCORE_THRESHOLDS
=
{
"meta-llama/Llama-3.1-8B-Instruct"
:
0.8
3
,
"meta-llama/Llama-3.1-8B-Instruct"
:
0.8
2
,
"mistralai/Mistral-7B-Instruct-v0.3"
:
0.58
,
"mistralai/Mistral-7B-Instruct-v0.3"
:
0.58
,
"deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
:
0.8
4
,
"deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
:
0.8
5
,
"google/gemma-2-27b-it"
:
0.92
,
"google/gemma-2-27b-it"
:
0.92
,
"meta-llama/Llama-3.1-70B-Instruct"
:
0.9
6
,
"meta-llama/Llama-3.1-70B-Instruct"
:
0.9
5
,
"mistralai/Mixtral-8x7B-Instruct-v0.1"
:
0.6
3
,
"mistralai/Mixtral-8x7B-Instruct-v0.1"
:
0.6
4
,
"Qwen/Qwen2-57B-A14B-Instruct"
:
0.8
7
,
"Qwen/Qwen2-57B-A14B-Instruct"
:
0.8
8
,
"neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"
:
0.8
4
,
"neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"
:
0.8
3
,
"neuralmagic/Mistral-7B-Instruct-v0.3-FP8"
:
0.54
,
"neuralmagic/Mistral-7B-Instruct-v0.3-FP8"
:
0.54
,
"neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
:
0.8
3
,
"neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
:
0.8
4
,
"neuralmagic/gemma-2-2b-it-FP8"
:
0.60
,
"neuralmagic/gemma-2-2b-it-FP8"
:
0.60
,
"neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8"
:
0.9
5
,
"neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8"
:
0.9
4
,
"neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
:
0.6
1
,
"neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
:
0.6
2
,
"neuralmagic/Qwen2-72B-Instruct-FP8"
:
0.9
5
,
"neuralmagic/Qwen2-72B-Instruct-FP8"
:
0.9
4
,
"neuralmagic/Qwen2-57B-A14B-Instruct-FP8"
:
0.82
,
"neuralmagic/Qwen2-57B-A14B-Instruct-FP8"
:
0.82
,
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
:
0.84
,
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
:
0.84
,
"hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"
:
0.83
,
"hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"
:
0.83
,
...
@@ -67,7 +68,6 @@ def launch_server(base_url, model, is_fp8, is_tp2):
...
@@ -67,7 +68,6 @@ def launch_server(base_url, model, is_fp8, is_tp2):
base_url
,
base_url
,
timeout
=
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
timeout
=
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
other_args
=
other_args
,
other_args
=
other_args
,
return_stdout_stderr
=
(
subprocess
.
DEVNULL
,
subprocess
.
DEVNULL
),
)
)
return
process
return
process
...
@@ -99,6 +99,9 @@ def write_results_to_json(model, metrics, mode="a"):
...
@@ -99,6 +99,9 @@ def write_results_to_json(model, metrics, mode="a"):
def
check_model_scores
(
results
):
def
check_model_scores
(
results
):
failed_models
=
[]
failed_models
=
[]
summary
=
" | model | score | threshold |
\n
"
summary
+=
"| ----- | ----- | --------- |
\n
"
for
model
,
score
in
results
:
for
model
,
score
in
results
:
threshold
=
MODEL_SCORE_THRESHOLDS
.
get
(
model
)
threshold
=
MODEL_SCORE_THRESHOLDS
.
get
(
model
)
if
threshold
is
None
:
if
threshold
is
None
:
...
@@ -111,11 +114,19 @@ def check_model_scores(results):
...
@@ -111,11 +114,19 @@ def check_model_scores(results):
f
"Model
{
model
}
score (
{
score
:.
4
f
}
) is below threshold (
{
threshold
:.
4
f
}
)"
f
"Model
{
model
}
score (
{
score
:.
4
f
}
) is below threshold (
{
threshold
:.
4
f
}
)"
)
)
line
=
f
"|
{
model
}
|
{
score
}
|
{
threshold
}
|
\n
"
summary
+=
line
print
(
summary
)
if
is_in_ci
():
write_github_step_summary
(
f
"### TestNightlyGsm8KEval
\n
{
summary
}
"
)
if
failed_models
:
if
failed_models
:
raise
AssertionError
(
"
\n
"
.
join
(
failed_models
))
raise
AssertionError
(
"
\n
"
.
join
(
failed_models
))
class
Test
EvalAccuracyLarge
(
unittest
.
TestCase
):
class
Test
NightlyGsm8KEval
(
unittest
.
TestCase
):
@
classmethod
@
classmethod
def
setUpClass
(
cls
):
def
setUpClass
(
cls
):
cls
.
model_groups
=
[
cls
.
model_groups
=
[
...
@@ -127,13 +138,6 @@ class TestEvalAccuracyLarge(unittest.TestCase):
...
@@ -127,13 +138,6 @@ class TestEvalAccuracyLarge(unittest.TestCase):
]
]
cls
.
base_url
=
DEFAULT_URL_FOR_TEST
cls
.
base_url
=
DEFAULT_URL_FOR_TEST
def
setUp
(
self
):
self
.
process
=
None
def
tearDown
(
self
):
if
self
.
process
:
kill_process_tree
(
self
.
process
.
pid
)
def
test_mgsm_en_all_models
(
self
):
def
test_mgsm_en_all_models
(
self
):
warnings
.
filterwarnings
(
warnings
.
filterwarnings
(
"ignore"
,
category
=
ResourceWarning
,
message
=
"unclosed.*socket"
"ignore"
,
category
=
ResourceWarning
,
message
=
"unclosed.*socket"
...
@@ -144,7 +148,7 @@ class TestEvalAccuracyLarge(unittest.TestCase):
...
@@ -144,7 +148,7 @@ class TestEvalAccuracyLarge(unittest.TestCase):
for
model_group
,
is_fp8
,
is_tp2
in
self
.
model_groups
:
for
model_group
,
is_fp8
,
is_tp2
in
self
.
model_groups
:
for
model
in
model_group
:
for
model
in
model_group
:
with
self
.
subTest
(
model
=
model
):
with
self
.
subTest
(
model
=
model
):
self
.
process
=
launch_server
(
self
.
base_url
,
model
,
is_fp8
,
is_tp2
)
process
=
launch_server
(
self
.
base_url
,
model
,
is_fp8
,
is_tp2
)
args
=
SimpleNamespace
(
args
=
SimpleNamespace
(
base_url
=
self
.
base_url
,
base_url
=
self
.
base_url
,
...
@@ -163,8 +167,7 @@ class TestEvalAccuracyLarge(unittest.TestCase):
...
@@ -163,8 +167,7 @@ class TestEvalAccuracyLarge(unittest.TestCase):
is_first
=
False
is_first
=
False
all_results
.
append
((
model
,
metrics
[
"score"
]))
all_results
.
append
((
model
,
metrics
[
"score"
]))
kill_process_tree
(
process
.
pid
)
self
.
tearDown
()
try
:
try
:
with
open
(
"results.json"
,
"r"
)
as
f
:
with
open
(
"results.json"
,
"r"
)
as
f
:
...
...
test/srt/test_nightly_human_eval.py
View file @
b22f3f64
...
@@ -18,7 +18,7 @@ from sglang.test.test_utils import (
...
@@ -18,7 +18,7 @@ from sglang.test.test_utils import (
)
)
class
Test
EvalAccuracyLarge
(
unittest
.
TestCase
):
class
Test
NightlyHumanEval
(
unittest
.
TestCase
):
@
classmethod
@
classmethod
def
setUpClass
(
cls
):
def
setUpClass
(
cls
):
if
is_in_ci
():
if
is_in_ci
():
...
...
test/srt/test_skip_tokenizer_init.py
View file @
b22f3f64
...
@@ -55,8 +55,10 @@ class TestSkipTokenizerInit(unittest.TestCase):
...
@@ -55,8 +55,10 @@ class TestSkipTokenizerInit(unittest.TestCase):
print
(
json
.
dumps
(
ret
))
print
(
json
.
dumps
(
ret
))
def
assert_one_item
(
item
):
def
assert_one_item
(
item
):
assert
len
(
item
[
"token_ids"
])
==
item
[
"meta_info"
][
"completion_tokens"
]
self
.
assertEqual
(
assert
len
(
item
[
"token_ids"
])
==
max_new_tokens
len
(
item
[
"token_ids"
]),
item
[
"meta_info"
][
"completion_tokens"
]
)
self
.
assertEqual
(
len
(
item
[
"token_ids"
]),
max_new_tokens
)
assert
item
[
"meta_info"
][
"prompt_tokens"
]
==
len
(
input_ids
)
assert
item
[
"meta_info"
][
"prompt_tokens"
]
==
len
(
input_ids
)
if
return_logprob
:
if
return_logprob
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment