Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
zhaoyu6
sglang
Commits
b22f3f64
"vscode:/vscode.git/clone" did not exist on "ba8cc4553867ccdac163a382d89213856a86f5b7"
Unverified
Commit
b22f3f64
authored
Jan 07, 2025
by
Lianmin Zheng
Committed by
GitHub
Jan 07, 2025
Browse files
Fix nightly accuracy tests (#2780)
parent
6fb57683
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
33 additions
and
29 deletions
+33
-29
python/sglang/test/test_utils.py
python/sglang/test/test_utils.py
+1
-1
test/srt/run_suite.py
test/srt/run_suite.py
+1
-2
test/srt/test_nightly_gsm8k_eval.py
test/srt/test_nightly_gsm8k_eval.py
+26
-23
test/srt/test_nightly_human_eval.py
test/srt/test_nightly_human_eval.py
+1
-1
test/srt/test_skip_tokenizer_init.py
test/srt/test_skip_tokenizer_init.py
+4
-2
No files found.
python/sglang/test/test_utils.py
View file @
b22f3f64
...
@@ -36,7 +36,7 @@ DEFAULT_MLA_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
...
@@ -36,7 +36,7 @@ DEFAULT_MLA_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
DEFAULT_MLA_FP8_MODEL_NAME_FOR_TEST
=
"neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
DEFAULT_MLA_FP8_MODEL_NAME_FOR_TEST
=
"neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
=
600
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
=
600
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1
=
"meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1
=
"meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2
=
"meta-llama/Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct
,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct
"
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2
=
"meta-llama/Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct"
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1
=
"neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8,neuralmagic/Mistral-7B-Instruct-v0.3-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8,neuralmagic/gemma-2-2b-it-FP8"
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1
=
"neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8,neuralmagic/Mistral-7B-Instruct-v0.3-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8,neuralmagic/gemma-2-2b-it-FP8"
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2
=
"neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2
=
"neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1
=
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1
=
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"
...
...
test/srt/run_suite.py
View file @
b22f3f64
...
@@ -49,8 +49,7 @@ suites = {
...
@@ -49,8 +49,7 @@ suites = {
],
],
"nightly"
:
[
"nightly"
:
[
"test_nightly_gsm8k_eval.py"
,
"test_nightly_gsm8k_eval.py"
,
"test_nightly_human_eval.py"
,
# Disable temporarily
# Disable temporarly
# "test_nightly_math_eval.py",
# "test_nightly_math_eval.py",
],
],
"sampling/penaltylib"
:
glob
.
glob
(
"sampling/penaltylib"
:
glob
.
glob
(
...
...
test/srt/test_nightly_gsm8k_eval.py
View file @
b22f3f64
import
json
import
json
import
os
import
os
import
subprocess
import
unittest
import
unittest
import
warnings
import
warnings
from
datetime
import
datetime
from
datetime
import
datetime
...
@@ -16,24 +15,26 @@ from sglang.test.test_utils import (
...
@@ -16,24 +15,26 @@ from sglang.test.test_utils import (
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2
,
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2
,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
DEFAULT_URL_FOR_TEST
,
DEFAULT_URL_FOR_TEST
,
is_in_ci
,
popen_launch_server
,
popen_launch_server
,
write_github_step_summary
,
)
)
MODEL_SCORE_THRESHOLDS
=
{
MODEL_SCORE_THRESHOLDS
=
{
"meta-llama/Llama-3.1-8B-Instruct"
:
0.8
3
,
"meta-llama/Llama-3.1-8B-Instruct"
:
0.8
2
,
"mistralai/Mistral-7B-Instruct-v0.3"
:
0.58
,
"mistralai/Mistral-7B-Instruct-v0.3"
:
0.58
,
"deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
:
0.8
4
,
"deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
:
0.8
5
,
"google/gemma-2-27b-it"
:
0.92
,
"google/gemma-2-27b-it"
:
0.92
,
"meta-llama/Llama-3.1-70B-Instruct"
:
0.9
6
,
"meta-llama/Llama-3.1-70B-Instruct"
:
0.9
5
,
"mistralai/Mixtral-8x7B-Instruct-v0.1"
:
0.6
3
,
"mistralai/Mixtral-8x7B-Instruct-v0.1"
:
0.6
4
,
"Qwen/Qwen2-57B-A14B-Instruct"
:
0.8
7
,
"Qwen/Qwen2-57B-A14B-Instruct"
:
0.8
8
,
"neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"
:
0.8
4
,
"neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"
:
0.8
3
,
"neuralmagic/Mistral-7B-Instruct-v0.3-FP8"
:
0.54
,
"neuralmagic/Mistral-7B-Instruct-v0.3-FP8"
:
0.54
,
"neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
:
0.8
3
,
"neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
:
0.8
4
,
"neuralmagic/gemma-2-2b-it-FP8"
:
0.60
,
"neuralmagic/gemma-2-2b-it-FP8"
:
0.60
,
"neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8"
:
0.9
5
,
"neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8"
:
0.9
4
,
"neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
:
0.6
1
,
"neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
:
0.6
2
,
"neuralmagic/Qwen2-72B-Instruct-FP8"
:
0.9
5
,
"neuralmagic/Qwen2-72B-Instruct-FP8"
:
0.9
4
,
"neuralmagic/Qwen2-57B-A14B-Instruct-FP8"
:
0.82
,
"neuralmagic/Qwen2-57B-A14B-Instruct-FP8"
:
0.82
,
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
:
0.84
,
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
:
0.84
,
"hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"
:
0.83
,
"hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"
:
0.83
,
...
@@ -67,7 +68,6 @@ def launch_server(base_url, model, is_fp8, is_tp2):
...
@@ -67,7 +68,6 @@ def launch_server(base_url, model, is_fp8, is_tp2):
base_url
,
base_url
,
timeout
=
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
timeout
=
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
other_args
=
other_args
,
other_args
=
other_args
,
return_stdout_stderr
=
(
subprocess
.
DEVNULL
,
subprocess
.
DEVNULL
),
)
)
return
process
return
process
...
@@ -99,6 +99,9 @@ def write_results_to_json(model, metrics, mode="a"):
...
@@ -99,6 +99,9 @@ def write_results_to_json(model, metrics, mode="a"):
def
check_model_scores
(
results
):
def
check_model_scores
(
results
):
failed_models
=
[]
failed_models
=
[]
summary
=
" | model | score | threshold |
\n
"
summary
+=
"| ----- | ----- | --------- |
\n
"
for
model
,
score
in
results
:
for
model
,
score
in
results
:
threshold
=
MODEL_SCORE_THRESHOLDS
.
get
(
model
)
threshold
=
MODEL_SCORE_THRESHOLDS
.
get
(
model
)
if
threshold
is
None
:
if
threshold
is
None
:
...
@@ -111,11 +114,19 @@ def check_model_scores(results):
...
@@ -111,11 +114,19 @@ def check_model_scores(results):
f
"Model
{
model
}
score (
{
score
:.
4
f
}
) is below threshold (
{
threshold
:.
4
f
}
)"
f
"Model
{
model
}
score (
{
score
:.
4
f
}
) is below threshold (
{
threshold
:.
4
f
}
)"
)
)
line
=
f
"|
{
model
}
|
{
score
}
|
{
threshold
}
|
\n
"
summary
+=
line
print
(
summary
)
if
is_in_ci
():
write_github_step_summary
(
f
"### TestNightlyGsm8KEval
\n
{
summary
}
"
)
if
failed_models
:
if
failed_models
:
raise
AssertionError
(
"
\n
"
.
join
(
failed_models
))
raise
AssertionError
(
"
\n
"
.
join
(
failed_models
))
class
Test
EvalAccuracyLarge
(
unittest
.
TestCase
):
class
Test
NightlyGsm8KEval
(
unittest
.
TestCase
):
@
classmethod
@
classmethod
def
setUpClass
(
cls
):
def
setUpClass
(
cls
):
cls
.
model_groups
=
[
cls
.
model_groups
=
[
...
@@ -127,13 +138,6 @@ class TestEvalAccuracyLarge(unittest.TestCase):
...
@@ -127,13 +138,6 @@ class TestEvalAccuracyLarge(unittest.TestCase):
]
]
cls
.
base_url
=
DEFAULT_URL_FOR_TEST
cls
.
base_url
=
DEFAULT_URL_FOR_TEST
def
setUp
(
self
):
self
.
process
=
None
def
tearDown
(
self
):
if
self
.
process
:
kill_process_tree
(
self
.
process
.
pid
)
def
test_mgsm_en_all_models
(
self
):
def
test_mgsm_en_all_models
(
self
):
warnings
.
filterwarnings
(
warnings
.
filterwarnings
(
"ignore"
,
category
=
ResourceWarning
,
message
=
"unclosed.*socket"
"ignore"
,
category
=
ResourceWarning
,
message
=
"unclosed.*socket"
...
@@ -144,7 +148,7 @@ class TestEvalAccuracyLarge(unittest.TestCase):
...
@@ -144,7 +148,7 @@ class TestEvalAccuracyLarge(unittest.TestCase):
for
model_group
,
is_fp8
,
is_tp2
in
self
.
model_groups
:
for
model_group
,
is_fp8
,
is_tp2
in
self
.
model_groups
:
for
model
in
model_group
:
for
model
in
model_group
:
with
self
.
subTest
(
model
=
model
):
with
self
.
subTest
(
model
=
model
):
self
.
process
=
launch_server
(
self
.
base_url
,
model
,
is_fp8
,
is_tp2
)
process
=
launch_server
(
self
.
base_url
,
model
,
is_fp8
,
is_tp2
)
args
=
SimpleNamespace
(
args
=
SimpleNamespace
(
base_url
=
self
.
base_url
,
base_url
=
self
.
base_url
,
...
@@ -163,8 +167,7 @@ class TestEvalAccuracyLarge(unittest.TestCase):
...
@@ -163,8 +167,7 @@ class TestEvalAccuracyLarge(unittest.TestCase):
is_first
=
False
is_first
=
False
all_results
.
append
((
model
,
metrics
[
"score"
]))
all_results
.
append
((
model
,
metrics
[
"score"
]))
kill_process_tree
(
process
.
pid
)
self
.
tearDown
()
try
:
try
:
with
open
(
"results.json"
,
"r"
)
as
f
:
with
open
(
"results.json"
,
"r"
)
as
f
:
...
...
test/srt/test_nightly_human_eval.py
View file @
b22f3f64
...
@@ -18,7 +18,7 @@ from sglang.test.test_utils import (
...
@@ -18,7 +18,7 @@ from sglang.test.test_utils import (
)
)
class
Test
EvalAccuracyLarge
(
unittest
.
TestCase
):
class
Test
NightlyHumanEval
(
unittest
.
TestCase
):
@
classmethod
@
classmethod
def
setUpClass
(
cls
):
def
setUpClass
(
cls
):
if
is_in_ci
():
if
is_in_ci
():
...
...
test/srt/test_skip_tokenizer_init.py
View file @
b22f3f64
...
@@ -55,8 +55,10 @@ class TestSkipTokenizerInit(unittest.TestCase):
...
@@ -55,8 +55,10 @@ class TestSkipTokenizerInit(unittest.TestCase):
print
(
json
.
dumps
(
ret
))
print
(
json
.
dumps
(
ret
))
def
assert_one_item
(
item
):
def
assert_one_item
(
item
):
assert
len
(
item
[
"token_ids"
])
==
item
[
"meta_info"
][
"completion_tokens"
]
self
.
assertEqual
(
assert
len
(
item
[
"token_ids"
])
==
max_new_tokens
len
(
item
[
"token_ids"
]),
item
[
"meta_info"
][
"completion_tokens"
]
)
self
.
assertEqual
(
len
(
item
[
"token_ids"
]),
max_new_tokens
)
assert
item
[
"meta_info"
][
"prompt_tokens"
]
==
len
(
input_ids
)
assert
item
[
"meta_info"
][
"prompt_tokens"
]
==
len
(
input_ids
)
if
return_logprob
:
if
return_logprob
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment