Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
a4331cd2
Unverified
Commit
a4331cd2
authored
Jan 21, 2025
by
Lianmin Zheng
Committed by
GitHub
Jan 21, 2025
Browse files
Add accuracy and latency tests of eagle into CI (#3027)
parent
ec1c21cd
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
186 additions
and
123 deletions
+186
-123
.github/workflows/pr-test.yml
.github/workflows/pr-test.yml
+16
-2
python/sglang/test/test_utils.py
python/sglang/test/test_utils.py
+5
-1
test/srt/models/test_qwen_models.py
test/srt/models/test_qwen_models.py
+2
-4
test/srt/test_bench_one_batch.py
test/srt/test_bench_one_batch.py
+24
-2
test/srt/test_bench_serving.py
test/srt/test_bench_serving.py
+33
-1
test/srt/test_eagle_infer.py
test/srt/test_eagle_infer.py
+105
-112
test/srt/test_torch_compile.py
test/srt/test_torch_compile.py
+1
-1
No files found.
.github/workflows/pr-test.yml
View file @
a4331cd2
...
...
@@ -128,7 +128,7 @@ jobs:
timeout-minutes
:
10
run
:
|
cd test/srt
python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_
default
python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_
bs1
-
name
:
Benchmark online latency
timeout-minutes
:
10
...
...
@@ -148,6 +148,13 @@ jobs:
cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_non_stream_small_batch_size
-
name
:
Benchmark online latency (EAGLE)
timeout-minutes
:
10
run
:
|
cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_eagle
performance-test-1-gpu-part-2
:
if
:
github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
runs-on
:
1-gpu-runner
...
...
@@ -196,7 +203,13 @@ jobs:
timeout-minutes
:
10
run
:
|
cd test/srt
python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_default
python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1
-
name
:
Benchmark single latency + torch.compile (TP=2)
timeout-minutes
:
10
run
:
|
cd test/srt
python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_torch_compile_tp2_bs1
-
name
:
Benchmark offline throughput (TP=2)
timeout-minutes
:
10
...
...
@@ -210,6 +223,7 @@ jobs:
cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_without_radix_cache
accuracy-test-1-gpu
:
if
:
github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
runs-on
:
1-gpu-runner
...
...
python/sglang/test/test_utils.py
View file @
a4331cd2
...
...
@@ -42,6 +42,9 @@ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-In
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1
=
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"
DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN
=
"Qwen/Qwen2.5-1.5B-Instruct"
DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST
=
"meta-llama/Llama-2-7b-chat-hf"
DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST
=
"lmzheng/sglang-EAGLE-llama2-chat-7B"
def
is_in_ci
():
"""Return whether it is in CI runner."""
...
...
@@ -538,6 +541,7 @@ def run_bench_serving(
random_input_len
=
4096
,
random_output_len
=
2048
,
disable_stream
=
False
,
disable_ignore_eos
=
False
,
need_warmup
=
False
,
):
# Launch the server
...
...
@@ -572,7 +576,7 @@ def run_bench_serving(
disable_stream
=
disable_stream
,
return_logprob
=
False
,
seed
=
0
,
disable_ignore_eos
=
False
,
disable_ignore_eos
=
disable_ignore_eos
,
extra_request_body
=
None
,
apply_chat_template
=
False
,
profile
=
None
,
...
...
test/srt/models/test_qwen_models.py
View file @
a4331cd2
...
...
@@ -37,8 +37,7 @@ class TestQwen2(unittest.TestCase):
port
=
int
(
self
.
base_url
.
split
(
":"
)[
-
1
]),
)
metrics
=
run_eval
(
args
)
print
(
metrics
)
print
(
f
"
{
metrics
=
}
"
)
self
.
assertGreater
(
metrics
[
"accuracy"
],
0.81
)
...
...
@@ -69,8 +68,7 @@ class TestQwen2FP8(unittest.TestCase):
port
=
int
(
self
.
base_url
.
split
(
":"
)[
-
1
]),
)
metrics
=
run_eval
(
args
)
print
(
metrics
)
print
(
f
"
{
metrics
=
}
"
)
self
.
assertGreater
(
metrics
[
"accuracy"
],
0.79
)
...
...
test/srt/test_bench_one_batch.py
View file @
a4331cd2
...
...
@@ -5,24 +5,46 @@ from sglang.test.test_utils import (
DEFAULT_MOE_MODEL_NAME_FOR_TEST
,
is_in_ci
,
run_bench_one_batch
,
write_github_step_summary
,
)
class
TestBenchOneBatch
(
unittest
.
TestCase
):
def
test_
default
(
self
):
def
test_
bs1
(
self
):
output_throughput
=
run_bench_one_batch
(
DEFAULT_MODEL_NAME_FOR_TEST
,
[])
if
is_in_ci
():
write_github_step_summary
(
f
"### test_bs1
\n
"
f
"output_throughput :
{
output_throughput
:.
2
f
}
token/s
\n
"
)
self
.
assertGreater
(
output_throughput
,
135
)
def
test_moe_
default
(
self
):
def
test_moe_
tp2_bs1
(
self
):
output_throughput
=
run_bench_one_batch
(
DEFAULT_MOE_MODEL_NAME_FOR_TEST
,
[
"--tp"
,
"2"
]
)
if
is_in_ci
():
write_github_step_summary
(
f
"### test_moe_tp2_bs1
\n
"
f
"output_throughput :
{
output_throughput
:.
2
f
}
token/s
\n
"
)
self
.
assertGreater
(
output_throughput
,
125
)
def
test_torch_compile_tp2_bs1
(
self
):
output_throughput
=
run_bench_one_batch
(
DEFAULT_MODEL_NAME_FOR_TEST
,
[
"--tp"
,
"2"
,
"--enable-torch-compile"
,
"--cuda-graph-max-bs"
,
"2"
],
)
if
is_in_ci
():
write_github_step_summary
(
f
"### test_torch_compile_tp2_bs1
\n
"
f
"output_throughput :
{
output_throughput
:.
2
f
}
token/s
\n
"
)
self
.
assertGreater
(
output_throughput
,
240
)
if
__name__
==
"__main__"
:
unittest
.
main
()
test/srt/test_bench_serving.py
View file @
a4331cd2
import
unittest
from
sglang.test.test_utils
import
(
DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST
,
DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST
,
DEFAULT_FP8_MODEL_NAME_FOR_TEST
,
DEFAULT_MODEL_NAME_FOR_TEST
,
DEFAULT_MOE_MODEL_NAME_FOR_TEST
,
...
...
@@ -47,7 +49,7 @@ class TestBenchServing(unittest.TestCase):
)
# There is a regression with torch 2.5
# This number was 950 for torch 2.4
self
.
assertGreater
(
res
[
"output_throughput"
],
8
0
0
)
self
.
assertGreater
(
res
[
"output_throughput"
],
8
5
0
)
def
test_offline_throughput_without_radix_cache
(
self
):
res
=
run_bench_serving
(
...
...
@@ -131,6 +133,36 @@ class TestBenchServing(unittest.TestCase):
self
.
assertLess
(
res
[
"median_ttft_ms"
],
86
)
self
.
assertLess
(
res
[
"median_itl_ms"
],
10
)
def
test_online_latency_eagle
(
self
):
res
=
run_bench_serving
(
model
=
DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST
,
num_prompts
=
50
,
request_rate
=
1
,
disable_ignore_eos
=
True
,
dataset_name
=
"sharegpt"
,
other_server_args
=
[
"--speculative-algorithm"
,
"EAGLE"
,
"--speculative-draft-model-path"
,
DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST
,
"--speculative-num-steps"
,
"5"
,
"--speculative-eagle-topk"
,
"8"
,
"--speculative-num-draft-tokens"
,
"64"
,
"--mem-fraction-static"
,
"0.7"
,
],
)
if
is_in_ci
():
write_github_step_summary
(
f
"### test_online_latency_eagle
\n
"
f
'median_e2e_latency_ms :
{
res
[
"median_e2e_latency_ms"
]:.
2
f
}
ms
\n
'
)
self
.
assertLess
(
res
[
"median_e2e_latency_ms"
],
10000
)
def
test_moe_offline_throughput_default
(
self
):
res
=
run_bench_serving
(
model
=
DEFAULT_MOE_MODEL_NAME_FOR_TEST
,
...
...
test/srt/test_eagle_infer.py
View file @
a4331cd2
import
multiprocessing
import
random
import
threading
import
time
import
unittest
from
types
import
SimpleNamespace
import
requests
from
transformers
import
AutoConfig
,
AutoTokenizer
import
sglang
as
sgl
from
sglang.srt.hf_transformers_utils
import
get_tokenizer
from
sglang.srt.utils
import
kill_process_tree
from
sglang.test.few_shot_gsm8k
import
run_eval
from
sglang.test.test_utils
import
(
DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST
,
DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST
,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
DEFAULT_URL_FOR_TEST
,
popen_launch_server
,
...
...
@@ -19,60 +23,59 @@ class TestEAGLEEngine(unittest.TestCase):
def
test_eagle_accuracy
(
self
):
prompt
=
"Today is a sunny day and I like"
target_model_path
=
"meta-llama/Llama-2-7b-chat-hf"
speculative_draft_model_path
=
"lmzheng/sglang-EAGLE-llama2-chat-7B"
sampling_params
=
{
"temperature"
:
0
,
"max_new_tokens"
:
8
}
# Get the reference output
ref_engine
=
sgl
.
Engine
(
model_path
=
DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST
)
ref_output
=
ref_engine
.
generate
(
prompt
,
sampling_params
)[
"text"
]
ref_engine
.
shutdown
()
# Launch EAGLE engine
engine
=
sgl
.
Engine
(
model_path
=
target_model_path
,
speculative_draft_model_path
=
speculative_draft_model_path
,
model_path
=
DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST
,
speculative_draft_model_path
=
DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST
,
speculative_algorithm
=
"EAGLE"
,
speculative_num_steps
=
3
,
speculative_eagle_topk
=
4
,
speculative_num_draft_tokens
=
16
,
speculative_num_steps
=
5
,
speculative_eagle_topk
=
8
,
speculative_num_draft_tokens
=
64
,
mem_fraction_static
=
0.7
,
)
out1
=
engine
.
generate
(
prompt
,
sampling_params
)[
"text"
]
engine
.
shutdown
()
engine
=
sgl
.
Engine
(
model_path
=
target_model_path
)
out2
=
engine
.
generate
(
prompt
,
sampling_params
)[
"text"
]
engine
.
shutdown
()
print
(
"==== Answer 1 ===="
)
print
(
out1
)
print
(
"==== Answer 2 ===="
)
print
(
out2
)
self
.
assertEqual
(
out1
,
out2
)
# Case 1: Test the output of EAGLE engine is the same as normal engine
out1
=
engine
.
generate
(
prompt
,
sampling_params
)[
"text"
]
print
(
f
"
{
out1
=
}
,
{
ref_output
=
}
"
)
self
.
assertEqual
(
out1
,
ref_output
)
def
test_eagle_end_check
(
self
):
# Case 2: Test the output of EAGLE engine does not contain unexpected EOS
prompt
=
"[INST] <<SYS>>
\\
nYou are a helpful assistant.
\\
n<</SYS>>
\\
nToday is a sunny day and I like [/INST]"
target_model_path
=
"meta-llama/Llama-2-7b-chat-hf"
tokenizer
=
AutoTokenizer
.
from_pretrained
(
target_model_path
)
speculative_draft_model_path
=
"lmzheng/sglang-EAGLE-llama2-chat-7B"
sampling_params
=
{
"temperature"
:
0
,
"max_new_tokens"
:
1024
,
"skip_special_tokens"
:
False
,
}
engine
=
sgl
.
Engine
(
model_path
=
target_model_path
,
speculative_draft_model_path
=
speculative_draft_model_path
,
speculative_algorithm
=
"EAGLE"
,
speculative_num_steps
=
3
,
speculative_eagle_topk
=
4
,
speculative_num_draft_tokens
=
16
,
)
out1
=
engine
.
generate
(
prompt
,
sampling_params
)[
"text"
]
engine
.
shutdown
()
print
(
"==== Answer 1 ===="
)
print
(
repr
(
out1
))
tokens
=
tokenizer
.
encode
(
out1
,
truncation
=
False
)
tokenizer
=
get_tokenizer
(
DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST
)
out2
=
engine
.
generate
(
prompt
,
sampling_params
)[
"text"
]
print
(
f
"
{
out2
=
}
"
)
tokens
=
tokenizer
.
encode
(
out2
,
truncation
=
False
)
assert
tokenizer
.
eos_token_id
not
in
tokens
# Case 3: Batched prompts
prompts
=
[
"Hello, my name is"
,
"The president of the United States is"
,
"The capital of France is"
,
"The future of AI is"
,
]
sampling_params
=
{
"temperature"
:
0
,
"max_new_tokens"
:
30
}
outputs
=
engine
.
generate
(
prompts
,
sampling_params
)
for
prompt
,
output
in
zip
(
prompts
,
outputs
):
print
(
"==============================="
)
print
(
f
"Prompt:
{
prompt
}
\n
Generated text:
{
output
[
'text'
]
}
"
)
# Shutdown the engine
engine
.
shutdown
()
prompts
=
[
"[INST] <<SYS>>
\\
nYou are a helpful assistant.
\\
n<</SYS>>
\\
nToday is a sunny day and I like[/INST]"
...
...
@@ -83,64 +86,27 @@ prompts = [
]
def
process
(
server_url
:
str
):
time
.
sleep
(
random
.
uniform
(
0
,
2
))
for
prompt
in
prompts
:
url
=
server_url
data
=
{
"model"
:
"base"
,
"text"
:
prompt
,
"sampling_params"
:
{
"temperature"
:
0
,
"max_new_tokens"
:
1024
,
},
}
response
=
requests
.
post
(
url
,
json
=
data
)
assert
response
.
status_code
==
200
def
abort_process
(
server_url
:
str
):
for
prompt
in
prompts
:
try
:
time
.
sleep
(
1
)
url
=
server_url
data
=
{
"model"
:
"base"
,
"text"
:
prompt
,
"sampling_params"
:
{
"temperature"
:
0
,
"max_new_tokens"
:
1024
,
},
}
# set timeout = 1s,mock disconnected
requests
.
post
(
url
,
json
=
data
,
timeout
=
1
)
except
:
pass
class
TestEAGLELaunchServer
(
unittest
.
TestCase
):
class
TestEAGLEServer
(
unittest
.
TestCase
):
@
classmethod
def
setUpClass
(
cls
):
speculative_draft_model_path
=
"lmzheng/sglang-EAGLE-llama2-chat-7B"
cls
.
model
=
"meta-llama/Llama-2-7b-chat-hf"
cls
.
base_url
=
DEFAULT_URL_FOR_TEST
cls
.
process
=
popen_launch_server
(
cls
.
model
,
DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST
,
cls
.
base_url
,
timeout
=
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
other_args
=
[
"--speculative-algorithm"
,
"EAGLE"
,
"--speculative-draft-model-path"
,
speculative_draft_model_path
,
DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST
,
"--speculative-num-steps"
,
"
3
"
,
"
5
"
,
"--speculative-eagle-topk"
,
"
4
"
,
"
8
"
,
"--speculative-num-draft-tokens"
,
"
1
6"
,
"--
served-model-name
"
,
"
base
"
,
"6
4
"
,
"--
mem-fraction-static
"
,
"
0.7
"
,
],
)
...
...
@@ -148,40 +114,67 @@ class TestEAGLELaunchServer(unittest.TestCase):
def
tearDownClass
(
cls
):
kill_process_tree
(
cls
.
process
.
pid
)
def
test_eagle_server_concurrency
(
self
):
concurrency
=
4
processes
=
[
multiprocessing
.
Process
(
target
=
process
,
kwargs
=
{
"server_url"
:
self
.
base_url
+
"/generate"
},
)
for
_
in
range
(
concurrency
)
]
for
worker
in
processes
:
worker
.
start
()
for
p
in
processes
:
p
.
join
()
def
test_eagle_server_request_abort
(
self
):
def
send_request
(
self
):
time
.
sleep
(
random
.
uniform
(
0
,
2
))
for
prompt
in
prompts
:
url
=
self
.
base_url
+
"/generate"
data
=
{
"text"
:
prompt
,
"sampling_params"
:
{
"temperature"
:
0
,
"max_new_tokens"
:
1024
,
},
}
response
=
requests
.
post
(
url
,
json
=
data
)
assert
response
.
status_code
==
200
def
send_requests_abort
(
self
):
for
prompt
in
prompts
:
try
:
time
.
sleep
(
random
.
uniform
(
0
,
2
))
url
=
self
.
base_url
+
"/generate"
data
=
{
"model"
:
"base"
,
"text"
:
prompt
,
"sampling_params"
:
{
"temperature"
:
0
,
"max_new_tokens"
:
1024
,
},
}
# set timeout = 1s,mock disconnected
requests
.
post
(
url
,
json
=
data
,
timeout
=
1
)
except
Exception
as
e
:
print
(
e
)
pass
def
test_request_abort
(
self
):
concurrency
=
4
processes
=
[
multiprocessing
.
Process
(
target
=
process
,
kwargs
=
{
"server_url"
:
self
.
base_url
+
"/generate"
},
)
for
_
in
range
(
concurrency
)
threads
=
[
threading
.
Thread
(
target
=
self
.
send_request
)
for
_
in
range
(
concurrency
)
]
+
[
multiprocessing
.
Process
(
target
=
abort_process
,
kwargs
=
{
"server_url"
:
self
.
base_url
+
"/generate"
},
)
threading
.
Thread
(
target
=
self
.
send_requests_abort
)
for
_
in
range
(
concurrency
)
]
for
worker
in
processe
s
:
for
worker
in
thread
s
:
worker
.
start
()
for
p
in
processe
s
:
for
p
in
thread
s
:
p
.
join
()
def
test_gsm8k
(
self
):
args
=
SimpleNamespace
(
num_shots
=
5
,
data_path
=
None
,
num_questions
=
200
,
max_new_tokens
=
512
,
parallel
=
128
,
host
=
"http://127.0.0.1"
,
port
=
int
(
self
.
base_url
.
split
(
":"
)[
-
1
]),
)
metrics
=
run_eval
(
args
)
print
(
f
"
{
metrics
=
}
"
)
self
.
assertGreater
(
metrics
[
"accuracy"
],
0.20
)
if
__name__
==
"__main__"
:
unittest
.
main
()
test/srt/test_torch_compile.py
View file @
a4331cd2
...
...
@@ -23,7 +23,7 @@ class TestTorchCompile(unittest.TestCase):
cls
.
model
,
cls
.
base_url
,
timeout
=
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
other_args
=
[
"--enable-torch-compile"
],
other_args
=
[
"--enable-torch-compile"
,
"--cuda-graph-max-bs"
,
"4"
],
)
@
classmethod
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment