Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
22352d47
Unverified
Commit
22352d47
authored
Jun 29, 2025
by
Lianmin Zheng
Committed by
GitHub
Jun 29, 2025
Browse files
Improve streaming, log_level, memory report, weight loading, and benchmark script (#7632)
Co-authored-by:
Kan Wu
<
wukanustc@gmail.com
>
parent
c5131f7a
Changes
24
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
165 additions
and
5 deletions
+165
-5
python/sglang/srt/warmup.py
python/sglang/srt/warmup.py
+12
-3
scripts/playground/replay_request_dump.py
scripts/playground/replay_request_dump.py
+150
-0
test/srt/run_suite.py
test/srt/run_suite.py
+2
-1
test/srt/test_vision_chunked_prefill.py
test/srt/test_vision_chunked_prefill.py
+1
-1
No files found.
python/sglang/srt/warmup.py
View file @
22352d47
...
@@ -4,6 +4,7 @@ from typing import List
...
@@ -4,6 +4,7 @@ from typing import List
import
numpy
as
np
import
numpy
as
np
import
tqdm
import
tqdm
from
sglang.srt.disaggregation.utils
import
FAKE_BOOTSTRAP_HOST
from
sglang.srt.managers.io_struct
import
GenerateReqInput
from
sglang.srt.managers.io_struct
import
GenerateReqInput
from
sglang.srt.managers.tokenizer_manager
import
TokenizerManager
from
sglang.srt.managers.tokenizer_manager
import
TokenizerManager
...
@@ -20,17 +21,21 @@ def warmup(name: str) -> callable:
...
@@ -20,17 +21,21 @@ def warmup(name: str) -> callable:
return
decorator
return
decorator
async
def
execute_warmups
(
warmup_names
:
List
[
str
],
tokenizer_manager
:
TokenizerManager
):
async
def
execute_warmups
(
disaggregation_mode
:
str
,
warmup_names
:
List
[
str
],
tokenizer_manager
:
TokenizerManager
,
):
for
warmup_name
in
warmup_names
:
for
warmup_name
in
warmup_names
:
if
warmup_name
not
in
_warmup_registry
:
if
warmup_name
not
in
_warmup_registry
:
logger
.
warning
(
f
"Could not find custom warmup
{
warmup_name
}
"
)
logger
.
warning
(
f
"Could not find custom warmup
{
warmup_name
}
"
)
continue
continue
logger
.
info
(
f
"Running warmup
{
warmup_name
}
"
)
logger
.
info
(
f
"Running warmup
{
warmup_name
}
"
)
await
_warmup_registry
[
warmup_name
](
tokenizer_manager
)
await
_warmup_registry
[
warmup_name
](
disaggregation_mode
,
tokenizer_manager
)
@
warmup
(
"voice_chat"
)
@
warmup
(
"voice_chat"
)
async
def
voice_chat
(
tokenizer_manager
:
TokenizerManager
):
async
def
voice_chat
(
disaggregation_mode
:
str
,
tokenizer_manager
:
TokenizerManager
):
# this warms up the fused_moe triton kernels and caches them
# this warms up the fused_moe triton kernels and caches them
# if we don't do this we break real time inference for voice chat
# if we don't do this we break real time inference for voice chat
for
i
in
tqdm
.
trange
(
1
,
512
):
for
i
in
tqdm
.
trange
(
1
,
512
):
...
@@ -44,4 +49,8 @@ async def voice_chat(tokenizer_manager: TokenizerManager):
...
@@ -44,4 +49,8 @@ async def voice_chat(tokenizer_manager: TokenizerManager):
"min_p"
:
0.0
,
"min_p"
:
0.0
,
},
},
)
)
if
disaggregation_mode
!=
"null"
:
generate_req_input
.
bootstrap_room
=
0
generate_req_input
.
bootstrap_host
=
FAKE_BOOTSTRAP_HOST
await
tokenizer_manager
.
generate_request
(
generate_req_input
,
None
).
__anext__
()
await
tokenizer_manager
.
generate_request
(
generate_req_input
,
None
).
__anext__
()
scripts/playground/replay_request_dump.py
0 → 100644
View file @
22352d47
"""
Usage:
# replay from a folder
python3 replay_request_dump.py --file-number 100 --parallel 512 --input-folder /data/lianmin/sglang_request_dump/grok-mini-0220-engine-5756f8f94-28bm6/
# replay from a single file
python3 replay_request_dump.py --parallel 512 --input-file /data/sglang_crash_dump/memx-cti-34-sr1.xpop.twttr.net/crash_dump_2025-06-04_20-13-18.pkl
"""
import
argparse
import
glob
import
json
import
pickle
import
time
from
concurrent.futures
import
ThreadPoolExecutor
from
dataclasses
import
asdict
from
datetime
import
datetime
import
requests
from
sglang.bench_serving
import
set_ulimit
from
sglang.utils
import
get_exception_traceback
def
read_records
(
files
):
records
=
[]
for
f
in
files
:
tmp
=
pickle
.
load
(
open
(
f
,
"rb"
))
if
isinstance
(
tmp
,
dict
)
and
"requests"
in
tmp
:
records
.
extend
(
tmp
[
"requests"
])
else
:
records
.
extend
(
tmp
)
return
records
def
run_one_request_internal
(
record
):
(
req
,
output
,
replay_init_time
,
start_time
,
end_time
,
idx
)
=
record
time
.
sleep
(
max
(
0
,
start_time
-
(
time
.
time
()
-
replay_init_time
)))
if
"completion_tokens"
in
output
.
get
(
"meta_info"
,
{}):
recorded_completion_tokens
=
output
[
"meta_info"
][
"completion_tokens"
]
else
:
recorded_completion_tokens
=
""
json_data
=
asdict
(
req
)
stream
=
json_data
[
"stream"
]
if
args
.
ignore_eos
:
json_data
[
"sampling_params"
][
"ignore_eos"
]
=
True
if
recorded_completion_tokens
:
json_data
[
"sampling_params"
][
"max_new_tokens"
]
=
recorded_completion_tokens
response
=
requests
.
post
(
f
"http://
{
args
.
host
}
:
{
args
.
port
}
/generate"
,
json
=
json_data
,
stream
=
stream
,
)
if
stream
:
for
chunk
in
response
.
iter_lines
(
decode_unicode
=
False
):
chunk
=
chunk
.
decode
(
"utf-8"
)
if
chunk
and
chunk
.
startswith
(
"data:"
):
if
chunk
==
"data: [DONE]"
:
break
ret
=
json
.
loads
(
chunk
[
5
:].
strip
(
"
\n
"
))
else
:
ret
=
response
.
json
()
prompt_tokens
=
ret
[
"meta_info"
][
"prompt_tokens"
]
completion_tokens
=
ret
[
"meta_info"
][
"completion_tokens"
]
print
(
f
"
{
idx
=
}
,
{
start_time
=
:.
2
f
}
,
{
prompt_tokens
=
}
, "
f
"
{
completion_tokens
=
}
,
{
recorded_completion_tokens
=
}
"
)
def
run_one_request
(
record
):
# global success_ct, error_ct
try
:
run_one_request_internal
(
record
)
# success_ct += 1
except
Exception
:
# error_ct += 1
traceback
=
get_exception_traceback
()
print
(
f
"Hit an exception:
{
traceback
}
"
)
def
main
(
records
):
if
len
(
records
)
==
0
:
return
base_time
=
records
[
0
][
-
2
]
base_time_str
=
datetime
.
fromtimestamp
(
base_time
).
strftime
(
"%y-%m-%d %H:%M:%S"
)
print
(
f
"
{
base_time_str
=
}
"
)
replay_init_time
=
time
.
time
()
for
i
in
range
(
len
(
records
)):
req
,
output
,
start_time
,
end_time
=
records
[
i
]
start_time
-=
base_time
records
[
i
]
=
(
req
,
output
,
replay_init_time
,
start_time
,
end_time
,
i
)
with
ThreadPoolExecutor
(
args
.
parallel
)
as
executor
:
executor
.
map
(
run_one_request
,
records
)
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--host"
,
type
=
str
,
default
=
"localhost"
)
parser
.
add_argument
(
"--port"
,
type
=
int
,
default
=
30000
)
parser
.
add_argument
(
"--input-folder"
,
type
=
str
,
default
=
None
,
help
=
"Folder containing pickle files"
)
parser
.
add_argument
(
"--input-file"
,
type
=
str
,
default
=
None
,
help
=
"Single pickle file to process"
)
parser
.
add_argument
(
"--file-number"
,
type
=
int
,
default
=
1
)
parser
.
add_argument
(
"--req-number"
,
type
=
int
,
default
=
1000000
)
parser
.
add_argument
(
"--req-start"
,
type
=
int
,
default
=
0
)
parser
.
add_argument
(
"--parallel"
,
type
=
int
,
default
=
512
)
parser
.
add_argument
(
"--idx"
,
type
=
int
,
default
=
None
)
parser
.
add_argument
(
"--ignore-eos"
,
action
=
"store_true"
)
args
=
parser
.
parse_args
()
set_ulimit
()
files
=
[]
if
args
.
input_file
:
files
=
[
args
.
input_file
]
if
args
.
file_number
>
1
:
print
(
"Warning: --file-number is ignored when --input-file is provided."
)
elif
args
.
input_folder
:
files
=
glob
.
glob
(
f
"
{
args
.
input_folder
}
/*.pkl"
)
files
=
files
[:
args
.
file_number
]
else
:
print
(
"Error: Either --input-folder or --input-file must be provided."
)
exit
(
1
)
print
(
f
"
{
files
=
}
"
)
records
=
read_records
(
files
)
# Sort by the receive time, before filtering
records
.
sort
(
key
=
lambda
x
:
x
[
-
2
])
records
=
records
[
args
.
req_start
:]
if
args
.
idx
:
records
=
[
records
[
args
.
idx
]]
print
(
f
"testing
{
args
.
idx
=
}
"
)
print
(
f
"
{
records
[
0
]
}
"
)
print
(
f
"
{
len
(
records
)
=
}
"
)
main
(
records
)
test/srt/run_suite.py
View file @
22352d47
...
@@ -173,10 +173,11 @@ suites = {
...
@@ -173,10 +173,11 @@ suites = {
# TestFile("test_deepep_intranode.py", 50),
# TestFile("test_deepep_intranode.py", 50),
# TestFile("test_deepep_low_latency.py", 50),
# TestFile("test_deepep_low_latency.py", 50),
# TestFile("test_moe_deepep_eval_accuracy_large.py", 250),
# TestFile("test_moe_deepep_eval_accuracy_large.py", 250),
# Disabled because it hangs on the CI.
# TestFile("test_moe_ep.py", 181),
TestFile
(
"test_disaggregation.py"
,
270
),
TestFile
(
"test_disaggregation.py"
,
270
),
TestFile
(
"test_disaggregation_different_tp.py"
,
155
),
TestFile
(
"test_disaggregation_different_tp.py"
,
155
),
TestFile
(
"test_full_deepseek_v3.py"
,
463
),
TestFile
(
"test_full_deepseek_v3.py"
,
463
),
TestFile
(
"test_moe_ep.py"
,
181
),
],
],
"per-commit-8-gpu-amd"
:
[
"per-commit-8-gpu-amd"
:
[
TestFile
(
"test_full_deepseek_v3.py"
,
250
),
TestFile
(
"test_full_deepseek_v3.py"
,
250
),
...
...
test/srt/test_vision_chunked_prefill.py
View file @
22352d47
...
@@ -178,7 +178,7 @@ class TestVisionChunkedPrefill(CustomTestCase):
...
@@ -178,7 +178,7 @@ class TestVisionChunkedPrefill(CustomTestCase):
print
(
output_chunked
)
print
(
output_chunked
)
print
(
"output without chunked prefill:"
)
print
(
"output without chunked prefill:"
)
print
(
output_no_chunked
)
print
(
output_no_chunked
)
assert
output_chunked
==
output_no_chunked
self
.
assert
Equal
(
output_chunked
,
output_no_chunked
)
def
test_chunked_prefill
(
self
):
def
test_chunked_prefill
(
self
):
self
.
_test_chunked_prefill
(
batches
=
[
False
,
True
],
num_frames
=
[
1
,
[
2
,
6
,
8
,
10
]])
self
.
_test_chunked_prefill
(
batches
=
[
False
,
True
],
num_frames
=
[
1
,
[
2
,
6
,
8
,
10
]])
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment