Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
0c0779d6
Unverified
Commit
0c0779d6
authored
Oct 13, 2025
by
Mick
Committed by
GitHub
Oct 12, 2025
Browse files
ci: improve nightly-ci (#11385)
parent
a55cf530
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
76 additions
and
54 deletions
+76
-54
.github/workflows/nightly-test.yml
.github/workflows/nightly-test.yml
+2
-2
python/sglang/bench_one_batch_server.py
python/sglang/bench_one_batch_server.py
+40
-25
python/sglang/bench_serving.py
python/sglang/bench_serving.py
+10
-11
test/srt/test_nightly_text_models_perf.py
test/srt/test_nightly_text_models_perf.py
+7
-5
test/srt/test_nightly_vlms_mmmu_eval.py
test/srt/test_nightly_vlms_mmmu_eval.py
+6
-4
test/srt/test_nightly_vlms_perf.py
test/srt/test_nightly_vlms_perf.py
+11
-7
No files found.
.github/workflows/nightly-test.yml
View file @
0c0779d6
...
...
@@ -62,7 +62,7 @@ jobs:
nightly-test-eval-vlms
:
if
:
github.repository == 'sgl-project/sglang'
runs-on
:
1
-gpu-runner
runs-on
:
2
-gpu-runner
steps
:
-
name
:
Checkout code
uses
:
actions/checkout@v4
...
...
@@ -79,7 +79,7 @@ jobs:
nightly-test-perf-vlms
:
if
:
github.repository == 'sgl-project/sglang'
runs-on
:
1
-gpu-runner
runs-on
:
2
-gpu-runner
steps
:
-
name
:
Checkout code
uses
:
actions/checkout@v4
...
...
python/sglang/bench_one_batch_server.py
View file @
0c0779d6
...
...
@@ -25,8 +25,10 @@ from typing import List, Optional, Tuple
import
numpy
as
np
import
requests
from
pydantic
import
BaseModel
from
transformers
import
AutoProcessor
,
PreTrainedTokenizer
from
sglang.bench_serving
import
(
get_processor
,
get_tokenizer
,
sample_mmmu_requests
,
sample_random_requests
,
...
...
@@ -104,8 +106,14 @@ Note: To view the traces through perfetto-ui, please:
if
self
.
profile_links
.
extend
or
self
.
profile_links
.
decode
:
# Create a combined link or use the first available one
trace_files
=
[
self
.
profile_links
.
extend
,
self
.
profile_links
.
decode
]
if
any
(
trace_file
is
None
for
trace_file
in
trace_files
):
logger
.
error
(
"Some trace files are None"
,
f
"
{
trace_files
=
}
"
)
trace_files_relay_links
=
[
f
"[trace](
{
get_perfetto_relay_link_from_trace_file
(
trace_file
)
}
)"
(
f
"[trace](
{
get_perfetto_relay_link_from_trace_file
(
trace_file
)
}
)"
if
trace_file
else
"N/A"
)
for
trace_file
in
trace_files
]
...
...
@@ -114,30 +122,31 @@ Note: To view the traces through perfetto-ui, please:
# Build the row
return
f
"|
{
self
.
batch_size
}
|
{
self
.
input_len
}
|
{
self
.
latency
:.
2
f
}
|
{
self
.
input_throughput
:.
2
f
}
|
{
self
.
output_throughput
:.
2
f
}
|
{
accept_length
}
|
{
itl
:.
2
f
}
|
{
input_cost
:.
2
f
}
|
{
output_cost
:.
2
f
}
|
{
profile_link
}
|
\n
"
@
classmethod
def
generate_markdown_report
(
cls
,
trace_dir
,
results
:
List
[
"BenchmarkResult"
]
)
->
str
:
"""Generate a markdown report from a list of BenchmarkResult object from a single run."""
import
os
summary
=
f
"###
{
results
[
0
].
model_path
}
\n
"
def
generate_markdown_report
(
trace_dir
,
results
:
List
[
"BenchmarkResult"
])
->
str
:
"""Generate a markdown report from a list of BenchmarkResult object from a single run."""
import
os
summary
=
f
"###
{
results
[
0
].
model_path
}
\n
"
# summary += (
# f"Input lens: {result.input_len}. Output lens: {result.output_len}.\n"
# )
summary
+=
"| batch size | input len | latency (s) | input throughput (tok/s) | output throughput (tok/s) | acc length | ITL (ms) | input cost ($/1M) | output cost ($/1M) | profile (extend) | profile (decode)|
\n
"
summary
+=
"| ---------- | --------- | ----------- | ------------------------- | ------------------------- | ---------- | -------- | ----------------- | ------------------ | --------------- | -------------- |
\n
"
# summary += (
# f"Input lens: {result.input_len}. Output lens: {result.output_len}.\n"
# )
summary
+=
"| batch size | input len | latency (s) | input throughput (tok/s) | output throughput (tok/s) | acc length | ITL (ms) | input cost ($/1M) | output cost ($/1M) | profile (extend) | profile (decode)|
\n
"
summary
+=
"| ---------- | --------- | ----------- | ------------------------- | ------------------------- | ---------- | -------- | ----------------- | ------------------ | --------------- | -------------- |
\n
"
# all results should share the same isl & osl
for
result
in
results
:
base_url
=
os
.
getenv
(
"TRACE_BASE_URL"
,
""
).
rstrip
(
"/"
)
relay_base
=
os
.
getenv
(
"PERFETTO_RELAY_URL"
,
""
).
rstrip
(
"/"
)
relay_base
=
"https://docs.sglang.ai/ci-data/pages/perfetto_relay.html"
# base_url = "https://github.com/sgl-project/ci-data/traces"
summary
+=
result
.
to_markdown_row
(
trace_dir
,
base_url
,
relay_base
)
# all results should share the same isl & osl
for
result
in
results
:
base_url
=
os
.
getenv
(
"TRACE_BASE_URL"
,
"https://github.com/sgl-project/ci-data/traces"
).
rstrip
(
"/"
)
relay_base
=
os
.
getenv
(
"PERFETTO_RELAY_URL"
,
"https://docs.sglang.ai/ci-data/pages/perfetto_relay.html"
,
).
rstrip
(
"/"
)
summary
+=
result
.
to_markdown_row
(
trace_dir
,
base_url
,
relay_base
)
return
summary
return
summary
@
dataclasses
.
dataclass
...
...
@@ -288,7 +297,7 @@ def run_one_case(
input_len_step_percentage
:
float
,
run_name
:
str
,
result_filename
:
str
,
tokenizer
,
tokenizer
:
PreTrainedTokenizer
|
AutoProcessor
,
dataset_name
=
""
,
profile
:
bool
=
False
,
profile_steps
:
int
=
3
,
...
...
@@ -302,9 +311,8 @@ def run_one_case(
if
dataset_name
==
"mmmu"
:
input_requests
=
sample_mmmu_requests
(
num_requests
=
batch_size
,
tokenize
r
=
tokenizer
,
processo
r
=
tokenizer
,
fixed_output_len
=
output_len
,
apply_chat_template
=
True
,
random_sample
=
False
,
)
elif
dataset_name
==
"random"
:
...
...
@@ -364,6 +372,8 @@ def run_one_case(
if
dataset_name
==
"mmmu"
:
# vlm
input_ids
=
[]
# for vlms, tokenizer is an instance of AutoProcessor
tokenizer
=
tokenizer
.
tokenizer
for
input_req
in
input_requests
:
input_ids
+=
[
tokenizer
.
encode
(
input_req
.
prompt
)]
payload
[
"image_data"
]
=
[
req
.
image_data
for
req
in
input_requests
]
...
...
@@ -609,7 +619,12 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
tokenizer_path
=
server_info
[
"tokenizer_path"
]
elif
"prefill"
in
server_info
:
tokenizer_path
=
server_info
[
"prefill"
][
0
][
"tokenizer_path"
]
tokenizer
=
get_tokenizer
(
tokenizer_path
)
if
bench_args
.
dataset_name
==
"mmmu"
:
# mmmu implies this is a MLLM
tokenizer
=
get_processor
(
tokenizer_path
)
else
:
tokenizer
=
get_tokenizer
(
tokenizer_path
)
# warmup
if
not
bench_args
.
skip_warmup
:
...
...
python/sglang/bench_serving.py
View file @
0c0779d6
...
...
@@ -12,7 +12,6 @@ python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-pro
import
argparse
import
asyncio
import
base64
import
io
import
json
import
os
...
...
@@ -671,7 +670,7 @@ def get_processor(
if
pretrained_model_name_or_path
.
endswith
(
".json"
)
or
pretrained_model_name_or_path
.
endswith
(
".model"
):
from
sglang.srt.hf_transformers_utils
import
get_processor
from
sglang.srt.
utils.
hf_transformers_utils
import
get_processor
return
get_processor
(
pretrained_model_name_or_path
)
...
...
@@ -935,7 +934,7 @@ async def get_mooncake_request_over_time(
for
i
in
range
(
num_rounds
):
# Add user query for the current round
chat_history
.
append
(
{
"role"
:
"user"
,
"content"
:
f
"Round
{
i
+
1
}
:
{
user_query_base
}
"
}
{
"role"
:
"user"
,
"content"
:
f
"Round
{
i
+
1
}
:
{
user_query_base
}
"
}
)
# Form the full prompt from history
...
...
@@ -964,7 +963,7 @@ async def get_mooncake_request_over_time(
def
sample_mmmu_requests
(
num_requests
:
int
,
processor
:
AutoProcessor
,
processor
:
AutoProcessor
|
AutoTokenizer
,
fixed_output_len
:
Optional
[
int
]
=
None
,
random_sample
:
bool
=
True
,
)
->
List
[
DatasetRow
]:
...
...
@@ -973,9 +972,7 @@ def sample_mmmu_requests(
Args:
num_requests: Number of requests to sample.
tokenizer: Tokenizer to use for token counting.
fixed_output_len: If provided, use this fixed output length for all requests.
apply_chat_template: Whether to apply the chat template to the prompt.
random_sample: Whether to randomly sample or take the first N.
Returns:
...
...
@@ -1282,11 +1279,11 @@ def parse_image_resolution(image_resolution: str) -> Tuple[int, int]:
)
def
create_mm_data_row
(
text_prompt
,
images
,
images_base64
,
output_len
,
processor
):
def
create_mm_data_row
(
text_prompt
,
images
:
list
,
images_base64
,
output_len
,
processor
):
try
:
content_items
=
[
{
"type"
:
"image
_url
"
,
"image
_url
"
:
{
"url"
:
im
g_url
}}
for
im
g_url
in
images_base64
{
"type"
:
"image"
,
"image"
:
{
"url"
:
im
age_base64
}}
for
im
age_base64
in
images_base64
]
content_items
.
append
({
"type"
:
"text"
,
"text"
:
text_prompt
})
prompt_str
=
processor
.
apply_chat_template
(
...
...
@@ -1294,7 +1291,9 @@ def create_mm_data_row(text_prompt, images, images_base64, output_len, processor
add_generation_prompt
=
True
,
tokenize
=
False
,
)
except
Exception
:
except
Exception
as
e
:
# Note (Xinyuan): This is a workaround for an issue where some tokenizers do not support content as a list. (e.g. InternVL)
print
(
f
"Error applying chat template:
{
e
}
, fallback to <image> tag"
)
# Some tokenizers do not support list content; fall back to a placeholder in the text
prompt_str
=
f
"<image>
{
text_prompt
}
"
...
...
@@ -1425,7 +1424,7 @@ def sample_image_requests(
print
(
f
"#Input tokens:
{
np
.
sum
([
x
.
prompt_len
for
x
in
dataset
])
}
"
)
print
(
f
"#Output tokens:
{
np
.
sum
([
x
.
output_len
for
x
in
dataset
])
}
"
)
print
(
f
"
\n
Created
{
len
(
dataset
)
}
{
image_content
}
{
image_format
}
images with average
{
total_image_bytes
//
num_requests
}
bytes per request"
f
"
\n
Created
{
len
(
dataset
)
}
{
image_content
}
{
image_format
}
images with average
{
total_image_bytes
//
num_requests
}
bytes per request"
)
return
dataset
...
...
test/srt/test_nightly_text_models_perf.py
View file @
0c0779d6
...
...
@@ -3,7 +3,7 @@ import subprocess
import
time
import
unittest
from
sglang.bench_one_batch_server
import
BenchmarkResult
from
sglang.bench_one_batch_server
import
BenchmarkResult
,
generate_markdown_report
from
sglang.srt.utils
import
kill_process_tree
from
sglang.test.test_utils
import
(
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
...
...
@@ -41,7 +41,7 @@ class TestNightlyTextModelsPerformance(unittest.TestCase):
def
test_bench_one_batch
(
self
):
all_benchmark_results
=
[]
all_model_succeed
=
True
for
model_setup
in
self
.
models
:
benchmark_results
=
[]
with
self
.
subTest
(
model
=
model_setup
.
model_path
):
...
...
@@ -113,19 +113,21 @@ class TestNightlyTextModelsPerformance(unittest.TestCase):
# Clean up JSON file
os
.
remove
(
json_output_file
)
else
:
all_model_succeed
=
False
print
(
f
"Warning: JSON output file
{
json_output_file
}
not found"
)
finally
:
kill_process_tree
(
process
.
pid
)
report_part
=
BenchmarkResult
.
generate_markdown_report
(
PROFILE_DIR
,
benchmark_results
)
report_part
=
generate_markdown_report
(
PROFILE_DIR
,
benchmark_results
)
self
.
full_report
+=
report_part
+
"
\n
"
if
is_in_ci
():
write_github_step_summary
(
self
.
full_report
)
if
not
all_model_succeed
:
raise
AssertionError
(
"Some models failed the perf tests."
)
if
__name__
==
"__main__"
:
unittest
.
main
()
test/srt/test_nightly_vlms_mmmu_eval.py
View file @
0c0779d6
import
json
import
unittest
import
warnings
from
functools
import
partial
from
types
import
SimpleNamespace
from
sglang.srt.utils
import
kill_process_tree
...
...
@@ -26,16 +25,19 @@ MODEL_THRESHOLDS = {
"Efficient-Large-Model/NVILA-Lite-2B-hf-0626"
):
ModelEvalMetrics
(
0.305
,
23.8
),
ModelLaunchSettings
(
"google/gemma-3-4b-it"
):
ModelEvalMetrics
(
0.360
,
10.9
),
ModelLaunchSettings
(
"google/gemma-3n-E4B-it"
):
ModelEvalMetrics
(
0.360
,
1
5.3
),
ModelLaunchSettings
(
"google/gemma-3n-E4B-it"
):
ModelEvalMetrics
(
0.360
,
1
7.7
),
ModelLaunchSettings
(
"mistral-community/pixtral-12b"
):
ModelEvalMetrics
(
0.360
,
16.6
),
ModelLaunchSettings
(
"moonshotai/Kimi-VL-A3B-Instruct"
):
ModelEvalMetrics
(
0.330
,
22.3
),
ModelLaunchSettings
(
"openbmb/MiniCPM-o-2_6"
):
ModelEvalMetrics
(
0.330
,
29.3
),
ModelLaunchSettings
(
"openbmb/MiniCPM-v-2_6"
):
ModelEvalMetrics
(
0.2
70
,
24.5
),
ModelLaunchSettings
(
"OpenGVLab/InternVL2_5-2B"
):
ModelEvalMetrics
(
0.300
,
1
4
.0
),
ModelLaunchSettings
(
"openbmb/MiniCPM-v-2_6"
):
ModelEvalMetrics
(
0.2
59
,
36.3
),
ModelLaunchSettings
(
"OpenGVLab/InternVL2_5-2B"
):
ModelEvalMetrics
(
0.300
,
1
7
.0
),
ModelLaunchSettings
(
"Qwen/Qwen2-VL-7B-Instruct"
):
ModelEvalMetrics
(
0.310
,
83.3
),
ModelLaunchSettings
(
"Qwen/Qwen2.5-VL-7B-Instruct"
):
ModelEvalMetrics
(
0.340
,
31.9
),
ModelLaunchSettings
(
"Qwen/Qwen3-VL-30B-A3B-Instruct"
,
extra_args
=
[
"--tp=2"
]
):
ModelEvalMetrics
(
0.29
,
29.1
),
ModelLaunchSettings
(
"unsloth/Mistral-Small-3.1-24B-Instruct-2503"
):
ModelEvalMetrics
(
0.310
,
16.7
),
...
...
test/srt/test_nightly_vlms_perf.py
View file @
0c0779d6
...
...
@@ -3,7 +3,7 @@ import subprocess
import
unittest
import
warnings
from
sglang.bench_one_batch_server
import
BenchmarkResult
from
sglang.bench_one_batch_server
import
BenchmarkResult
,
generate_markdown_report
from
sglang.srt.utils
import
kill_process_tree
from
sglang.test.test_utils
import
(
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
...
...
@@ -27,6 +27,7 @@ MODEL_DEFAULTS = [
ModelLaunchSettings
(
"google/gemma-3-27b-it"
,
),
ModelLaunchSettings
(
"Qwen/Qwen3-VL-30B-A3B-Instruct"
,
extra_args
=
[
"--tp=2"
]),
# "OpenGVLab/InternVL2_5-2B",
# buggy in official transformers impl
# "openbmb/MiniCPM-V-2_6",
...
...
@@ -45,9 +46,7 @@ class TestNightlyVLMModelsPerformance(unittest.TestCase):
cls
.
models
=
[]
model_paths
=
parse_models
(
nightly_vlm_models_str
)
for
model_path
in
model_paths
:
cls
.
models
.
append
(
ModelLaunchSettings
(
model_path
,
extra_args
=
VLM_EXTRA_ARGS
)
)
cls
.
models
.
append
(
ModelLaunchSettings
(
model_path
))
else
:
cls
.
models
=
MODEL_DEFAULTS
...
...
@@ -60,6 +59,7 @@ class TestNightlyVLMModelsPerformance(unittest.TestCase):
def
test_bench_one_batch
(
self
):
all_benchmark_results
=
[]
all_model_succeed
=
True
for
model_setup
in
self
.
models
:
benchmark_results
=
[]
...
...
@@ -112,7 +112,6 @@ class TestNightlyVLMModelsPerformance(unittest.TestCase):
f
"Error running benchmark for
{
model_setup
.
model_path
}
with batch size:"
)
print
(
result
.
stderr
)
# Continue to next batch size even if one fails
continue
print
(
f
"Output for
{
model_setup
.
model_path
}
with batch size:"
)
...
...
@@ -136,19 +135,24 @@ class TestNightlyVLMModelsPerformance(unittest.TestCase):
)
else
:
all_model_succeed
=
False
print
(
f
"Warning: JSON output file
{
json_output_file
}
not found"
)
finally
:
kill_process_tree
(
process
.
pid
)
report_part
=
BenchmarkResult
.
generate_markdown_report
(
PROFILE_DIR
,
benchmark_results
report_part
=
generate_markdown_report
(
PROFILE_DIR
,
benchmark_results
,
)
self
.
full_report
+=
report_part
+
"
\n
"
if
is_in_ci
():
write_github_step_summary
(
self
.
full_report
)
if
not
all_model_succeed
:
raise
AssertionError
(
"Some models failed the perf tests."
)
if
__name__
==
"__main__"
:
unittest
.
main
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment