Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
7e61737d
Unverified
Commit
7e61737d
authored
Oct 02, 2025
by
Vedant V Jhaveri
Committed by
GitHub
Oct 02, 2025
Browse files
[Generative Scores API] add performance tests to CICD (#10830)
parent
3c699772
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
250 additions
and
0 deletions
+250
-0
.github/workflows/pr-test.yml
.github/workflows/pr-test.yml
+33
-0
python/sglang/test/test_utils.py
python/sglang/test/test_utils.py
+149
-0
test/srt/test_bench_serving.py
test/srt/test_bench_serving.py
+68
-0
No files found.
.github/workflows/pr-test.yml
View file @
7e61737d
...
...
@@ -460,6 +460,39 @@ jobs:
cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_vlm_online_latency
performance-test-1-gpu-part-3
:
needs
:
[
check-changes
,
sgl-kernel-build-wheels
]
if
:
always() && !failure() && !cancelled() &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
runs-on
:
1-gpu-runner
steps
:
-
name
:
Checkout code
uses
:
actions/checkout@v4
-
name
:
Download artifacts
if
:
needs.check-changes.outputs.sgl_kernel == 'true'
uses
:
actions/download-artifact@v4
with
:
path
:
sgl-kernel/dist/
merge-multiple
:
true
pattern
:
wheel-python3.10-cuda12.9
-
name
:
Install dependencies
run
:
|
CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh
-
name
:
Benchmark Scores online latency and throughput
timeout-minutes
:
10
run
:
|
cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_score_api_latency_throughput
-
name
:
Benchmark Scores online latency and throughput (batch size scaling)
timeout-minutes
:
10
run
:
|
cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_score_api_batch_scaling
performance-test-2-gpu
:
needs
:
[
check-changes
,
unit-test-backend-2-gpu
,
sgl-kernel-build-wheels
]
if
:
always() && !failure() && !cancelled() &&
...
...
python/sglang/test/test_utils.py
View file @
7e61737d
...
...
@@ -43,6 +43,7 @@ from sglang.utils import get_exception_traceback
DEFAULT_MODEL_NAME_FOR_TEST
=
"meta-llama/Llama-3.1-8B-Instruct"
DEFAULT_SMALL_MODEL_NAME_FOR_TEST
=
"meta-llama/Llama-3.2-1B-Instruct"
DEFAULT_SMALL_MODEL_NAME_FOR_TEST_BASE
=
"meta-llama/Llama-3.2-1B"
DEFAULT_SMALL_MODEL_NAME_FOR_TEST_SCORE
=
"Qwen/Qwen3-Reranker-0.6B"
DEFAULT_MOE_MODEL_NAME_FOR_TEST
=
"mistralai/Mixtral-8x7B-Instruct-v0.1"
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_BASE
=
"Qwen/Qwen1.5-MoE-A2.7B"
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_CHAT
=
"Qwen/Qwen1.5-MoE-A2.7B-Chat"
...
...
@@ -873,6 +874,154 @@ def run_bench_serving(
return
res
def
run_score_benchmark
(
model
,
num_requests
=
100
,
batch_size
=
5
,
other_server_args
=
None
,
need_warmup
=
False
,
device
=
"auto"
,
):
"""Score API benchmark function compatible with run_bench_serving pattern"""
if
other_server_args
is
None
:
other_server_args
=
[]
if
device
==
"auto"
:
device
=
auto_config_device
()
# Launch the server (consistent with run_bench_serving)
base_url
=
DEFAULT_URL_FOR_TEST
process
=
popen_launch_server
(
model
,
base_url
,
timeout
=
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
other_args
=
other_server_args
,
)
async
def
_run_benchmark
():
# Load tokenizer for generating test data
from
sglang.srt.hf_transformers_utils
import
get_tokenizer
tokenizer
=
get_tokenizer
(
model
)
# Score API configuration
score_query_tokens
=
120
score_item_tokens
=
180
score_label_token_ids
=
[
9454
,
2753
]
# Yes/No token IDs
special_token
=
"<|im_start|>"
def
generate_text_with_token_count
(
num_tokens
):
"""Generate text with precise token count using replicated token."""
text
=
special_token
*
num_tokens
actual_tokens
=
len
(
tokenizer
.
encode
(
text
,
add_special_tokens
=
False
))
if
actual_tokens
!=
num_tokens
:
text
=
special_token
*
(
num_tokens
//
len
(
tokenizer
.
encode
(
special_token
,
add_special_tokens
=
False
))
)
return
text
if
need_warmup
:
warmup_data
=
{
"query"
:
generate_text_with_token_count
(
score_query_tokens
),
"items"
:
[
generate_text_with_token_count
(
score_item_tokens
)
for
_
in
range
(
3
)
],
"label_token_ids"
:
score_label_token_ids
,
"model"
:
model
,
"apply_softmax"
:
True
,
}
async
with
aiohttp
.
ClientSession
()
as
session
:
try
:
await
session
.
post
(
f
"
{
base_url
}
/v1/score"
,
json
=
warmup_data
,
timeout
=
aiohttp
.
ClientTimeout
(
total
=
30
),
)
except
:
pass
# Ignore warmup errors
test_requests
=
[]
for
i
in
range
(
num_requests
):
query
=
generate_text_with_token_count
(
score_query_tokens
)
items
=
[
generate_text_with_token_count
(
score_item_tokens
)
for
_
in
range
(
batch_size
)
]
score_data
=
{
"query"
:
query
,
"items"
:
items
,
"label_token_ids"
:
score_label_token_ids
,
"model"
:
model
,
"apply_softmax"
:
True
,
}
test_requests
.
append
(
score_data
)
start_time
=
time
.
monotonic
()
successful_requests
=
0
total_latency
=
0
latencies
=
[]
async
with
aiohttp
.
ClientSession
()
as
session
:
for
request_data
in
test_requests
:
try
:
request_start
=
time
.
monotonic
()
async
with
session
.
post
(
f
"
{
base_url
}
/v1/score"
,
json
=
request_data
,
timeout
=
aiohttp
.
ClientTimeout
(
total
=
30
),
)
as
response
:
if
response
.
status
==
200
:
response_data
=
await
response
.
json
()
request_end
=
time
.
monotonic
()
if
"scores"
in
response_data
or
"logprobs"
in
response_data
:
latency_ms
=
(
request_end
-
request_start
)
*
1000
latencies
.
append
(
latency_ms
)
total_latency
+=
latency_ms
successful_requests
+=
1
except
Exception
:
continue
end_time
=
time
.
monotonic
()
total_time
=
end_time
-
start_time
if
successful_requests
>
0
:
throughput
=
successful_requests
/
total_time
avg_latency
=
total_latency
/
successful_requests
latencies
.
sort
()
p95_latency
=
latencies
[
int
(
len
(
latencies
)
*
0.95
)]
if
latencies
else
0
return
{
"completed"
:
successful_requests
,
"total_requests"
:
num_requests
,
"throughput"
:
throughput
,
"avg_latency_ms"
:
avg_latency
,
"p95_latency_ms"
:
p95_latency
,
"successful_requests"
:
successful_requests
,
}
else
:
return
{
"completed"
:
0
,
"total_requests"
:
num_requests
,
"throughput"
:
0
,
"avg_latency_ms"
:
0
,
"p95_latency_ms"
:
0
,
"successful_requests"
:
0
,
}
try
:
res
=
asyncio
.
run
(
_run_benchmark
())
finally
:
kill_process_tree
(
process
.
pid
)
assert
res
[
"completed"
]
==
res
[
"successful_requests"
]
return
res
def
run_bench_serving_multi
(
model
,
base_url
,
...
...
test/srt/test_bench_serving.py
View file @
7e61737d
...
...
@@ -4,17 +4,20 @@ import unittest
import
requests
from
sglang.srt.hf_transformers_utils
import
get_tokenizer
from
sglang.test.test_utils
import
(
DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST
,
DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST
,
DEFAULT_MODEL_NAME_FOR_TEST
,
DEFAULT_MODEL_NAME_FOR_TEST_FP8
,
DEFAULT_MOE_MODEL_NAME_FOR_TEST
,
DEFAULT_SMALL_MODEL_NAME_FOR_TEST_SCORE
,
DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST
,
CustomTestCase
,
is_in_amd_ci
,
is_in_ci
,
run_bench_serving
,
run_score_benchmark
,
write_github_step_summary
,
)
...
...
@@ -440,6 +443,71 @@ class TestBenchServing(CustomTestCase):
)
self
.
assertGreater
(
res
[
"input_throughput"
],
4000
)
def
test_score_api_latency_throughput
(
self
):
"""Test score API latency and throughput performance"""
res
=
run_score_benchmark
(
model
=
DEFAULT_SMALL_MODEL_NAME_FOR_TEST_SCORE
,
num_requests
=
1000
,
batch_size
=
10
,
other_server_args
=
[],
need_warmup
=
True
,
)
if
is_in_ci
():
write_github_step_summary
(
f
"### test_score_api_throughput
\n
"
f
"Average latency:
{
res
[
'avg_latency_ms'
]:.
2
f
}
ms
\n
"
f
"P95 latency:
{
res
[
'p95_latency_ms'
]:.
2
f
}
ms
\n
"
f
"Score API throughput:
{
res
[
'throughput'
]:.
2
f
}
req/s
\n
"
f
"Successful requests:
{
res
[
'successful_requests'
]
}
/
{
res
[
'total_requests'
]
}
\n
"
)
self
.
assertEqual
(
res
[
"successful_requests"
],
res
[
"total_requests"
])
self
.
assertLess
(
res
[
"avg_latency_ms"
],
48
)
self
.
assertLess
(
res
[
"p95_latency_ms"
],
50
)
self
.
assertGreater
(
res
[
"throughput"
],
20
)
def
test_score_api_batch_scaling
(
self
):
"""Test score API performance with different batch sizes"""
batch_sizes
=
[
10
,
25
,
50
]
for
batch_size
in
batch_sizes
:
res
=
run_score_benchmark
(
model
=
DEFAULT_SMALL_MODEL_NAME_FOR_TEST_SCORE
,
num_requests
=
500
,
batch_size
=
batch_size
,
)
if
is_in_ci
():
write_github_step_summary
(
f
"### test_score_api_batch_scaling_size_
{
batch_size
}
\n
"
f
"Batch size:
{
batch_size
}
\n
"
f
"Average latency:
{
res
[
'avg_latency_ms'
]:.
2
f
}
ms
\n
"
f
"P95 latency:
{
res
[
'p95_latency_ms'
]:.
2
f
}
ms
\n
"
f
"Throughput:
{
res
[
'throughput'
]:.
2
f
}
req/s
\n
"
f
"Successful requests:
{
res
[
'successful_requests'
]
}
/
{
res
[
'total_requests'
]
}
\n
"
)
self
.
assertEqual
(
res
[
"successful_requests"
],
res
[
"total_requests"
])
if
batch_size
==
10
:
avg_latency_bound
=
45
elif
batch_size
==
25
:
avg_latency_bound
=
50
elif
batch_size
==
50
:
avg_latency_bound
=
60
else
:
avg_latency_bound
=
60
self
.
assertLess
(
res
[
"avg_latency_ms"
],
avg_latency_bound
)
if
batch_size
==
10
:
p95_latency_bound
=
50
elif
batch_size
==
25
:
p95_latency_bound
=
60
elif
batch_size
==
50
:
p95_latency_bound
=
65
else
:
p95_latency_bound
=
65
self
.
assertLess
(
res
[
"p95_latency_ms"
],
p95_latency_bound
)
if
__name__
==
"__main__"
:
unittest
.
main
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment