Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
5c705b1d
Unverified
Commit
5c705b1d
authored
Jul 26, 2025
by
Lifu Huang
Committed by
GitHub
Jul 26, 2025
Browse files
Add perf tests for LoRA (#8314)
parent
b7094a5e
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
177 additions
and
21 deletions
+177
-21
.github/workflows/pr-test.yml
.github/workflows/pr-test.yml
+7
-0
python/sglang/test/test_utils.py
python/sglang/test/test_utils.py
+30
-5
test/srt/test_bench_serving.py
test/srt/test_bench_serving.py
+140
-16
No files found.
.github/workflows/pr-test.yml
View file @
5c705b1d
...
...
@@ -174,6 +174,13 @@ jobs:
cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_eagle
-
name
:
Benchmark online latency (LoRA)
timeout-minutes
:
10
run
:
|
cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_lora_online_latency
python3 -m unittest test_bench_serving.TestBenchServing.test_lora_online_latency_with_concurrent_adapter_updates
performance-test-1-gpu-part-2
:
if
:
(github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
github.event.pull_request.draft ==
false
...
...
python/sglang/test/test_utils.py
View file @
5c705b1d
"""Common utilities for testing and benchmarking"""
import
argparse
import
asyncio
import
copy
import
json
import
logging
...
...
@@ -15,7 +16,7 @@ from concurrent.futures import ThreadPoolExecutor
from
dataclasses
import
dataclass
from
functools
import
partial
from
types
import
SimpleNamespace
from
typing
import
Callable
,
List
,
Optional
,
Tuple
from
typing
import
Awaitable
,
Callable
,
List
,
Optional
,
Tuple
import
numpy
as
np
import
requests
...
...
@@ -714,6 +715,7 @@ def get_benchmark_args(
seed
:
int
=
0
,
device
=
"auto"
,
pd_separated
:
bool
=
False
,
lora_name
=
None
,
):
return
SimpleNamespace
(
backend
=
"sglang"
,
...
...
@@ -741,7 +743,7 @@ def get_benchmark_args(
extra_request_body
=
None
,
apply_chat_template
=
False
,
profile
=
None
,
lora_name
=
Non
e
,
lora_name
=
lora_nam
e
,
prompt_suffix
=
""
,
device
=
device
,
pd_separated
=
pd_separated
,
...
...
@@ -764,6 +766,8 @@ def run_bench_serving(
need_warmup
=
False
,
seed
:
int
=
0
,
device
=
"auto"
,
background_task
:
Optional
[
Callable
[[
str
,
asyncio
.
Event
],
Awaitable
[
None
]]]
=
None
,
lora_name
:
Optional
[
str
]
=
None
,
):
if
device
==
"auto"
:
device
=
auto_config_device
()
...
...
@@ -791,14 +795,35 @@ def run_bench_serving(
disable_ignore_eos
=
disable_ignore_eos
,
seed
=
seed
,
device
=
device
,
lora_name
=
lora_name
,
)
try
:
async
def
_run
()
:
if
need_warmup
:
warmup_args
=
copy
.
deepcopy
(
args
)
warmup_args
.
num_prompts
=
16
run_benchmark
(
warmup_args
)
res
=
run_benchmark
(
args
)
await
asyncio
.
to_thread
(
run_benchmark
,
warmup_args
)
start_event
=
asyncio
.
Event
()
stop_event
=
asyncio
.
Event
()
task_handle
=
(
asyncio
.
create_task
(
background_task
(
base_url
,
start_event
,
stop_event
))
if
background_task
else
None
)
try
:
start_event
.
set
()
result
=
await
asyncio
.
to_thread
(
run_benchmark
,
args
)
finally
:
if
task_handle
:
stop_event
.
set
()
await
task_handle
return
result
try
:
res
=
asyncio
.
run
(
_run
())
finally
:
kill_process_tree
(
process
.
pid
)
...
...
test/srt/test_bench_serving.py
View file @
5c705b1d
import
asyncio
import
itertools
import
unittest
from
random
import
random
,
uniform
import
requests
from
sglang.test.test_utils
import
(
DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST
,
...
...
@@ -16,7 +21,6 @@ from sglang.test.test_utils import (
class
TestBenchServing
(
CustomTestCase
):
def
test_offline_throughput_default
(
self
):
res
=
run_bench_serving
(
model
=
DEFAULT_MODEL_NAME_FOR_TEST
,
...
...
@@ -28,7 +32,7 @@ class TestBenchServing(CustomTestCase):
if
is_in_ci
():
write_github_step_summary
(
f
"### test_offline_throughput_default
\n
"
f
'
Output throughput:
{
res
[
"
output_throughput
"
]:.
2
f
}
token/s
\n
'
f
"
Output throughput:
{
res
[
'
output_throughput
'
]:.
2
f
}
token/s
\n
"
)
if
is_in_amd_ci
():
self
.
assertGreater
(
res
[
"output_throughput"
],
3050
)
...
...
@@ -51,7 +55,7 @@ class TestBenchServing(CustomTestCase):
if
is_in_ci
():
write_github_step_summary
(
f
"### test_offline_throughput_non_stream_small_batch_size
\n
"
f
'
Output throughput:
{
res
[
"
output_throughput
"
]:.
2
f
}
token/s
\n
'
f
"
Output throughput:
{
res
[
'
output_throughput
'
]:.
2
f
}
token/s
\n
"
)
self
.
assertGreater
(
res
[
"output_throughput"
],
1050
)
...
...
@@ -66,7 +70,7 @@ class TestBenchServing(CustomTestCase):
if
is_in_ci
():
write_github_step_summary
(
f
"### test_offline_throughput_without_radix_cache
\n
"
f
'
Output throughput:
{
res
[
"
output_throughput
"
]:.
2
f
}
token/s
\n
'
f
"
Output throughput:
{
res
[
'
output_throughput
'
]:.
2
f
}
token/s
\n
"
)
if
is_in_amd_ci
():
self
.
assertGreater
(
res
[
"output_throughput"
],
3050
)
...
...
@@ -84,7 +88,7 @@ class TestBenchServing(CustomTestCase):
if
is_in_ci
():
write_github_step_summary
(
f
"### test_offline_throughput_without_chunked_prefill
\n
"
f
'
Output throughput:
{
res
[
"
output_throughput
"
]:.
2
f
}
token/s
\n
'
f
"
Output throughput:
{
res
[
'
output_throughput
'
]:.
2
f
}
token/s
\n
"
)
self
.
assertGreater
(
res
[
"output_throughput"
],
2600
)
...
...
@@ -104,7 +108,7 @@ class TestBenchServing(CustomTestCase):
if
is_in_ci
():
write_github_step_summary
(
f
"### test_offline_throughput_with_triton_attention_backend
\n
"
f
'
Output throughput:
{
res
[
"
output_throughput
"
]:.
2
f
}
token/s
\n
'
f
"
Output throughput:
{
res
[
'
output_throughput
'
]:.
2
f
}
token/s
\n
"
)
if
is_in_amd_ci
():
self
.
assertGreater
(
res
[
"output_throughput"
],
3500
)
...
...
@@ -122,7 +126,7 @@ class TestBenchServing(CustomTestCase):
if
is_in_ci
():
write_github_step_summary
(
f
"### test_offline_throughput_default_fp8
\n
"
f
'
Output throughput:
{
res
[
"
output_throughput
"
]:.
2
f
}
token/s
\n
'
f
"
Output throughput:
{
res
[
'
output_throughput
'
]:.
2
f
}
token/s
\n
"
)
if
is_in_amd_ci
():
self
.
assertGreater
(
res
[
"output_throughput"
],
3500
)
...
...
@@ -140,7 +144,7 @@ class TestBenchServing(CustomTestCase):
if
is_in_ci
():
write_github_step_summary
(
f
"### test_online_latency_default
\n
"
f
'
median_e2e_latency_ms:
{
res
[
"
median_e2e_latency_ms
"
]:.
2
f
}
ms
\n
'
f
"
median_e2e_latency_ms:
{
res
[
'
median_e2e_latency_ms
'
]:.
2
f
}
ms
\n
"
)
self
.
assertLess
(
res
[
"median_e2e_latency_ms"
],
11000
)
if
is_in_amd_ci
():
...
...
@@ -164,7 +168,7 @@ class TestBenchServing(CustomTestCase):
if
is_in_ci
():
write_github_step_summary
(
f
"### test_vlm_offline_throughput
\n
"
f
'
Output throughput:
{
res
[
"
output_throughput
"
]:.
2
f
}
token/s
\n
'
f
"
Output throughput:
{
res
[
'
output_throughput
'
]:.
2
f
}
token/s
\n
"
)
if
is_in_amd_ci
():
self
.
assertGreater
(
res
[
"output_throughput"
],
2000
)
...
...
@@ -187,7 +191,7 @@ class TestBenchServing(CustomTestCase):
if
is_in_ci
():
write_github_step_summary
(
f
"### test_vlm_online_latency
\n
"
f
'
median_e2e_latency_ms:
{
res
[
"
median_e2e_latency_ms
"
]:.
2
f
}
ms
\n
'
f
"
median_e2e_latency_ms:
{
res
[
'
median_e2e_latency_ms
'
]:.
2
f
}
ms
\n
"
)
self
.
assertLess
(
res
[
"median_e2e_latency_ms"
],
16500
)
if
is_in_amd_ci
():
...
...
@@ -197,6 +201,126 @@ class TestBenchServing(CustomTestCase):
self
.
assertLess
(
res
[
"median_ttft_ms"
],
100
)
self
.
assertLess
(
res
[
"median_itl_ms"
],
8
)
def
test_lora_online_latency
(
self
):
# TODO (lifuhuang): verify LoRA support in AMD.
if
is_in_amd_ci
():
pass
res
=
self
.
_run_lora_latency_test
(
enable_background_task
=
False
)
if
is_in_ci
():
write_github_step_summary
(
f
"### test_lora_online_latency
\n
"
f
"median_e2e_latency_ms:
{
res
[
'median_e2e_latency_ms'
]:.
2
f
}
ms
\n
"
f
"median_ttft_ms:
{
res
[
'median_ttft_ms'
]:.
2
f
}
ms
\n
"
)
self
.
assertLess
(
res
[
"median_e2e_latency_ms"
],
2400
)
self
.
assertLess
(
res
[
"median_ttft_ms"
],
58
)
def
test_lora_online_latency_with_concurrent_adapter_updates
(
self
):
# TODO (lifuhuang): verify LoRA support in AMD.
if
is_in_amd_ci
():
pass
res
=
self
.
_run_lora_latency_test
(
enable_background_task
=
True
)
if
is_in_ci
():
write_github_step_summary
(
f
"### test_lora_online_latency
\n
"
f
"median_e2e_latency_ms:
{
res
[
'median_e2e_latency_ms'
]:.
2
f
}
ms
\n
"
f
"median_ttft_ms:
{
res
[
'median_ttft_ms'
]:.
2
f
}
ms
\n
"
)
self
.
assertLess
(
res
[
"median_e2e_latency_ms"
],
4000
)
# TODO (lifuhuang): This will be fixed by the overlapped LoRA update in a separate PR.
self
.
assertLess
(
res
[
"median_ttft_ms"
],
1600
)
def
_run_lora_latency_test
(
self
,
enable_background_task
:
bool
):
"""
Run a latency test for LoRA with the specified background task setting.
"""
async
def
lora_loader_unloader_task
(
base_url
:
str
,
start_event
:
asyncio
.
Event
,
stop_event
:
asyncio
.
Event
,
):
"""
A background task that repeatedly loads and unloads a LoRA adapter.
"""
await
start_event
.
wait
()
path_cycler
=
itertools
.
cycle
(
[
"pbevan11/llama-3.1-8b-ocr-correction"
,
"faridlazuarda/valadapt-llama-3.1-8B-it-chinese"
,
"philschmid/code-llama-3-1-8b-text-to-sql-lora"
,
]
)
load_url
=
f
"
{
base_url
}
/load_lora_adapter"
unload_url
=
f
"
{
base_url
}
/unload_lora_adapter"
num_updates
=
0
while
not
stop_event
.
is_set
():
# 1. Load the LoRA adapter
lora_path
=
next
(
path_cycler
)
response
=
await
asyncio
.
to_thread
(
requests
.
post
,
load_url
,
json
=
{
"lora_name"
:
lora_path
,
"lora_path"
:
lora_path
},
)
self
.
assertTrue
(
response
.
ok
,
f
"Failed to load LoRA adapter:
{
response
.
text
}
"
)
num_updates
+=
1
if
stop_event
.
is_set
():
break
# Yield control to allow other tasks to run.
await
asyncio
.
sleep
(
1
)
# 2. Unload the LoRA adapter
response
=
await
asyncio
.
to_thread
(
requests
.
post
,
unload_url
,
json
=
{
"lora_name"
:
lora_path
},
)
self
.
assertTrue
(
response
.
ok
,
f
"Failed to unload LoRA adapter:
{
response
.
text
}
"
)
num_updates
+=
1
# Yield control to allow other tasks to run.
await
asyncio
.
sleep
(
1
)
background_task
=
lora_loader_unloader_task
if
enable_background_task
else
None
res
=
run_bench_serving
(
model
=
DEFAULT_MODEL_NAME_FOR_TEST
,
num_prompts
=
400
,
request_rate
=
8
,
other_server_args
=
[
"--enable-lora"
,
"--max-loras-per-batch"
,
"1"
,
"--disable-radix-cache"
,
"--random-seed"
,
"42"
,
"--mem-fraction-static"
,
"0.8"
,
"--lora-paths"
,
"Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16"
,
"--max-lora-rank"
,
"256"
,
],
dataset_name
=
"random"
,
random_input_len
=
256
,
random_output_len
=
256
,
lora_name
=
[
"Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16"
],
background_task
=
background_task
,
)
return
res
def
test_online_latency_eagle
(
self
):
res
=
run_bench_serving
(
model
=
DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST
,
...
...
@@ -226,8 +350,8 @@ class TestBenchServing(CustomTestCase):
if
is_in_ci
():
write_github_step_summary
(
f
"### test_online_latency_eagle
\n
"
f
'
median_e2e_latency_ms:
{
res
[
"
median_e2e_latency_ms
"
]:.
2
f
}
ms
\n
'
f
'
accept_length:
{
res
[
"
accept_length
"
]:.
2
f
}
\n
'
f
"
median_e2e_latency_ms:
{
res
[
'
median_e2e_latency_ms
'
]:.
2
f
}
ms
\n
"
f
"
accept_length:
{
res
[
'
accept_length
'
]:.
2
f
}
\n
"
)
if
is_in_amd_ci
():
self
.
assertLess
(
res
[
"median_e2e_latency_ms"
],
1800
)
...
...
@@ -246,7 +370,7 @@ class TestBenchServing(CustomTestCase):
if
is_in_ci
():
write_github_step_summary
(
f
"### test_moe_offline_throughput_default
\n
"
f
'
Output throughput:
{
res
[
"
output_throughput
"
]:.
2
f
}
token/s
\n
'
f
"
Output throughput:
{
res
[
'
output_throughput
'
]:.
2
f
}
token/s
\n
"
)
if
is_in_amd_ci
():
self
.
assertGreater
(
res
[
"output_throughput"
],
2100
)
...
...
@@ -264,7 +388,7 @@ class TestBenchServing(CustomTestCase):
if
is_in_ci
():
write_github_step_summary
(
f
"### test_moe_offline_throughput_without_radix_cache
\n
"
f
'
Output throughput:
{
res
[
"
output_throughput
"
]:.
2
f
}
token/s
\n
'
f
"
Output throughput:
{
res
[
'
output_throughput
'
]:.
2
f
}
token/s
\n
"
)
if
is_in_amd_ci
():
self
.
assertGreater
(
res
[
"output_throughput"
],
2100
)
...
...
@@ -286,7 +410,7 @@ class TestBenchServing(CustomTestCase):
if
is_in_ci
():
write_github_step_summary
(
f
"### test_pp_offline_throughput_default_decode
\n
"
f
'
Output throughput:
{
res
[
"
output_throughput
"
]:.
2
f
}
token/s
\n
'
f
"
Output throughput:
{
res
[
'
output_throughput
'
]:.
2
f
}
token/s
\n
"
)
self
.
assertGreater
(
res
[
"output_throughput"
],
6700
)
...
...
@@ -311,7 +435,7 @@ class TestBenchServing(CustomTestCase):
if
is_in_ci
():
write_github_step_summary
(
f
"### test_pp_long_context_latency_prefill
\n
"
f
'
input_throughput:
{
res
[
"
input_throughput
"
]:.
2
f
}
ms
\n
'
f
"
input_throughput:
{
res
[
'
input_throughput
'
]:.
2
f
}
ms
\n
"
)
self
.
assertGreater
(
res
[
"input_throughput"
],
4000
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment