Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
5c705b1d
"encoding/vscode:/vscode.git/clone" did not exist on "b8d83b0d93793b56c7d6aaff5880322f7bdd8da0"
Unverified
Commit
5c705b1d
authored
Jul 26, 2025
by
Lifu Huang
Committed by
GitHub
Jul 26, 2025
Browse files
Add perf tests for LoRA (#8314)
parent
b7094a5e
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
177 additions
and
21 deletions
+177
-21
.github/workflows/pr-test.yml
.github/workflows/pr-test.yml
+7
-0
python/sglang/test/test_utils.py
python/sglang/test/test_utils.py
+30
-5
test/srt/test_bench_serving.py
test/srt/test_bench_serving.py
+140
-16
No files found.
.github/workflows/pr-test.yml
View file @
5c705b1d
...
...
@@ -174,6 +174,13 @@ jobs:
cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_eagle
-
name
:
Benchmark online latency (LoRA)
timeout-minutes
:
10
run
:
|
cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_lora_online_latency
python3 -m unittest test_bench_serving.TestBenchServing.test_lora_online_latency_with_concurrent_adapter_updates
performance-test-1-gpu-part-2
:
if
:
(github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
github.event.pull_request.draft ==
false
...
...
python/sglang/test/test_utils.py
View file @
5c705b1d
"""Common utilities for testing and benchmarking"""
import
argparse
import
asyncio
import
copy
import
json
import
logging
...
...
@@ -15,7 +16,7 @@ from concurrent.futures import ThreadPoolExecutor
from
dataclasses
import
dataclass
from
functools
import
partial
from
types
import
SimpleNamespace
from
typing
import
Callable
,
List
,
Optional
,
Tuple
from
typing
import
Awaitable
,
Callable
,
List
,
Optional
,
Tuple
import
numpy
as
np
import
requests
...
...
@@ -714,6 +715,7 @@ def get_benchmark_args(
seed
:
int
=
0
,
device
=
"auto"
,
pd_separated
:
bool
=
False
,
lora_name
=
None
,
):
return
SimpleNamespace
(
backend
=
"sglang"
,
...
...
@@ -741,7 +743,7 @@ def get_benchmark_args(
extra_request_body
=
None
,
apply_chat_template
=
False
,
profile
=
None
,
lora_name
=
Non
e
,
lora_name
=
lora_nam
e
,
prompt_suffix
=
""
,
device
=
device
,
pd_separated
=
pd_separated
,
...
...
@@ -764,6 +766,8 @@ def run_bench_serving(
need_warmup
=
False
,
seed
:
int
=
0
,
device
=
"auto"
,
background_task
:
Optional
[
Callable
[[
str
,
asyncio
.
Event
],
Awaitable
[
None
]]]
=
None
,
lora_name
:
Optional
[
str
]
=
None
,
):
if
device
==
"auto"
:
device
=
auto_config_device
()
...
...
@@ -791,14 +795,35 @@ def run_bench_serving(
disable_ignore_eos
=
disable_ignore_eos
,
seed
=
seed
,
device
=
device
,
lora_name
=
lora_name
,
)
try
:
async
def
_run
()
:
if
need_warmup
:
warmup_args
=
copy
.
deepcopy
(
args
)
warmup_args
.
num_prompts
=
16
run_benchmark
(
warmup_args
)
res
=
run_benchmark
(
args
)
await
asyncio
.
to_thread
(
run_benchmark
,
warmup_args
)
start_event
=
asyncio
.
Event
()
stop_event
=
asyncio
.
Event
()
task_handle
=
(
asyncio
.
create_task
(
background_task
(
base_url
,
start_event
,
stop_event
))
if
background_task
else
None
)
try
:
start_event
.
set
()
result
=
await
asyncio
.
to_thread
(
run_benchmark
,
args
)
finally
:
if
task_handle
:
stop_event
.
set
()
await
task_handle
return
result
try
:
res
=
asyncio
.
run
(
_run
())
finally
:
kill_process_tree
(
process
.
pid
)
...
...
test/srt/test_bench_serving.py
View file @
5c705b1d
import
asyncio
import
itertools
import
unittest
from
random
import
random
,
uniform
import
requests
from
sglang.test.test_utils
import
(
DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST
,
...
...
@@ -16,7 +21,6 @@ from sglang.test.test_utils import (
class
TestBenchServing
(
CustomTestCase
):
def
test_offline_throughput_default
(
self
):
res
=
run_bench_serving
(
model
=
DEFAULT_MODEL_NAME_FOR_TEST
,
...
...
@@ -28,7 +32,7 @@ class TestBenchServing(CustomTestCase):
if
is_in_ci
():
write_github_step_summary
(
f
"### test_offline_throughput_default
\n
"
f
'
Output throughput:
{
res
[
"
output_throughput
"
]:.
2
f
}
token/s
\n
'
f
"
Output throughput:
{
res
[
'
output_throughput
'
]:.
2
f
}
token/s
\n
"
)
if
is_in_amd_ci
():
self
.
assertGreater
(
res
[
"output_throughput"
],
3050
)
...
...
@@ -51,7 +55,7 @@ class TestBenchServing(CustomTestCase):
if
is_in_ci
():
write_github_step_summary
(
f
"### test_offline_throughput_non_stream_small_batch_size
\n
"
f
'
Output throughput:
{
res
[
"
output_throughput
"
]:.
2
f
}
token/s
\n
'
f
"
Output throughput:
{
res
[
'
output_throughput
'
]:.
2
f
}
token/s
\n
"
)
self
.
assertGreater
(
res
[
"output_throughput"
],
1050
)
...
...
@@ -66,7 +70,7 @@ class TestBenchServing(CustomTestCase):
if
is_in_ci
():
write_github_step_summary
(
f
"### test_offline_throughput_without_radix_cache
\n
"
f
'
Output throughput:
{
res
[
"
output_throughput
"
]:.
2
f
}
token/s
\n
'
f
"
Output throughput:
{
res
[
'
output_throughput
'
]:.
2
f
}
token/s
\n
"
)
if
is_in_amd_ci
():
self
.
assertGreater
(
res
[
"output_throughput"
],
3050
)
...
...
@@ -84,7 +88,7 @@ class TestBenchServing(CustomTestCase):
if
is_in_ci
():
write_github_step_summary
(
f
"### test_offline_throughput_without_chunked_prefill
\n
"
f
'
Output throughput:
{
res
[
"
output_throughput
"
]:.
2
f
}
token/s
\n
'
f
"
Output throughput:
{
res
[
'
output_throughput
'
]:.
2
f
}
token/s
\n
"
)
self
.
assertGreater
(
res
[
"output_throughput"
],
2600
)
...
...
@@ -104,7 +108,7 @@ class TestBenchServing(CustomTestCase):
if
is_in_ci
():
write_github_step_summary
(
f
"### test_offline_throughput_with_triton_attention_backend
\n
"
f
'
Output throughput:
{
res
[
"
output_throughput
"
]:.
2
f
}
token/s
\n
'
f
"
Output throughput:
{
res
[
'
output_throughput
'
]:.
2
f
}
token/s
\n
"
)
if
is_in_amd_ci
():
self
.
assertGreater
(
res
[
"output_throughput"
],
3500
)
...
...
@@ -122,7 +126,7 @@ class TestBenchServing(CustomTestCase):
if
is_in_ci
():
write_github_step_summary
(
f
"### test_offline_throughput_default_fp8
\n
"
f
'
Output throughput:
{
res
[
"
output_throughput
"
]:.
2
f
}
token/s
\n
'
f
"
Output throughput:
{
res
[
'
output_throughput
'
]:.
2
f
}
token/s
\n
"
)
if
is_in_amd_ci
():
self
.
assertGreater
(
res
[
"output_throughput"
],
3500
)
...
...
@@ -140,7 +144,7 @@ class TestBenchServing(CustomTestCase):
if
is_in_ci
():
write_github_step_summary
(
f
"### test_online_latency_default
\n
"
f
'
median_e2e_latency_ms:
{
res
[
"
median_e2e_latency_ms
"
]:.
2
f
}
ms
\n
'
f
"
median_e2e_latency_ms:
{
res
[
'
median_e2e_latency_ms
'
]:.
2
f
}
ms
\n
"
)
self
.
assertLess
(
res
[
"median_e2e_latency_ms"
],
11000
)
if
is_in_amd_ci
():
...
...
@@ -164,7 +168,7 @@ class TestBenchServing(CustomTestCase):
if
is_in_ci
():
write_github_step_summary
(
f
"### test_vlm_offline_throughput
\n
"
f
'
Output throughput:
{
res
[
"
output_throughput
"
]:.
2
f
}
token/s
\n
'
f
"
Output throughput:
{
res
[
'
output_throughput
'
]:.
2
f
}
token/s
\n
"
)
if
is_in_amd_ci
():
self
.
assertGreater
(
res
[
"output_throughput"
],
2000
)
...
...
@@ -187,7 +191,7 @@ class TestBenchServing(CustomTestCase):
if
is_in_ci
():
write_github_step_summary
(
f
"### test_vlm_online_latency
\n
"
f
'
median_e2e_latency_ms:
{
res
[
"
median_e2e_latency_ms
"
]:.
2
f
}
ms
\n
'
f
"
median_e2e_latency_ms:
{
res
[
'
median_e2e_latency_ms
'
]:.
2
f
}
ms
\n
"
)
self
.
assertLess
(
res
[
"median_e2e_latency_ms"
],
16500
)
if
is_in_amd_ci
():
...
...
@@ -197,6 +201,126 @@ class TestBenchServing(CustomTestCase):
self
.
assertLess
(
res
[
"median_ttft_ms"
],
100
)
self
.
assertLess
(
res
[
"median_itl_ms"
],
8
)
def
test_lora_online_latency
(
self
):
# TODO (lifuhuang): verify LoRA support in AMD.
if
is_in_amd_ci
():
pass
res
=
self
.
_run_lora_latency_test
(
enable_background_task
=
False
)
if
is_in_ci
():
write_github_step_summary
(
f
"### test_lora_online_latency
\n
"
f
"median_e2e_latency_ms:
{
res
[
'median_e2e_latency_ms'
]:.
2
f
}
ms
\n
"
f
"median_ttft_ms:
{
res
[
'median_ttft_ms'
]:.
2
f
}
ms
\n
"
)
self
.
assertLess
(
res
[
"median_e2e_latency_ms"
],
2400
)
self
.
assertLess
(
res
[
"median_ttft_ms"
],
58
)
def
test_lora_online_latency_with_concurrent_adapter_updates
(
self
):
# TODO (lifuhuang): verify LoRA support in AMD.
if
is_in_amd_ci
():
pass
res
=
self
.
_run_lora_latency_test
(
enable_background_task
=
True
)
if
is_in_ci
():
write_github_step_summary
(
f
"### test_lora_online_latency
\n
"
f
"median_e2e_latency_ms:
{
res
[
'median_e2e_latency_ms'
]:.
2
f
}
ms
\n
"
f
"median_ttft_ms:
{
res
[
'median_ttft_ms'
]:.
2
f
}
ms
\n
"
)
self
.
assertLess
(
res
[
"median_e2e_latency_ms"
],
4000
)
# TODO (lifuhuang): This will be fixed by the overlapped LoRA update in a separate PR.
self
.
assertLess
(
res
[
"median_ttft_ms"
],
1600
)
def
_run_lora_latency_test
(
self
,
enable_background_task
:
bool
):
"""
Run a latency test for LoRA with the specified background task setting.
"""
async
def
lora_loader_unloader_task
(
base_url
:
str
,
start_event
:
asyncio
.
Event
,
stop_event
:
asyncio
.
Event
,
):
"""
A background task that repeatedly loads and unloads a LoRA adapter.
"""
await
start_event
.
wait
()
path_cycler
=
itertools
.
cycle
(
[
"pbevan11/llama-3.1-8b-ocr-correction"
,
"faridlazuarda/valadapt-llama-3.1-8B-it-chinese"
,
"philschmid/code-llama-3-1-8b-text-to-sql-lora"
,
]
)
load_url
=
f
"
{
base_url
}
/load_lora_adapter"
unload_url
=
f
"
{
base_url
}
/unload_lora_adapter"
num_updates
=
0
while
not
stop_event
.
is_set
():
# 1. Load the LoRA adapter
lora_path
=
next
(
path_cycler
)
response
=
await
asyncio
.
to_thread
(
requests
.
post
,
load_url
,
json
=
{
"lora_name"
:
lora_path
,
"lora_path"
:
lora_path
},
)
self
.
assertTrue
(
response
.
ok
,
f
"Failed to load LoRA adapter:
{
response
.
text
}
"
)
num_updates
+=
1
if
stop_event
.
is_set
():
break
# Yield control to allow other tasks to run.
await
asyncio
.
sleep
(
1
)
# 2. Unload the LoRA adapter
response
=
await
asyncio
.
to_thread
(
requests
.
post
,
unload_url
,
json
=
{
"lora_name"
:
lora_path
},
)
self
.
assertTrue
(
response
.
ok
,
f
"Failed to unload LoRA adapter:
{
response
.
text
}
"
)
num_updates
+=
1
# Yield control to allow other tasks to run.
await
asyncio
.
sleep
(
1
)
background_task
=
lora_loader_unloader_task
if
enable_background_task
else
None
res
=
run_bench_serving
(
model
=
DEFAULT_MODEL_NAME_FOR_TEST
,
num_prompts
=
400
,
request_rate
=
8
,
other_server_args
=
[
"--enable-lora"
,
"--max-loras-per-batch"
,
"1"
,
"--disable-radix-cache"
,
"--random-seed"
,
"42"
,
"--mem-fraction-static"
,
"0.8"
,
"--lora-paths"
,
"Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16"
,
"--max-lora-rank"
,
"256"
,
],
dataset_name
=
"random"
,
random_input_len
=
256
,
random_output_len
=
256
,
lora_name
=
[
"Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16"
],
background_task
=
background_task
,
)
return
res
def
test_online_latency_eagle
(
self
):
res
=
run_bench_serving
(
model
=
DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST
,
...
...
@@ -226,8 +350,8 @@ class TestBenchServing(CustomTestCase):
if
is_in_ci
():
write_github_step_summary
(
f
"### test_online_latency_eagle
\n
"
f
'
median_e2e_latency_ms:
{
res
[
"
median_e2e_latency_ms
"
]:.
2
f
}
ms
\n
'
f
'
accept_length:
{
res
[
"
accept_length
"
]:.
2
f
}
\n
'
f
"
median_e2e_latency_ms:
{
res
[
'
median_e2e_latency_ms
'
]:.
2
f
}
ms
\n
"
f
"
accept_length:
{
res
[
'
accept_length
'
]:.
2
f
}
\n
"
)
if
is_in_amd_ci
():
self
.
assertLess
(
res
[
"median_e2e_latency_ms"
],
1800
)
...
...
@@ -246,7 +370,7 @@ class TestBenchServing(CustomTestCase):
if
is_in_ci
():
write_github_step_summary
(
f
"### test_moe_offline_throughput_default
\n
"
f
'
Output throughput:
{
res
[
"
output_throughput
"
]:.
2
f
}
token/s
\n
'
f
"
Output throughput:
{
res
[
'
output_throughput
'
]:.
2
f
}
token/s
\n
"
)
if
is_in_amd_ci
():
self
.
assertGreater
(
res
[
"output_throughput"
],
2100
)
...
...
@@ -264,7 +388,7 @@ class TestBenchServing(CustomTestCase):
if
is_in_ci
():
write_github_step_summary
(
f
"### test_moe_offline_throughput_without_radix_cache
\n
"
f
'
Output throughput:
{
res
[
"
output_throughput
"
]:.
2
f
}
token/s
\n
'
f
"
Output throughput:
{
res
[
'
output_throughput
'
]:.
2
f
}
token/s
\n
"
)
if
is_in_amd_ci
():
self
.
assertGreater
(
res
[
"output_throughput"
],
2100
)
...
...
@@ -286,7 +410,7 @@ class TestBenchServing(CustomTestCase):
if
is_in_ci
():
write_github_step_summary
(
f
"### test_pp_offline_throughput_default_decode
\n
"
f
'
Output throughput:
{
res
[
"
output_throughput
"
]:.
2
f
}
token/s
\n
'
f
"
Output throughput:
{
res
[
'
output_throughput
'
]:.
2
f
}
token/s
\n
"
)
self
.
assertGreater
(
res
[
"output_throughput"
],
6700
)
...
...
@@ -311,7 +435,7 @@ class TestBenchServing(CustomTestCase):
if
is_in_ci
():
write_github_step_summary
(
f
"### test_pp_long_context_latency_prefill
\n
"
f
'
input_throughput:
{
res
[
"
input_throughput
"
]:.
2
f
}
ms
\n
'
f
"
input_throughput:
{
res
[
'
input_throughput
'
]:.
2
f
}
ms
\n
"
)
self
.
assertGreater
(
res
[
"input_throughput"
],
4000
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment