Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
da681f35
Unverified
Commit
da681f35
authored
Oct 17, 2025
by
Yineng Zhang
Committed by
GitHub
Oct 17, 2025
Browse files
Revert "Set csgmv as default lora backend. (#11488)" (#11735)
parent
9b0f725b
Changes
11
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
23 additions
and
11 deletions
+23
-11
benchmark/lora/launch_server.py
benchmark/lora/launch_server.py
+1
-1
python/sglang/srt/server_args.py
python/sglang/srt/server_args.py
+1
-1
python/sglang/test/runners.py
python/sglang/test/runners.py
+1
-1
test/srt/lora/test_lora.py
test/srt/lora/test_lora.py
+4
-2
test/srt/lora/test_lora_cuda_graph.py
test/srt/lora/test_lora_cuda_graph.py
+2
-0
test/srt/lora/test_lora_eviction.py
test/srt/lora/test_lora_eviction.py
+2
-0
test/srt/lora/test_lora_qwen3.py
test/srt/lora/test_lora_qwen3.py
+4
-2
test/srt/lora/test_lora_radix_cache.py
test/srt/lora/test_lora_radix_cache.py
+3
-0
test/srt/lora/test_lora_tp.py
test/srt/lora/test_lora_tp.py
+1
-0
test/srt/lora/test_lora_update.py
test/srt/lora/test_lora_update.py
+1
-1
test/srt/lora/utils.py
test/srt/lora/utils.py
+3
-3
No files found.
benchmark/lora/launch_server.py
View file @
da681f35
...
...
@@ -53,7 +53,7 @@ if __name__ == "__main__":
parser
.
add_argument
(
"--lora-backend"
,
type
=
str
,
default
=
"
csgmv
"
,
default
=
"
triton
"
,
)
parser
.
add_argument
(
"--tp-size"
,
...
...
python/sglang/srt/server_args.py
View file @
da681f35
...
...
@@ -309,8 +309,8 @@ class ServerArgs:
]
=
None
max_loaded_loras
:
Optional
[
int
]
=
None
max_loras_per_batch
:
int
=
8
lora_backend
:
str
=
"csgmv"
lora_eviction_policy
:
str
=
DEFAULT_LORA_EVICTION_POLICY
lora_backend
:
str
=
"triton"
max_lora_chunk_size
:
Optional
[
int
]
=
16
# Kernel backend
...
...
python/sglang/test/runners.py
View file @
da681f35
...
...
@@ -496,7 +496,7 @@ class SRTRunner:
attention_backend
:
Optional
[
str
]
=
None
,
prefill_attention_backend
:
Optional
[
str
]
=
None
,
decode_attention_backend
:
Optional
[
str
]
=
None
,
lora_backend
:
str
=
"
csgmv
"
,
lora_backend
:
str
=
"
triton
"
,
disable_cuda_graph
:
bool
=
False
,
disable_radix_cache
:
bool
=
False
,
chunked_prefill_size
:
Optional
[
int
]
=
None
,
...
...
test/srt/lora/test_lora.py
View file @
da681f35
...
...
@@ -81,12 +81,13 @@ class TestLoRA(CustomTestCase):
for
model_case
in
model_cases
:
for
torch_dtype
in
TORCH_DTYPES
:
max_new_tokens
=
32
backend
=
"triton"
base_path
=
model_case
.
base
lora_adapter_paths
=
[
a
.
name
for
a
in
model_case
.
adaptors
]
assert
len
(
lora_adapter_paths
)
>=
2
print
(
f
"
\n
========== Testing multiple batches on base '
{
base_path
}
', dtype=
{
torch_dtype
}
---"
f
"
\n
========== Testing multiple batches on base '
{
base_path
}
'
with backend=
{
backend
}
, dtype=
{
torch_dtype
}
---"
)
# Initialize runners
...
...
@@ -96,6 +97,7 @@ class TestLoRA(CustomTestCase):
model_type
=
"generation"
,
lora_paths
=
[
lora_adapter_paths
[
0
],
lora_adapter_paths
[
1
]],
max_loras_per_batch
=
len
(
lora_adapter_paths
)
+
1
,
lora_backend
=
backend
,
sleep_on_idle
=
True
,
# Eliminate non-determinism by forcing all requests to be processed in one batch.
attention_backend
=
"torch_native"
,
)
...
...
@@ -140,7 +142,7 @@ class TestLoRA(CustomTestCase):
if
rouge_score
<
rouge_tol
:
raise
AssertionError
(
f
"ROUGE-L score
{
rouge_score
}
below tolerance
{
rouge_tol
}
"
f
"for base '
{
base_path
}
', adaptor '
{
lora_paths
}
', prompt: '
{
prompts
}
...'"
f
"for base '
{
base_path
}
', adaptor '
{
lora_paths
}
',
backend '
{
backend
}
',
prompt: '
{
prompts
}
...'"
)
print
(
f
"--- Batch
{
i
}
Comparison Passed --- "
)
...
...
test/srt/lora/test_lora_cuda_graph.py
View file @
da681f35
...
...
@@ -62,6 +62,7 @@ class TestLoRACudaGraph(CustomTestCase):
model_case
,
torch_dtype
,
max_new_tokens
=
32
,
backend
=
"triton"
,
disable_cuda_graph
=
True
,
test_tag
=
"without_cuda_graph"
,
)
...
...
@@ -76,6 +77,7 @@ class TestLoRACudaGraph(CustomTestCase):
model_case
,
torch_dtype
,
max_new_tokens
=
32
,
backend
=
"triton"
,
disable_cuda_graph
=
False
,
test_tag
=
"cuda_graph_padding"
,
)
...
...
test/srt/lora/test_lora_eviction.py
View file @
da681f35
...
...
@@ -83,6 +83,7 @@ class TestLoRAEviction(CustomTestCase):
):
REUSED_LORA_NAME
=
"lora"
max_new_tokens
=
256
backend
=
"triton"
torch_dtype
=
torch
.
float16
base_path
=
BASE_MODEL
assert
len
(
lora_paths
)
>=
2
...
...
@@ -95,6 +96,7 @@ class TestLoRAEviction(CustomTestCase):
model_type
=
"generation"
,
lora_paths
=
initial_lora_paths
,
max_loras_per_batch
=
1
,
lora_backend
=
backend
,
enable_lora
=
True
,
max_lora_rank
=
256
,
lora_target_modules
=
[
"all"
],
...
...
test/srt/lora/test_lora_qwen3.py
View file @
da681f35
...
...
@@ -71,6 +71,7 @@ class TestLoRAQwen3(CustomTestCase):
for
model_case
in
model_cases
:
for
torch_dtype
in
TORCH_DTYPES
:
max_new_tokens
=
32
backend
=
"triton"
base_path
=
model_case
.
base
lora_adapter_paths
=
[
a
.
name
for
a
in
model_case
.
adaptors
]
assert
len
(
lora_adapter_paths
)
>=
2
...
...
@@ -127,7 +128,7 @@ class TestLoRAQwen3(CustomTestCase):
]
print
(
f
"
\n
========== Testing multiple batches on base '
{
base_path
}
', dtype=
{
torch_dtype
}
---"
f
"
\n
========== Testing multiple batches on base '
{
base_path
}
'
with backend=
{
backend
}
, dtype=
{
torch_dtype
}
---"
)
# Initialize runners
...
...
@@ -138,6 +139,7 @@ class TestLoRAQwen3(CustomTestCase):
model_type
=
"generation"
,
lora_paths
=
[
lora_adapter_paths
[
0
],
lora_adapter_paths
[
1
]],
max_loras_per_batch
=
len
(
lora_adapter_paths
)
+
1
,
lora_backend
=
backend
,
sleep_on_idle
=
True
,
# Eliminate non-determinism by forcing all requests to be processed in one batch.
attention_backend
=
"torch_native"
,
)
...
...
@@ -181,7 +183,7 @@ class TestLoRAQwen3(CustomTestCase):
if
rouge_score
<
rouge_tol
:
raise
AssertionError
(
f
"ROUGE-L score
{
rouge_score
}
below tolerance
{
rouge_tol
}
"
f
"for base '
{
base_path
}
', adaptor '
{
lora_paths
}
', prompt: '
{
prompts
}
...'"
f
"for base '
{
base_path
}
', adaptor '
{
lora_paths
}
',
backend '
{
backend
}
',
prompt: '
{
prompts
}
...'"
)
print
(
f
"--- Batch
{
i
+
1
}
Comparison Passed --- "
)
...
...
test/srt/lora/test_lora_radix_cache.py
View file @
da681f35
...
...
@@ -44,6 +44,7 @@ class TestLoRARadixCache(CustomTestCase):
torch_dtype
=
torch
.
float16
max_new_tokens
=
32
backend
=
"triton"
batch_prompts
=
(
PROMPTS
if
not
model_case
.
skip_long_prompt
...
...
@@ -56,6 +57,7 @@ class TestLoRARadixCache(CustomTestCase):
model_case
,
torch_dtype
,
max_new_tokens
=
max_new_tokens
,
backend
=
backend
,
disable_radix_cache
=
False
,
test_tag
=
"lora-with-radix-cache"
,
)
...
...
@@ -66,6 +68,7 @@ class TestLoRARadixCache(CustomTestCase):
model_case
,
torch_dtype
,
max_new_tokens
=
max_new_tokens
,
backend
=
backend
,
disable_radix_cache
=
True
,
test_tag
=
"lora-without-radix-cache"
,
)
...
...
test/srt/lora/test_lora_tp.py
View file @
da681f35
...
...
@@ -48,6 +48,7 @@ class TestLoRATP(CustomTestCase):
model_case
,
torch_dtype
,
max_new_tokens
=
32
,
backend
=
"triton"
,
test_tag
=
f
"tp=
{
tp_size
}
"
,
)
...
...
test/srt/lora/test_lora_update.py
View file @
da681f35
...
...
@@ -763,7 +763,7 @@ class LoRAUpdateTestSessionBase:
max_lora_rank
:
Optional
[
int
],
enable_lora
:
Optional
[
bool
]
=
None
,
lora_target_modules
:
Optional
[
List
[
str
]]
=
None
,
lora_backend
:
str
=
"
csgmv
"
,
lora_backend
:
str
=
"
triton
"
,
disable_cuda_graph
:
bool
=
False
,
cuda_graph_max_bs
:
int
=
4
,
):
...
...
test/srt/lora/utils.py
View file @
da681f35
...
...
@@ -14,7 +14,7 @@
import
dataclasses
import
random
from
typing
import
List
,
Optional
from
typing
import
List
import
torch
...
...
@@ -50,7 +50,7 @@ class LoRAModelCase:
TORCH_DTYPES
=
[
torch
.
float16
]
BACKENDS
=
[
"triton"
,
"csgmv"
]
BACKENDS
=
[
"triton"
]
DEFAULT_PROMPTS
=
[
"AI is a field of computer science focused on"
,
"""
...
...
@@ -135,7 +135,7 @@ def run_lora_test_one_by_one(
model_case
:
LoRAModelCase
,
torch_dtype
:
torch
.
dtype
,
max_new_tokens
:
int
,
backend
:
str
=
"csgmv"
,
backend
:
str
,
disable_cuda_graph
:
bool
=
False
,
disable_radix_cache
:
bool
=
False
,
mem_fraction_static
:
float
=
0.88
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment