Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
d1135a50
Unverified
Commit
d1135a50
authored
Apr 19, 2026
by
danisereb
Committed by
GitHub
Apr 19, 2026
Browse files
Fix MoE backend selection for LoRA (unquantized MoE) (#40273)
Signed-off-by:
Daniel Serebrenik
<
daserebrenik@nvidia.com
>
parent
982beae8
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
90 additions
and
0 deletions
+90
-0
tests/kernels/moe/test_unquantized_backend_selection.py
tests/kernels/moe/test_unquantized_backend_selection.py
+85
-0
vllm/model_executor/layers/fused_moe/oracle/unquantized.py
vllm/model_executor/layers/fused_moe/oracle/unquantized.py
+5
-0
No files found.
tests/kernels/moe/test_unquantized_backend_selection.py
View file @
d1135a50
...
@@ -11,6 +11,11 @@ from vllm.model_executor.layers.fused_moe.oracle.unquantized import (
...
@@ -11,6 +11,11 @@ from vllm.model_executor.layers.fused_moe.oracle.unquantized import (
)
)
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
skipif_not_cuda_rocm
=
pytest
.
mark
.
skipif
(
not
(
current_platform
.
is_cuda
()
or
current_platform
.
is_rocm
()),
reason
=
"Only supported on CUDA/ROCm platforms."
,
)
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"platform_method,expected_backend"
,
"platform_method,expected_backend"
,
...
@@ -190,3 +195,83 @@ def test_select_cuda_flashinfer_cutlass_backend(
...
@@ -190,3 +195,83 @@ def test_select_cuda_flashinfer_cutlass_backend(
assert
selected_backend
==
UnquantizedMoeBackend
.
FLASHINFER_CUTLASS
assert
selected_backend
==
UnquantizedMoeBackend
.
FLASHINFER_CUTLASS
assert
experts_cls
is
not
None
assert
experts_cls
is
not
None
@
skipif_not_cuda_rocm
def
test_select_lora_backend_prefers_triton
():
"""LoRA-enabled unquantized MoE should select Triton backend."""
moe_config
=
make_dummy_moe_config
()
moe_config
.
is_lora_enabled
=
True
selected_backend
,
experts_cls
=
select_unquantized_moe_backend
(
moe_config
=
moe_config
)
assert
selected_backend
==
UnquantizedMoeBackend
.
TRITON
assert
experts_cls
is
not
None
@
skipif_not_cuda_rocm
def
test_select_lora_explicit_non_triton_backend
():
"""LoRA should override explicit non-Triton backend to Triton."""
moe_config
=
make_dummy_moe_config
()
moe_config
.
is_lora_enabled
=
True
# Use string from mapping in function map_unquantized_backend()
moe_config
.
moe_backend
=
"flashinfer_cutlass"
selected_backend
,
experts_cls
=
select_unquantized_moe_backend
(
moe_config
=
moe_config
)
assert
selected_backend
==
UnquantizedMoeBackend
.
TRITON
assert
experts_cls
is
not
None
@
skipif_not_cuda_rocm
@
pytest
.
mark
.
parametrize
(
"is_lora_enabled"
,
[
False
,
True
])
def
test_select_explicit_triton_backend
(
is_lora_enabled
):
"""Explicit triton backend selection should return Triton."""
moe_config
=
make_dummy_moe_config
()
moe_config
.
is_lora_enabled
=
is_lora_enabled
moe_config
.
moe_backend
=
"triton"
selected_backend
,
experts_cls
=
select_unquantized_moe_backend
(
moe_config
=
moe_config
)
assert
selected_backend
==
UnquantizedMoeBackend
.
TRITON
assert
experts_cls
is
not
None
@
skipif_not_cuda_rocm
def
test_select_explicit_triton_ignores_flashinfer_env
(
monkeypatch
):
"""Explicit triton backend should override FlashInfer env selection."""
monkeypatch
.
setenv
(
"VLLM_USE_FLASHINFER_MOE_FP16"
,
"1"
)
monkeypatch
.
setenv
(
"VLLM_FLASHINFER_MOE_BACKEND"
,
"throughput"
)
moe_config
=
make_dummy_moe_config
()
moe_config
.
is_lora_enabled
=
False
moe_config
.
moe_backend
=
"triton"
selected_backend
,
experts_cls
=
select_unquantized_moe_backend
(
moe_config
=
moe_config
)
assert
selected_backend
==
UnquantizedMoeBackend
.
TRITON
assert
experts_cls
is
not
None
@
skipif_not_cuda_rocm
def
test_select_lora_ignores_flashinfer_env
(
monkeypatch
):
"""LoRA path should still choose Triton even if FlashInfer env is on."""
monkeypatch
.
setenv
(
"VLLM_USE_FLASHINFER_MOE_FP16"
,
"1"
)
monkeypatch
.
setenv
(
"VLLM_FLASHINFER_MOE_BACKEND"
,
"throughput"
)
moe_config
=
make_dummy_moe_config
()
moe_config
.
is_lora_enabled
=
True
selected_backend
,
experts_cls
=
select_unquantized_moe_backend
(
moe_config
=
moe_config
)
assert
selected_backend
==
UnquantizedMoeBackend
.
TRITON
assert
experts_cls
is
not
None
vllm/model_executor/layers/fused_moe/oracle/unquantized.py
View file @
d1135a50
...
@@ -163,6 +163,11 @@ def select_unquantized_moe_backend(
...
@@ -163,6 +163,11 @@ def select_unquantized_moe_backend(
if
current_platform
.
is_out_of_tree
():
if
current_platform
.
is_out_of_tree
():
return
UnquantizedMoeBackend
.
OOT
,
None
return
UnquantizedMoeBackend
.
OOT
,
None
if
moe_config
.
is_lora_enabled
:
return
UnquantizedMoeBackend
.
TRITON
,
backend_to_kernel_cls
(
UnquantizedMoeBackend
.
TRITON
)
# NOTE: the kernels are selected in the following order.
# NOTE: the kernels are selected in the following order.
AVAILABLE_BACKENDS
=
_get_priority_backends
(
moe_config
)
AVAILABLE_BACKENDS
=
_get_priority_backends
(
moe_config
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment