Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
e5f599d4
Unverified
Commit
e5f599d4
authored
Nov 11, 2025
by
Michael Goin
Committed by
GitHub
Nov 11, 2025
Browse files
[Bugfix] Disable shared expert overlap if Marlin MoE is used (#28410)
Signed-off-by:
mgoin
<
mgoin64@gmail.com
>
parent
28534b92
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
13 additions
and
5 deletions
+13
-5
vllm/model_executor/layers/fused_moe/layer.py
vllm/model_executor/layers/fused_moe/layer.py
+4
-0
vllm/model_executor/layers/fused_moe/shared_fused_moe.py
vllm/model_executor/layers/fused_moe/shared_fused_moe.py
+5
-5
vllm/model_executor/layers/quantization/awq_marlin.py
vllm/model_executor/layers/quantization/awq_marlin.py
+1
-0
vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
...quantization/compressed_tensors/compressed_tensors_moe.py
+1
-0
vllm/model_executor/layers/quantization/gptq_marlin.py
vllm/model_executor/layers/quantization/gptq_marlin.py
+1
-0
vllm/model_executor/layers/quantization/mxfp4.py
vllm/model_executor/layers/quantization/mxfp4.py
+1
-0
No files found.
vllm/model_executor/layers/fused_moe/layer.py
View file @
e5f599d4
...
...
@@ -678,6 +678,10 @@ class FusedMoE(CustomOp):
and
self
.
moe_config
.
use_flashinfer_cutlass_kernels
)
@
property
def
use_marlin_kernels
(
self
):
return
getattr
(
self
.
quant_method
,
"use_marlin"
,
False
)
@
property
def
use_dp_chunking
(
self
)
->
bool
:
return
(
...
...
vllm/model_executor/layers/fused_moe/shared_fused_moe.py
View file @
e5f599d4
...
...
@@ -28,17 +28,17 @@ class SharedFusedMoE(FusedMoE):
super
().
__init__
(
**
kwargs
)
self
.
_shared_experts
=
shared_experts
# Disable shared expert overlap if we are using eplb, because of
# correctness issues, or if using flashinfer with DP, since there
# is nothing to be gained in this case. Disabling the overlap
# optimization also prevents the shared experts from being hidden
# from torch.compile.
# Disable shared expert overlap if:
# - we are using eplb, because of correctness issues
# - we are using flashinfer with DP, since there nothint to gain
# - we are using marlin kjernels
self
.
use_overlapped
=
(
use_overlapped
and
not
(
# TODO(wentao): find the root cause and remove this condition
self
.
enable_eplb
or
(
self
.
moe_config
.
use_flashinfer_cutlass_kernels
and
self
.
dp_size
>
1
)
or
self
.
use_marlin_kernels
)
and
self
.
_shared_experts
is
not
None
)
...
...
vllm/model_executor/layers/quantization/awq_marlin.py
View file @
e5f599d4
...
...
@@ -424,6 +424,7 @@ class AWQMoEMethod(FusedMoEMethodBase):
if
self
.
quant_config
.
weight_bits
!=
4
:
raise
ValueError
(
"AWQMoEMethod only supports 4bit now."
)
self
.
quant_type
=
scalar_types
.
uint4
self
.
use_marlin
=
True
def
create_weights
(
self
,
...
...
vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
View file @
e5f599d4
...
...
@@ -1342,6 +1342,7 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
f
"
{
WNA16_SUPPORTED_BITS
}
"
,
)
self
.
quant_type
=
WNA16_SUPPORTED_TYPES_MAP
[
self
.
num_bits
]
self
.
use_marlin
=
True
def
create_weights
(
self
,
...
...
vllm/model_executor/layers/quantization/gptq_marlin.py
View file @
e5f599d4
...
...
@@ -482,6 +482,7 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
self
.
quant_type
=
scalar_types
.
uint8b128
else
:
raise
ValueError
(
"GPTQMarlinMoEMethod only supports int4 and int8 now."
)
self
.
use_marlin
=
True
def
create_weights
(
self
,
...
...
vllm/model_executor/layers/quantization/mxfp4.py
View file @
e5f599d4
...
...
@@ -216,6 +216,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
def
__init__
(
self
,
moe
:
FusedMoEConfig
):
super
().
__init__
(
moe
)
self
.
mxfp4_backend
=
get_mxfp4_backend
(
moe
.
is_lora_enabled
)
self
.
use_marlin
=
self
.
mxfp4_backend
==
Mxfp4Backend
.
MARLIN
self
.
max_capture_size
=
(
get_current_vllm_config
().
compilation_config
.
max_cudagraph_capture_size
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment