Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
6a9cceb2
Unverified
Commit
6a9cceb2
authored
Mar 19, 2026
by
Duyi-Wang
Committed by
GitHub
Mar 19, 2026
Browse files
[Bugfix][ROCm] Fix MoRI + AITER FP8 dispatch compatibility for defer_input_quant (#37418)
Signed-off-by:
Duyi-Wang
<
duyi.wang@amd.com
>
parent
199f9141
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
9 additions
and
7 deletions
+9
-7
vllm/model_executor/layers/fused_moe/mori_prepare_finalize.py
.../model_executor/layers/fused_moe/mori_prepare_finalize.py
+3
-6
vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
+6
-1
No files found.
vllm/model_executor/layers/fused_moe/mori_prepare_finalize.py
View file @
6a9cceb2
...
@@ -70,16 +70,13 @@ class MoriPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
...
@@ -70,16 +70,13 @@ class MoriPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular):
- Optional dispatched expert topk IDs
- Optional dispatched expert topk IDs
- Optional dispatched expert topk weight
- Optional dispatched expert topk weight
"""
"""
if
defer_input_quant
:
raise
NotImplementedError
(
f
"
{
self
.
__class__
.
__name__
}
does not support defer_input_quant=True. "
"Please select an MoE kernel that accepts quantized inputs."
)
assert
not
apply_router_weight_on_input
,
(
assert
not
apply_router_weight_on_input
,
(
"mori does not support apply_router_weight_on_input=True now."
"mori does not support apply_router_weight_on_input=True now."
)
)
scale
=
None
scale
=
None
if
self
.
use_fp8_dispatch
:
# When defer_input_quant is True, the expert kernel handles
# quantization internally, so skip FP8 dispatch quantization.
if
self
.
use_fp8_dispatch
and
not
defer_input_quant
:
from
aiter
import
QuantType
,
get_hip_quant
from
aiter
import
QuantType
,
get_hip_quant
if
quant_config
.
is_block_quantized
:
if
quant_config
.
is_block_quantized
:
...
...
vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
View file @
6a9cceb2
...
@@ -295,7 +295,12 @@ def rocm_aiter_fused_experts(
...
@@ -295,7 +295,12 @@ def rocm_aiter_fused_experts(
class
AiterExperts
(
mk
.
FusedMoEExpertsModular
):
class
AiterExperts
(
mk
.
FusedMoEExpertsModular
):
@
property
@
property
def
expects_unquantized_inputs
(
self
)
->
bool
:
def
expects_unquantized_inputs
(
self
)
->
bool
:
return
True
# When paired with MoRI, the prepare/finalize handles FP8
# quantization during dispatch to reduce network traffic,
# so we should not defer input quantization.
# Otherwise, AITER fused MoE kernels handle input quantization
# internally via a single fused kernel.
return
not
self
.
moe_config
.
use_mori_kernels
@
staticmethod
@
staticmethod
def
activation_format
()
->
mk
.
FusedMoEActivationFormat
:
def
activation_format
()
->
mk
.
FusedMoEActivationFormat
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment