Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
bc387d5a
Commit
bc387d5a
authored
Feb 05, 2026
by
zhuwenwen
Browse files
sync v0.15.1 (fused_moe)
parent
899a2db4
Changes
23
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
16 additions
and
133 deletions
+16
-133
vllm/model_executor/layers/fused_moe/prepare_finalize.py
vllm/model_executor/layers/fused_moe/prepare_finalize.py
+6
-126
vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
+9
-6
vllm/model_executor/layers/fused_moe/router/base_router.py
vllm/model_executor/layers/fused_moe/router/base_router.py
+1
-1
No files found.
vllm/model_executor/layers/fused_moe/prepare_finalize.py
View file @
bc387d5a
...
@@ -4,133 +4,19 @@
...
@@ -4,133 +4,19 @@
import
torch
import
torch
import
vllm.model_executor.layers.fused_moe.modular_kernel
as
mk
import
vllm.model_executor.layers.fused_moe.modular_kernel
as
mk
from
vllm.distributed
import
get_ep_group
from
vllm.model_executor.layers.fused_moe.config
import
FusedMoEQuantConfig
from
vllm.model_executor.layers.fused_moe.config
import
FusedMoEQuantConfig
from
vllm.model_executor.layers.fused_moe.topk_weight_and_reduce
import
(
from
vllm.model_executor.layers.fused_moe.topk_weight_and_reduce
import
(
TopKWeightAndReduceContiguous
,
TopKWeightAndReduceContiguous
,
TopKWeightAndReduceDelegate
,
TopKWeightAndReduceDelegate
,
)
)
from
vllm.model_executor.layers.fused_moe.utils
import
moe_kernel_quantize_input
from
vllm.model_executor.layers.fused_moe.utils
import
moe_kernel_quantize_input
from
vllm.utils.flashinfer
import
nvfp4_block_scale_interleave
class
MoEPrepareAndFinalizeNaiveEP
(
mk
.
FusedMoEPrepareAndFinalize
):
class
MoEPrepareAndFinalizeNoEP
(
mk
.
FusedMoEPrepareAndFinalize
):
def
__init__
(
def
__init__
(
self
,
defer_input_quant
:
bool
=
False
)
->
None
:
self
,
is_sequence_parallel
:
bool
=
False
,
num_dispatchers
:
int
=
1
,
)
->
None
:
super
().
__init__
()
super
().
__init__
()
self
.
is_sequence_parallel
=
is_sequence_parallel
self
.
defer_input_quant
=
defer_input_quant
self
.
_num_dispatchers
=
num_dispatchers
@
property
def
activation_format
(
self
)
->
mk
.
FusedMoEActivationFormat
:
return
mk
.
FusedMoEActivationFormat
.
Standard
def
max_num_tokens_per_rank
(
self
)
->
int
|
None
:
return
None
def
topk_indices_dtype
(
self
)
->
torch
.
dtype
|
None
:
return
None
def
num_dispatchers
(
self
)
->
int
:
return
self
.
_num_dispatchers
def
output_is_reduced
(
self
)
->
bool
:
return
False
def
prepare
(
self
,
a1
:
torch
.
Tensor
,
topk_weights
:
torch
.
Tensor
,
topk_ids
:
torch
.
Tensor
,
num_experts
:
int
,
expert_map
:
torch
.
Tensor
|
None
,
apply_router_weight_on_input
:
bool
,
quant_config
:
FusedMoEQuantConfig
,
defer_input_quant
:
bool
=
False
,
)
->
mk
.
PrepareResultType
:
if
apply_router_weight_on_input
:
topk
=
topk_ids
.
size
(
1
)
assert
topk
==
1
,
(
"apply_router_weight_on_input is only implemented for topk=1"
)
# Note: do not use inplace for shared experts overlap
a1
=
a1
*
topk_weights
.
to
(
a1
.
dtype
)
# Defer input quantization to the MoE kernel.
use_nvfp4
=
quant_config
.
use_nvfp4_w4a4
if
defer_input_quant
:
a1q
=
a1
a1q_scale
=
None
else
:
a1q
,
a1q_scale
=
moe_kernel_quantize_input
(
a1
,
quant_config
.
a1_gscale
if
use_nvfp4
else
quant_config
.
a1_scale
,
quant_config
.
quant_dtype
,
quant_config
.
per_act_token_quant
,
quant_config
.
block_shape
,
# NOTE: swizzling pads the scales to multiple of 128
# which makes the scales tensor different shape than
# the hidden states, breaking the A2A kernel. So, we
# delay the swizzling until after the A2A.
is_fp4_scale_swizzled
=
False
,
)
# Skip gathering scales if we have static quantization
# (the scale is a scalar, replicated on all ranks) or
# if quantization is deferred.
skip_gather_scales
=
a1q_scale
is
None
or
a1q_scale
.
ndim
==
0
scales
=
None
if
skip_gather_scales
else
[
a1q_scale
]
res
=
get_ep_group
().
dispatch
(
a1q
,
topk_weights
,
topk_ids
,
is_sequence_parallel
=
self
.
is_sequence_parallel
,
extra_tensors
=
scales
,
)
if
skip_gather_scales
:
a1q
,
topk_weights
,
topk_ids
=
res
else
:
a1q
,
topk_weights
,
topk_ids
,
scales
=
res
assert
scales
is
not
None
and
len
(
scales
)
==
1
a1q_scale
=
scales
[
0
]
if
quant_config
.
quant_dtype
==
"nvfp4"
:
assert
a1q_scale
is
not
None
if
a1q_scale
.
element_size
()
==
1
:
a1q_scale
=
a1q_scale
.
view
(
torch
.
uint8
)
a1q_scale
=
nvfp4_block_scale_interleave
(
a1q_scale
)
return
a1q
,
a1q_scale
,
None
,
topk_ids
,
topk_weights
def
finalize
(
self
,
output
:
torch
.
Tensor
,
fused_expert_output
:
torch
.
Tensor
,
topk_weights
:
torch
.
Tensor
,
topk_ids
:
torch
.
Tensor
,
apply_router_weight_on_input
:
bool
,
weight_and_reduce_impl
:
mk
.
TopKWeightAndReduce
,
)
->
None
:
if
isinstance
(
weight_and_reduce_impl
,
TopKWeightAndReduceDelegate
):
weight_and_reduce_impl
=
TopKWeightAndReduceContiguous
()
out
=
weight_and_reduce_impl
.
apply
(
output
=
None
,
fused_expert_output
=
fused_expert_output
,
topk_weights
=
topk_weights
,
topk_ids
=
topk_ids
,
apply_router_weight_on_input
=
apply_router_weight_on_input
,
)
output
.
copy_
(
get_ep_group
().
combine
(
out
,
is_sequence_parallel
=
self
.
is_sequence_parallel
)
)
class
MoEPrepareAndFinalizeNoEP
(
mk
.
FusedMoEPrepareAndFinalize
):
@
property
@
property
def
activation_format
(
self
)
->
mk
.
FusedMoEActivationFormat
:
def
activation_format
(
self
)
->
mk
.
FusedMoEActivationFormat
:
return
mk
.
FusedMoEActivationFormat
.
Standard
return
mk
.
FusedMoEActivationFormat
.
Standard
...
@@ -156,7 +42,6 @@ class MoEPrepareAndFinalizeNoEP(mk.FusedMoEPrepareAndFinalize):
...
@@ -156,7 +42,6 @@ class MoEPrepareAndFinalizeNoEP(mk.FusedMoEPrepareAndFinalize):
expert_map
:
torch
.
Tensor
|
None
,
expert_map
:
torch
.
Tensor
|
None
,
apply_router_weight_on_input
:
bool
,
apply_router_weight_on_input
:
bool
,
quant_config
:
FusedMoEQuantConfig
,
quant_config
:
FusedMoEQuantConfig
,
defer_input_quant
:
bool
=
False
,
)
->
mk
.
PrepareResultType
:
)
->
mk
.
PrepareResultType
:
if
apply_router_weight_on_input
:
if
apply_router_weight_on_input
:
topk
=
topk_ids
.
size
(
1
)
topk
=
topk_ids
.
size
(
1
)
...
@@ -169,17 +54,12 @@ class MoEPrepareAndFinalizeNoEP(mk.FusedMoEPrepareAndFinalize):
...
@@ -169,17 +54,12 @@ class MoEPrepareAndFinalizeNoEP(mk.FusedMoEPrepareAndFinalize):
# Defer input quant to moe kernel for backends (e.g. AITER, FI)
# Defer input quant to moe kernel for backends (e.g. AITER, FI)
# which use a single kernel call for quant + experts.
# which use a single kernel call for quant + experts.
if
defer_input_quant
:
if
self
.
defer_input_quant
:
return
a1
,
None
,
None
,
None
,
None
return
a1
,
None
,
None
,
None
,
None
input_sf
=
(
quant_config
.
a1_gscale
if
quant_config
.
use_nvfp4_w4a4
else
quant_config
.
a1_scale
)
a1q
,
a1q_scale
=
moe_kernel_quantize_input
(
a1q
,
a1q_scale
=
moe_kernel_quantize_input
(
a1
,
a1
,
input_sf
,
quant_config
.
a1_scale
,
quant_config
.
quant_dtype
,
quant_config
.
quant_dtype
,
quant_config
.
per_act_token_quant
,
quant_config
.
per_act_token_quant
,
quant_config
.
block_shape
,
quant_config
.
block_shape
,
...
...
vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
View file @
bc387d5a
...
@@ -287,14 +287,17 @@ def rocm_aiter_fused_experts(
...
@@ -287,14 +287,17 @@ def rocm_aiter_fused_experts(
class
AiterExperts
(
mk
.
FusedMoEPermuteExpertsUnpermute
):
class
AiterExperts
(
mk
.
FusedMoEPermuteExpertsUnpermute
):
@
property
def
expects_unquantized_inputs
(
self
)
->
bool
:
return
True
@
staticmethod
@
staticmethod
def
activation_format
()
->
mk
.
FusedMoEActivationFormat
:
def
activation_format
()
->
mk
.
FusedMoEActivationFormat
:
return
mk
.
FusedMoEActivationFormat
.
Standard
return
mk
.
FusedMoEActivationFormat
.
Standard
@
staticmethod
def
expects_unquantized_inputs
(
fused_moe_config
:
mk
.
FusedMoEConfig
,
quant_config
:
FusedMoEQuantConfig
)
->
bool
:
# AITER fused MoE kernels handle input quantization internally.
return
True
@
staticmethod
@
staticmethod
def
_supports_current_device
()
->
bool
:
def
_supports_current_device
()
->
bool
:
return
rocm_aiter_ops
.
is_fused_moe_enabled
()
return
rocm_aiter_ops
.
is_fused_moe_enabled
()
...
@@ -326,7 +329,7 @@ class AiterExperts(mk.FusedMoEPermuteExpertsUnpermute):
...
@@ -326,7 +329,7 @@ class AiterExperts(mk.FusedMoEPermuteExpertsUnpermute):
@
staticmethod
@
staticmethod
def
_supports_parallel_config
(
moe_parallel_config
:
FusedMoEParallelConfig
)
->
bool
:
def
_supports_parallel_config
(
moe_parallel_config
:
FusedMoEParallelConfig
)
->
bool
:
return
not
moe_parallel_config
.
use_fi_all2allv_kernels
return
True
def
supports_expert_map
(
self
):
def
supports_expert_map
(
self
):
return
True
return
True
...
...
vllm/model_executor/layers/fused_moe/router/base_router.py
View file @
bc387d5a
...
@@ -229,7 +229,7 @@ class BaseRouter(FusedMoERouter):
...
@@ -229,7 +229,7 @@ class BaseRouter(FusedMoERouter):
# Step 3: Compute routing (delegated to subclass)
# Step 3: Compute routing (delegated to subclass)
topk_weights
,
topk_ids
=
self
.
_compute_routing
(
topk_weights
,
topk_ids
=
self
.
_compute_routing
(
hidden_states
,
router_logits
,
indices_type
,
hidden_states
,
router_logits
,
indices_type
)
)
# Step 4: Apply EPLB mapping
# Step 4: Apply EPLB mapping
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment