Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
f9c069c8
Unverified
Commit
f9c069c8
authored
May 14, 2025
by
bnellnm
Committed by
GitHub
May 14, 2025
Browse files
Modularize fused experts and integrate PPLX kernels (#15956)
parent
418d2f8b
Changes
42
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1406 additions
and
124 deletions
+1406
-124
vllm/model_executor/layers/fused_moe/layer.py
vllm/model_executor/layers/fused_moe/layer.py
+499
-52
vllm/model_executor/layers/fused_moe/modular_kernel.py
vllm/model_executor/layers/fused_moe/modular_kernel.py
+364
-0
vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py
.../model_executor/layers/fused_moe/moe_permute_unpermute.py
+79
-11
vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
.../model_executor/layers/fused_moe/pplx_prepare_finalize.py
+147
-0
vllm/model_executor/layers/fused_moe/prepare_finalize.py
vllm/model_executor/layers/fused_moe/prepare_finalize.py
+60
-0
vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
+112
-0
vllm/model_executor/layers/fused_moe/utils.py
vllm/model_executor/layers/fused_moe/utils.py
+54
-5
vllm/model_executor/layers/quantization/fp8.py
vllm/model_executor/layers/quantization/fp8.py
+56
-28
vllm/model_executor/models/dbrx.py
vllm/model_executor/models/dbrx.py
+0
-1
vllm/model_executor/models/deepseek_v2.py
vllm/model_executor/models/deepseek_v2.py
+8
-6
vllm/model_executor/models/llama4.py
vllm/model_executor/models/llama4.py
+4
-4
vllm/model_executor/models/qwen2_moe.py
vllm/model_executor/models/qwen2_moe.py
+4
-5
vllm/model_executor/models/qwen3_moe.py
vllm/model_executor/models/qwen3_moe.py
+2
-4
vllm/platforms/cuda.py
vllm/platforms/cuda.py
+1
-0
vllm/v1/attention/backends/mla/common.py
vllm/v1/attention/backends/mla/common.py
+4
-2
vllm/v1/worker/gpu_worker.py
vllm/v1/worker/gpu_worker.py
+2
-1
vllm/v1/worker/tpu_worker.py
vllm/v1/worker/tpu_worker.py
+2
-1
vllm/worker/cpu_worker.py
vllm/worker/cpu_worker.py
+2
-1
vllm/worker/hpu_worker.py
vllm/worker/hpu_worker.py
+4
-2
vllm/worker/tpu_worker.py
vllm/worker/tpu_worker.py
+2
-1
No files found.
vllm/model_executor/layers/fused_moe/layer.py
View file @
f9c069c8
# SPDX-License-Identifier: Apache-2.0
import
importlib
import
threading
from
abc
import
abstractmethod
from
dataclasses
import
dataclass
from
enum
import
Enum
from
typing
import
Callable
,
Optional
from
weakref
import
WeakValueDictionary
import
torch
import
torch.nn.functional
as
F
from
torch.nn.parameter
import
UninitializedParameter
import
vllm.envs
as
envs
from
vllm.config
import
get_current_vllm_config
from
vllm.config
import
ParallelConfig
,
get_current_vllm_config
from
vllm.distributed
import
(
get_dp_group
,
get_ep_group
,
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
,
...
...
@@ -26,8 +30,17 @@ from vllm.platforms import current_platform
from
vllm.platforms.interface
import
CpuArchEnum
from
vllm.utils
import
direct_register_custom_op
has_pplx
=
importlib
.
util
.
find_spec
(
"pplx_kernels"
)
is
not
None
if
current_platform
.
is_cuda_alike
():
from
.fused_moe
import
fused_experts
from
.fused_batched_moe
import
(
BatchedPrepareAndFinalize
,
BatchedTritonExperts
)
from
.fused_moe
import
TritonExperts
,
fused_experts
from
.modular_kernel
import
(
FusedMoEModularKernel
,
FusedMoEPermuteExpertsUnpermute
,
FusedMoEPrepareAndFinalize
)
if
has_pplx
:
from
.pplx_prepare_finalize
import
PplxPrepareAndFinalize
else
:
fused_experts
=
None
# type: ignore
if
is_rocm_aiter_moe_enabled
():
...
...
@@ -42,6 +55,179 @@ else:
fused_moe_pallas
=
None
# type: ignore
logger
=
init_logger
(
__name__
)
# Note: this limit is somewhat arbitrary and might be changed later.
# The size of the activations will be E x MOE_DP_CHUNK_SIZE x hidden_dim.
MOE_DP_CHUNK_SIZE
=
256
@
dataclass
class
FusedMoEParallelConfig
:
tp_size
:
int
dp_size
:
int
ep_size
:
int
tp_rank
:
int
dp_rank
:
int
ep_rank
:
int
use_ep
:
bool
# whether to use EP or not
@
property
def
use_pplx_kernels
(
self
):
return
self
.
dp_size
>
1
and
self
.
use_ep
and
has_pplx
@
staticmethod
def
make
(
tp_size_
:
int
,
dp_size_
:
int
,
vllm_parallel_config
:
ParallelConfig
)
->
"FusedMoEParallelConfig"
:
"""
Determine MoE parallel configuration. Based on the input tp_size_,
dp_size_, ep_size_ and vllm's parallel config, determine what
level's of parallelism to use in the fused moe layer.
Args:
tp_size_ (int): tp_size passed into the FusedMoE constructor.
dp_size_ (int): dp_size passed into the FusedMoE constructor.
ep_size_ (int): ep_size passed into the FusedMoE constructor.
vllm_parallel_config (ParallelConfig): vllm's parallel config
object.
Examples:
When there is no parallelism requested, i.e. tp_size_ = dp_size_ = 1,
we simply return the sizes unaltered and the ranks set to 0.
Expert Parallelism is considered only when either dp_size_ or tp_size_
is non trivial.
When TP = 2, DP = 1 and EP = False, the configuration on different
devices,
- device 0 : TP = {2, 0} DP = {1, 0} EP = {1, 0} //
legend : {size, rank}
- device 1 : TP = {2, 1} DP = {1, 0} EP = {1, 0}
- Comment : Tensors are sharded across 2 devices.
When TP = 1, DP = 2 and EP = False, the configuration on different
devices,
- device 0 : TP = {2, 0} DP = {2, 0} EP = {1, 0}
- device 1 : TP = {2, 1} DP = {2, 1} EP = {1, 0}
- Comment: There are 2 engine instances and the tensors are sharded
across 2 decvices.
When TP = 2, DP = 2 and EP = False, the configuration on different
devices,
- device 0: TP = {4, 0} DP = {2, 0} EP = {1, 0}
- device 1: TP = {4, 1} DP = {2, 0} EP = {1, 0}
- device 2: TP = {4, 2} DP = {2, 1} EP = {1, 0}
- device 3: TP = {4, 3} DP = {2, 1} EP = {1, 0}
- Comment: There are 2 engine instances and the tensors are sharded
across 4 devices.
When, TP = 2, DP = 1 and EP = True, the configuration on different
devices,
- device 0: TP = {1, 0} DP = {1, 0} EP = {2, 0}
- device 1: TP = {1, 0} DP = {1, 0} EP = {2, 1}
- Comment: The experts are split between the 2 devices.
When, TP = 1, DP = 2 and EP = True, the configuration on different
devices,
- device 0: TP = {1, 0} DP = {2, 0} EP = {2, 0}
- device 1: TP = {1, 0} DP = {2, 1} EP = {2, 1}
- Comment: There are 2 engine instances and the experts are split
between the 2 devices.
When TP = 2, DP = 2 and EP = True, the configuration on different
devices,
- device 0: TP = {1, 0} DP = {2, 0} EP = {4, 0}
- device 1: TP = {1, 0} DP = {2, 0} EP = {4, 1}
- device 2: TP = {1, 0} DP = {2, 1} EP = {4, 2}
- device 3: TP = {1, 0} DP = {2, 1} EP = {4, 3}
- Comment: There are 2 engine instances and the experts are split
between the 4 devices.
"""
def
flatten_tp_across_dp
(
dp_rank
:
int
):
tp_rank
=
0
if
tp_size_
==
1
else
get_tensor_model_parallel_rank
()
# There are actually dp_size_ * tp_size_ devices. Update tp_size
# and tp_rank so we shard across all devices.
tp_size
=
dp_size_
*
tp_size_
tp_rank
=
dp_rank
*
tp_size_
+
tp_rank
return
tp_size
,
tp_rank
use_ep
=
(
dp_size_
*
tp_size_
>
1
and
vllm_parallel_config
.
enable_expert_parallel
)
dp_size
=
dp_size_
dp_rank
=
get_dp_group
().
rank_in_group
if
dp_size
>
1
else
0
tp_size
,
tp_rank
=
flatten_tp_across_dp
(
dp_rank
)
if
not
use_ep
:
return
FusedMoEParallelConfig
(
tp_size
=
tp_size
,
tp_rank
=
tp_rank
,
dp_size
=
dp_size
,
dp_rank
=
dp_rank
,
ep_size
=
1
,
ep_rank
=
0
,
use_ep
=
False
)
# DP + EP / TP + EP / DP + TP + EP
assert
use_ep
# In EP, each device owns a set of experts fully. There is no tensor
# parallel update tp_size, tp_rank, ep_size and ep_rank to reflect that.
ep_size
=
tp_size
ep_rank
=
tp_rank
return
FusedMoEParallelConfig
(
tp_size
=
1
,
tp_rank
=
0
,
dp_size
=
dp_size
,
dp_rank
=
dp_rank
,
ep_size
=
ep_size
,
ep_rank
=
ep_rank
,
use_ep
=
True
)
# Adapted from pplx-kernels tests/all_to_all_utils.py
@
dataclass
class
MoEConfig
:
num_experts
:
int
experts_per_token
:
int
hidden_dim
:
int
num_local_experts
:
int
moe_parallel_config
:
FusedMoEParallelConfig
in_dtype
:
torch
.
dtype
# The activation type.
# TODO: add more quantization params, blocked, per-token, etc.
block_size
:
int
=
128
@
property
def
tp_size
(
self
):
return
self
.
moe_parallel_config
.
tp_size
@
property
def
dp_size
(
self
):
return
self
.
moe_parallel_config
.
dp_size
@
property
def
ep_size
(
self
):
return
self
.
moe_parallel_config
.
ep_size
@
property
def
tp_rank
(
self
):
return
self
.
moe_parallel_config
.
tp_rank
@
property
def
dp_rank
(
self
):
return
self
.
moe_parallel_config
.
dp_rank
@
property
def
ep_rank
(
self
):
return
self
.
moe_parallel_config
.
ep_rank
@
property
def
use_ep
(
self
):
return
self
.
moe_parallel_config
.
use_ep
@
property
def
use_pplx_kernels
(
self
):
return
self
.
moe_parallel_config
.
use_pplx_kernels
class
FusedMoeWeightScaleSupported
(
Enum
):
TENSOR
=
"tensor"
...
...
@@ -58,6 +244,14 @@ class FusedMoEMethodBase(QuantizeMethodBase):
params_dtype
:
torch
.
dtype
,
**
extra_weight_attrs
):
raise
NotImplementedError
def
set_prepare_finalize
(
self
,
dp_size
:
int
,
world_size
:
int
,
prepare_finalize
:
FusedMoEPrepareAndFinalize
,
)
->
bool
:
return
False
@
abstractmethod
def
apply
(
self
,
...
...
@@ -80,12 +274,54 @@ class FusedMoEMethodBase(QuantizeMethodBase):
raise
NotImplementedError
class
AllToAllCache
:
def
__init__
(
self
):
self
.
_cache
:
WeakValueDictionary
=
WeakValueDictionary
()
self
.
_lock
=
threading
.
RLock
()
# Reentrant lock for thread safety
def
destroy
(
self
):
with
self
.
_lock
:
# TODO: can we do del self._cache?
for
_
,
a2a
in
self
.
_cache
.
items
():
a2a
.
destroy
()
def
get_or_create
(
self
,
**
kwargs
):
assert
has_pplx
import
pplx_kernels
as
pplx
# Create a hashable key from the kwargs
key
=
tuple
(
sorted
((
k
,
v
)
for
k
,
v
in
kwargs
.
items
()))
with
self
.
_lock
:
instance
=
self
.
_cache
.
get
(
key
)
if
instance
is
None
:
# TODO (varun): Add support to switch to intranode
# when all communications are within the same
# node.
logger
.
debug
(
"Create AllToAll %s"
,
kwargs
)
instance
=
pplx
.
AllToAll
.
internode
(
**
kwargs
)
self
.
_cache
[
key
]
=
instance
return
instance
# Global singleton
_all_to_all_cache
=
AllToAllCache
()
# Factory function as a cleaner interface
def
get_all_to_all
(
**
kwargs
):
return
_all_to_all_cache
.
get_or_create
(
**
kwargs
)
@
CustomOp
.
register
(
"unquantized_fused_moe"
)
class
UnquantizedFusedMoEMethod
(
FusedMoEMethodBase
,
CustomOp
):
"""MoE method without quantization."""
def
__init__
(
self
):
def
__init__
(
self
,
moe
:
MoEConfig
):
super
().
__init__
()
self
.
fused_experts
=
fused_experts
self
.
moe
=
moe
self
.
rocm_aiter_moe_enabled
=
is_rocm_aiter_moe_enabled
()
if
self
.
rocm_aiter_moe_enabled
:
...
...
@@ -193,6 +429,47 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
activation
=
activation
,
apply_router_weight_on_input
=
apply_router_weight_on_input
)
def
set_prepare_finalize
(
self
,
dp_size
:
int
,
world_size
:
int
,
prepare_finalize
:
FusedMoEPrepareAndFinalize
,
)
->
bool
:
assert
self
.
fused_experts
==
fused_experts
experts
:
Optional
[
FusedMoEPermuteExpertsUnpermute
]
=
None
if
isinstance
(
prepare_finalize
,
(
BatchedPrepareAndFinalize
,
PplxPrepareAndFinalize
)):
logger
.
debug
(
"BatchedTritonExperts %s"
,
self
.
moe
)
experts
=
BatchedTritonExperts
(
max_num_tokens
=
MOE_DP_CHUNK_SIZE
,
world_size
=
world_size
,
dp_size
=
dp_size
,
use_fp8_w8a8
=
False
,
use_int8_w8a8
=
False
,
use_int8_w8a16
=
False
,
use_int4_w4a16
=
False
,
block_shape
=
None
,
)
else
:
logger
.
debug
(
"TritonExperts %s"
,
self
.
moe
)
experts
=
TritonExperts
(
use_fp8_w8a8
=
False
,
use_int8_w8a8
=
False
,
use_int8_w8a16
=
False
,
use_int4_w4a16
=
False
,
block_shape
=
None
,
per_channel_quant
=
False
,
)
self
.
fused_experts
=
FusedMoEModularKernel
(
prepare_finalize
,
experts
,
)
return
True
def
forward_cuda
(
self
,
layer
:
torch
.
nn
.
Module
,
...
...
@@ -221,9 +498,12 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
num_expert_group
=
num_expert_group
,
custom_routing_function
=
custom_routing_function
,
scoring_func
=
scoring_func
,
e_score_correction_bias
=
e_score_correction_bias
)
e_score_correction_bias
=
e_score_correction_bias
,
indices_type
=
torch
.
uint32
if
self
.
moe
.
use_pplx_kernels
else
None
)
if
self
.
rocm_aiter_moe_enabled
:
assert
not
apply_router_weight_on_input
assert
expert_map
is
None
return
self
.
rocm_aiter_fused_experts
(
hidden_states
=
x
,
w1
=
layer
.
w13_weight
,
...
...
@@ -232,18 +512,19 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
topk_ids
=
topk_ids
,
activation
=
activation
,
apply_router_weight_on_input
=
apply_router_weight_on_input
)
return
fused_experts
(
hidden_states
=
x
,
w1
=
layer
.
w13_weight
,
w2
=
layer
.
w2_weight
,
topk_weights
=
topk_weights
,
topk_ids
=
topk_ids
,
inplace
=
True
,
activation
=
activation
,
apply_router_weight_on_input
=
apply_router_weight_on_input
,
global_num_experts
=
global_num_experts
,
expert_map
=
expert_map
)
else
:
return
self
.
fused_experts
(
hidden_states
=
x
,
w1
=
layer
.
w13_weight
,
w2
=
layer
.
w2_weight
,
topk_weights
=
topk_weights
,
topk_ids
=
topk_ids
,
inplace
=
True
,
activation
=
activation
,
apply_router_weight_on_input
=
apply_router_weight_on_input
,
global_num_experts
=
global_num_experts
,
expert_map
=
expert_map
,
)
def
forward_cpu
(
self
,
...
...
@@ -399,6 +680,45 @@ def determine_expert_map(
return
(
local_num_experts
,
expert_map
)
def
_construct_prepare_finalize
(
moe
:
MoEConfig
,
quant_config
:
Optional
[
QuantizationConfig
]
)
->
Optional
[
FusedMoEPrepareAndFinalize
]:
max_num_tokens
=
MOE_DP_CHUNK_SIZE
world_size
=
moe
.
ep_size
dp_size
=
moe
.
ep_size
//
moe
.
dp_size
# dp_size actually means TP.
rank
=
moe
.
ep_rank
if
moe
.
use_pplx_kernels
:
logger
.
debug
(
"using PplxPrepareAndFinalize"
)
all_to_all
=
get_all_to_all
(
max_num_tokens
=
max_num_tokens
,
num_experts
=
moe
.
num_experts
,
experts_per_token
=
moe
.
experts_per_token
,
# topk
rank
=
rank
,
world_size
=
world_size
,
dp_size
=
dp_size
,
hidden_dim
=
moe
.
hidden_dim
,
hidden_dim_bytes
=
moe
.
hidden_dim
*
moe
.
in_dtype
.
itemsize
,
# For blocked per token: set to
# ceil_div(hidden_dim, block_size) * sizeof(float32)
# For per-token: set to sizeof(float32)
hidden_dim_scale_bytes
=
(
0
if
moe
.
in_dtype
.
itemsize
!=
1
else
((
moe
.
hidden_dim
+
moe
.
block_size
-
1
)
//
moe
.
block_size
*
torch
.
float32
.
itemsize
)))
return
PplxPrepareAndFinalize
(
all_to_all
,
max_num_tokens
=
max_num_tokens
,
world_size
=
world_size
,
rank
=
rank
,
dp_size
=
dp_size
,
quant_dtype
=
moe
.
in_dtype
,
)
return
None
class
FusedMoE
(
torch
.
nn
.
Module
):
"""FusedMoE layer for MoE models.
...
...
@@ -449,21 +769,16 @@ class FusedMoE(torch.nn.Module):
params_dtype
=
torch
.
get_default_dtype
()
self
.
params_dtype
=
params_dtype
# Note: here we guard against accessing the TP and DP groups when
# uninitialized (this happens when testing)
self
.
tp_size
=
(
tp_size
if
tp_size
is
not
None
else
get_tensor_model_parallel_world_size
())
tp_rank
=
0
if
self
.
tp_size
==
1
else
get_tensor_model_parallel_rank
()
self
.
dp_size
=
(
dp_size
if
dp_size
is
not
None
else
get_dp_group
().
world_size
)
self
.
dp_rank
=
(
0
if
self
.
dp_size
==
1
else
get_dp_group
().
rank_in_group
)
self
.
global_num_experts
=
num_experts
# Use expert parallelism instead of tensor parallelism?
vllm_config
=
get_current_vllm_config
()
use_ep
=
(
vllm_config
.
parallel_config
.
enable_expert_parallel
and
self
.
tp_size
*
self
.
dp_size
>
1
)
self
.
moe_parallel_config
:
FusedMoEParallelConfig
=
(
FusedMoEParallelConfig
.
make
(
tp_size_
=
(
tp_size
if
tp_size
is
not
None
else
get_tensor_model_parallel_world_size
()),
dp_size_
=
(
dp_size
if
dp_size
is
not
None
else
get_dp_group
().
world_size
),
vllm_parallel_config
=
vllm_config
.
parallel_config
))
self
.
global_num_experts
=
num_experts
# For smuggling this layer into the fused moe custom op
self
.
use_direct_call
=
self
.
dp_size
==
1
...
...
@@ -474,28 +789,17 @@ class FusedMoE(torch.nn.Module):
compilation_config
.
static_forward_context
[
prefix
]
=
self
self
.
layer_name
=
prefix
if
use_ep
:
# Set TP size to 1 to adjust for EP and adjust EP size and rank
# for DP attention.
self
.
ep_rank
=
tp_rank
+
self
.
tp_size
*
self
.
dp_rank
self
.
tp_rank
=
0
self
.
ep_size
=
self
.
tp_size
*
self
.
dp_size
self
.
tp_size
=
1
# Determine expert maps
if
self
.
use_ep
:
self
.
local_num_experts
,
self
.
expert_map
=
determine_expert_map
(
ep_size
=
self
.
ep_size
,
ep_rank
=
self
.
ep_rank
,
global_num_experts
=
self
.
global_num_experts
)
else
:
# Adjust TP size for DP attention
self
.
tp_rank
=
tp_rank
+
self
.
tp_size
*
self
.
dp_rank
self
.
ep_rank
=
0
self
.
tp_size
=
self
.
tp_size
*
self
.
dp_size
self
.
ep_size
=
1
self
.
local_num_experts
=
self
.
global_num_experts
self
.
expert_map
=
None
self
.
local_num_experts
,
self
.
expert_map
=
(
self
.
global_num_experts
,
None
)
self
.
top_k
=
top_k
self
.
global_num_experts
=
num_experts
assert
intermediate_size
%
self
.
tp_size
==
0
self
.
hidden_size
=
hidden_size
...
...
@@ -520,14 +824,40 @@ class FusedMoE(torch.nn.Module):
from
vllm_hpu_extension.ops
import
DynamicFusedMOE
self
.
hpu_fused_moe
=
DynamicFusedMOE
(
self
.
global_num_experts
)
moe
=
MoEConfig
(
num_experts
=
self
.
global_num_experts
,
experts_per_token
=
top_k
,
hidden_dim
=
hidden_size
,
num_local_experts
=
self
.
local_num_experts
,
moe_parallel_config
=
self
.
moe_parallel_config
,
# TODO (bnell): this needs to be fixed for quantized types.
in_dtype
=
params_dtype
,
)
# Note: get_quant_method will look at the layer's local_num_experts
# for heuristic purposes, so it must be initialized first.
quant_method
:
Optional
[
QuantizeMethodBase
]
=
None
if
quant_config
is
None
:
self
.
quant_method
:
Optional
[
QuantizeMethodBase
]
=
(
UnquantizedFusedMoEMethod
()
)
quant_method
=
UnquantizedFusedMoEMethod
(
moe
)
prepare_finalize
=
_construct_prepare_finalize
(
moe
,
quant_config
)
else
:
self
.
quant_method
=
quant_config
.
get_quant_method
(
self
,
prefix
)
assert
self
.
quant_method
is
not
None
quant_method
=
quant_config
.
get_quant_method
(
self
,
prefix
)
# No pplx for quantized types yet.
prepare_finalize
=
None
assert
quant_method
is
not
None
assert
isinstance
(
quant_method
,
FusedMoEMethodBase
)
self
.
quant_method
=
quant_method
if
prepare_finalize
is
not
None
:
world_size
=
moe
.
ep_size
dp_size
=
int
(
moe
.
ep_size
//
moe
.
dp_size
)
success
=
self
.
quant_method
.
set_prepare_finalize
(
dp_size
,
world_size
,
prepare_finalize
)
if
not
success
:
logger
.
warning
(
"DP+EP not supported for %s."
,
type
(
self
.
quant_method
))
moe_quant_params
=
{
"num_experts"
:
self
.
local_num_experts
,
...
...
@@ -546,6 +876,38 @@ class FusedMoE(torch.nn.Module):
self
.
quant_method
.
create_weights
(
layer
=
self
,
**
moe_quant_params
)
@
property
def
tp_size
(
self
):
return
self
.
moe_parallel_config
.
tp_size
@
property
def
dp_size
(
self
):
return
self
.
moe_parallel_config
.
dp_size
@
property
def
ep_size
(
self
):
return
self
.
moe_parallel_config
.
ep_size
@
property
def
tp_rank
(
self
):
return
self
.
moe_parallel_config
.
tp_rank
@
property
def
dp_rank
(
self
):
return
self
.
moe_parallel_config
.
dp_rank
@
property
def
ep_rank
(
self
):
return
self
.
moe_parallel_config
.
ep_rank
@
property
def
use_ep
(
self
):
return
self
.
moe_parallel_config
.
use_ep
@
property
def
use_pplx_kernels
(
self
):
return
self
.
moe_parallel_config
.
use_pplx_kernels
def
_load_per_tensor_weight_scale
(
self
,
shard_id
:
str
,
param
:
torch
.
nn
.
Parameter
,
loaded_weight
:
torch
.
Tensor
,
...
...
@@ -830,7 +1192,8 @@ class FusedMoE(torch.nn.Module):
num_expert_group
:
Optional
[
int
]
=
None
,
custom_routing_function
:
Optional
[
Callable
]
=
None
,
scoring_func
:
str
=
"softmax"
,
e_score_correction_bias
:
Optional
[
torch
.
Tensor
]
=
None
):
e_score_correction_bias
:
Optional
[
torch
.
Tensor
]
=
None
,
indices_type
:
Optional
[
torch
.
dtype
]
=
None
):
from
vllm.model_executor.layers.fused_moe.fused_moe
import
fused_topk
# DeekSeekv2 uses grouped_top_k
...
...
@@ -846,21 +1209,52 @@ class FusedMoE(torch.nn.Module):
topk_group
=
topk_group
,
scoring_func
=
scoring_func
,
e_score_correction_bias
=
e_score_correction_bias
)
if
indices_type
is
not
None
:
topk_ids
=
topk_ids
.
to
(
dtype
=
indices_type
)
elif
custom_routing_function
is
None
:
topk_weights
,
topk_ids
,
token_expert_indices
=
fused_topk
(
hidden_states
=
hidden_states
,
gating_output
=
router_logits
,
topk
=
top_k
,
renormalize
=
renormalize
)
renormalize
=
renormalize
,
indices_type
=
indices_type
,
)
else
:
topk_weights
,
topk_ids
=
custom_routing_function
(
hidden_states
=
hidden_states
,
gating_output
=
router_logits
,
topk
=
top_k
,
renormalize
=
renormalize
)
if
indices_type
is
not
None
:
topk_ids
=
topk_ids
.
to
(
dtype
=
indices_type
)
return
topk_weights
,
topk_ids
def
must_reduce_shared_expert_outputs
(
self
)
->
bool
:
"""
The shared_experts are typically computed using the RowParallelLinear
layer. The result of this function is typically used as
the reduce_results argument to the module.
When just tensor-parallel is used, it is not required to reduce
the shared_experts results immediately. Instead we reduce at the
once at the end of the MoE op. (Refer to DeepSeekV2MoE module)
With EP and the pplx kernels - this is no longer viable as all
GPU ranks in DP, produce the complete set of hidden_states.
Therefore it is required that we reduce the shared_experts output
early.
"""
return
self
.
use_pplx_kernels
def
maybe_all_reduce_tensor_model_parallel
(
self
,
final_hidden_states
:
torch
.
Tensor
):
"""
The pplx combine kernel reduces across GPU ranks by default.
"""
if
self
.
use_pplx_kernels
:
return
final_hidden_states
else
:
return
tensor_model_parallel_all_reduce
(
final_hidden_states
)
def
forward
(
self
,
hidden_states
:
torch
.
Tensor
,
router_logits
:
torch
.
Tensor
):
if
self
.
use_direct_call
:
...
...
@@ -869,9 +1263,62 @@ class FusedMoE(torch.nn.Module):
return
torch
.
ops
.
vllm
.
moe_forward
(
hidden_states
,
router_logits
,
self
.
layer_name
)
def
forward_impl_chunked
(
self
,
full_hidden_states
:
torch
.
Tensor
,
full_router_logits
:
torch
.
Tensor
):
full_final_hidden_states
=
torch
.
empty_like
(
full_hidden_states
)
def
process_chunk
(
chunk_start
,
chunk_end
,
skip_result_store
=
False
):
hidden_states
=
full_hidden_states
[
chunk_start
:
chunk_end
,
:]
router_logits
=
full_router_logits
[
chunk_start
:
chunk_end
,
:]
# Matrix multiply.
final_hidden_states
=
self
.
quant_method
.
apply
(
layer
=
self
,
x
=
hidden_states
,
router_logits
=
router_logits
,
top_k
=
self
.
top_k
,
renormalize
=
self
.
renormalize
,
use_grouped_topk
=
self
.
use_grouped_topk
,
global_num_experts
=
self
.
global_num_experts
,
expert_map
=
self
.
expert_map
,
topk_group
=
self
.
topk_group
,
num_expert_group
=
self
.
num_expert_group
,
custom_routing_function
=
self
.
custom_routing_function
,
scoring_func
=
self
.
scoring_func
,
e_score_correction_bias
=
self
.
e_score_correction_bias
,
activation
=
self
.
activation
,
)
if
not
skip_result_store
:
full_final_hidden_states
[
chunk_start
:
chunk_end
,
:].
copy_
(
final_hidden_states
)
ctx
=
get_forward_context
()
max_tokens_across_dp
=
ctx
.
dp_metadata
.
max_tokens_across_dp_cpu
moe_dp_chunk_size_per_rank
=
MOE_DP_CHUNK_SIZE
num_tokens
=
full_hidden_states
.
size
(
0
)
for
chunk_start_
in
range
(
0
,
max_tokens_across_dp
,
moe_dp_chunk_size_per_rank
):
chunk_start
=
chunk_start_
chunk_end
=
min
(
chunk_start
+
moe_dp_chunk_size_per_rank
,
max_tokens_across_dp
)
# clamp start and end
chunk_start
=
min
(
chunk_start
,
num_tokens
-
1
)
chunk_end
=
min
(
chunk_end
,
num_tokens
)
process_chunk
(
chunk_start
,
chunk_end
,
skip_result_store
=
chunk_start_
>=
num_tokens
)
return
full_final_hidden_states
def
forward_impl
(
self
,
hidden_states
:
torch
.
Tensor
,
router_logits
:
torch
.
Tensor
):
assert
self
.
quant_method
is
not
None
if
self
.
moe_parallel_config
.
use_pplx_kernels
:
return
self
.
forward_impl_chunked
(
hidden_states
,
router_logits
)
if
self
.
dp_size
>
1
:
hidden_states
,
router_logits
=
get_ep_group
().
dispatch
(
...
...
vllm/model_executor/layers/fused_moe/modular_kernel.py
0 → 100644
View file @
f9c069c8
# SPDX-License-Identifier: Apache-2.0
from
abc
import
ABC
,
abstractmethod
from
typing
import
Optional
import
torch
#
# This file defines a set of base classes used to make MoE kernels more modular.
# The goal is to be able to utilize different communication mechanisms with
# any fused MoE kernel without needing to have combinatoric implementations.
#
# The fused moe kernels are broken down into the following components:
#
# [Router] → [Quantize-Dispatch] → [Permute-Experts-Unpermute] → [Combine]
#
# Each component will be independent of the others except for
# [Quantize-Dispatch] and `[Combine] (see below). The components can then be
# mixed and matched with so that DP+EP can be supported easily for multiple
# MoE kernel implementations.
#
# The following main classes are defined:
# * FusedMoEPrepareAndFinalize - an abstract base class for preparation of MoE
# inputs (e.g. quantization, distribution) and finalization of Moe outputs.
# The prepare method must take care of any needed quantization and the
# finalize method must apply weights and do the final reduction of the output.
# * FusedMoEPermuteExpertsUnpermute - an abstract base class for the main fused
# MoE operation. One important feature to note is that this class does not
# apply topk weights or reduce the final output.
# * FusedMoEModularKernel - an interface class that combines a
# FusedMoEPrepareAndFinalize and a FusedMoEPermuteExpertsUnpermute to
# provide the standard fused MoE kernel interface.
#
# [Quantize-Prepare] and [Finalize] functionality are bundled into a single
# class `FusedMoEPrepareAndFinalize` since they could use collective
# communication mechanisms that need to be consistent.
#
def
_moe_problem_size
(
a1
:
torch
.
Tensor
,
w1
:
torch
.
Tensor
,
w2
:
torch
.
Tensor
,
topk_ids
:
torch
.
Tensor
,
)
->
tuple
[
int
,
int
,
int
,
int
,
int
]:
"""
Extract the MoE problem size from the given tensor arguments:
- a: The hidden states, input to the MoE layer.
- w1: The first set of expert weights.
- w2: The second set of expert weights.
- topk_ids: The topk ids.
Note: extracting the problem shape from the weight and activation tensors is
not obvious. It needs to be done this way specifically due to subtle issues
with particular kernels, e.g. the int4 kernels divide the trailing dimension
by two, so it's not "correct" to extract N or K from the trailing dimension
of w1 or w2. Similarly, some kernels transpose the weights, so this needs
to be kept in mind.
"""
assert
w1
.
dim
()
==
3
and
w2
.
dim
()
==
3
E
,
N
,
_
=
w1
.
size
()
K
=
w2
.
size
(
1
)
if
a1
.
dim
()
==
2
:
# Make sure we are using the correct a1 (pre-permute).
assert
topk_ids
.
size
(
0
)
==
a1
.
size
(
0
),
\
f
"
{
topk_ids
.
size
(
0
)
}
!=
{
a1
.
size
(
0
)
}
"
M
=
a1
.
size
(
0
)
else
:
assert
a1
.
dim
()
==
3
assert
a1
.
size
(
0
)
==
E
,
f
"
{
a1
.
size
(
0
)
}
==
{
E
}
"
M
=
a1
.
size
(
1
)
# This is max_num_tokens
assert
topk_ids
.
dim
()
==
2
topk
=
topk_ids
.
size
(
1
)
return
E
,
M
,
N
,
K
,
topk
class
FusedMoEPrepareAndFinalize
(
ABC
):
"""
An abstract base class for the [Quantize-Prepare] and [Finalize] steps
described above.
"""
@
abstractmethod
def
prepare
(
self
,
a1
:
torch
.
Tensor
,
a1_scale
:
Optional
[
torch
.
Tensor
],
a2_scale
:
Optional
[
torch
.
Tensor
],
topk_weights
:
torch
.
Tensor
,
topk_ids
:
torch
.
Tensor
,
num_experts
:
int
,
expert_map
:
Optional
[
torch
.
Tensor
],
apply_router_weight_on_input
:
bool
,
)
->
tuple
[
torch
.
Tensor
,
Optional
[
torch
.
Tensor
],
Optional
[
torch
.
Tensor
]]:
"""
Perform any quantization (and/or) dispatching needed
for this kernel.
- a1: The (unquantized) input to the MoE layer.
- a1_scale: Optional scales for a1
- a2_scale: Optional scales for the second MoE gemm. Required to make
sure the quantization is consistent for both gemms.
- topk_ids: The topk ids.
- topk_weights: The topk weights.
- num_experts: The total number of experts in the global expert space.
- expert_map: A tensor mapping expert indices from the global expert
space to the local expert space of the expert parallel shard.
- apply_router_weight_on_input: When True, apply the weights to the
activations, before quantization + dispatching.
Returns a tuple of:
- quantized + dispatched a.
- quantized + dispatched a1_scales.
"""
raise
NotImplementedError
@
abstractmethod
def
finalize
(
self
,
output
:
torch
.
Tensor
,
fused_expert_output
:
torch
.
Tensor
,
topk_weights
:
torch
.
Tensor
,
topk_ids
:
torch
.
Tensor
,
apply_router_weight_on_input
:
bool
,
)
->
None
:
"""
Perform any combine plus apply weights and perform a reduction on the
fused experts output.
- output: The output tensor, written in place. Must be (M, K) shape.
- fused_expert_output: The unweighted, unreduced output of the fused
experts, it will have (M, topk, K) shape.
- topk_weights: The weights to be applied to the fused_experts_output.
- topk_ids: The topk_ids.
- apply_router_weight_on_input: When False, apply the weights to
fused_expert_output.
"""
raise
NotImplementedError
class
FusedMoEPermuteExpertsUnpermute
(
ABC
):
"""
An abstract base class for the [Permute-Experts-Unpermute] step described
above.
"""
@
abstractmethod
def
workspace_shapes
(
self
,
a
:
torch
.
Tensor
,
M
:
int
,
N
:
int
,
K
:
int
,
topk
:
int
,
num_experts
:
int
,
)
->
tuple
[
int
,
int
,
torch
.
dtype
]:
"""
Compute the number of elements for the temporary outputs of the two
gemms and activation in the fused expert function. Since the
gemms are independent, the workspace for the first gemm can be shared
with the workspace for the last gemm.
Returns a tuple of:
- Number of workspace13 elements: must be large enough to hold the
result of either expert gemm.
- Number of workspace2 elements: must be large enough to hold the
result of the activation function.
- Workspace type: The dtype to use for the workspace tensors.
"""
raise
NotImplementedError
def
activation
(
self
,
activation
:
str
,
output
:
torch
.
Tensor
,
input
:
torch
.
Tensor
)
->
None
:
assert
output
.
size
(
-
1
)
*
2
==
input
.
size
(
-
1
)
if
activation
==
"silu"
:
torch
.
ops
.
_C
.
silu_and_mul
(
output
,
input
)
elif
activation
==
"gelu"
:
torch
.
ops
.
_C
.
gelu_and_mul
(
output
,
input
)
else
:
raise
ValueError
(
f
"Unsupported FusedMoe activation:
{
activation
}
"
)
@
abstractmethod
def
apply
(
self
,
hidden_states
:
torch
.
Tensor
,
w1
:
torch
.
Tensor
,
w2
:
torch
.
Tensor
,
topk_ids
:
torch
.
Tensor
,
activation
:
str
,
global_num_experts
:
int
,
expert_map
:
Optional
[
torch
.
Tensor
],
w1_scale
:
Optional
[
torch
.
Tensor
],
w2_scale
:
Optional
[
torch
.
Tensor
],
w1_zp
:
Optional
[
torch
.
Tensor
],
w2_zp
:
Optional
[
torch
.
Tensor
],
a1q_scale
:
Optional
[
torch
.
Tensor
],
a2_scale
:
Optional
[
torch
.
Tensor
],
workspace13
:
torch
.
Tensor
,
workspace2
:
torch
.
Tensor
,
expert_num_tokens
:
Optional
[
torch
.
Tensor
],
)
->
torch
.
Tensor
:
"""
This function computes the intermediate result of a Mixture of Experts
(MoE) layer using two sets of weights, w1 and w2.
Parameters:
- hidden_states: (torch.Tensor): The (quantized) input tensor to the MoE
layer.
- w1 (torch.Tensor): The first set of expert weights.
- w2 (torch.Tensor): The second set of expert weights.
- topk_ids (torch.Tensor): A map of row to expert id.
- activation (str): The activation function to apply after the first
MoE layer.
- global_num_experts (int): The total number of experts in the global
expert space.
- expert_map (Optional[torch.Tensor]): A tensor mapping expert indices
from the global expert space to the local expert space of the expert
parallel shard.
- w1_scale (Optional[torch.Tensor]): Optional scale to be used for w1.
- w2_scale (Optional[torch.Tensor]): Optional scale to be used for w2.
- w1_zp (Optional[torch.Tensor]): Optional zero points to be used for
w1.
- w2_zp (Optional[torch.Tensor]): Optional zero points to be used for
w2.
- a1q_scale (Optional[torch.Tensor]): Optional quantized scale to be
used for a1.
- a2_scale (Optional[torch.Tensor]): Optional scale to be used for a2.
- workspace13 (torch.Tensor): A scratch tensor used for gemm outputs
must be large enough to hold output of either MoE gemm.
- workspace2 (torch.Tensor): A scratch tensor used for the activation
function.
- expert_num_tokens: An optional tensor containing the number of tokens
assigned to each expert when using batched experts format input.
Returns:
- torch.Tensor: The unweighted, unreduced output tensor
"""
raise
NotImplementedError
class
FusedMoEModularKernel
(
torch
.
nn
.
Module
):
"""
This class combines a FusedMoEPrepareAndFinalize instance and
a FusedMoEPermuteExpertsUnpermute to provide an interface that
is compatible with the `fused_experts` function in fused_moe.py.
It takes care of managing any required scratch space.
Note: Instances of this class should only be used for a single model
layer due to any layer specific state that may be used by the component
objects.
"""
def
__init__
(
self
,
prepare_finalize
:
FusedMoEPrepareAndFinalize
,
fused_experts
:
FusedMoEPermuteExpertsUnpermute
,
):
super
().
__init__
()
self
.
prepare_finalize
=
prepare_finalize
self
.
fused_experts
=
fused_experts
def
forward
(
self
,
hidden_states
:
torch
.
Tensor
,
w1
:
torch
.
Tensor
,
w2
:
torch
.
Tensor
,
topk_weights
:
torch
.
Tensor
,
topk_ids
:
torch
.
Tensor
,
inplace
:
bool
=
False
,
activation
:
str
=
"silu"
,
global_num_experts
:
int
=
-
1
,
expert_map
:
Optional
[
torch
.
Tensor
]
=
None
,
w1_scale
:
Optional
[
torch
.
Tensor
]
=
None
,
w2_scale
:
Optional
[
torch
.
Tensor
]
=
None
,
w1_zp
:
Optional
[
torch
.
Tensor
]
=
None
,
w2_zp
:
Optional
[
torch
.
Tensor
]
=
None
,
a1_scale
:
Optional
[
torch
.
Tensor
]
=
None
,
a2_scale
:
Optional
[
torch
.
Tensor
]
=
None
,
apply_router_weight_on_input
:
bool
=
False
,
)
->
torch
.
Tensor
:
"""
This function computes a Mixture of Experts (MoE) layer using two sets
of weights, w1 and w2, and top-k gating mechanism.
Parameters:
- hidden_states: (torch.Tensor): The input tensor to the MoE layer.
- w1 (torch.Tensor): The first set of expert weights.
- w2 (torch.Tensor): The second set of expert weights.
- topk_weights (torch.Tensor): The topk weights applied at the end of
the layer.
- topk_ids (torch.Tensor): A map of row to expert id.
- inplace (bool): If True, perform the operation in-place.
Defaults to False.
- activation (str): The activation function to apply after the first
MoE layer.
- global_num_experts (int): The total number of experts in the global
expert space.
- expert_map (Optional[torch.Tensor]): A tensor mapping expert indices
from the global expert space to the local expert space of the expert
parallel shard.
- w1_scale (Optional[torch.Tensor]): Optional scale to be used for w1.
- w2_scale (Optional[torch.Tensor]): Optional scale to be used for w2.
- w1_zp (Optional[torch.Tensor]): Optional zero points to be used for
w1.
- w2_zp (Optional[torch.Tensor]): Optional zero points to be used for
w2.
- a1_scale (Optional[torch.Tensor]): Optional scale to be used for a1.
- a2_scale (Optional[torch.Tensor]): Optional scale to be used for a2.
- apply_router_weight_on_input (bool): When true, the topk weights are
applied directly on the inputs. This is only applicable when topk is
1.
Returns:
- torch.Tensor: The output tensor after applying the MoE layer.
"""
a1
=
hidden_states
E
,
M
,
N
,
K
,
top_k
=
_moe_problem_size
(
a1
,
w1
,
w2
,
topk_ids
)
if
global_num_experts
==
-
1
:
global_num_experts
=
E
output
=
a1
if
inplace
else
torch
.
zeros_like
(
a1
)
workspace13_shape
,
workspace2_shape
,
workspace_dtype
=
(
self
.
fused_experts
.
workspace_shapes
(
a1
,
M
,
N
,
K
,
top_k
,
global_num_experts
))
# We can reuse the memory between cache1 and cache3 because by the time
# we need cache3, we're done with cache1
workspace13
=
torch
.
zeros
(
workspace13_shape
,
device
=
a1
.
device
,
dtype
=
workspace_dtype
)
workspace2
=
torch
.
zeros
(
workspace2_shape
,
device
=
a1
.
device
,
dtype
=
workspace_dtype
)
a1q
,
a1q_scale
,
expert_num_tokens
=
self
.
prepare_finalize
.
prepare
(
a1
,
a1_scale
,
a2_scale
,
topk_weights
,
topk_ids
,
global_num_experts
,
expert_map
,
apply_router_weight_on_input
)
fused_out
=
self
.
fused_experts
.
apply
(
a1q
,
w1
,
w2
,
topk_ids
,
activation
=
activation
,
global_num_experts
=
global_num_experts
,
expert_map
=
expert_map
,
w1_scale
=
w1_scale
,
w2_scale
=
w2_scale
,
w1_zp
=
w1_zp
,
w2_zp
=
w2_zp
,
a1q_scale
=
a1q_scale
,
a2_scale
=
a2_scale
,
workspace13
=
workspace13
,
workspace2
=
workspace2
,
expert_num_tokens
=
expert_num_tokens
,
)
self
.
prepare_finalize
.
finalize
(
output
,
fused_out
,
topk_weights
,
topk_ids
,
apply_router_weight_on_input
)
return
output
vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py
View file @
f9c069c8
...
...
@@ -3,6 +3,74 @@ from typing import Optional
import
torch
from
vllm
import
_custom_ops
as
ops
from
vllm.model_executor.layers.fused_moe.moe_align_block_size
import
(
moe_align_block_size
)
from
vllm.model_executor.layers.fused_moe.utils
import
_fp8_perm
def
_moe_permute
(
curr_hidden_states
:
torch
.
Tensor
,
a1q_scale
:
Optional
[
torch
.
Tensor
],
curr_topk_ids
:
torch
.
Tensor
,
global_num_experts
:
int
,
expert_map
:
Optional
[
torch
.
Tensor
],
block_m
:
int
,
)
->
tuple
[
torch
.
Tensor
,
Optional
[
torch
.
Tensor
],
torch
.
Tensor
,
torch
.
Tensor
,
Optional
[
torch
.
Tensor
]]:
"""
Determine the sorted_token_ids, expert_ids for the given problem size.
Permute the hidden states and scales according to `sorted_token_ids`.
"""
top_k_num
=
curr_topk_ids
.
size
(
1
)
tokens_in_chunk
=
curr_hidden_states
.
sizze
(
0
)
sorted_token_ids
,
expert_ids
,
num_tokens_post_padded
=
(
moe_align_block_size
(
curr_topk_ids
,
block_m
,
global_num_experts
,
expert_map
,
pad_sorted_ids
=
True
))
inv_perm
:
Optional
[
torch
.
Tensor
]
=
None
num_tokens
=
top_k_num
*
tokens_in_chunk
sorted_token_ids
=
sorted_token_ids
.
clamp
(
max
=
num_tokens
-
1
)
expert_ids
=
torch
.
repeat_interleave
(
expert_ids
,
block_m
,
dim
=
0
)
inv_perm
=
torch
.
argsort
(
sorted_token_ids
)[:
num_tokens
]
# Permute according to sorted token ids.
curr_hidden_states
=
_fp8_perm
(
curr_hidden_states
,
sorted_token_ids
//
top_k_num
)
if
a1q_scale
is
not
None
:
a1q_scale
=
a1q_scale
[
sorted_token_ids
//
top_k_num
]
return
(
curr_hidden_states
,
a1q_scale
,
sorted_token_ids
,
expert_ids
,
inv_perm
)
def
_moe_unpermute_and_reduce
(
out
:
torch
.
Tensor
,
curr_hidden
:
torch
.
Tensor
,
inv_perm
:
Optional
[
torch
.
Tensor
],
topk_weight
:
torch
.
Tensor
,
apply_router_weight_on_input
:
bool
,
)
->
None
:
"""
Unpermute the final result and apply topk_weights, then perform the final
reduction on the hidden states.
"""
M
,
topk
=
topk_weight
.
size
()
K
=
curr_hidden
.
size
(
-
1
)
if
inv_perm
is
not
None
:
curr_hidden
=
curr_hidden
[
inv_perm
,
...]
curr_hidden
=
curr_hidden
.
view
(
-
1
,
topk
,
K
)
if
not
apply_router_weight_on_input
:
curr_hidden
.
mul_
(
topk_weight
.
view
(
M
,
-
1
,
1
))
ops
.
moe_sum
(
curr_hidden
,
out
)
def
moe_permute
(
hidden_states
:
torch
.
Tensor
,
...
...
@@ -17,21 +85,21 @@ def moe_permute(
fill_invalid_expert
:
int
=
-
1
)
->
tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
]:
"""
This function expands and permutes activation to gather uncontinuous tokens
This function expands and permutes activation to gather uncontinuous tokens
for each expert.
Parameters:
- hidden_states (torch.Tensor): The input tensor to the MoE layer.
- hidden_states (torch.Tensor): The input tensor to the MoE layer.
- topk_weights (torch.Tensor): topk expert route weight for each token.
- topk_ids (torch.Tensor): topk expert route id for each token.
- token_expert_indices (torch.Tensor): indice for expanded hidden.
- topk (int): The number of top-k experts to select.
- n_expert (int): The number of expert.
- n_local_expert (int): The number of expert in current EP rank.
- expert_map (Optional[torch.Tensor]): A tensor mapping expert indices
from the global expert space to the local expert space of the expert
- expert_map (Optional[torch.Tensor]): A tensor mapping expert indices
from the global expert space to the local expert space of the expert
parallel shard.
- align_block_size (Optional[int]): align group gemm block size for deepgemm
- fill_invalid_expert(int): fill expert id in m_indices for invalid expert
- fill_invalid_expert(int): fill expert id in m_indices for invalid expert
to workaround DeepGemm unsupported -1 in m_indices
Returns:
- permuted_hidden_states (torch.Tensor): permuted activation.
...
...
@@ -39,10 +107,10 @@ def moe_permute(
of each expert for standard grouped gemm. if enable 'align_block_size'
expert_first_token_offset will align up to 'align_block_size'.
- src_row_id2dst_row_id_map (torch.Tensor): idx map for moe_unpermute.
- m_indices: m_indices for grouped gemm in deepgemm,`m_indices[i]` records
- m_indices: m_indices for grouped gemm in deepgemm,`m_indices[i]` records
the group which the j-th row of the LHS belong to.`
"""
n_token
,
n_hidden
=
hidden_states
.
s
hape
n_token
,
n_hidden
=
hidden_states
.
s
ize
()
assert
(
n_hidden
*
hidden_states
.
element_size
()
)
%
16
==
0
,
"permue kernel need hidden dim align to 16B"
permuted_row_size
=
n_token
*
topk
...
...
@@ -87,7 +155,7 @@ def moe_unpermute(
n_local_expert
:
int
,
)
->
torch
.
Tensor
:
"""
This function expands and permutes activation to gathering uncontinuous
This function expands and permutes activation to gathering uncontinuous
tokens for each expert.
Parameters:
- permuted_hidden_states (torch.Tensor): permuted activation.
...
...
@@ -99,10 +167,10 @@ def moe_unpermute(
- n_expert (int): The number of expert.
- n_local_expert (int): The number of expert in current EP rank.
Returns:
- hidden_states (torch.Tensor): The reduced and unpermuted activation
tensor.
- hidden_states (torch.Tensor): The reduced and unpermuted activation
tensor.
"""
n_token
,
n_hidden
=
topk_weights
.
s
hape
[
0
]
,
permuted_hidden_states
.
s
hape
[
-
1
]
n_token
,
n_hidden
=
topk_weights
.
s
ize
(
0
)
,
permuted_hidden_states
.
s
ize
(
-
1
)
assert
(
n_hidden
*
permuted_hidden_states
.
element_size
()
)
%
16
==
0
,
"unpermue kernel need hidden dim align to 16B"
hidden_states
=
torch
.
empty
((
n_token
,
n_hidden
),
...
...
vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
0 → 100644
View file @
f9c069c8
# SPDX-License-Identifier: Apache-2.0
from
typing
import
Optional
import
pplx_kernels
as
pplx
import
torch
import
vllm.model_executor.layers.fused_moe.modular_kernel
as
mk
from
vllm.model_executor.layers.fused_moe.utils
import
(
moe_kernel_quantize_input
)
# Note use: layer.get_all_to_all() to get an AllToAll instance
# The max_num_tokens, world_size and dp_size must be the same
# as the ones used to create the AllToAll.
class
PplxPrepareAndFinalize
(
mk
.
FusedMoEPrepareAndFinalize
):
def
__init__
(
self
,
a2a
:
pplx
.
AllToAll
,
max_num_tokens
:
int
,
world_size
:
int
,
rank
:
int
,
dp_size
:
int
,
quant_dtype
:
Optional
[
torch
.
dtype
]
=
None
,
block_shape
:
Optional
[
list
[
int
]]
=
None
):
super
().
__init__
()
assert
max_num_tokens
>
0
self
.
a2a
=
a2a
self
.
block_shape
=
block_shape
self
.
max_num_tokens
=
max_num_tokens
self
.
world_size
=
world_size
self
.
rank
=
rank
self
.
dp_size
=
dp_size
self
.
quant_dtype
=
quant_dtype
def
prepare
(
self
,
a1
:
torch
.
Tensor
,
a1_scale
:
Optional
[
torch
.
Tensor
],
a2_scale
:
Optional
[
torch
.
Tensor
],
rank_topk_weights
:
torch
.
Tensor
,
rank_topk_ids
:
torch
.
Tensor
,
num_experts
:
int
,
expert_map
:
Optional
[
torch
.
Tensor
],
apply_router_weight_on_input
:
bool
,
)
->
tuple
[
torch
.
Tensor
,
Optional
[
torch
.
Tensor
],
Optional
[
torch
.
Tensor
]]:
num_tokens
=
a1
.
size
(
0
)
# M
hidden_dim
=
a1
.
size
(
-
1
)
# K
assert
rank_topk_ids
.
size
(
0
)
==
num_tokens
# assert expert_map is None, "NYI"
# Is this always going to be a1.device?
device
=
a1
.
device
if
apply_router_weight_on_input
:
topk
=
rank_topk_ids
.
size
(
1
)
# TODO: this only works for topK=1, will need to update for topK>1
assert
topk
==
1
,
(
"apply_router_weight_on_input is only implemented for topk=1"
)
a1
=
a1
*
rank_topk_weights
.
to
(
a1
.
dtype
)
per_act_token
=
a1_scale
.
numel
()
!=
1
if
a1_scale
is
not
None
else
(
a2_scale
.
numel
()
!=
1
if
a2_scale
is
not
None
else
False
)
a1q
,
a1q_scale
=
moe_kernel_quantize_input
(
a1
,
a1_scale
,
self
.
quant_dtype
,
per_act_token
,
self
.
block_shape
)
# rem_experts need to be 0 for pplx to work properly.
rem_experts
=
num_experts
%
self
.
world_size
assert
rem_experts
==
0
num_local_experts
=
((
num_experts
//
self
.
world_size
)
+
(
1
if
self
.
rank
<
rem_experts
else
0
))
expert_num_tokens
=
torch
.
empty
(
num_local_experts
,
dtype
=
torch
.
int32
,
device
=
device
,
)
num_dp
=
self
.
world_size
//
self
.
dp_size
expert_x
=
torch
.
empty
(
(
num_local_experts
,
self
.
max_num_tokens
*
num_dp
,
hidden_dim
),
dtype
=
a1q
.
dtype
,
device
=
device
,
)
expert_x_scale
:
Optional
[
torch
.
Tensor
]
=
None
if
a1q
.
dtype
.
itemsize
==
1
:
float32_size
=
torch
.
float32
.
itemsize
block_size
=
(
self
.
block_shape
[
0
]
if
self
.
block_shape
is
not
None
else
1
)
*
float32_size
expert_x_scale
=
torch
.
empty
(
(
num_experts
,
expert_x
.
size
(
1
),
(
expert_x
.
size
(
2
)
+
block_size
-
1
)
//
block_size
,
),
dtype
=
torch
.
float32
,
device
=
device
,
)
# This argument is optional, defaults to indices.size(0)
# There's not much point setting this unless it is != indices.size(0)
bound_m
:
Optional
[
torch
.
Tensor
]
=
None
self
.
a2a
.
dispatch
(
out_expert_num_tokens
=
expert_num_tokens
,
out_expert_x
=
expert_x
,
out_expert_x_scale
=
expert_x_scale
,
dp_x
=
a1q
,
dp_x_scale
=
a1q_scale
,
indices
=
rank_topk_ids
,
bound_m
=
bound_m
,
)
return
expert_x
,
expert_x_scale
,
expert_num_tokens
def
finalize
(
self
,
output
:
torch
.
Tensor
,
fused_expert_output
:
torch
.
Tensor
,
topk_weights
:
torch
.
Tensor
,
topk_ids
:
torch
.
Tensor
,
apply_router_weight_on_input
:
bool
,
)
->
None
:
num_tokens
=
output
.
size
(
0
)
# M
# This argument is optional
# There's not much point setting this unless it is != topk_ids.size(0)
bound_m
:
Optional
[
torch
.
Tensor
]
=
None
assert
topk_ids
.
size
(
0
)
==
num_tokens
,
(
f
"
{
topk_ids
.
size
(
0
)
}
==
{
num_tokens
}
"
)
assert
output
.
size
(
0
)
<=
self
.
max_num_tokens
,
(
f
"
{
output
.
size
(
0
)
}
<=
{
self
.
max_num_tokens
}
"
)
assert
output
.
size
(
1
)
==
fused_expert_output
.
size
(
-
1
)
# Set weights to 1 if we did them in dispatch. This is hacky.
if
apply_router_weight_on_input
:
topk_weights
=
torch
.
ones_like
(
topk_weights
)
self
.
a2a
.
combine
(
out_tokens
=
output
,
indices
=
topk_ids
,
weights
=
topk_weights
,
expert_y
=
fused_expert_output
,
bound_m
=
bound_m
)
vllm/model_executor/layers/fused_moe/prepare_finalize.py
0 → 100644
View file @
f9c069c8
# SPDX-License-Identifier: Apache-2.0
from
typing
import
Optional
import
torch
import
vllm.model_executor.layers.fused_moe.modular_kernel
as
mk
from
vllm.model_executor.layers.fused_moe.moe_permute_unpermute
import
(
_moe_unpermute_and_reduce
)
from
vllm.model_executor.layers.fused_moe.utils
import
(
moe_kernel_quantize_input
)
class
MoEPrepareAndFinalizeNoEP
(
mk
.
FusedMoEPrepareAndFinalize
):
def
__init__
(
self
,
quant_dtype
:
Optional
[
torch
.
dtype
]
=
None
,
per_channel_quant
:
bool
=
False
,
block_shape
:
Optional
[
list
[
int
]]
=
None
,
):
super
().
__init__
()
self
.
per_channel_quant
=
per_channel_quant
self
.
block_shape
=
block_shape
self
.
quant_dtype
=
quant_dtype
def
prepare
(
self
,
a1
:
torch
.
Tensor
,
a1_scale
:
Optional
[
torch
.
Tensor
],
a2_scale
:
Optional
[
torch
.
Tensor
],
topk_weights
:
torch
.
Tensor
,
topk_ids
:
torch
.
Tensor
,
num_experts
:
int
,
expert_map
:
Optional
[
torch
.
Tensor
],
apply_router_weight_on_input
:
bool
=
False
,
)
->
tuple
[
torch
.
Tensor
,
Optional
[
torch
.
Tensor
],
Optional
[
torch
.
Tensor
]]:
if
apply_router_weight_on_input
:
topk
=
topk_ids
.
size
(
1
)
# TODO: this only works for topK=1, will need to update for topK>1
assert
topk
==
1
,
\
"apply_router_weight_on_input is only implemented for topk=1"
a1
.
mul_
(
topk_weights
.
to
(
a1
.
dtype
))
a1q
,
a1q_scale
=
moe_kernel_quantize_input
(
a1
,
a1_scale
,
self
.
quant_dtype
,
self
.
per_channel_quant
,
self
.
block_shape
)
return
a1q
,
a1q_scale
,
None
def
finalize
(
self
,
output
:
torch
.
Tensor
,
fused_expert_output
:
torch
.
Tensor
,
topk_weights
:
torch
.
Tensor
,
topk_ids
:
torch
.
Tensor
,
apply_router_weight_on_input
:
bool
,
)
->
None
:
_moe_unpermute_and_reduce
(
output
,
fused_expert_output
,
None
,
topk_weights
,
apply_router_weight_on_input
)
vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
0 → 100644
View file @
f9c069c8
# SPDX-License-Identifier: Apache-2.0
from
typing
import
Optional
import
torch
import
vllm.model_executor.layers.fused_moe.modular_kernel
as
mk
from
vllm.model_executor.layers.fused_moe.deep_gemm_moe
import
(
DeepGemmExperts
,
_valid_deep_gemm
,
_valid_deep_gemm_shape
)
from
vllm.model_executor.layers.fused_moe.fused_moe
import
TritonExperts
class
TritonOrDeepGemmExperts
(
mk
.
FusedMoEPermuteExpertsUnpermute
):
def
__init__
(
self
,
use_fp8_w8a8
:
bool
=
False
,
use_int8_w8a8
:
bool
=
False
,
use_int8_w8a16
:
bool
=
False
,
use_int4_w4a16
:
bool
=
False
,
per_channel_quant
:
bool
=
False
,
block_shape
:
Optional
[
list
[
int
]]
=
None
,
block_m
:
Optional
[
int
]
=
None
,
allow_deep_gemm
:
bool
=
False
):
super
().
__init__
()
self
.
triton_expert
=
TritonExperts
(
use_fp8_w8a8
=
use_fp8_w8a8
,
use_int8_w8a8
=
use_int8_w8a8
,
use_int4_w4a16
=
use_int4_w4a16
,
use_int8_w8a16
=
use_int8_w8a16
,
per_channel_quant
=
per_channel_quant
,
block_shape
=
block_shape
,
block_m
=
block_m
)
self
.
deep_gemm_expert
=
DeepGemmExperts
()
self
.
allow_deep_gemm
=
allow_deep_gemm
self
.
use_fp8_w8a8
=
use_fp8_w8a8
def
workspace_shapes
(
self
,
a
:
torch
.
Tensor
,
M
:
int
,
N
:
int
,
K
:
int
,
topk
:
int
,
num_experts
:
int
,
)
->
tuple
[
int
,
int
,
torch
.
dtype
]:
# Note: the deep gemm workspaces are strictly larger than the triton
# workspaces so we can be pessimistic here and allocate for DeepGemm
# even if we fall back to triton later, e.g. if expert maps are set.
if
self
.
allow_deep_gemm
and
_valid_deep_gemm_shape
(
M
,
N
,
K
):
return
self
.
deep_gemm_expert
.
workspace_shapes
(
a
,
M
,
N
,
K
,
topk
,
num_experts
)
else
:
return
self
.
triton_expert
.
workspace_shapes
(
a
,
M
,
N
,
K
,
topk
,
num_experts
)
def
apply
(
self
,
hidden_states
:
torch
.
Tensor
,
w1
:
torch
.
Tensor
,
w2
:
torch
.
Tensor
,
topk_ids
:
torch
.
Tensor
,
activation
:
str
,
global_num_experts
:
int
,
expert_map
:
Optional
[
torch
.
Tensor
],
w1_scale
:
Optional
[
torch
.
Tensor
],
w2_scale
:
Optional
[
torch
.
Tensor
],
w1_zp
:
Optional
[
torch
.
Tensor
],
w2_zp
:
Optional
[
torch
.
Tensor
],
a1q_scale
:
Optional
[
torch
.
Tensor
],
a2_scale
:
Optional
[
torch
.
Tensor
],
workspace13
:
torch
.
Tensor
,
workspace2
:
torch
.
Tensor
,
expert_num_tokens
:
Optional
[
torch
.
Tensor
],
)
->
torch
.
Tensor
:
N
=
w1
.
size
(
1
)
if
(
self
.
allow_deep_gemm
and
self
.
use_fp8_w8a8
and
N
>
512
and
_valid_deep_gemm
(
hidden_states
,
w1
,
w2
,
expert_map
)):
return
self
.
deep_gemm_expert
.
apply
(
hidden_states
,
w1
,
w2
,
topk_ids
,
activation
,
global_num_experts
,
expert_map
,
w1_scale
,
w2_scale
,
w1_zp
,
w2_zp
,
a1q_scale
,
a2_scale
,
workspace13
,
workspace2
,
expert_num_tokens
,
)
else
:
return
self
.
triton_expert
.
apply
(
hidden_states
,
w1
,
w2
,
topk_ids
,
activation
,
global_num_experts
,
expert_map
,
w1_scale
,
w2_scale
,
w1_zp
,
w2_zp
,
a1q_scale
,
a2_scale
,
workspace13
,
workspace2
,
expert_num_tokens
,
)
vllm/model_executor/layers/fused_moe/utils.py
View file @
f9c069c8
...
...
@@ -7,6 +7,8 @@ import torch
from
vllm
import
_custom_ops
as
ops
from
vllm.model_executor.layers.quantization.utils.fp8_utils
import
(
per_token_group_quant_fp8
)
from
vllm.model_executor.layers.quantization.utils.int8_utils
import
(
per_token_group_quant_int8
,
per_token_quant_int8
)
from
vllm.utils
import
cdiv
...
...
@@ -15,34 +17,81 @@ def _resize_cache(x: torch.Tensor, v: tuple[int, ...]) -> torch.Tensor:
Shrink the given tensor and apply the given view to it. This is
used to resize the intermediate fused_moe caches.
"""
assert
prod
(
v
)
<=
x
.
numel
()
assert
prod
(
v
)
<=
x
.
numel
(),
f
"
{
prod
(
v
)
}
<=
{
x
.
numel
()
}
"
# CUDAGRAPH unfriendly?
return
x
.
flatten
()[:
prod
(
v
)].
view
(
*
v
)
def
_fp8_quantize
(
A
:
torch
.
Tensor
,
A_scale
:
Optional
[
torch
.
Tensor
],
block_shape
:
Optional
[
list
[
int
]],
per_act_token
:
bool
,
block_shape
:
Optional
[
list
[
int
]]
=
None
,
)
->
tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
"""
Perform fp8 quantization on the inputs. If a block_shape
is provided, the output will be blocked.
"""
if
block_shape
is
None
:
A
,
A_scale
=
ops
.
scaled_fp8_quant
(
A
,
A_scale
)
A
,
A_scale
=
ops
.
scaled_fp8_quant
(
A
,
A_scale
,
use_per_token_if_dynamic
=
per_act_token
)
else
:
assert
len
(
block_shape
)
==
2
_
,
block_k
=
block_shape
[
0
],
block_shape
[
1
]
A
,
A_scale
=
per_token_group_quant_fp8
(
A
,
block_k
)
assert
cdiv
(
A
.
shape
[
-
1
],
block_k
)
==
A_scale
.
shape
[
-
1
]
assert
cdiv
(
A
.
size
(
-
1
),
block_k
)
==
A_scale
.
size
(
-
1
)
return
A
,
A_scale
def
_int8_quantize
(
A
:
torch
.
Tensor
,
A_scale
:
Optional
[
torch
.
Tensor
],
per_act_token
:
bool
,
block_shape
:
Optional
[
list
[
int
]]
=
None
,
)
->
tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
"""
Perform int8 quantization on the inputs. If a block_shape
is provided, the output will be blocked.
"""
# If weights are per-channel (per_channel_quant=True), then
# activations apply per-token quantization. Otherwise, assume
# activation tensor-wise fp8/int8 quantization, dynamic or static
if
block_shape
is
None
:
assert
per_act_token
,
\
"int8 quantization only supports block or channel-wise"
A
,
A_scale
=
per_token_quant_int8
(
A
)
else
:
assert
len
(
block_shape
)
==
2
_
,
block_k
=
block_shape
[
0
],
block_shape
[
1
]
A
,
A_scale
=
per_token_group_quant_int8
(
A
,
block_k
)
assert
cdiv
(
A
.
size
(
-
1
),
block_k
)
==
A_scale
.
size
(
-
1
)
return
A
,
A_scale
def
moe_kernel_quantize_input
(
A
:
torch
.
Tensor
,
A_scale
:
Optional
[
torch
.
Tensor
],
qtype
:
Optional
[
torch
.
dtype
],
per_channel_quant
:
bool
,
block_shape
:
Optional
[
list
[
int
]]
=
None
,
)
->
tuple
[
torch
.
Tensor
,
Optional
[
torch
.
Tensor
]]:
if
qtype
==
torch
.
float8_e4m3fn
:
return
_fp8_quantize
(
A
,
A_scale
,
per_channel_quant
,
block_shape
)
elif
qtype
==
torch
.
int8
:
return
_int8_quantize
(
A
,
A_scale
,
per_channel_quant
,
block_shape
)
else
:
assert
A_scale
is
None
return
A
,
A_scale
def
_fp8_perm
(
m
:
torch
.
Tensor
,
idx
:
torch
.
Tensor
)
->
torch
.
Tensor
:
"""
A permutation routine that works on fp8 types.
"""
if
torch
.
is_floating_point
(
m
)
and
torch
.
finfo
(
m
.
dtype
).
bits
==
8
:
if
torch
.
is_floating_point
(
m
)
and
m
.
dtype
.
itemsize
==
1
:
return
m
.
view
(
dtype
=
torch
.
uint8
)[
idx
,
...].
view
(
dtype
=
m
.
dtype
)
else
:
return
m
[
idx
,
...]
vllm/model_executor/layers/quantization/fp8.py
View file @
f9c069c8
# SPDX-License-Identifier: Apache-2.0
import
functools
import
importlib.util
from
typing
import
Any
,
Callable
,
Optional
...
...
@@ -9,6 +10,7 @@ from torch.nn import Module
from
torch.nn.parameter
import
Parameter
import
vllm.envs
as
envs
import
vllm.model_executor.layers.fused_moe.modular_kernel
as
mk
from
vllm
import
_custom_ops
as
ops
from
vllm.distributed
import
get_tensor_model_parallel_world_size
from
vllm.logger
import
init_logger
...
...
@@ -434,6 +436,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
"""
def
__init__
(
self
,
quant_config
:
Fp8Config
):
from
vllm.model_executor.layers.fused_moe
import
fused_experts
self
.
quant_config
=
quant_config
self
.
block_quant
=
self
.
quant_config
.
weight_block_size
is
not
None
...
...
@@ -458,6 +461,11 @@ class Fp8MoEMethod(FusedMoEMethodBase):
logger
.
warning_once
(
"DeepGemm not supported on the current platform."
)
self
.
fused_experts
=
functools
.
partial
(
fused_experts
,
block_shape
=
self
.
quant_config
.
weight_block_size
,
allow_deep_gemm
=
self
.
allow_deep_gemm
)
def
create_weights
(
self
,
layer
:
Module
,
num_experts
:
int
,
hidden_size
:
int
,
intermediate_size_per_partition
:
int
,
params_dtype
:
torch
.
dtype
,
**
extra_weight_attrs
):
...
...
@@ -783,6 +791,31 @@ class Fp8MoEMethod(FusedMoEMethodBase):
del
layer
.
w13_input_scale
del
layer
.
w2_input_scale
def
set_prepare_finalize
(
self
,
dp_size
:
int
,
world_size
:
int
,
prepare_finalize
:
mk
.
FusedMoEPrepareAndFinalize
,
)
->
bool
:
from
vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe
import
(
TritonOrDeepGemmExperts
)
if
self
.
use_marlin
or
self
.
rocm_aiter_moe_enabled
:
return
False
experts
=
TritonOrDeepGemmExperts
(
use_fp8_w8a8
=
True
,
block_shape
=
self
.
quant_config
.
weight_block_size
,
allow_deep_gemm
=
self
.
allow_deep_gemm
,
)
self
.
fused_experts
=
mk
.
FusedMoEModularKernel
(
prepare_finalize
,
experts
,
)
return
True
def
apply
(
self
,
layer
:
torch
.
nn
.
Module
,
...
...
@@ -801,10 +834,6 @@ class Fp8MoEMethod(FusedMoEMethodBase):
apply_router_weight_on_input
:
bool
=
False
,
activation
:
str
=
"silu"
,
)
->
torch
.
Tensor
:
from
vllm.model_executor.layers.fused_moe
import
fused_experts
from
vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe
import
(
rocm_aiter_fused_experts
)
topk_weights
,
topk_ids
=
FusedMoE
.
select_experts
(
hidden_states
=
x
,
router_logits
=
router_logits
,
...
...
@@ -819,6 +848,8 @@ class Fp8MoEMethod(FusedMoEMethodBase):
)
if
self
.
rocm_aiter_moe_enabled
:
from
vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe
import
(
# noqa: E501
rocm_aiter_fused_experts
)
return
rocm_aiter_fused_experts
(
x
,
layer
.
w13_weight
,
...
...
@@ -835,8 +866,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
a1_scale
=
layer
.
w13_input_scale
,
a2_scale
=
layer
.
w2_input_scale
,
block_shape
=
self
.
quant_config
.
weight_block_size
)
if
self
.
use_marlin
:
elif
self
.
use_marlin
:
assert
activation
==
"silu"
,
(
f
"
{
activation
}
not supported for Marlin MoE."
)
assert
not
apply_router_weight_on_input
,
(
...
...
@@ -853,28 +883,26 @@ class Fp8MoEMethod(FusedMoEMethodBase):
quant_type_id
=
scalar_types
.
float8_e4m3fn
.
id
,
global_num_experts
=
global_num_experts
,
expert_map
=
expert_map
)
return
fused_experts
(
x
,
layer
.
w13_weight
,
layer
.
w2_weight
,
topk_weights
=
topk_weights
,
topk_ids
=
topk_ids
,
inplace
=
True
,
activation
=
activation
,
use_fp8_w8a8
=
True
,
global_num_experts
=
global_num_experts
,
apply_router_weight_on_input
=
apply_router_weight_on_input
,
expert_map
=
expert_map
,
w1_scale
=
(
layer
.
w13_weight_scale_inv
if
self
.
block_quant
else
layer
.
w13_weight_scale
),
w2_scale
=
(
layer
.
w2_weight_scale_inv
if
self
.
block_quant
else
layer
.
w2_weight_scale
),
a1_scale
=
layer
.
w13_input_scale
,
a2_scale
=
layer
.
w2_input_scale
,
block_shape
=
self
.
quant_config
.
weight_block_size
,
allow_deep_gemm
=
self
.
allow_deep_gemm
,
)
else
:
return
self
.
fused_experts
(
hidden_states
=
x
,
w1
=
layer
.
w13_weight
,
w2
=
layer
.
w2_weight
,
topk_weights
=
topk_weights
,
topk_ids
=
topk_ids
,
inplace
=
True
,
activation
=
activation
,
use_fp8_w8a8
=
True
,
global_num_experts
=
global_num_experts
,
apply_router_weight_on_input
=
apply_router_weight_on_input
,
expert_map
=
expert_map
,
w1_scale
=
(
layer
.
w13_weight_scale_inv
if
self
.
block_quant
else
layer
.
w13_weight_scale
),
w2_scale
=
(
layer
.
w2_weight_scale_inv
if
self
.
block_quant
else
layer
.
w2_weight_scale
),
a1_scale
=
layer
.
w13_input_scale
,
a2_scale
=
layer
.
w2_input_scale
,
)
class
Fp8KVCacheMethod
(
BaseKVCacheMethod
):
...
...
vllm/model_executor/models/dbrx.py
View file @
f9c069c8
...
...
@@ -79,7 +79,6 @@ class DbrxExperts(FusedMoE):
prefix
=
prefix
,
)
self
.
config
=
config
self
.
tp_size
=
get_tensor_model_parallel_world_size
()
self
.
d_model
=
config
.
d_model
self
.
intermediate_size
=
(
self
.
config
.
ffn_config
.
ffn_hidden_size
//
self
.
tp_size
)
...
...
vllm/model_executor/models/deepseek_v2.py
View file @
f9c069c8
...
...
@@ -31,9 +31,7 @@ from transformers import PretrainedConfig
from
vllm.attention
import
Attention
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
CacheConfig
,
ModelConfig
,
VllmConfig
from
vllm.distributed
import
(
get_pp_group
,
get_tensor_model_parallel_world_size
,
tensor_model_parallel_all_reduce
)
from
vllm.distributed
import
get_pp_group
,
get_tensor_model_parallel_world_size
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.fused_moe
import
FusedMoE
from
vllm.model_executor.layers.layernorm
import
RMSNorm
...
...
@@ -143,7 +141,8 @@ class DeepseekV2MoE(nn.Module):
intermediate_size
=
intermediate_size
,
hidden_act
=
config
.
hidden_act
,
quant_config
=
quant_config
,
reduce_results
=
False
,
reduce_results
=
self
.
experts
.
must_reduce_shared_expert_outputs
(
),
prefix
=
f
"
{
prefix
}
.shared_experts"
,
)
...
...
@@ -154,6 +153,7 @@ class DeepseekV2MoE(nn.Module):
shared_output
=
self
.
shared_experts
(
hidden_states
)
# router_logits: (num_tokens, n_experts)
router_logits
,
_
=
self
.
gate
(
hidden_states
)
if
hidden_states
.
dtype
!=
torch
.
float16
:
final_hidden_states
=
self
.
experts
(
hidden_states
=
hidden_states
,
...
...
@@ -171,9 +171,11 @@ class DeepseekV2MoE(nn.Module):
# See DeepseekV2DecoderLayer for more details.
final_hidden_states
=
final_hidden_states
+
shared_output
\
*
(
1.
/
self
.
routed_scaling_factor
)
if
self
.
tp_size
>
1
:
final_hidden_states
=
tensor_model_parallel_all_reduce
(
final_hidden_states
)
final_hidden_states
=
(
self
.
experts
.
maybe_all_reduce_tensor_model_parallel
(
final_hidden_states
))
return
final_hidden_states
.
view
(
num_tokens
,
hidden_dim
)
...
...
vllm/model_executor/models/llama4.py
View file @
f9c069c8
...
...
@@ -25,8 +25,7 @@ from transformers import Llama4TextConfig
from
vllm.attention
import
Attention
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
CacheConfig
,
VllmConfig
from
vllm.distributed
import
(
get_tensor_model_parallel_world_size
,
tensor_model_parallel_all_reduce
)
from
vllm.distributed
import
get_tensor_model_parallel_world_size
from
vllm.model_executor.layers.fused_moe
import
FusedMoE
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.linear
import
(
QKVParallelLinear
,
...
...
@@ -89,7 +88,7 @@ class Llama4MoE(nn.Module):
quant_config
=
quant_config
,
bias
=
False
,
prefix
=
f
"
{
prefix
}
.shared_expert"
,
reduce_results
=
False
,
# We need to do scatter before reduce
reduce_results
=
self
.
experts
.
must_reduce_shared_expert_outputs
(),
)
def
forward
(
self
,
hidden_states
):
...
...
@@ -102,7 +101,8 @@ class Llama4MoE(nn.Module):
experts_out
=
routed_out
+
shared_out
if
self
.
tp_size
>
1
:
experts_out
=
tensor_model_parallel_all_reduce
(
experts_out
)
experts_out
=
self
.
experts
.
maybe_all_reduce_tensor_model_parallel
(
experts_out
)
return
experts_out
...
...
vllm/model_executor/models/qwen2_moe.py
View file @
f9c069c8
...
...
@@ -33,9 +33,7 @@ from transformers import PretrainedConfig
from
vllm.attention
import
Attention
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
CacheConfig
,
VllmConfig
from
vllm.distributed
import
(
get_pp_group
,
get_tensor_model_parallel_world_size
,
tensor_model_parallel_all_reduce
)
from
vllm.distributed
import
get_pp_group
,
get_tensor_model_parallel_world_size
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.fused_moe
import
FusedMoE
...
...
@@ -129,7 +127,8 @@ class Qwen2MoeSparseMoeBlock(nn.Module):
intermediate_size
=
config
.
shared_expert_intermediate_size
,
hidden_act
=
config
.
hidden_act
,
quant_config
=
quant_config
,
reduce_results
=
False
,
reduce_results
=
self
.
experts
.
must_reduce_shared_expert_outputs
(
),
)
else
:
self
.
shared_expert
=
None
...
...
@@ -156,7 +155,7 @@ class Qwen2MoeSparseMoeBlock(nn.Module):
if
shared_output
is
not
None
:
final_hidden_states
=
final_hidden_states
+
shared_output
if
self
.
tp_size
>
1
:
final_hidden_states
=
tensor_model_parallel
_all_reduce
(
final_hidden_states
=
self
.
experts
.
maybe_all_reduce_
tensor_model_parallel
(
# noqa E501
final_hidden_states
)
return
final_hidden_states
.
view
(
orig_shape
)
...
...
vllm/model_executor/models/qwen3_moe.py
View file @
f9c069c8
...
...
@@ -30,9 +30,7 @@ from transformers import PretrainedConfig
from
vllm.attention
import
Attention
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
CacheConfig
,
VllmConfig
from
vllm.distributed
import
(
get_pp_group
,
get_tensor_model_parallel_world_size
,
tensor_model_parallel_all_reduce
)
from
vllm.distributed
import
get_pp_group
,
get_tensor_model_parallel_world_size
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.fused_moe
import
FusedMoE
...
...
@@ -137,7 +135,7 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
router_logits
=
router_logits
)
final_hidden_states
=
final_hidden_states
if
self
.
tp_size
>
1
:
final_hidden_states
=
tensor_model_parallel
_all_reduce
(
final_hidden_states
=
self
.
experts
.
maybe_all_reduce_
tensor_model_parallel
(
# noqa E501
final_hidden_states
)
return
final_hidden_states
.
view
(
orig_shape
)
...
...
vllm/platforms/cuda.py
View file @
f9c069c8
...
...
@@ -158,6 +158,7 @@ class CudaPlatformBase(Platform):
"currently not supported with CUDA Graphs."
)
vllm_config
.
model_config
.
enforce_eager
=
True
compilation_config
.
use_cudagraph
=
False
compilation_config
.
use_inductor
=
False
@
classmethod
def
get_current_memory_usage
(
cls
,
...
...
vllm/v1/attention/backends/mla/common.py
View file @
f9c069c8
...
...
@@ -865,8 +865,10 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
assert
output
is
not
None
,
"Output tensor must be provided."
if
attn_metadata
is
None
:
# Profiling run.
return
output
# The zero fill is required when used with DP + EP
# to ensure all ranks within a DP group compute the
# same expert outputs.
return
output
.
fill_
(
0
)
num_actual_toks
=
attn_metadata
.
num_actual_tokens
...
...
vllm/v1/worker/gpu_worker.py
View file @
f9c069c8
...
...
@@ -341,7 +341,8 @@ def init_worker_distributed_environment(
distributed_init_method
,
local_rank
)
ensure_model_parallel_initialized
(
parallel_config
.
tensor_parallel_size
,
parallel_config
.
pipeline_parallel_size
)
parallel_config
.
pipeline_parallel_size
,
parallel_config
.
enable_expert_parallel
)
ensure_kv_transfer_initialized
(
vllm_config
)
...
...
vllm/v1/worker/tpu_worker.py
View file @
f9c069c8
...
...
@@ -265,4 +265,5 @@ def init_tpu_worker_distributed_environment(
backend
=
"gloo"
,
)
ensure_model_parallel_initialized
(
parallel_config
.
tensor_parallel_size
,
parallel_config
.
pipeline_parallel_size
)
parallel_config
.
pipeline_parallel_size
,
parallel_config
.
enable_expert_parallel
)
vllm/worker/cpu_worker.py
View file @
f9c069c8
...
...
@@ -390,7 +390,8 @@ class CPUWorker(LocalOrDistributedWorkerBase):
ensure_model_parallel_initialized
(
parallel_config
.
tensor_parallel_size
,
parallel_config
.
pipeline_parallel_size
)
parallel_config
.
pipeline_parallel_size
,
parallel_config
.
enable_expert_parallel
)
def
get_cache_block_size_bytes
(
self
)
->
int
:
"""Return the size in bytes of a single KV cache block.
...
...
vllm/worker/hpu_worker.py
View file @
f9c069c8
...
...
@@ -416,7 +416,8 @@ def init_worker_distributed_environment(
backend
=
'hccl'
)
ensure_model_parallel_initialized
(
parallel_config
.
tensor_parallel_size
,
parallel_config
.
pipeline_parallel_size
)
parallel_config
.
pipeline_parallel_size
,
parallel_config
.
enable_expert_parallel
)
if
torch
.
distributed
.
is_initialized
():
torch_world_size
=
torch
.
distributed
.
get_world_size
()
...
...
@@ -442,7 +443,8 @@ def init_worker_distributed_environment(
torch
.
distributed
.
all_reduce
(
dummy_tensor_hpu
)
assert
dummy_tensor_hpu
.
item
()
==
parallel_config
.
world_size
ensure_model_parallel_initialized
(
parallel_config
.
tensor_parallel_size
,
parallel_config
.
pipeline_parallel_size
)
parallel_config
.
pipeline_parallel_size
,
parallel_config
.
enable_expert_parallel
)
def
raise_if_cache_size_invalid
(
num_gpu_blocks
,
block_size
,
max_model_len
,
...
...
vllm/worker/tpu_worker.py
View file @
f9c069c8
...
...
@@ -76,7 +76,8 @@ class TPUWorker(LoRANotSupportedWorkerBase, LocalOrDistributedWorkerBase):
)
ensure_model_parallel_initialized
(
self
.
parallel_config
.
tensor_parallel_size
,
self
.
parallel_config
.
pipeline_parallel_size
)
self
.
parallel_config
.
pipeline_parallel_size
,
self
.
parallel_config
.
enable_expert_parallel
)
# Device initialization should happen after initializing the distributed
# runtime.
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment