Unverified Commit f02b3269 authored by Hexiang Wang's avatar Hexiang Wang Committed by GitHub
Browse files

[PluggableLayer][3/N] Apply PluggableLayer to moe-related layers. (#33556)


Signed-off-by: default avatarwhx-sjtu <2952154980@qq.com>
parent e1e318af
...@@ -18,7 +18,7 @@ from vllm.distributed import ( ...@@ -18,7 +18,7 @@ from vllm.distributed import (
) )
from vllm.distributed.eplb.eplb_state import EplbLayerState, EplbState from vllm.distributed.eplb.eplb_state import EplbLayerState, EplbState
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor.custom_op import CustomOp from vllm.model_executor.custom_op import PluggableLayer
from vllm.model_executor.layers.fused_moe.activation import MoEActivation from vllm.model_executor.layers.fused_moe.activation import MoEActivation
from vllm.model_executor.layers.fused_moe.config import ( from vllm.model_executor.layers.fused_moe.config import (
FusedMoEConfig, FusedMoEConfig,
...@@ -213,8 +213,8 @@ def get_compressed_expert_map(expert_map: torch.Tensor) -> str: ...@@ -213,8 +213,8 @@ def get_compressed_expert_map(expert_map: torch.Tensor) -> str:
# --8<-- [start:fused_moe] # --8<-- [start:fused_moe]
@CustomOp.register("fused_moe") @PluggableLayer.register("fused_moe")
class FusedMoE(CustomOp): class FusedMoE(PluggableLayer):
"""FusedMoE layer for MoE models. """FusedMoE layer for MoE models.
This layer contains both MergedColumnParallel weights (gate_up_proj / This layer contains both MergedColumnParallel weights (gate_up_proj /
...@@ -1532,7 +1532,7 @@ class FusedMoE(CustomOp): ...@@ -1532,7 +1532,7 @@ class FusedMoE(CustomOp):
""" """
return self.runner.maybe_all_reduce_tensor_model_parallel(final_hidden_states) return self.runner.maybe_all_reduce_tensor_model_parallel(final_hidden_states)
def forward_native( def forward(
self, self,
hidden_states: torch.Tensor, hidden_states: torch.Tensor,
router_logits: torch.Tensor, router_logits: torch.Tensor,
...@@ -1548,13 +1548,6 @@ class FusedMoE(CustomOp): ...@@ -1548,13 +1548,6 @@ class FusedMoE(CustomOp):
self._expert_map if not self.rocm_aiter_fmoe_enabled else self.expert_mask self._expert_map if not self.rocm_aiter_fmoe_enabled else self.expert_mask
) )
def forward_cuda(
self,
hidden_states: torch.Tensor,
router_logits: torch.Tensor,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
return self.forward_native(hidden_states, router_logits)
@classmethod @classmethod
def make_expert_params_mapping( def make_expert_params_mapping(
cls, cls,
......
...@@ -24,7 +24,7 @@ import torch.nn as nn ...@@ -24,7 +24,7 @@ import torch.nn as nn
from vllm.config.utils import getattr_iter from vllm.config.utils import getattr_iter
from vllm.distributed import get_dp_group, get_ep_group from vllm.distributed import get_dp_group, get_ep_group
from vllm.forward_context import ForwardContext, get_forward_context from vllm.forward_context import ForwardContext, get_forward_context
from vllm.model_executor.custom_op import CustomOp from vllm.model_executor.custom_op import PluggableLayer
from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.fused_moe import FusedMoE
from vllm.model_executor.models.interfaces import MixtureOfExperts from vllm.model_executor.models.interfaces import MixtureOfExperts
from vllm.model_executor.models.utils import maybe_prefix from vllm.model_executor.models.utils import maybe_prefix
...@@ -38,7 +38,7 @@ if TYPE_CHECKING: ...@@ -38,7 +38,7 @@ if TYPE_CHECKING:
# --8<-- [start:transformers_fused_moe] # --8<-- [start:transformers_fused_moe]
@CustomOp.register("transformers_fused_moe") @PluggableLayer.register("transformers_fused_moe")
class TransformersFusedMoE(FusedMoE): class TransformersFusedMoE(FusedMoE):
"""Custom FusedMoE for the Transformers modeling backend.""" """Custom FusedMoE for the Transformers modeling backend."""
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment