Unverified Commit 809d83c2 authored by bnellnm's avatar bnellnm Committed by GitHub
Browse files

[MoE Refactor] Combine MoERunnerBase + DefaultMoERunner (#40560)


Signed-off-by: default avatarBill Nell <bnell@redhat.com>
parent 33ef1941
......@@ -38,8 +38,11 @@ from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
from vllm.model_executor.layers.fused_moe.router.router_factory import (
create_fused_moe_router,
)
from vllm.model_executor.layers.fused_moe.runner.moe_runner_factory import (
create_moe_runner,
from vllm.model_executor.layers.fused_moe.runner.moe_runner import (
MoERunner,
)
from vllm.model_executor.layers.fused_moe.runner.moe_runner_interface import (
MoERunnerInterface,
)
from vllm.model_executor.layers.fused_moe.runner.shared_experts import (
SharedExperts,
......@@ -586,7 +589,7 @@ class FusedMoE(PluggableLayer):
# Storing the runner in the FusedMoE is an intermediate state, eventually
# the runner will own the FusedMoE layer and provide the execution interface
# for MoE ops.
self.runner = create_moe_runner(
self.runner: MoERunnerInterface = MoERunner(
layer_name=self.layer_name,
moe_config=self.moe_config,
router=self.router,
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import torch
from vllm.distributed import (
get_ep_group,
get_pcp_group,
)
from vllm.model_executor.layers.fused_moe.runner.moe_runner_base import MoERunnerBase
class DefaultMoERunner(MoERunnerBase):
"""
Standard MoE runner implementation for executing Mixture of Experts layers.
This is the primary concrete implementation of MoE execution logic, providing
comprehensive support for standard MoE operations. It handles:
- Expert routing and token dispatching using various routing strategies
- Shared experts computation with optional parallel execution using CUDA streams
- Tensor model parallel and expert parallel operations
- Multiple quantization methods and optimized kernel selection
- Both monolithic and decomposed expert execution paths
- Integration with various parallel execution modes (TP, EP, DP)
The runner orchestrates the complete MoE forward pass including routing tokens
to experts, executing expert computations in parallel, and combining results.
It supports advanced features like overlapped execution of shared experts,
optimized kernels for different parallel configurations, and seamless
integration with vLLM's distributed execution framework.
This implementation is suitable for most standard MoE use cases. For specialized
scenarios like large batch chunking, alternative runners like ChunkingMoERunner
may be more appropriate.
Eventually, this class may be split into more specialized implementations
for different configurations (e.g., with/without shared experts, gates, etc.).
"""
@property
def do_naive_dispatch_combine(self) -> bool:
return (
self.moe_config.dp_size > 1 and not self.quant_method.supports_internal_mk
)
def _maybe_dispatch(
self,
layer: torch.nn.Module,
hidden_states: torch.Tensor,
router_logits: torch.Tensor,
) -> tuple[torch.Tensor, torch.Tensor]:
# For naive dispatch/combine Dp/Ep, dispatch the hidden states and
# router logits to all experts.
# NOTE: this will be removed once all kernels are migrated into the
# MoEKernel framework.
if self.do_naive_dispatch_combine:
res = get_ep_group().dispatch_router_logits(
hidden_states,
router_logits,
self.moe_config.is_sequence_parallel,
)
assert len(res) == 2
hidden_states, router_logits = res
# NOTE: Similar with DP, PCP also needs dispatch and combine. For
# simplicity, AgRsAll2All was added separately for PCP here. Maybe
# we should modify All2AllManager abstraction to better support PCP.
if self.moe_config.pcp_size > 1:
hidden_states = get_pcp_group().all_gather(
hidden_states,
dim=0,
)
router_logits = get_pcp_group().all_gather(
router_logits,
dim=0,
)
return hidden_states, router_logits
def _maybe_combine(
self,
shared_output: torch.Tensor | None,
hidden_states: torch.Tensor,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor | None]:
if self.do_naive_dispatch_combine:
hidden_states = get_ep_group().combine(
hidden_states, self.moe_config.is_sequence_parallel
)
if self.moe_config.pcp_size > 1:
hidden_states = get_pcp_group().reduce_scatter(
hidden_states,
dim=0,
)
if self.shared_experts is not None:
assert shared_output is not None
return shared_output, hidden_states
else:
return hidden_states
def _forward_impl(
self,
layer: torch.nn.Module,
hidden_states: torch.Tensor,
router_logits: torch.Tensor,
shared_experts_input: torch.Tensor | None,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
# TODO(bnell): parts of the dispatch/combine steps will go away once
# #32567 lands and the remaining kernels are made MKs. The PCP
# code will probably remain
hidden_states, router_logits = self._maybe_dispatch(
layer,
hidden_states,
router_logits,
)
shared_output, hidden_states = self._apply_quant_method(
layer=layer,
hidden_states=hidden_states,
router_logits=router_logits,
shared_experts_input=shared_experts_input,
)
return self._maybe_combine(
shared_output,
hidden_states,
)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import torch
from vllm.model_executor.layers.fused_moe.config import (
FusedMoEConfig,
)
from vllm.model_executor.layers.fused_moe.fused_moe_method_base import (
FusedMoEMethodBase,
)
from vllm.model_executor.layers.fused_moe.router.fused_moe_router import (
FusedMoERouter,
)
from vllm.model_executor.layers.fused_moe.runner.default_moe_runner import (
DefaultMoERunner,
)
from vllm.model_executor.layers.fused_moe.runner.moe_runner import MoERunner
from vllm.model_executor.layers.fused_moe.runner.shared_experts import (
SharedExperts,
)
def create_moe_runner(
layer_name: str,
moe_config: FusedMoEConfig,
router: FusedMoERouter,
routed_input_transform: torch.nn.Module | None,
gate: torch.nn.Module | None,
shared_experts: SharedExperts | None,
quant_method: FusedMoEMethodBase,
enable_dbo: bool,
routed_output_transform: torch.nn.Module | None = None,
routed_scaling_factor: float = 1.0,
) -> MoERunner:
return DefaultMoERunner(
layer_name,
moe_config,
router,
routed_input_transform,
gate,
shared_experts,
quant_method,
enable_dbo,
routed_output_transform=routed_output_transform,
routed_scaling_factor=routed_scaling_factor,
)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from abc import ABC, abstractmethod
import torch
from vllm.model_executor.layers.fused_moe.fused_moe_method_base import (
FusedMoEMethodBase,
)
from vllm.model_executor.layers.fused_moe.runner.shared_experts import (
SharedExperts,
)
class MoERunnerInterface(ABC):
"""
Abstract base class for Mixture of Experts (MoE) runners.
This class defines the interface that all MoE runner implementations must follow.
MoE runners are responsible for executing the forward pass of MoE layers, handling
expert routing, and managing tensor parallel operations.
"""
@abstractmethod
def forward(
self,
hidden_states: torch.Tensor,
router_logits: torch.Tensor,
) -> torch.Tensor:
raise NotImplementedError
@abstractmethod
def is_internal_router(self) -> bool:
raise NotImplementedError
@property
@abstractmethod
def shared_experts(self) -> SharedExperts | None:
raise NotImplementedError
# TODO(bnell): temporary hack, do not call this method.
@abstractmethod
def _replace_quant_method(self, quant_method: FusedMoEMethodBase):
raise NotImplementedError
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment