Merge branch 'v0.9.2-dev-ds' of http://10.16.6.30/dcutoolkit/deeplearing/vllm into v0.9.2-dev-ds

55f7b089 · zhuwenwen · 5ca1259e · ab485158 · 55f7b089 · 55f7b089
Commit 55f7b089 authored Nov 03, 2025 by zhuwenwen
20 changed files
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -4755,7 +4755,7 @@ class VllmConfig:
            batch_size_capture_list = []
            if self.model_config is not None and \
                not self.model_config.enforce_eager:
-                if self.model_config.use_mla and self.compilation_config.full_cuda_graph and self.scheduler_config.max_num_seqs<=512:
+                if self.model_config.use_mla and self.scheduler_config.max_num_seqs<=512:
                    cuda_graph_sizes = [self.scheduler_config.max_num_seqs]
                else:
                    cuda_graph_sizes = self.scheduler_config.cuda_graph_sizes 

--- a/vllm/distributed/device_communicators/all2all.py
+++ b/vllm/distributed/device_communicators/all2all.py
@@ -140,7 +140,7 @@ class DeepEPAll2AllManagerBase(All2AllManagerBase):

        # This is the DeepEP default. Stick to it till we can establish
        # reasonable defaults based on profiling.
-        self.num_sms = 20
+        self.num_sms = 24#20

    def get_handle(self, kwargs):
        raise NotImplementedError
@@ -166,13 +166,21 @@ class DeepEPHTAll2AllManager(DeepEPAll2AllManagerBase):

    def _make_all2all_kwargs(self) -> dict[Any, Any]:
        # Defaults for internode and intranode are taken from DeepEP tests.
-        num_nvl_bytes = 1024 * 1024 * 1024
+        num_nvl_bytes = int(2e9/2)#1024 * 1024 * 1024
        num_rdma_bytes = None
        num_qps_per_rank = None

        if self.internode:
-            num_rdma_bytes = 1024 * 1024 * 1024
-            num_qps_per_rank = self.num_sms // 2
+            num_rdma_bytes = int(1e9/2) #1024 * 1024 * 1024
+            num_qps_per_rank = 30 #self.num_sms // 2
+
+            # import deep_ep
+            # num_nvl_bytes, num_rdma_bytes = 0, 0
+            # hidden_size = 7168
+            # hidden_bytes = hidden_size * 2
+            # for config in (deep_ep.Buffer.get_dispatch_config(self.cpu_group.size()), deep_ep.Buffer.get_combine_config(self.cpu_group.size())):
+            #     num_nvl_bytes = max(config.get_nvl_buffer_size_hint(hidden_bytes, self.cpu_group.size()), num_nvl_bytes)
+            #     num_rdma_bytes = max(config.get_rdma_buffer_size_hint(hidden_bytes, self.cpu_group.size()), num_rdma_bytes)
        else:
            num_rdma_bytes = 0
            num_qps_per_rank = 1
@@ -183,7 +191,9 @@ class DeepEPHTAll2AllManager(DeepEPAll2AllManagerBase):
                    num_nvl_bytes=num_nvl_bytes,
                    num_rdma_bytes=num_rdma_bytes,
                    low_latency_mode=False,
-                    num_qps_per_rank=num_qps_per_rank)
+                    num_qps_per_rank=num_qps_per_rank,
+                    explicitly_destroy=False,
+                    use_default_stream_as_comm_stream=False)

    def get_handle(self, kwargs):


--- a/vllm/distributed/device_communicators/cuda_communicator.py
+++ b/vllm/distributed/device_communicators/cuda_communicator.py
@@ -87,6 +87,8 @@ class CudaCommunicator(DeviceCommunicatorBase):
                from .all2all import DeepEPLLAll2AllManager
                self.all2all_manager = DeepEPLLAll2AllManager(self.cpu_group)
                logger.info("Using DeepEP Low-Latency all2all manager.")
+            elif all2all_backend == "mori":
+                pass
            else:
                raise ValueError(f"Unknown all2all backend: {all2all_backend}")


--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -951,7 +951,7 @@ def init_distributed_environment(

        parallel_config = config.parallel_config
        data_parallel_size = parallel_config.data_parallel_size
-        use_mori_ep = envs.VLLM_USE_MORI_EP and data_parallel_size > 1 and parallel_config.enable_expert_parallel
+        use_mori_ep = envs.VLLM_ALL2ALL_BACKEND == 'mori' and data_parallel_size > 1 and parallel_config.enable_expert_parallel
        if use_mori_ep:
            backend="cpu:gloo,cuda:nccl"
        torch.distributed.init_process_group(

--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -173,9 +173,9 @@ if TYPE_CHECKING:
    VLLM_USE_MERGE_ATTN_STATES_OPT: bool = False
    USE_FUSED_RMS_QUANT: bool = False
    USE_FUSED_SILU_MUL_QUANT: bool = False
-    VLLM_USE_MORI_EP: bool = False
    VLLM_P2P_ASYNC: bool = False
    VLLM_P2P_BUF_TOKENS: int = 30000
+    VLLM_ENABLE_MOE_GROUP_GEMM: bool = False

 def get_default_cache_root():
    return os.getenv(
@@ -945,6 +945,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
    # - "pplx": use pplx kernels
    # - "deepep_high_throughput", use deepep high-throughput kernels
    # - "deepep_low_latency", use deepep low-latency kernels
+    # - "mori", use mori kernels
    "VLLM_ALL2ALL_BACKEND":
    lambda: os.getenv("VLLM_ALL2ALL_BACKEND", "naive"),

@@ -1144,11 +1145,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
    lambda: (os.getenv('USE_FUSED_SILU_MUL_QUANT', '0').lower() in
             ("true", "1")),

-    # vLLM will use all_to_all ep mode
-    "VLLM_USE_MORI_EP":
-        lambda: (os.environ.get("VLLM_USE_MORI_EP", "True").lower() in
-                 ("true", "1")),
-
    # vllm pd separation will be used async
    "VLLM_P2P_ASYNC":
    lambda: bool(int(os.getenv("VLLM_P2P_ASYNC", "0"))),
@@ -1156,6 +1152,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
    # pd separation p2p async buf tokens
    "VLLM_P2P_BUF_TOKENS":
    lambda: int(os.getenv("VLLM_P2P_BUF_TOKENS", "30000")),
+
+    # pd separation p2p async buf tokens
+    "VLLM_ENABLE_MOE_GROUP_GEMM":
+        lambda: (os.environ.get("VLLM_ENABLE_MOE_GROUP_GEMM", "False").lower() in
+                 ("true", "1")),  
 }

 # --8<-- [end:env-vars-definition]

--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -136,8 +136,8 @@ def set_forward_context(
        forward_start_time = time.perf_counter()
    dp_metadata: Optional[DPMetadata] = None
    dp_size = vllm_config.parallel_config.data_parallel_size
-    use_mori_ep = envs.VLLM_USE_MORI_EP and dp_size > 1 and vllm_config.parallel_config.enable_expert_parallel
-    if not use_mori_ep and dp_size > 1 and (
+    use_navie_ep = envs.VLLM_ALL2ALL_BACKEND == 'naive' and dp_size > 1 and vllm_config.parallel_config.enable_expert_parallel
+    if use_navie_ep and dp_size > 1 and (
            attn_metadata is not None or num_tokens is not None) :
        dp_metadata = DPMetadata.make(vllm_config.parallel_config,
                                      attn_metadata, num_tokens or 0,

--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -59,6 +59,8 @@ if HAS_TRITON:
        get_config_file_name, grouped_topk)
    from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
        TritonOrDeepGemmExperts)
+    from vllm.model_executor.layers.fused_moe.triton_group_gemm_moe import (
+        TritonOrGroupGemmExperts)

    __all__ += [
        "fused_moe",
@@ -75,4 +77,5 @@ if HAS_TRITON:
        "BatchedDeepGemmExperts",
        "TritonOrDeepGemmExperts",
        "BatchedTritonOrDeepGemmExperts",
+        "TritonOrGroupGemmExperts",
    ]
--- a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
@@ -4,12 +4,15 @@ from typing import Optional

 import deep_ep
 import torch
+import torch.distributed as dist

 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.model_executor.layers.fused_moe.utils import (
    moe_kernel_quantize_input)
+from vllm.distributed.parallel_state import get_ep_group
+    


 class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
@@ -54,6 +57,10 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
        if self.dp_size not in self.available_rank_configs:
            return None
        return deep_ep.Buffer.get_combine_config(self.dp_size)
+    
+    def sync(self):
+        # torch.cuda.synchronize()
+        dist.barrier()

    def _do_dispatch(self, tokens: torch.Tensor,
                     token_scales: Optional[torch.Tensor],
@@ -205,13 +212,14 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):

    def finalize(self, output: torch.Tensor, fused_expert_output: torch.Tensor,
                 topk_weights: torch.Tensor, topk_ids: torch.Tensor,
-                 apply_router_weight_on_input: bool) -> None:
+                 apply_router_weight_on_input: bool,
+                 apply_weights_and_reduce: bool = True) -> None:

        assert self.handle is not None

        # fused_expert_output can have 0 tokens - This happens when none of the
        # tokens from the all2all reach this EP rank.
-        if fused_expert_output.numel() != 0:
+        if fused_expert_output.numel() != 0 and apply_weights_and_reduce:
            fused_expert_output = self._apply_weights_and_reduce(
                num_tokens=topk_ids.size(0),
                fused_expert_output=fused_expert_output,
@@ -227,5 +235,6 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
            previous_event=None,
            async_finish=False,
            allocate_on_comm_stream=False)
+
        # Respect inplace outputs.
        output.copy_(combined_x, non_blocking=True)
--- a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
@@ -162,7 +162,8 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):

    def finalize(self, output: torch.Tensor, fused_expert_output: torch.Tensor,
                 topk_weights: torch.Tensor, topk_ids: torch.Tensor,
-                 apply_router_weight_on_input: bool) -> None:
+                 apply_router_weight_on_input: bool,
+                 apply_weights_and_reduce: bool = True) -> None:

        assert self.handle is not None


--- a/vllm/model_executor/layers/fused_moe/ep_moe/token_dispatcher.py
+++ b/vllm/model_executor/layers/fused_moe/ep_moe/token_dispatcher.py
-
-import os
-from abc import ABC, abstractmethod
-from typing import List, Optional, Tuple
-
-import torch
-import torch.nn as nn
-
-from vllm.distributed.parallel_state import (get_dp_group,
-                                                     get_tp_group,
-                                                     get_ep_group,
-                                                     get_tensor_model_parallel_rank)
-from vllm.model_executor.layers.fused_moe.ep_moe.ep_moe_utlis import (EPSharedExperts, 
-                                                                      maybe_move_tensor_to_cpu,
-                                                                      maybe_move_tensor_to_cpu_block,
-                                                                      permute,
-                                                                      sort_chunks_by_idxs,
-                                                                      unpermute,
-                                                                      all_to_all,
-                                                                      EpMoeConfig)
-from vllm.distributed import (tensor_model_parallel_all_gather,
-                              tensor_model_parallel_gather,
-                              expert_parallel_all_gather,
-                              expert_parallel_gather)
-from vllm.platforms import current_platform
-from vllm.forward_context import ForwardContext, get_forward_context
-from vllm.utils import direct_register_custom_op
-from vllm.config import get_current_vllm_config
-from lightop import groupgemm_permute, groupgemm_unpermute
-
-
-cuda_dtoh_stream = torch.cuda.Stream()
-cuda_dtoh_sync_event = torch.cuda.Event(enable_timing=False)
-
-
-class MoETokenDispatcher(nn.Module):
-    """
-    MoE Token Dispatcher
-    """
-
-    def __init__(self, config: EpMoeConfig) -> None:
-        """
-        Initialize the MoE Token Dispatcher.
-        """
-        super().__init__()
-        self.config = config
-
-        self.tp_size = 1
-        self.ep_size = config.ep_size
-
-    @property
-    def ep_group(self):
-        """Get expert model parallel group."""
-        return get_ep_group()
-
-    @property
-    def tp_group(self):
-        """Get expert tensor parallel group."""
-        return get_tp_group()
-
-    @property
-    def tp_rank(self):
-        """Get expert tensor parallel rank."""
-        return 0#get_tensor_model_parallel_rank()
-
-    @property
-    def tp_ep_group(self):
-        """Get expert tensor and model parallel group."""
-        return get_ep_group()
-
-    @abstractmethod
-    def token_permutation(
-        self, tokens: torch.Tensor, probs: torch.Tensor, routing_map: torch.Tensor
-    ):
-        """Dispatch tokens to experts.
-
-        Args:
-            tokens (torch.Tensor): Input tokens.
-            probs (torch.Tensor): The routing probability tensor [num_tokens, num_experts].
-            routing_map (torch.Tensor): Token to expert mapping tensor.
-
-        Returns:
-            torch.Tensor: Tokens tensor.
-        """
-        raise NotImplementedError("Dispatch function not implemented.")
-
-    @abstractmethod
-    def token_unpermutation(self, expert_output: torch.Tensor, bias: torch.Tensor = None):
-        """Restores the expert output to its original ordering.
-
-        Args:
-            expert_output (torch.Tensor): The output tensor from the expert models.
-            bias (torch.Tensor): The bias tensor.
-
-        Returns:
-            (torch.Tensor, torch.Tensor): Unpermuted activation and optional bias.
-        """
-        raise NotImplementedError("Restore function not implemented.")
-
-    def set_shared_experts(self, shared_experts):
-        """Set shared expert to the dispatcher."""
-        assert self.config.moe_shared_expert_overlap
-        self.shared_experts = shared_experts
-
-
-class MoEAlltoAllTokenDispatcher(MoETokenDispatcher):
-    """
-    AlltoAll-based token dispatcher.
-
-    The workflow of AlltoAll token dispatcher is as follows:
-    (1) preprocess(): calculate necessary metadata for communication and permute
-    (2) token_permutation(): permute->A2A(EP)->AG(TP)->sort_chunk(if num_local_experts>1)
-    (3) token_unpermutation(): sort_chunk(if num_local_experts>1)->RS(TP)->A2A(EP)->unpermute
-    """
-
-    def __init__(
-        self, num_local_experts: int, local_expert_indices: List[int], config: EpMoeConfig, layer_name: str=""
-    ) -> None:
-        """
-        Initialize the AlltoAll token dispatcher.
-
-        Args:
-            num_local_experts (int): Number of local experts on the current device.
-            local_expert_indices (List[int]): Indices of local experts on the current device.
-            config (TransformerConfig): Configuration for the transformer model.
-        """
-        super().__init__(config=config)
-        self.num_local_experts = num_local_experts
-        assert config.num_moe_experts is not None
-        self.num_experts = config.num_moe_experts
-        assert self.num_local_experts > 0, "Expected at least one expert"
-        self.local_expert_indices = local_expert_indices
-        assert (
-            len(self.local_expert_indices) == self.num_local_experts
-        ), "Invalid local expert indices"
-        for i in range(len(self.local_expert_indices) - 1):
-            assert (
-                self.local_expert_indices[i] == self.local_expert_indices[i + 1] - 1
-            ), "local_expert_indices must be continous"
-
-        self.layer_name = layer_name
-        # [ep_size]. Represents the number of tokens sent by the current rank to other
-        # EP ranks.
-        self.input_splits = None
-        # [ep_size]. Represents the number of tokens received by the current rank from
-        # other EP ranks.
-        self.output_splits = None
-        # [tp_size]. Represents the number of tokens received by the current rank from
-        # other TP ranks.
-        #self.output_splits_tp = None
-        self.permute_idx_device = torch.device("cuda") if self.config.moe_permute_fusion else None
-        input_chunk_idxs = torch.arange(
-            self.num_experts * self.tp_size, device=self.permute_idx_device
-        )
-        # [num_local_experts, tp_size * ep_size]. Sort the input chunks by local experts.
-        self.sort_input_by_local_experts = input_chunk_idxs.reshape(
-            -1, self.num_local_experts
-        ).T.ravel()
-        # [tp_size * ep_size, num_local_experts]. Restore the output chunks by local experts.
-        self.restore_output_by_local_experts = input_chunk_idxs.reshape(
-            self.num_local_experts, -1
-        ).T.ravel()
-
-        # A cuda stream synchronization is needed in self.token_permutation() in some cases,
-        # because there are several non-blocking DtoH data transfers called at
-        # `self.cuda_dtoh_point`. The synchronization happens at `self.cuda_sync_point`, which is
-        # decided based on the MoE and parallel settings. Valid points are "before_permutation_1",
-        # "before_ep_alltoall", "before_permutation_2", "before_finish", and "no_sync".
-        self.cuda_sync_point = "no_sync"
-        self.cuda_sync_point_priority = {
-            "before_permutation_1": 0,
-            "before_ep_alltoall": 1,
-            "before_permutation_2": 2,
-            "before_finish": 3,
-            "no_sync": 4,
-        }
-        self.cuda_dtoh_point = "before_permutation_1"
-        #self.cuda_dtoh_stream = torch.cuda.Stream()
-        
-        # Whether to use gather or all-gather to gather the logits.
-        self.use_all_gather = current_platform.use_all_gather()
-
-        self.probs = None
-
-        # For smuggling this layer into the fused moe custom op
-        vllm_config = get_current_vllm_config()
-        compilation_config = vllm_config.compilation_config
-        if layer_name in compilation_config.static_forward_context:
-            raise ValueError("Duplicate layer name: {}".format(layer_name))
-        compilation_config.static_forward_context[layer_name] = self
-
-    def preprocess(self, routing_map: torch.Tensor) -> torch.Tensor:
-        """
-        Preprocess token routing map for AlltoAll communication and token permutation.
-
-        This method computes the number of tokens assigned to each expert based on the routing_map.
-        It also initializes the necessary data structures for AlltoAll communication, such as input
-        and output splits, and the mapping between global tokens and local experts. This method
-        should not call any DtoH data copying due to performance consideration. The necessary DtoH
-        copies are made on the `self.cuda_dtoh_stream` at `self.cuda_dtoh_point`.
-
-        Args:
-            routing_map (torch.Tensor): The mapping of tokens to experts, with shape
-                [num_tokens, num_experts].
-
-        Returns:
-            torch.Tensor: Tensor containing the number of tokens assigned to local expert.
-        """
-        # [num_experts], number of tokens assigned to each expert from the current rank's input.
-        num_local_tokens_per_expert = routing_map.sum(dim=0).long()
-
-        self.num_out_tokens = routing_map.size(0) * self.config.moe_router_topk
-
-        # ===================================================
-        # Calculate input_splits, output_splits for alltoall/allgather in variable size.
-        # ===================================================
-        # [ep_size]. Represents the number of tokens sent by the current rank to other
-        # EP ranks.
-        self.input_splits = num_local_tokens_per_expert.reshape(
-            self.ep_size, self.num_local_experts
-        ).sum(axis=1)
-        # Gather the global distribution of tokens across ranks.
-        # num_global_tokens_per_expert represents the number of tokens sent to each
-        # expert by all ranks.
-        # [tp_size, ep_size, num_experts]
-        if self.use_all_gather:
-            # Gather is not supported for some devices such as TPUs.
-            # Use all-gather instead.
-            num_global_tokens_per_expert = expert_parallel_all_gather(num_local_tokens_per_expert) \
-                .reshape(self.ep_size, self.tp_size, self.num_experts) \
-                .transpose(0, 1)
-        else:
-            # None may be returned for rank > 0
-            num_global_tokens_per_expert = expert_parallel_gather(num_local_tokens_per_expert) \
-                .reshape(self.ep_size, self.tp_size, self.num_experts) \
-                .transpose(0, 1)
-
-        # [tp_size, ep_size, num_experts] -> [tp_size, ep_size, num_local_experts]
-        num_global_tokens_per_local_expert = num_global_tokens_per_expert[
-            :, :, self.local_expert_indices[0] : self.local_expert_indices[-1] + 1
-        ].contiguous()
-        # [tp_size, ep_size, num_local_experts] -> [tp_size, ep_size]
-        num_global_tokens_per_rank = num_global_tokens_per_local_expert.sum(axis=2)
-        # [tp_size, ep_size] -> [ep_size]
-        # self.output_splits represents the number of tokens received by the current rank
-        # from other EP rank.
-        self.output_splits = num_global_tokens_per_rank[self.tp_rank]
-        # [tp_size, ep_size] -> [tp_size]
-        # self.output_splits_tp represents the number of tokens received by the current
-        # rank from other TP rank.
-        #self.output_splits_tp = num_global_tokens_per_rank.sum(axis=1)
-        # [tp_size, ep_size, num_local_experts] -> [num_local_experts]
-        num_tokens_per_local_expert = num_global_tokens_per_local_expert.sum(dim=(0, 1))
-
-        # A synchronization is needed before expert parallel AlltoAll communication
-        # to get the `input_splits` and `output_splits` CPU values.
-        #self._maybe_update_cuda_sync_point("before_ep_alltoall")
-
-        if self.num_local_experts > 1:
-            # [tp_size * ep_size, num_local_experts]. Represents the number of tokens sent
-            # to each local expert by all ranks.
-            self.num_global_tokens_per_local_expert = num_global_tokens_per_local_expert.view(
-                -1, self.num_local_experts
-            )
-            # if not self.config.moe_permute_fusion:
-            #     # A synchronization is needed before permutation 2
-            #     # to get the `num_global_tokens_per_local_expert` CPU value.
-            #     self._maybe_update_cuda_sync_point("before_permutation_2")
-
-        # assert (
-        #     self.cuda_sync_point_priority[self.cuda_dtoh_point]
-        #     <= self.cuda_sync_point_priority[self.cuda_sync_point]
-        # ), "cuda_sync_point must be after cuda_dtoh_point."
-        return num_tokens_per_local_expert
-
-    def token_permutation(
-        self, hidden_states: torch.Tensor,
-        probs: torch.Tensor,
-        routing_map: torch.Tensor,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        self.routing_map = routing_map
-        assert routing_map.dim() == 2, "Expected 2D tensor for token2expert mask"
-        assert routing_map.dtype == torch.bool, "Expected bool tensor for mask"
-        tokens_per_expert = self.preprocess(self.routing_map)
-
-        if self.config.moe_shared_expert_overlap and self.shared_experts is not None:
-            self.shared_experts.pre_forward_comm(hidden_states.view(self.hidden_shape))
-        
-        global_input_tokens = torch.ops.vllm.token_permutation_forward(tokens_per_expert, hidden_states, 
-                                                                       probs, routing_map, self.layer_name)
-        return global_input_tokens, tokens_per_expert
-    
-    def token_permutation_impl(
-        self, 
-        tokens_per_expert: torch.Tensor,
-        hidden_states: torch.Tensor,
-        probs: torch.Tensor,
-        routing_map: torch.Tensor,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """
-        Dispatch tokens to local experts using AlltoAll communication.
-
-        This method performs the following steps:
-        1. Preprocess the routing map to get metadata for communication and permutation.
-        2. Permute input tokens for AlltoAll communication.
-        3. Perform expert parallel AlltoAll communication.
-        4. Sort tokens by local expert (if multiple local experts exist).
-
-        Args:
-            hidden_states (torch.Tensor): Input token embeddings.
-            probs (torch.Tensor): The probabilities of token to experts assignment.
-            routing_map (torch.Tensor): The mapping of token to experts assignment.
-
-        Returns:
-            Tuple[torch.Tensor, torch.Tensor]:
-                - Permuted token embeddings for local experts.
-                - Number of tokens per expert.
-        """
-        # Preprocess: Get the metadata for communication, permutation and computation operations.
-        # Permutation 1: input to AlltoAll input
-        tokens_per_expert = self._maybe_dtoh_and_synchronize(
-            "before_permutation_1", tokens_per_expert
-        )
-        self.hidden_shape = hidden_states.shape
-        if self.config.apply_router_weight_on_input:
-            self.probs = probs
-            assert probs.dim() == 2, "Expected 2D tensor for probs"
-        
-        hidden_states = hidden_states.view(-1, self.hidden_shape[-1])
-
-        self.hidden_shape_before_permute = hidden_states.shape
-
-        if False:
-            permutated_local_input_tokens, self.reversed_local_input_permutation_mapping = permute(
-                hidden_states,
-                routing_map,
-                num_out_tokens=self.num_out_tokens,
-                fused=self.config.moe_permute_fusion
-            )
-        else:
-            cuda_permute_result = groupgemm_permute(hidden_states, routing_map)
-            
-            permutated_local_input_tokens, self.reversed_local_input_permutation_mapping, \
-                self.expert_m_count = cuda_permute_result
-
-        # Perform expert parallel AlltoAll communication
-        # tokens_per_expert = self._maybe_dtoh_and_synchronize(
-        #     "before_ep_alltoall", tokens_per_expert
-        # )
-
-        ###test##############
-        #cuda_dtoh_stream.synchronize()
-        cuda_dtoh_sync_event.synchronize()
-        ###test##############
-
-        global_input_tokens = all_to_all(
-            self.ep_group.device_group, permutated_local_input_tokens, self.output_splits, self.input_splits
-        )
-
-        if self.config.moe_shared_expert_overlap and self.shared_experts is not None:
-            self.shared_experts.linear_fc1_forward_and_act(global_input_tokens)
-
-        # Permutation 2: Sort tokens by local expert.
-        # tokens_per_expert = self._maybe_dtoh_and_synchronize(
-        #     "before_permutation_2", tokens_per_expert
-        # )
-        if self.num_local_experts > 1:
-            global_input_tokens = sort_chunks_by_idxs(
-                    global_input_tokens,
-                    self.num_global_tokens_per_local_expert.ravel(),
-                    self.sort_input_by_local_experts,
-                    fused=self.config.moe_permute_fusion,
-                )
-
-        #tokens_per_expert = self._maybe_dtoh_and_synchronize("before_finish", tokens_per_expert)
-
-        return global_input_tokens
-
-    def token_unpermutation(
-        self, hidden_states: torch.Tensor,
-    ) -> torch.Tensor:
-        return torch.ops.vllm.token_unpermutation_forward(hidden_states, self.layer_name)
-    
-    def token_unpermutation_impl(
-        self, hidden_states: torch.Tensor,
-    ) -> torch.Tensor:
-        """
-        Reverse the token permutation to restore the original order.
-
-        This method performs the following steps:
-        1. Unsort tokens by local expert (if multiple local experts exist).
-        2. Perform expert parallel AlltoAll communication to restore the original order.
-        3. Unpermute tokens to restore the original order.
-
-        Args:
-            hidden_states (torch.Tensor): Output from local experts.
-            bias (torch.Tensor, optional): Bias tensor (not supported).
-
-        Returns:
-            Tuple[torch.Tensor, Optional[torch.Tensor]]:
-                - Unpermuted token embeddings in the original order.
-                - None (bias is not supported).
-        """
-        # Unpermutation 2: Unsort tokens by local expert.
-        if self.num_local_experts > 1:
-            hidden_states = sort_chunks_by_idxs(
-                hidden_states,
-                self.num_global_tokens_per_local_expert.T.ravel(),
-                self.restore_output_by_local_experts,
-                fused=self.config.moe_permute_fusion,
-            )
-
-        # Perform expert parallel AlltoAll communication
-        # hidden_states: [SEQL, H] -> [SEQL, H/TP]
-        permutated_local_input_tokens = all_to_all(
-            self.ep_group.device_group, hidden_states, self.input_splits, self.output_splits
-        )
-
-        if self.config.moe_shared_expert_overlap and self.shared_experts is not None:
-            self.shared_experts.linear_fc2_forward(permutated_local_input_tokens)
-            self.shared_experts.post_forward_comm()
-
-        # Unpermutation 1: AlltoAll output to output
-        if False:
-            output = unpermute(
-                permutated_local_input_tokens,
-                self.reversed_local_input_permutation_mapping,
-                restore_shape=self.hidden_shape_before_permute,
-                probs=self.probs,
-                routing_map=self.routing_map,
-                fused=self.config.moe_permute_fusion,
-            )
-        else:
-            output = groupgemm_unpermute(permutated_local_input_tokens,
-                                self.reversed_local_input_permutation_mapping,
-                                list(self.hidden_shape_before_permute),
-                                self.probs,
-                                self.routing_map,
-                                self.expert_m_count)
-
-        # Reshape the output tensor
-        output = output.view(self.hidden_shape)
-
-        # Add shared experts output
-        if self.config.moe_shared_expert_overlap and self.shared_experts is not None:
-            shared_output = self.shared_experts.get_output()
-
-            if hidden_states.dtype != torch.float16:
-                output = output + shared_output
-            else:
-                # Fix FP16 overflow
-                # See DeepseekV2DecoderLayer for more details.
-                output = output + shared_output \
-                    * (1. / self.config.routed_scaling_factor)
-        return output
-
-    def _maybe_update_cuda_sync_point(self, point: str):
-        """
-        Update the CUDA sync point if the priority of the new point is higher than the current
-        sync point, which means the new point is reached earlier than the current sync point.
-        """
-        if (
-            self.cuda_sync_point_priority[point]
-            < self.cuda_sync_point_priority[self.cuda_sync_point]
-        ):
-            self.cuda_sync_point = point
-
-    def _maybe_dtoh_and_synchronize(
-        self, point: str, tokens_per_expert: torch.Tensor = None
-    ) -> torch.Tensor:
-        """
-        Move all possible GPU tensors to CPU and make a synchronization at the expected point.
-        """
-        if point == self.cuda_dtoh_point:
-            # Move all possible GPU tensors to CPU at self.cuda_dtoh_point.
-            on_side_stream = torch.cuda.current_stream() != cuda_dtoh_stream
-            if on_side_stream:
-                cuda_dtoh_stream.wait_stream(torch.cuda.current_stream())
-            with torch.cuda.stream(cuda_dtoh_stream):
-                # TODO: use MemcpyBatchAsync instead.
-                # tokens_per_expert = maybe_move_tensor_to_cpu(
-                #     tokens_per_expert, record_stream=on_side_stream
-                # )
-                self.input_splits = maybe_move_tensor_to_cpu(
-                    self.input_splits, as_numpy=True, record_stream=on_side_stream
-                )
-                self.output_splits = maybe_move_tensor_to_cpu(
-                    self.output_splits, as_numpy=True, record_stream=on_side_stream
-                )
-                # self.output_splits_tp = maybe_move_tensor_to_cpu(
-                #     self.output_splits_tp, as_numpy=True, record_stream=on_side_stream
-                # )
-                self.num_out_tokens = maybe_move_tensor_to_cpu(
-                    self.num_out_tokens, record_stream=on_side_stream
-                )
-                if self.num_local_experts > 1 and not self.config.moe_permute_fusion:
-                    self.num_global_tokens_per_local_expert = maybe_move_tensor_to_cpu(
-                        self.num_global_tokens_per_local_expert, record_stream=on_side_stream
-                    )
-
-                cuda_dtoh_sync_event.record()
-
-            # if point == self.cuda_sync_point:
-            #     # Synchronize with the dtoh stream at self.cuda_sync_point.
-            #     cuda_dtoh_stream.synchronize()
-
-        return tokens_per_expert
-    
-
-def token_permutation_forward(tokens_per_expert: torch.Tensor,
-                              hidden_states: torch.Tensor,
-                              probs: torch.Tensor,
-                              routing_map: torch.Tensor,
-                              layer_name: str) -> torch.Tensor:
-    forward_context: ForwardContext = get_forward_context()
-    self = forward_context.no_compile_layers[layer_name]
-
-    return self.token_permutation_impl(tokens_per_expert, hidden_states, probs, routing_map)
-
-
-def token_permutation_forward_fake(tokens_per_expert: torch.Tensor,
-                                    hidden_states: torch.Tensor,
-                                    probs: torch.Tensor,
-                                    routing_map: torch.Tensor,
-                                    layer_name: str) -> torch.Tensor:
-    return torch.empty_like(hidden_states)
-
-
-direct_register_custom_op(
-    op_name="token_permutation_forward",
-    op_func=token_permutation_forward,
-    mutates_args=["tokens_per_expert", "hidden_states", "probs", "routing_map"],
-    fake_impl=token_permutation_forward_fake,
-    dispatch_key=current_platform.dispatch_key,
-    tags=(torch.Tag.needs_fixed_stride_order, ),
-)
-
-
-def token_unpermutation_forward(hidden_states: torch.Tensor,
-                              layer_name: str) -> torch.Tensor:
-    forward_context: ForwardContext = get_forward_context()
-    self = forward_context.no_compile_layers[layer_name]
-
-    return self.token_unpermutation_impl(hidden_states)
-
-
-def token_unpermutation_forward_fake(hidden_states: torch.Tensor,
-                              layer_name: str) -> torch.Tensor:
-    return torch.empty_like(hidden_states)
-
-
-direct_register_custom_op(
-    op_name="token_unpermutation_forward",
-    op_func=token_unpermutation_forward,
-    mutates_args=["hidden_states"],
-    fake_impl=token_unpermutation_forward_fake,
-    dispatch_key=current_platform.dispatch_key,
-    tags=(torch.Tag.needs_fixed_stride_order, ),
-)
\ No newline at end of file
--- a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
@@ -596,6 +596,7 @@ class BatchedPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
        topk_weights: torch.Tensor,
        topk_ids: torch.Tensor,
        apply_router_weight_on_input: bool,
+        apply_weights_and_reduce: bool = True
    ) -> None:
        num_tokens = topk_ids.size(0)
        num_local_experts = fused_expert_output.size(0)

--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -28,8 +28,9 @@ from vllm.model_executor.layers.fused_moe.config import (
    FusedMoEConfig, FusedMoEParallelConfig)
 # yapf: enable
 from vllm.model_executor.layers.fused_moe.modular_kernel import (
-    FusedMoEActivationFormat, FusedMoEModularKernel,
-    FusedMoEPermuteExpertsUnpermute, FusedMoEPrepareAndFinalize)
+    FusedMoEActivationFormat, FusedMoEModularKernel, 
+    DeepGemmBannedFusedMoEModularKernel, FusedMoEPermuteExpertsUnpermute, 
+    FusedMoEPrepareAndFinalize)
 # from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
 #     is_rocm_aiter_moe_enabled)
 from vllm.model_executor.layers.quantization.base_config import (
@@ -40,7 +41,7 @@ from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.platforms.interface import CpuArchEnum

-from vllm.utils import direct_register_custom_op, has_deep_ep, has_pplx
+from vllm.utils import direct_register_custom_op, has_deep_ep, has_pplx, has_deep_gemm
 from vllm import _custom_ops as ops


@@ -184,10 +185,17 @@ class FusedMoEMethodBase(QuantizeMethodBase):
            logger.debug("%s", prepare_finalize.__class__.__name__)
            self.topk_indices_dtype = prepare_finalize.topk_indices_dtype()
            experts = self.select_gemm_impl(prepare_finalize, moe)
-            self.fused_experts = FusedMoEModularKernel(
-                prepare_finalize,
-                experts,
-            )
+
+            if has_deep_gemm():
+                self.fused_experts = FusedMoEModularKernel(
+                    prepare_finalize,
+                    experts,
+                )
+            else:
+                self.fused_experts = DeepGemmBannedFusedMoEModularKernel(
+                    prepare_finalize,
+                    experts,
+                )

    def select_gemm_impl(
        self,

--- a/vllm/model_executor/layers/fused_moe/modular_kernel.py
+++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py
@@ -149,6 +149,7 @@ class FusedMoEPrepareAndFinalize(ABC):
        topk_weights: torch.Tensor,
        topk_ids: torch.Tensor,
        apply_router_weight_on_input: bool,
+        apply_weights_and_reduce: bool = True
    ) -> None:
        """
        Perform any combine plus apply weights and perform a reduction on the
@@ -355,6 +356,168 @@ class FusedMoEPermuteExpertsUnpermute(ABC):
          assigned to each expert when using batched experts format input.
        """
        raise NotImplementedError
+    
+
+class CustomizedFusedMoEPermuteExpertsUnpermute(ABC):
+    """
+    An abstract base class for the [Permute-Experts-Unpermute] step described
+    above.
+    """
+
+    def __init__(
+        self,
+        quant_config: Optional[FusedMoEQuantConfig],
+    ):
+        if quant_config is not None:
+            self.quant_config = quant_config
+        else:
+            self.quant_config = FusedMoEQuantConfig()
+
+    @property
+    @abstractmethod
+    def activation_formats(
+            self) -> tuple[FusedMoEActivationFormat, FusedMoEActivationFormat]:
+        """
+        A property which is a tuple of the input and output activation formats
+        for the 'apply' method.
+        """
+        raise NotImplementedError
+
+    @property
+    def quant_dtype(self) -> Optional[torch.dtype]:
+        return self.quant_config.quant_dtype
+
+    @property
+    def block_shape(self) -> Optional[list[int]]:
+        return self.quant_config.block_shape
+
+    @property
+    def per_act_token_quant(self) -> bool:
+        return self.quant_config.per_act_token_quant
+
+    @property
+    def per_out_ch_quant(self) -> bool:
+        return self.quant_config.per_out_ch_quant
+
+    # TODO (bnell): make this return a CHUNK_SIZE or None instead?
+    @abstractmethod
+    def supports_chunking(self) -> bool:
+        """
+        A flag indicating whether or not this class supports activation
+        chunking.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def supports_expert_map(self) -> bool:
+        """
+        A flag indicating whether or not this class supports expert maps
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def workspace_shapes(
+        self,
+        a: torch.Tensor,
+        aq: torch.Tensor,
+        M: int,
+        N: int,
+        K: int,
+        topk: int,
+        global_num_experts: int,
+        local_num_experts: int,
+    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]:
+        """
+        Compute the shapes for the temporary and final outputs of the two gemms
+        and activation in the fused expert function.  Since the gemms are
+        independent, the workspace for the first gemm can be shared with the
+        workspace for the last gemm.
+
+        Returns a tuple of:
+        - workspace13 shape tuple: must be large enough to hold the
+          result of either expert gemm.
+        - workspace2 shape tuple: must be large enough to hold the
+          result of the activation function.
+        - output shape tuple: must be exact size of the final gemm output.
+        - Workspace type: The dtype to use for the workspace tensors.
+        - Note: in order for activation chunking to work, the first dimension
+          of each tuple must be the number of tokens.
+        """
+        raise NotImplementedError
+
+    def activation(self, activation: str, output: torch.Tensor,
+                   input: torch.Tensor) -> None:
+        assert output.size(-1) * 2 == input.size(-1)
+        if activation == "silu":
+            torch.ops._C.silu_and_mul(output, input)
+        elif activation == "gelu":
+            torch.ops._C.gelu_and_mul(output, input)
+        else:
+            raise ValueError(f"Unsupported FusedMoe activation: {activation}")
+
+    def enable_chunking(self):
+        return envs.VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING and \
+          self.supports_chunking()
+
+    @abstractmethod
+    def apply(
+        self,
+        output: torch.Tensor,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
+        global_num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        w1_scale: Optional[torch.Tensor],
+        w2_scale: Optional[torch.Tensor],
+        w1_zp: Optional[torch.Tensor],
+        w2_zp: Optional[torch.Tensor],
+        a1q_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        workspace13: torch.Tensor,
+        workspace2: torch.Tensor,
+        expert_num_tokens: Optional[torch.Tensor] = None,
+        use_nn_moe: Optional[bool] = False,
+        shared_output: Optional[torch.Tensor] = None,
+        routed_scaling_factor: Optional[float] = None,
+    ):
+        """
+        This function computes the intermediate result of a Mixture of Experts
+        (MoE) layer using two sets of weights, w1 and w2.
+
+        Parameters:
+        - output: (torch.Tensor): The unweighted, unreduced output tensor.
+        - hidden_states: (torch.Tensor): The (quantized) input tensor to the MoE
+          layer.
+        - w1 (torch.Tensor): The first set of expert weights.
+        - w2 (torch.Tensor): The second set of expert weights.
+        - topk_ids (torch.Tensor): A map of row to expert id.
+        - activation (str): The activation function to apply after the first
+          MoE layer.
+        - global_num_experts (int): The total number of experts in the global
+          expert space.
+        - expert_map (Optional[torch.Tensor]):  A tensor mapping expert indices
+          from the global expert space to the local expert space of the expert
+          parallel shard.
+        - w1_scale (Optional[torch.Tensor]): Optional scale to be used for w1.
+        - w2_scale (Optional[torch.Tensor]): Optional scale to be used for w2.
+        - w1_zp (Optional[torch.Tensor]): Optional zero points to be used for
+          w1.
+        - w2_zp (Optional[torch.Tensor]): Optional zero points to be used for
+          w2.
+        - a1q_scale (Optional[torch.Tensor]): Optional quantized scale to be
+          used for a1.
+        - a2_scale (Optional[torch.Tensor]): Optional scale to be used for a2.
+        - workspace13 (torch.Tensor): A scratch tensor used for gemm outputs
+          must be large enough to hold output of either MoE gemm.
+        - workspace2 (torch.Tensor): A scratch tensor used for the activation
+          function.
+        - expert_num_tokens: An optional tensor containing the number of tokens
+          assigned to each expert when using batched experts format input.
+        """
+        raise NotImplementedError


 def _chunk_scales(scales: Optional[torch.Tensor], start: int,
@@ -596,3 +759,145 @@ class FusedMoEModularKernel(torch.nn.Module):
                                       topk_ids, apply_router_weight_on_input)

        return output
+    
+
+@final
+class DeepGemmBannedFusedMoEModularKernel(torch.nn.Module):
+    """
+    This class combines a FusedMoEPrepareAndFinalize instance and
+    a FusedMoEPermuteExpertsUnpermute to provide an interface that
+    is compatible with the `fused_experts` function in fused_moe.py.
+
+    It takes care of managing any required scratch space.
+
+    Note: Instances of this class should only be used for a single model
+    layer due to any layer specific state that may be used by the component
+    objects.
+    """
+
+    def __init__(
+        self,
+        prepare_finalize: FusedMoEPrepareAndFinalize,
+        fused_experts: CustomizedFusedMoEPermuteExpertsUnpermute,
+    ):
+        super().__init__()
+        self.prepare_finalize = prepare_finalize
+        self.fused_experts = fused_experts
+        assert prepare_finalize.activation_format == \
+            fused_experts.activation_formats[0], (
+                f"{prepare_finalize.__class__.__name__}."
+                f"{prepare_finalize.activation_format} == "
+                f"{fused_experts.__class__.__name__}."
+                f"{fused_experts.activation_formats[0]}")
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        inplace: bool = False,
+        activation: str = "silu",
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
+        w1_scale: Optional[torch.Tensor] = None,
+        w2_scale: Optional[torch.Tensor] = None,
+        w1_zp: Optional[torch.Tensor] = None,
+        w2_zp: Optional[torch.Tensor] = None,
+        a1_scale: Optional[torch.Tensor] = None,
+        a2_scale: Optional[torch.Tensor] = None,
+        use_nn_moe: Optional[bool] = False,
+        apply_router_weight_on_input: bool = False,
+        shared_output: Optional[torch.Tensor] = None,
+        routed_scaling_factor: Optional[float] = None,
+    ) -> torch.Tensor:
+        """
+        This function computes a Mixture of Experts (MoE) layer using two sets
+        of weights, w1 and w2, and top-k gating mechanism.
+
+        Parameters:
+        - hidden_states: (torch.Tensor): The input tensor to the MoE layer.
+        - w1 (torch.Tensor): The first set of expert weights.
+        - w2 (torch.Tensor): The second set of expert weights.
+        - topk_weights (torch.Tensor): The topk weights applied at the end of
+          the layer.
+        - topk_ids (torch.Tensor): A map of row to expert id.
+        - inplace (bool): If True, perform the operation in-place.
+          Defaults to False.
+        - activation (str): The activation function to apply after the first
+          MoE layer.
+        - global_num_experts (int): The total number of experts in the global
+          expert space.
+        - expert_map (Optional[torch.Tensor]):  A tensor mapping expert indices
+          from the global expert space to the local expert space of the expert
+          parallel shard.
+        - w1_scale (Optional[torch.Tensor]): Optional scale to be used for w1.
+        - w2_scale (Optional[torch.Tensor]): Optional scale to be used for w2.
+        - w1_zp (Optional[torch.Tensor]): Optional zero points to be used for
+          w1.
+        - w2_zp (Optional[torch.Tensor]): Optional zero points to be used for
+          w2.
+        - a1_scale (Optional[torch.Tensor]): Optional scale to be used for a1.
+        - a2_scale (Optional[torch.Tensor]): Optional scale to be used for a2.
+        - apply_router_weight_on_input (bool): When true, the topk weights are
+          applied directly on the inputs. This is only applicable when topk is
+          1.
+
+        Returns:
+        - torch.Tensor: The output tensor after applying the MoE layer.
+        """
+
+        a1 = hidden_states
+        output = a1 if inplace else torch.zeros_like(a1)
+
+        local_num_experts = w1.size(0)
+        if global_num_experts == -1:
+            global_num_experts = local_num_experts
+
+        (a1q, a1q_scale, expert_num_tokens, _expert_topk_ids,
+         _expert_topk_weights) = self.prepare_finalize.prepare(
+             a1,
+             a1_scale,
+             a2_scale,
+             topk_weights,
+             topk_ids,
+             global_num_experts,
+             expert_map,
+             apply_router_weight_on_input,
+             self.fused_experts.quant_config,
+         )
+
+        # Maybe prepare gathered topk_ids and topk_weights from other EP ranks.
+        topk_ids = topk_ids if _expert_topk_ids is None else _expert_topk_ids
+        topk_weights = (topk_weights if _expert_topk_weights is None else
+                        _expert_topk_weights)
+
+        fused_out = self.fused_experts.apply(
+                        None,
+                        a1q,
+                        w1,
+                        w2,
+                        topk_ids,
+                        topk_weights=topk_weights,
+                        activation=activation,
+                        global_num_experts=global_num_experts,
+                        expert_map=expert_map,
+                        w1_scale=w1_scale,
+                        w2_scale=w2_scale,
+                        w1_zp=w1_zp,
+                        w2_zp=w2_zp,
+                        a1q_scale=a1q_scale,
+                        a2_scale=a2_scale,
+                        workspace13=None,
+                        workspace2=None,
+                        use_nn_moe=use_nn_moe,
+                        expert_num_tokens=expert_num_tokens,
+                        shared_output=shared_output,
+                        routed_scaling_factor=routed_scaling_factor,
+                    )
+
+        self.prepare_finalize.finalize(output, fused_out, topk_weights,
+                                       topk_ids, apply_router_weight_on_input, apply_weights_and_reduce=False)
+
+        return output
--- a/vllm/model_executor/layers/fused_moe/ep_moe/ep_moe_utlis.py
+++ b/vllm/model_executor/layers/fused_moe/ep_moe/ep_moe_utlis.py
--- a/vllm/model_executor/layers/fused_moe/ep_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/ep_moe/layer.py
-import os
-import logging
-from typing import Callable, List, Optional, Tuple
-from dataclasses import dataclass
+from typing import Callable, Optional
 from collections.abc import Iterable

 import torch
-import torch.nn.functional as F
+import torch.distributed as dist

 from vllm.logger import init_logger
 from vllm.platforms import current_platform
@@ -18,10 +15,8 @@ from vllm.model_executor.layers.quantization.base_config import (
    QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.fused_moe.layer import FusedMoEMethodBase, UnquantizedFusedMoEMethod
-from vllm.model_executor.layers.fused_moe.ep_moe.token_dispatcher import MoEAlltoAllTokenDispatcher
-from vllm.model_executor.layers.fused_moe.ep_moe.ep_moe_utlis import EpMoeConfig
+from vllm.model_executor.layers.fused_moe.mori_moe.ep_moe_utlis import EpMoeConfig
 from vllm.utils import direct_register_custom_op
-import torch.distributed as dist

 try:
    import mori
@@ -35,8 +30,8 @@ logger = init_logger(__name__)
 _MORI_OP = None


-@CustomOp.register("unquantized_ep_moe")
-class UnquantizedEPGroupedGemmMethod(UnquantizedFusedMoEMethod):
+@CustomOp.register("unquantized_mori_moe")
+class UnquantizedMoriMoeMethod(UnquantizedFusedMoEMethod):
    """MoE method without quantization."""

    def __init__(self, moe: FusedMoEConfig):
@@ -44,9 +39,9 @@ class UnquantizedEPGroupedGemmMethod(UnquantizedFusedMoEMethod):
        self.topk_indices_dtype = None
        self.moe = moe

-        self.rocm_aiter_moe_enabled = False  # is_rocm_aiter_moe_enabled()
+        self.rocm_aiter_moe_enabled = False

-    def apply_ep(
+    def apply_mori_ep(
            self,
            layer: torch.nn.Module,
            hidden_states: torch.Tensor,
@@ -162,7 +157,7 @@ class UnquantizedEPGroupedGemmMethod(UnquantizedFusedMoEMethod):
        forward_native = forward_cuda


-class EPMoE(FusedMoE):
+class MoriMoE(FusedMoE):
    """
    dp+ep MoE Expert Parallel Impl

@@ -194,7 +189,6 @@ class EPMoE(FusedMoE):
            enable_eplb: bool = False,
            num_redundant_experts: int = 0,
            moe_permute_fusion: bool = False,
-            moe_shared_expert_overlap: bool = False
    ):
        super().__init__(num_experts, top_k, hidden_size,
                         intermediate_size, params_dtype,
@@ -215,7 +209,6 @@ class EPMoE(FusedMoE):
            moe_router_topk=self.top_k,
            # TODO: support fusion permute
            moe_permute_fusion=moe_permute_fusion,
-            moe_shared_expert_overlap=moe_shared_expert_overlap,
            ep_size=self.ep_size,
            num_moe_experts=self.global_num_experts,
            routed_scaling_factor=self.routed_scaling_factor,
@@ -228,23 +221,15 @@ class EPMoE(FusedMoE):
        self.local_expert_indices = [
            local_expert_indices_offset + i for i in range(self.local_num_experts)
        ]
-
-        self.use_shared_expert = False
-        self.token_dispatcher = MoEAlltoAllTokenDispatcher(
-            self.local_num_experts, self.local_expert_indices,
-            config=self.ep_moe_config, layer_name=f"{self.layer_name}.token_dispatcher",
-        )
-
-        self.shared_expert_overlap = moe_shared_expert_overlap
+        
        self.shared_experts = None

        self.scales = None
        self.use_int8_dispatch = True

        vllm_config = get_current_vllm_config()
-        self.max_num_inp_token_per_rank = vllm_config.scheduler_config.max_num_seqs
+        self.max_num_inp_token_per_rank = 1024 #vllm_config.scheduler_config.max_num_seqs
        self.mori_op = self.get_mori_op()
-        self.first = True

    def get_mori_op(self):
        global _MORI_OP
@@ -253,10 +238,6 @@ class EPMoE(FusedMoE):
            assert world_group is not None
            torch._C._distributed_c10d._register_process_group("mori_ep", get_ep_group().device_group)
            mori.shmem.shmem_torch_process_group_init("mori_ep")
-            # world_group = torch.distributed.group.WORLD
-            # assert world_group is not None
-            # torch._C._distributed_c10d._register_process_group("default", world_group)
-            # mori.shmem.shmem_torch_process_group_init("default")

            vllm_config = get_current_vllm_config()
            multi_node = self.ep_size / 8 > 1
@@ -278,8 +259,7 @@ class EPMoE(FusedMoE):
                num_experts_per_token=self.top_k,
                max_token_type_size=2,
                block_num=80,
-                warp_num_per_block=16,
-                # kernel_type=mori.ops.EpDispatchCombineKernelType.InterNode
+                warp_num_per_block=4,
                kernel_type=mori.ops.EpDispatchCombineKernelType.InterNode if multi_node else \
                    mori.ops.EpDispatchCombineKernelType.IntraNode
            )
@@ -291,14 +271,11 @@ class EPMoE(FusedMoE):
        if self.shared_experts is None:
            self.shared_experts = shared_experts

-            if self.shared_expert_overlap:
-                self.token_dispatcher.set_shared_experts(self.shared_experts)
-
    def create_quant_method(self, moe, quant_config, prefix):
        # Note: get_quant_method will look at the layer's local_num_experts
        # for heuristic purposes, so it must be initialized first.
        quant_method: Optional[QuantizeMethodBase] = None
-        quant_method = (UnquantizedEPGroupedGemmMethod(moe) if quant_config is None
+        quant_method = (UnquantizedMoriMoeMethod(moe) if quant_config is None
                        else quant_config.get_quant_method(self, prefix))

        assert quant_method is not None
@@ -311,7 +288,7 @@ class EPMoE(FusedMoE):

    def forward(self, hidden_states: torch.Tensor,
                router_logits: torch.Tensor):
-        return torch.ops.vllm.ep_moe_forward(hidden_states, router_logits,
+        return torch.ops.vllm.mori_moe_forward(hidden_states, router_logits,
                                             self.layer_name)

    def get_expert_weights(self) -> Iterable[torch.Tensor]:
@@ -351,7 +328,7 @@ class EPMoE(FusedMoE):
            routed_scaling_factor=self.routed_scaling_factor,
            use_fused_gate=self.use_fused_gate)

-        if not self.ep_moe_config.moe_shared_expert_overlap and self.shared_experts is not None:
+        if self.shared_experts is not None:
            shared_output = self.shared_experts(hidden_states)

        if self.use_int8_dispatch:
@@ -378,33 +355,10 @@ class EPMoE(FusedMoE):
            hidden_states,
            topk_weights,
            scales,
-            topk_ids,
+            topk_ids
        )
-        # self.sync()

-        # expect_m = topk_ids.shape[0] * self.ep_size
-        # dispatch_output_clip = dispatch_output[:expect_m]
-        # dispatch_weights_clip  = dispatch_weights[:expect_m]
-        # dispatch_indices_clip  = dispatch_indices[:expect_m]
-        # dispatch_scales_clip  = dispatch_scales[:expect_m]
-        # expert_output = self.quant_method.apply_ep(
-        #     layer=self,
-        #     x=dispatch_output_clip,
-        #     topk_weights=dispatch_weights_clip,
-        #     topk_ids=dispatch_indices_clip,
-        #     global_num_experts=self.global_num_experts,
-        #     expert_map=self.expert_map,
-        #     activation=self.activation,
-        #     apply_router_weight_on_input=self.apply_router_weight_on_input,
-        #     use_nn_moe=self.use_nn_moe,
-        #     num_local_tokens=dispatch_recv_num_token,
-        #     config_select_bs=hidden_states.shape[0],
-        #     scales=dispatch_scales_clip if self.use_int8_dispatch else None
-        #     #routed_scaling_factor=self.routed_scaling_factor,
-        # )
-
-
-        expert_output = self.quant_method.apply_ep(
+        expert_output = self.quant_method.apply_mori_ep(
            layer=self,
            x=dispatch_output,
            topk_weights=dispatch_weights,
@@ -415,10 +369,10 @@ class EPMoE(FusedMoE):
            apply_router_weight_on_input=self.apply_router_weight_on_input,
            use_nn_moe=self.use_nn_moe,
            num_local_tokens=dispatch_recv_num_token,
-            config_select_bs=hidden_states.shape[0],
+            expect_m=hidden_states.shape[0],
            scales=dispatch_scales if self.use_int8_dispatch else None
-            # routed_scaling_factor=self.routed_scaling_factor,
        )
+        
        # self.sync()

        combine_output, _ = self.mori_op.combine(expert_output, dispatch_weights, topk_ids)
@@ -426,13 +380,7 @@ class EPMoE(FusedMoE):

        # self.sync()

-        if not self.ep_moe_config.moe_shared_expert_overlap and self.shared_experts is not None:
-            # if shared_expert_overlap is True, the expert calculation happens in
-            # the token_dispatcher to overlap communications and computations
-            # shared_output = (
-            #         self.maybe_all_reduce_tensor_model_parallel(
-            #             shared_output))
-
+        if self.shared_experts is not None:
            if hidden_states.dtype != torch.float16:
                final_hidden_states = final_hidden_states + shared_output
            else:
@@ -444,7 +392,7 @@ class EPMoE(FusedMoE):
        return final_hidden_states


-def ep_moe_forward(hidden_states: torch.Tensor, router_logits: torch.Tensor,
+def mori_moe_forward(hidden_states: torch.Tensor, router_logits: torch.Tensor,
                   layer_name: str) -> torch.Tensor:
    forward_context: ForwardContext = get_forward_context()
    self = forward_context.no_compile_layers[layer_name]
@@ -453,16 +401,16 @@ def ep_moe_forward(hidden_states: torch.Tensor, router_logits: torch.Tensor,
    return self.forward_impl(hidden_states, router_logits)


-def ep_moe_forward_fake(hidden_states: torch.Tensor, router_logits: torch.Tensor,
+def mori_moe_forward_fake(hidden_states: torch.Tensor, router_logits: torch.Tensor,
                        layer_name: str) -> torch.Tensor:
    return torch.empty_like(hidden_states)


 direct_register_custom_op(
-    op_name="ep_moe_forward",
-    op_func=ep_moe_forward,
+    op_name="mori_moe_forward",
+    op_func=mori_moe_forward,
    mutates_args=["hidden_states", "router_logits"],
-    fake_impl=ep_moe_forward_fake,
+    fake_impl=mori_moe_forward_fake,
    dispatch_key=current_platform.dispatch_key,
    tags=(torch.Tag.needs_fixed_stride_order,),
 )
\ No newline at end of file
--- a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
@@ -207,6 +207,7 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
        topk_weights: torch.Tensor,
        topk_ids: torch.Tensor,
        apply_router_weight_on_input: bool,
+        apply_weights_and_reduce: bool = True
    ) -> None:
        # This argument is optional
        # There's not much point setting this unless it is != topk_ids.size(0)

--- a/vllm/model_executor/layers/fused_moe/prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/prepare_finalize.py
@@ -61,6 +61,7 @@ class MoEPrepareAndFinalizeNoEP(mk.FusedMoEPrepareAndFinalize):
        topk_weights: torch.Tensor,
        topk_ids: torch.Tensor,
        apply_router_weight_on_input: bool,
+        apply_weights_and_reduce: bool = True
    ) -> None:
        _moe_unpermute_and_reduce(output, fused_expert_output, None,
                                  topk_weights, apply_router_weight_on_input)
--- a/vllm/model_executor/layers/fused_moe/triton_group_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/triton_group_gemm_moe.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
+
+import torch
+
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
+from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
+    DeepGemmExperts, _valid_deep_gemm, _valid_deep_gemm_shape)
+
+
+class TritonOrGroupGemmExperts(mk.CustomizedFusedMoEPermuteExpertsUnpermute):
+
+    def __init__(
+        self,
+        use_fp8_w8a8: bool = False,
+        use_int8_w8a8: bool = False,
+        use_int8_w8a16: bool = False,
+        use_int4_w4a16: bool = False,
+        per_act_token_quant: bool = False,
+        block_shape: Optional[list[int]] = None,
+        allow_group_gemm: bool = False,
+        fused_experts = None
+    ):
+        super().__init__(
+            FusedMoEQuantConfig.make(
+                use_fp8_w8a8=use_fp8_w8a8,
+                use_int8_w8a8=use_int8_w8a8,
+                use_int8_w8a16=use_int8_w8a16,
+                use_int4_w4a16=use_int4_w4a16,
+                per_act_token_quant=per_act_token_quant,
+                block_shape=block_shape,
+            ))
+        self.fused_experts = fused_experts
+
+    @property
+    def activation_formats(
+        self
+    ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
+        return (mk.FusedMoEActivationFormat.Standard,
+                mk.FusedMoEActivationFormat.Standard)
+
+    def supports_chunking(self) -> bool:
+        return True
+
+    def supports_expert_map(self) -> bool:
+        return True
+
+    def workspace_shapes(
+        self,
+        a: torch.Tensor,
+        aq: torch.Tensor,
+        M: int,
+        N: int,
+        K: int,
+        topk: int,
+        global_num_experts: int,
+        local_num_experts: int,
+    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]:
+        raise NotImplementedError
+
+    def apply(
+        self,
+        output: torch.Tensor,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
+        global_num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        w1_scale: Optional[torch.Tensor],
+        w2_scale: Optional[torch.Tensor],
+        w1_zp: Optional[torch.Tensor],
+        w2_zp: Optional[torch.Tensor],
+        a1q_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        workspace13: torch.Tensor,
+        workspace2: torch.Tensor,
+        topk_weights: Optional[torch.Tensor] = None,
+        expert_num_tokens: Optional[torch.Tensor] = None,
+        use_nn_moe: Optional[bool] = False,
+        shared_output: Optional[torch.Tensor] = None,
+        routed_scaling_factor: Optional[float] = None,
+    ):
+        assert self.fused_experts is not None
+
+        return self.fused_experts(
+                    x=hidden_states,
+                    w1=w1,
+                    w2=w2,
+                    topk_ids=topk_ids,
+                    topk_weights=topk_weights,
+                    global_num_experts=global_num_experts,
+                    expert_map=expert_map,
+                    apply_router_weight_on_input=False,
+                    activation=activation,
+                    w1_scale=w1_scale,
+                    w2_scale=w2_scale,
+                    a1_scale=a1q_scale,
+                    a2_scale=a2_scale,
+                    expert_num_tokens=expert_num_tokens,
+                    use_nn_moe=use_nn_moe,
+                    shared_output=shared_output,
+                    routed_scaling_factor=routed_scaling_factor
+                )
--- a/vllm/model_executor/layers/quantization/slimquant_w4a8_marlin.py
+++ b/vllm/model_executor/layers/quantization/slimquant_w4a8_marlin.py
@@ -4,10 +4,12 @@ import os
 import torch
 from torch.nn.parameter import Parameter

+import vllm.envs as envs
 from vllm import _custom_ops as ops
 from vllm.model_executor.utils import set_weight_attrs
-from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.distributed import get_tensor_model_parallel_world_size, get_dp_group
 from vllm.logger import init_logger
+from vllm.config import get_current_vllm_config
 from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.model_executor.layers.linear import (LinearBase,LinearMethodBase)
 from vllm.model_executor.layers.quantization.base_config import (QuantizationConfig,
@@ -125,6 +127,10 @@ class SlimQuantW4A8Int8MarlinConfig(QuantizationConfig):

    def get_scaled_act_names(self) -> List[str]:
        return []
+    
+    @property
+    def weight_block_size(self):
+        return [128,128]


 class SlimQuantW4A8Int8MarlinMoEMethod:
@@ -154,6 +160,15 @@ class SlimQuantW4A8Int8MarlinMoEMethod:

    def __init__(self, quant_config):
        self.quant_config = quant_config
+        self.fused_experts = self.w4a8_marlin_forward
+        vllm_config = get_current_vllm_config()
+        parallel_config = vllm_config.parallel_config
+        self.use_deepep = parallel_config.enable_expert_parallel and \
+            (envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput" or \
+             envs.VLLM_ALL2ALL_BACKEND == "deepep_low_latency")
+        
+        self.enable_moe_group_gemm = parallel_config.enable_expert_parallel and envs.VLLM_ENABLE_MOE_GROUP_GEMM
+

    def create_weights(
        self,
@@ -218,7 +233,55 @@ class SlimQuantW4A8Int8MarlinMoEMethod:
        layer.w13_weight = Parameter(w4a8_weight_repack_impl(layer.w13_weight), requires_grad=False)
        layer.w2_weight = Parameter(w4a8_weight_repack_impl(layer.w2_weight), requires_grad=False)

-    def apply_ep(  #dp+ep
+    def w4a8_marlin_forward(self,
+        x: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_ids: torch.Tensor,
+        topk_weights: torch.Tensor,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+        w1_scale: Optional[torch.Tensor] = None,
+        w2_scale: Optional[torch.Tensor] = None,
+        a1_scale: Optional[torch.Tensor] = None,
+        a2_scale: Optional[torch.Tensor] = None,
+        expert_num_tokens: Optional[torch.Tensor] = None,
+        use_nn_moe: Optional[bool] = False,
+        routed_scaling_factor: Optional[float] = None,
+        shared_output: Optional[torch.Tensor] = None,
+        **_  ):
+            if not self.enable_moe_group_gemm:
+                workspace, global_reduce_buffer = MarlinMoeWorkspace(x.device).get_buffers()
+                return fused_experts_impl_w4a8_marlin(
+                    x,
+                    w1,
+                    w2,
+                    topk_ids=topk_ids,
+                    topk_weights=topk_weights,
+                    workspace=workspace,
+                    global_reduce_buffer=global_reduce_buffer,
+                    inplace=True,
+                    use_int4_w4a8=True,
+                    per_channel_quant=True,
+                    activation=activation,
+                    expert_map=expert_map,
+                    apply_router_weight_on_input=apply_router_weight_on_input,
+                    global_num_experts=global_num_experts,
+                    w1_scale=w1_scale,
+                    w2_scale=w2_scale,
+                    a1_scale=a1_scale,
+                    a2_scale=a2_scale,
+                    use_nn_moe=use_nn_moe,
+                    shared_output=shared_output,
+                    routed_scaling_factor=routed_scaling_factor,
+                )
+            else:
+                # TODO: 
+                return None
+
+    def apply_mori_ep(
                self,
                layer: torch.nn.Module,
                x: torch.Tensor,
@@ -230,7 +293,7 @@ class SlimQuantW4A8Int8MarlinMoEMethod:
                activation: str = "silu",
                use_nn_moe: Optional[bool] = False,
                num_local_tokens: Optional[torch.Tensor] = None,
-                config_select_bs: Optional[int] = None,
+                expect_m: Optional[int] = None,
                routed_scaling_factor: Optional[float] = None,
                scales: Optional[torch.Tensor] = None,
                **_
@@ -253,12 +316,11 @@ class SlimQuantW4A8Int8MarlinMoEMethod:
                global_num_experts=global_num_experts,
                w1_scale=(layer.w13_weight_scale),
                w2_scale=(layer.w2_weight_scale),
-                a1_scale=layer.w13_input_scale,
+                a1_scale=scales,
                a2_scale=layer.w2_input_scale,
                use_nn_moe=use_nn_moe,
                num_local_tokens=num_local_tokens,
-                config_select_bs=config_select_bs,
-                q_scales=scales
+                expect_m=expect_m,
            )

    def apply(
@@ -301,29 +363,25 @@ class SlimQuantW4A8Int8MarlinMoEMethod:
            custom_routing_function=custom_routing_function,
            scoring_func=scoring_func,
            e_score_correction_bias=e_score_correction_bias,
+            indices_type=torch.int64 if self.use_deepep else None,
            routed_scaling_factor=routed_scaling_factor,
            use_fused_gate=use_fused_gate
        )
-        workspace, global_reduce_buffer = MarlinMoeWorkspace(x.device).get_buffers()
-        return fused_experts_impl_w4a8_marlin(
+        return self.fused_experts(
            x,
            layer.w13_weight,
            layer.w2_weight,
            topk_weights=topk_weights,
            topk_ids=topk_ids,
-            workspace=workspace,
-            global_reduce_buffer=global_reduce_buffer,
            inplace=True,
-            use_int4_w4a8=True,
-            per_channel_quant=True,
            activation=activation,
-            expert_map=expert_map,
-            apply_router_weight_on_input=apply_router_weight_on_input,
            global_num_experts=global_num_experts,
+            expert_map=expert_map,
            w1_scale=(layer.w13_weight_scale),
            w2_scale=(layer.w2_weight_scale),
            a1_scale=layer.w13_input_scale,
            a2_scale=layer.w2_input_scale,
+            apply_router_weight_on_input=apply_router_weight_on_input,
            use_nn_moe=use_nn_moe,
            shared_output=shared_output,
            routed_scaling_factor=routed_scaling_factor,
@@ -335,10 +393,7 @@ class SlimQuantW4A8Int8MarlinMoEMethod:
        moe: FusedMoEConfig,
    ) -> FusedMoEPermuteExpertsUnpermute:
        from vllm.model_executor.layers.fused_moe import (
-            BatchedGroupedGemmExperts, GroupedGemmGemmExperts)
-
-        assert not self.rocm_aiter_moe_enabled, (
-            "ROCm AITER are not supported with all2all yet.")
+            TritonOrGroupGemmExperts)

        if (prepare_finalize.activation_format ==
                FusedMoEActivationFormat.BatchedExperts):
@@ -350,21 +405,16 @@ class SlimQuantW4A8Int8MarlinMoEMethod:
                "max_tokens_per_rank=%s, block_size=%s, per_act_token=%s",
                self.__class__.__name__, max_num_tokens_per_rank,
                self.quant_config.weight_block_size, False)
-            return BatchedGroupedGemmExperts(
-                max_num_tokens=max_num_tokens_per_rank,
-                num_dispatchers=prepare_finalize.num_dispatchers(),
-                use_fp8_w8a8=False,
-                block_shape=self.quant_config.weight_block_size,
-                per_act_token_quant=True,
-                allow_deep_gemm=False,
-            )
+            return None
        else:
            logger.debug(
-                "GroupedGemmGemmExperts(%s): block_size=%s, per_act_token=%s",
+                "TritonOrGroupGemmExperts(%s): block_size=%s, per_act_token=%s",
                self.__class__.__name__, self.quant_config.weight_block_size,
                False)
-            return GroupedGemmGemmExperts(
+            
+            return TritonOrGroupGemmExperts(
                use_fp8_w8a8=False,
                block_shape=self.quant_config.weight_block_size,
-                allow_deep_gemm=False,
+                allow_group_gemm=False,
+                fused_experts=self.w4a8_marlin_forward
            )
--- a/vllm/model_executor/models/deepseek_mtp.py
+++ b/vllm/model_executor/models/deepseek_mtp.py
@@ -178,7 +178,7 @@ class DeepSeekMTP(nn.Module, SupportsPP):

        parallel_config = vllm_config.parallel_config
        dp_size = get_dp_group().world_size
-        self.use_mori_ep = envs.VLLM_USE_MORI_EP and dp_size > 1 and parallel_config.enable_expert_parallel
+        self.use_mori_ep = envs.VLLM_ALL2ALL_BACKEND == 'mori' and dp_size > 1 and parallel_config.enable_expert_parallel


    def forward(