Merge branch 'v0.9.2-dev-ds' of http://10.16.6.30/dcutoolkit/deeplearing/vllm into v0.9.2-dev-ds

2eb579dd · zhuwenwen · 7b8a9e18 · 0e92caa0 · 2eb579dd · 2eb579dd
Commit 2eb579dd authored Oct 12, 2025 by zhuwenwen
5 changed files
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -81,7 +81,7 @@ class Attention(nn.Module):
            calculate_kv_scales = cache_config.calculate_kv_scales
        else:
            kv_cache_dtype = "auto"
-            block_size = 64 if envs.VLLM_USE_FLASH_ATTN_PA or envs.VLLM_USE_FLASH_MLA else 16
+            block_size = 64 if envs.VLLM_USE_FLASH_ATTN_PA and envs.VLLM_USE_FLASH_MLA else 16
            is_attention_free = False
            calculate_kv_scales = False
        if num_kv_heads is None:
@@ -312,7 +312,7 @@ class MultiHeadAttention(nn.Module):
        attn_backend = get_attn_backend(head_size,
                                        dtype,
                                        kv_cache_dtype=None,
-                                        block_size=64 if envs.VLLM_USE_FLASH_ATTN_PA or envs.VLLM_USE_FLASH_MLA else 16,
+                                        block_size=64 if envs.VLLM_USE_FLASH_ATTN_PA and envs.VLLM_USE_FLASH_MLA else 16,
                                        is_attention_free=False)
        backend = backend_name_to_enum(attn_backend.get_name())
        if current_platform.is_rocm():

--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1499,7 +1499,7 @@ PrefixCachingHashAlgo = Literal["builtin", "sha256"]
 class CacheConfig:
    """Configuration for the KV cache."""

-    block_size: BlockSize = 64 if envs.VLLM_USE_FLASH_ATTN_PA or envs.VLLM_USE_FLASH_MLA else 16  # type: ignore
+    block_size: BlockSize = 64 if envs.VLLM_USE_FLASH_ATTN_PA and envs.VLLM_USE_FLASH_MLA else 16  # type: ignore
    """Size of a contiguous cache block in number of tokens. This is ignored on
    neuron devices and set to `--max-model-len`. On CUDA devices, only block
    sizes up to 32 are supported. On HPU devices, block size defaults to 128.

--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -949,6 +949,7 @@ def init_distributed_environment(
            backend = "gloo"
        # this backend is used for WORLD

+        parallel_config = config.parallel_config
        data_parallel_size = parallel_config.data_parallel_size
        use_mori_ep = envs.VLLM_USE_MORI_EP and data_parallel_size > 1 and parallel_config.enable_expert_parallel
        if use_mori_ep:

--- a/vllm/model_executor/layers/fused_moe/ep_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/ep_moe/layer.py
@@ -21,11 +21,14 @@ from vllm.model_executor.layers.fused_moe.layer import FusedMoEMethodBase, Unqua
 from vllm.model_executor.layers.fused_moe.ep_moe.token_dispatcher import MoEAlltoAllTokenDispatcher
 from vllm.model_executor.layers.fused_moe.ep_moe.ep_moe_utlis import EpMoeConfig
 from vllm.utils import direct_register_custom_op
-import mori
 import torch.distributed as dist

-from lmslim.layers.gemm.int8_utils import (
+try:
+    import mori
+    from lmslim.layers.gemm.int8_utils import (
        per_token_quant_int8)
+except ImportError:
+    is_mori_available = False


 logger = init_logger(__name__)
@@ -239,6 +242,8 @@ class EPMoE(FusedMoE):
        self.scales = None
        self.use_int8_dispatch = True

+        vllm_config = get_current_vllm_config()
+        self.max_num_inp_token_per_rank = vllm_config.scheduler_config.max_num_seqs
        self.mori_op = self.get_mori_op()
        self.first = True
        
@@ -270,7 +275,7 @@ class EPMoE(FusedMoE):
                hidden_dim=self.hidden_size,
                scale_dim=1 if self.use_int8_dispatch else 0,
                scale_type_size=mori_scale_type_size,
-                max_num_inp_token_per_rank=512,
+                max_num_inp_token_per_rank=self.max_num_inp_token_per_rank,
                num_experts_per_rank=self.local_num_experts,
                num_experts_per_token=self.top_k,
                max_token_type_size=2,
@@ -381,16 +386,33 @@ class EPMoE(FusedMoE):
        )
        #self.sync()
        
-        expect_m = hidden_states.shape[0] * self.ep_size
-        dispatch_output_clip = dispatch_output[:expect_m]
-        dispatch_weights_clip  = dispatch_weights[:expect_m]
-        dispatch_indices_clip  = dispatch_indices[:expect_m]
-        dispatch_scales_clip  = dispatch_scales[:expect_m]
+        # expect_m = topk_ids.shape[0] * self.ep_size
+        # dispatch_output_clip = dispatch_output[:expect_m]
+        # dispatch_weights_clip  = dispatch_weights[:expect_m]
+        # dispatch_indices_clip  = dispatch_indices[:expect_m]
+        # dispatch_scales_clip  = dispatch_scales[:expect_m]
+        # expert_output = self.quant_method.apply_ep(
+        #     layer=self,
+        #     x=dispatch_output_clip,
+        #     topk_weights=dispatch_weights_clip,
+        #     topk_ids=dispatch_indices_clip,
+        #     global_num_experts=self.global_num_experts,
+        #     expert_map=self.expert_map,
+        #     activation=self.activation,
+        #     apply_router_weight_on_input=self.apply_router_weight_on_input,
+        #     use_nn_moe=self.use_nn_moe,
+        #     num_local_tokens=dispatch_recv_num_token,
+        #     config_select_bs=hidden_states.shape[0],
+        #     scales=dispatch_scales_clip if self.use_int8_dispatch else None
+        #     #routed_scaling_factor=self.routed_scaling_factor,
+        # )
+
+
        expert_output = self.quant_method.apply_ep(
            layer=self,
-            x=dispatch_output_clip,
-            topk_weights=dispatch_weights_clip,
-            topk_ids=dispatch_indices_clip,
+            x=dispatch_output,
+            topk_weights=dispatch_weights,
+            topk_ids=dispatch_indices,
            global_num_experts=self.global_num_experts,
            expert_map=self.expert_map,
            activation=self.activation,
@@ -398,25 +420,9 @@ class EPMoE(FusedMoE):
            use_nn_moe=self.use_nn_moe,
            num_local_tokens=dispatch_recv_num_token,
            config_select_bs=hidden_states.shape[0],
-            scales=dispatch_scales_clip if self.use_int8_dispatch else None
+            scales=dispatch_scales if self.use_int8_dispatch else None
            #routed_scaling_factor=self.routed_scaling_factor,
        )
-
-        # expert_output = self.quant_method.apply_ep(
-        #     layer=self,
-        #     x=dispatch_output,
-        #     topk_weights=dispatch_weights,
-        #     topk_ids=dispatch_indices,
-        #     global_num_experts=self.global_num_experts,
-        #     expert_map=self.expert_map,
-        #     activation=self.activation,
-        #     apply_router_weight_on_input=self.apply_router_weight_on_input,
-        #     use_nn_moe=self.use_nn_moe,
-        #     num_local_tokens=dispatch_recv_num_token,
-        #     config_select_bs=hidden_states.shape[0]*2,
-        #     scales=dispatch_scales if self.use_int8_dispatch else None
-        #     #routed_scaling_factor=self.routed_scaling_factor,
-        # )
        #self.sync()

        combine_output, _ = self.mori_op.combine(expert_output, dispatch_weights, topk_ids)

--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -242,11 +242,6 @@ class DeepseekV2MoE(nn.Module):
                    # See DeepseekV2DecoderLayer for more details.
                    final_hidden_states = self.experts(hidden_states=hidden_states,
                                                    router_logits=router_logits)
-        else:        
-            final_hidden_states = self.experts(hidden_states=hidden_states,
-                                                router_logits=router_logits)
-
-        if not self.use_mori_ep:
                if shared_output is not None:
                    if hidden_states.dtype != torch.float16 or self.dpsk_fp16_quick:
                        final_hidden_states = final_hidden_states + shared_output
@@ -255,7 +250,11 @@ class DeepseekV2MoE(nn.Module):
                        # See DeepseekV2DecoderLayer for more details.
                        final_hidden_states = final_hidden_states + shared_output \
                            * (1. / self.routed_scaling_factor)
+        else:        
+            final_hidden_states = self.experts(hidden_states=hidden_states,
+                                                router_logits=router_logits)

+        if not self.use_mori_ep:
            if self.tp_size > 1:
                if envs.VLLM_ENABLE_TBO:
                    final_hidden_states = self.tbo_all_reduce(final_hidden_states)