Merge branch 'v0.9.2-dev-ds-wm-1215' into 'v0.9.2-dev-ds'

优化高吞吐模式num_sms See merge request dcutoolkit/deeplearing/vllm!297

Merge branch 'v0.9.2-dev-ds-wm-1215' into 'v0.9.2-dev-ds'
优化高吞吐模式num_sms See merge request dcutoolkit/deeplearing/vllm!297
ab1acdce · zhuwenwen · b2db7ca2 · 10400c58 · ab1acdce · ab1acdce
Commit ab1acdce authored Dec 16, 2025 by zhuwenwen
4 changed files
--- a/vllm/distributed/device_communicators/all2all.py
+++ b/vllm/distributed/device_communicators/all2all.py
@@ -173,6 +173,7 @@ class DeepEPHTAll2AllManager(DeepEPAll2AllManagerBase):
        if self.internode:
            num_rdma_bytes = int(1e9/2) #1024 * 1024 * 1024
            num_qps_per_rank = 30 #self.num_sms // 2
+            self.num_sms = 30
            # import deep_ep
            # num_nvl_bytes, num_rdma_bytes = 0, 0
@@ -184,6 +185,7 @@ class DeepEPHTAll2AllManager(DeepEPAll2AllManagerBase):
        else:
            num_rdma_bytes = 0
            num_qps_per_rank = 1
+            self.num_sms = 60
        assert num_rdma_bytes is not None
        assert num_qps_per_rank is not None

--- a/vllm/model_executor/layers/fused_moe/modular_kernel.py
+++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py
@@ -12,7 +12,7 @@ import torch
 import vllm.envs as envs
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.model_executor.layers.fused_moe.utils import _resize_cache
-from vllm.utils import cdiv
+from vllm.utils import cdiv, async_tensor_h2d
 #
 # This file defines a set of base classes used to make MoE kernels more modular.
@@ -97,6 +97,8 @@ class FusedMoEActivationFormat(Enum):
    BatchedExperts = "batched_experts",
 @dataclass
 class ExpertTokensMetadata:
    """
@@ -110,11 +112,16 @@ class ExpertTokensMetadata:
    def make_from_list(
        expert_num_tokens_list: list[int], device: str
    ) -> "ExpertTokensMetadata":
+        # expert_num_tokens_cpu = torch.tensor(
+        #     expert_num_tokens_list, device="cpu", dtype=torch.int32
+        # )
        expert_num_tokens_cpu = torch.tensor(
-            expert_num_tokens_list, device="cpu", dtype=torch.int32
+            expert_num_tokens_list, device="cpu", dtype=torch.int32, pin_memory=True
        )
+        expert_num_tokens = expert_num_tokens_cpu.to(device=device, non_blocking=True)
        return ExpertTokensMetadata(
-            expert_num_tokens=expert_num_tokens_cpu.to(device, non_blocking=True),
+            expert_num_tokens=expert_num_tokens,
            expert_num_tokens_cpu=expert_num_tokens_cpu,
        )

--- a/vllm/model_executor/layers/fused_moe/utils.py
+++ b/vllm/model_executor/layers/fused_moe/utils.py
@@ -547,41 +547,6 @@ def _fwd_kernel_ep_scatter_2(
    SCALE_HIDDEN_SIZE: tl.constexpr,
    SCALE_HIDDEN_SIZE_PAD: tl.constexpr,
 ):
-    # start_token_id = tl.program_id(0)
-    # grid_num = tl.num_programs(0)
-    # offset_in = tl.arange(0, HIDDEN_SIZE_PAD)
-    # mask = offset_in < HIDDEN_SIZE
-    # offset_in_s = tl.arange(0, SCALE_HIDDEN_SIZE_PAD)
-    # mask_s = offset_in_s < SCALE_HIDDEN_SIZE
-    # for token_id in range(start_token_id, total_token_num, grid_num):
-    #     to_copy = tl.load(recv_x + token_id * recv_x_stride0 + offset_in, mask=mask)
-    #     to_copy_s = tl.load(
-    #         recv_x_scale + token_id * recv_x_scale_stride0 + offset_in_s, mask=mask_s
-    #     )
-    #     for topk_index in tl.range(0, topk_num, 1, num_stages=4):
-    #         expert_id = tl.load(recv_topk + token_id * recv_topk_stride0 + topk_index)
-    #         if HAS_EXPERT_MAP:
-    #             expert_id = apply_expert_map(expert_id, expert_map)
-    #         if expert_id >= 0:
-    #             dest_token_index = tl.atomic_add(expert_start_loc + expert_id, 1)
-    #             tl.store(
-    #                 output_index + token_id * output_index_stride0 + topk_index,
-    #                 dest_token_index,
-    #             )
-    #             output_tensor_ptr = (
-    #                 output_tensor + dest_token_index * output_tensor_stride0
-    #             )
-    #             output_tensor_scale_ptr = (
-    #                 output_tensor_scale + dest_token_index * output_tensor_scale_stride0
-    #             )
-    #             tl.store(output_tensor_ptr + offset_in, to_copy, mask=mask)
-    #             tl.store(output_tensor_scale_ptr + offset_in_s, to_copy_s, mask=mask_s)
    start_token_id = tl.program_id(0)
    grid_num = tl.num_programs(0)
@@ -720,43 +685,6 @@ def _fwd_kernel_ep_gather(
    HAS_EXPERT_MAP: tl.constexpr,
    BLOCK_D: tl.constexpr,
 ):
-    # cur_block = tl.program_id(0)
-    # start_cur_token = tl.program_id(1)
-    # grid_num = tl.num_programs(1)
-    # for cur_token in range(start_cur_token, total_token_num, grid_num):
-    #     off_d = tl.arange(0, BLOCK_D)
-    #     accumulator = tl.zeros([BLOCK_D], dtype=tl.float32)
-    #     for topk_index in range(0, topk_num):
-    #         expert_id = tl.load(
-    #             recv_topk_ids + cur_token * recv_topk_ids_stride0 + topk_index
-    #         )
-    #         if HAS_EXPERT_MAP:
-    #             expert_id = apply_expert_map(expert_id, expert_map)
-    #         if expert_id >= 0:
-    #             source_token_index = tl.load(
-    #                 input_index + cur_token * input_index_stride0 + topk_index
-    #             )
-    #             acc_weight = tl.load(
-    #                 recv_topk_weight + cur_token * recv_topk_weight_stride0 + topk_index
-    #             )
-    #             tmp = tl.load(
-    #                 input_tensor
-    #                 + source_token_index * input_tensor_stride0
-    #                 + cur_block * BLOCK_D
-    #                 + off_d
-    #             )
-    #             accumulator += tmp.to(tl.float32) * acc_weight
-    #     tl.store(
-    #         output_tensor
-    #         + cur_token * output_tensor_stride0
-    #         + cur_block * BLOCK_D
-    #         + off_d,
-    #         accumulator.to(output_tensor.dtype.element_ty),
-    #     )
    cur_block_int32 = tl.program_id(0)
    cur_block = cur_block_int32.to(tl.int64)
@@ -856,6 +784,7 @@ def deepgemm_moe_permute(
    expert_num_tokens: Optional[torch.Tensor] = None,
    expert_num_tokens_cpu: Optional[torch.Tensor] = None,
    aq_out: torch.Tensor | None = None,
+    M_sum: int | None = None,
 ):
    assert aq.ndim == 2
    assert topk_ids.dtype.is_signed, "The kernel uses -1 to represent invalid topk_ids"
@@ -864,13 +793,14 @@ def deepgemm_moe_permute(
    block_m = block_shape[0]
-    M_sum = compute_aligned_M(
+    if M_sum is None:
-        M=topk_ids.size(0),
+        M_sum = compute_aligned_M(
-        num_topk=topk_ids.size(1),
+            M=topk_ids.size(0),
-        local_num_experts=local_num_experts,
+            num_topk=topk_ids.size(1),
-        alignment=block_m,
+            local_num_experts=local_num_experts,
-        expert_num_tokens_cpu=expert_num_tokens_cpu,
+            alignment=block_m,
-    )
+            expert_num_tokens_cpu=expert_num_tokens_cpu,
+        )
    expert_start_loc = torch.empty(
        (local_num_experts), device=device, dtype=torch.int32

--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe_marlin.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe_marlin.py
@@ -346,6 +346,7 @@ class CompressedTensorsW8A8Int8MarlinMoEMethod(CompressedTensorsMarlinMoEMethod)
                expert_num_tokens=expert_num_tokens,
                expert_num_tokens_cpu=expert_num_tokens_cpu,
                aq_out=a1q_perm,
+                M_sum=M_sum
            )
            # if expert_map is not None: