fix dense run error

5b0a1c93 · zhuwenwen · 8f3d67b5 · 5b0a1c93 · 5b0a1c93 · 5b0a1c93
Commit 5b0a1c93 authored Aug 19, 2025 by zhuwenwen
5 changed files
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -28,6 +28,42 @@ batchsize_logging_interval: float = envs.VLLM_LOG_BATCHSIZE_INTERVAL
 batchsize_forward_time: defaultdict = defaultdict(list)


+class BatchDescriptor(NamedTuple):
+    """
+    Batch descriptor for cudagraph dispatching. We should keep the num of
+    items as minimal as possible to properly and uniquely describe the padded
+    batch for cudagraph.
+    """
+    num_tokens: int
+    uniform_decode: bool = False
+    """
+    False can also be used for an uniform decode batch to dispatch to the 
+    cudagraph supporting non-uniform batches.
+    """
+
+    @property
+    def non_uniform(self) -> "BatchDescriptor":
+        """
+        Return a non-uniform version of current batch descriptor.
+        """
+        return BatchDescriptor(self.num_tokens, uniform_decode=False)
+
+
+def _compute_chunked_local_num_tokens(num_tokens_across_dp_cpu: list[int],
+                                      max_num_tokens: int,
+                                      chunk_idx: int) -> list[int]:
+    dp_size = len(num_tokens_across_dp_cpu)
+
+    local_size = [-1] * dp_size
+    for i in range(dp_size):
+        dp_tokens = num_tokens_across_dp_cpu[i]
+        local_size[i] = min(max_num_tokens,
+                            dp_tokens - (max_num_tokens * chunk_idx))
+        if local_size[i] <= 0:
+            local_size[i] = 1  # ensure lockstep even if done
+    return local_size
+
+
 @dataclass
 class DPMetadata:
    max_tokens_across_dp_cpu: torch.Tensor

--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1313,10 +1313,10 @@ def inplace_fused_experts(
    fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids, True,
                       activation, is_act_and_mul,
                       apply_router_weight_on_input, use_fp8_w8a8,
-                       use_int8_w8a8, use_int8_w8a16, use_int4_w4a16,
+                       use_int8_w8a8, use_int8_w8a16, use_int4_w4a16, use_int4_w4a8,
                       use_mxfp4_w4a4, per_channel_quant, global_num_experts,
                       expert_map, w1_scale, w2_scale, w1_zp, w2_zp, a1_scale,
-                       a2_scale, block_shape, w1_bias, w2_bias)
+                       a2_scale, block_shape, w1_bias, w2_bias, use_nn_moe)


 def inplace_fused_experts_fake(hidden_states: torch.Tensor,
@@ -1331,6 +1331,7 @@ def inplace_fused_experts_fake(hidden_states: torch.Tensor,
                               use_int8_w8a8: bool = False,
                               use_int8_w8a16: bool = False,
                               use_int4_w4a16: bool = False,
+                               use_int4_w4a8: bool =False,
                               use_mxfp4_w4a4: bool = False,
                               per_channel_quant: bool = False,
                               global_num_experts: int = -1,
@@ -1343,7 +1344,8 @@ def inplace_fused_experts_fake(hidden_states: torch.Tensor,
                               a2_scale: Optional[torch.Tensor] = None,
                               block_shape: Optional[list[int]] = None,
                               w1_bias: Optional[torch.Tensor] = None,
-                               w2_bias: Optional[torch.Tensor] = None) -> None:
+                               w2_bias: Optional[torch.Tensor] = None,
+                               use_nn_moe: Optional[bool] = False) -> None:
    pass


@@ -1540,6 +1542,7 @@ def outplace_fused_experts(
    use_int8_w8a8: bool = False,
    use_int8_w8a16: bool = False,
    use_int4_w4a16: bool = False,
+    use_int4_w4a8: bool =False,
    use_mxfp4_w4a4: bool = False,
    per_channel_quant: bool = False,
    global_num_experts: int = -1,
@@ -1553,13 +1556,14 @@ def outplace_fused_experts(
    block_shape: Optional[List[int]] = None,  #noqa: UP006
    w1_bias: Optional[torch.Tensor] = None,
    w2_bias: Optional[torch.Tensor] = None,
+    use_nn_moe: Optional[bool] = False,
 ) -> torch.Tensor:
    return fused_experts_impl(
        hidden_states, w1, w2, topk_weights, topk_ids, False, activation,
        is_act_and_mul, apply_router_weight_on_input, use_fp8_w8a8,
-        use_int8_w8a8, use_int8_w8a16, use_int4_w4a16, use_mxfp4_w4a4,
+        use_int8_w8a8, use_int8_w8a16, use_int4_w4a16, use_int4_w4a8, use_mxfp4_w4a4,
        per_channel_quant, global_num_experts, expert_map, w1_scale, w2_scale,
-        w1_zp, w2_zp, a1_scale, a2_scale, block_shape, w1_bias, w2_bias)
+        w1_zp, w2_zp, a1_scale, a2_scale, block_shape, w1_bias, w2_bias, use_nn_moe)


 def outplace_fused_experts_fake(
@@ -1634,6 +1638,7 @@ def fused_experts(hidden_states: torch.Tensor,
                  use_int8_w8a8: bool = False,
                  use_int8_w8a16: bool = False,
                  use_int4_w4a16: bool = False,
+                  use_int4_w4a8: bool =False,
                  use_mxfp4_w4a4: bool = False,
                  per_channel_quant: bool = False,
                  global_num_experts: int = -1,

--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -547,6 +547,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
                apply_router_weight_on_input=apply_router_weight_on_input,
                global_num_experts=global_num_experts,
                expert_map=expert_map,
+                use_nn_moe=use_nn_moe,
            )

    def forward_cpu(

--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -3,7 +3,7 @@

 import ast
 from dataclasses import replace
-from typing import Optional
+from typing import Optional, Any

 import numpy as np


--- a/vllm/zero_overhead/v1/gpu_model_runner.py
+++ b/vllm/zero_overhead/v1/gpu_model_runner.py
@@ -254,7 +254,7 @@ class V1ZeroModelRunner(GPUModelRunner):
                                                            True)
                last_draft_token_ids = self.last_draft_token_ids.flatten().to(torch.int)
                input_ids[input_ids_indices_tensor] = last_draft_token_ids[update_req_indices_tensor]
-    else:
+
        update_req_indices = []
        input_ids_indices = []
        token_idx = 0