[Refactor] Remove unused dead code (#40640)

Signed-off-by: yewentao256 <zhyanwentao@126.com>

[Refactor] Remove unused dead code (#40640)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
a474da28 · Wentao Ye · GitHub · ce6a199e · a474da28 · a474da28
Unverified Commit a474da28 authored Apr 24, 2026 by Wentao Ye Committed by GitHub Apr 25, 2026
4 changed files
--- a/vllm/model_executor/layers/batch_invariant.py
+++ b/vllm/model_executor/layers/batch_invariant.py
@@ -39,7 +39,7 @@ def _matmul_launch_metadata(


 @triton.jit
-def _compute_pid(tile_id, num_pid_in_group, num_pid_m, GROUP_SIZE_M, NUM_SMS):
+def _compute_pid(tile_id, num_pid_in_group, num_pid_m, GROUP_SIZE_M):
    group_id = tile_id // num_pid_in_group
    first_pid_m = group_id * GROUP_SIZE_M
    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
@@ -85,9 +85,7 @@ def matmul_kernel_persistent(
    num_pid_in_group = GROUP_SIZE_M * num_pid_n

    for tile_id in tl.range(start_pid, num_tiles, NUM_SMS, flatten=True):
-        pid_m, pid_n = _compute_pid(
-            tile_id, num_pid_in_group, num_pid_m, GROUP_SIZE_M, NUM_SMS
-        )
+        pid_m, pid_n = _compute_pid(tile_id, num_pid_in_group, num_pid_m, GROUP_SIZE_M)
        start_m = pid_m * BLOCK_SIZE_M
        start_n = pid_n * BLOCK_SIZE_N
        offs_am = start_m + tl.arange(0, BLOCK_SIZE_M)
@@ -124,7 +122,7 @@ def matmul_kernel_persistent(

        tile_id_c += NUM_SMS
        pid_m, pid_n = _compute_pid(
-            tile_id_c, num_pid_in_group, num_pid_m, GROUP_SIZE_M, NUM_SMS
+            tile_id_c, num_pid_in_group, num_pid_m, GROUP_SIZE_M
        )
        offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
        offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)

--- a/vllm/v1/worker/dp_utils.py
+++ b/vllm/v1/worker/dp_utils.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-
-import numpy as np
 import torch
 import torch.distributed as dist

@@ -167,7 +165,6 @@ def coordinate_batch_across_dp(
    parallel_config: ParallelConfig,
    num_tokens_padded: int | None = None,
    uniform_decode: bool | None = None,
-    num_scheduled_tokens_per_request: np.ndarray | None = None,
    cudagraph_mode: int = 0,
 ) -> tuple[bool, torch.Tensor | None, int]:
    """
@@ -182,8 +179,6 @@ def coordinate_batch_across_dp(
            TP, etc)
        uniform_decode: Only used if allow_microbatching is True. True if the batch
            only contains single token decodes
-        num_scheduled_tokens_per_request: Only used if allow_microbatching is True. The
-            number of tokens per request.
        cudagraph_mode: The cudagraph mode for this rank (0=NONE, 1=PIECEWISE, 2=FULL).
            DP padding is enabled when synced cudagraph mode across ranks is not NONE.


--- a/vllm/v1/worker/gpu/dp_utils.py
+++ b/vllm/v1/worker/gpu/dp_utils.py
@@ -13,12 +13,6 @@ from vllm.v1.worker.gpu.cudagraph_utils import (
 )


-def make_num_tokens_across_dp(dp_size: int, num_tokens: int) -> torch.Tensor | None:
-    if dp_size == 1:
-        return None
-    return torch.full((dp_size,), num_tokens, dtype=torch.int32, device="cpu")
-
-
 def sync_cudagraph_and_dp_padding(
    cudagraph_manager: CudaGraphManager | None,
    desired_batch_desc: BatchExecutionDescriptor,

--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -3362,7 +3362,6 @@ class GPUModelRunner(
        logits: torch.Tensor | None,
        hidden_states: torch.Tensor,
        num_scheduled_tokens: int,
-        spec_decode_metadata: SpecDecodeMetadata | None,
    ) -> tuple[
        dict[str, int],
        LogprobsLists | None,
@@ -3630,7 +3629,6 @@ class GPUModelRunner(
                    allow_microbatching=allow_microbatching,
                    num_tokens_padded=num_tokens_padded,
                    uniform_decode=uniform_decode,
-                    num_scheduled_tokens_per_request=num_scheduled_tokens_np,
                    cudagraph_mode=cudagraph_mode.value,
                )
            )
@@ -4308,7 +4306,6 @@ class GPUModelRunner(
                logits,
                hidden_states,
                scheduler_output.total_num_scheduled_tokens,
-                spec_decode_metadata,
            )

        if propose_drafts_after_bookkeeping:
@@ -6540,7 +6537,6 @@ class GPUModelRunner(

    def _reshape_kv_cache_tensors(
        self,
-        kv_cache_config: KVCacheConfig,
        kv_cache_raw_tensors: dict[str, torch.Tensor],
        kernel_block_sizes: list[int],
    ) -> dict[str, torch.Tensor]:
@@ -6548,7 +6544,6 @@ class GPUModelRunner(
        Reshape the KV cache tensors to the desired shape and dtype.

        Args:
-            kv_cache_config: The KV cache config
            kv_cache_raw_tensors: The KV cache buffer of each layer, with
                correct size but uninitialized shape.
            kernel_block_sizes: The kernel block sizes for each KV cache group.
@@ -6712,7 +6707,7 @@ class GPUModelRunner(

            # Change the memory buffer to the desired shape
            kv_caches = self._reshape_kv_cache_tensors(
-                kv_cache_config, kv_cache_raw_tensors, kernel_block_sizes
+                kv_cache_raw_tensors, kernel_block_sizes
            )

        # Set up cross-layer KV cache sharing