Unverified Commit a474da28 authored by Wentao Ye's avatar Wentao Ye Committed by GitHub
Browse files

[Refactor] Remove unused dead code (#40640)


Signed-off-by: default avataryewentao256 <zhyanwentao@126.com>
parent ce6a199e
......@@ -39,7 +39,7 @@ def _matmul_launch_metadata(
@triton.jit
def _compute_pid(tile_id, num_pid_in_group, num_pid_m, GROUP_SIZE_M, NUM_SMS):
def _compute_pid(tile_id, num_pid_in_group, num_pid_m, GROUP_SIZE_M):
group_id = tile_id // num_pid_in_group
first_pid_m = group_id * GROUP_SIZE_M
group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
......@@ -85,9 +85,7 @@ def matmul_kernel_persistent(
num_pid_in_group = GROUP_SIZE_M * num_pid_n
for tile_id in tl.range(start_pid, num_tiles, NUM_SMS, flatten=True):
pid_m, pid_n = _compute_pid(
tile_id, num_pid_in_group, num_pid_m, GROUP_SIZE_M, NUM_SMS
)
pid_m, pid_n = _compute_pid(tile_id, num_pid_in_group, num_pid_m, GROUP_SIZE_M)
start_m = pid_m * BLOCK_SIZE_M
start_n = pid_n * BLOCK_SIZE_N
offs_am = start_m + tl.arange(0, BLOCK_SIZE_M)
......@@ -124,7 +122,7 @@ def matmul_kernel_persistent(
tile_id_c += NUM_SMS
pid_m, pid_n = _compute_pid(
tile_id_c, num_pid_in_group, num_pid_m, GROUP_SIZE_M, NUM_SMS
tile_id_c, num_pid_in_group, num_pid_m, GROUP_SIZE_M
)
offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import numpy as np
import torch
import torch.distributed as dist
......@@ -167,7 +165,6 @@ def coordinate_batch_across_dp(
parallel_config: ParallelConfig,
num_tokens_padded: int | None = None,
uniform_decode: bool | None = None,
num_scheduled_tokens_per_request: np.ndarray | None = None,
cudagraph_mode: int = 0,
) -> tuple[bool, torch.Tensor | None, int]:
"""
......@@ -182,8 +179,6 @@ def coordinate_batch_across_dp(
TP, etc)
uniform_decode: Only used if allow_microbatching is True. True if the batch
only contains single token decodes
num_scheduled_tokens_per_request: Only used if allow_microbatching is True. The
number of tokens per request.
cudagraph_mode: The cudagraph mode for this rank (0=NONE, 1=PIECEWISE, 2=FULL).
DP padding is enabled when synced cudagraph mode across ranks is not NONE.
......
......@@ -13,12 +13,6 @@ from vllm.v1.worker.gpu.cudagraph_utils import (
)
def make_num_tokens_across_dp(dp_size: int, num_tokens: int) -> torch.Tensor | None:
if dp_size == 1:
return None
return torch.full((dp_size,), num_tokens, dtype=torch.int32, device="cpu")
def sync_cudagraph_and_dp_padding(
cudagraph_manager: CudaGraphManager | None,
desired_batch_desc: BatchExecutionDescriptor,
......
......@@ -3362,7 +3362,6 @@ class GPUModelRunner(
logits: torch.Tensor | None,
hidden_states: torch.Tensor,
num_scheduled_tokens: int,
spec_decode_metadata: SpecDecodeMetadata | None,
) -> tuple[
dict[str, int],
LogprobsLists | None,
......@@ -3630,7 +3629,6 @@ class GPUModelRunner(
allow_microbatching=allow_microbatching,
num_tokens_padded=num_tokens_padded,
uniform_decode=uniform_decode,
num_scheduled_tokens_per_request=num_scheduled_tokens_np,
cudagraph_mode=cudagraph_mode.value,
)
)
......@@ -4308,7 +4306,6 @@ class GPUModelRunner(
logits,
hidden_states,
scheduler_output.total_num_scheduled_tokens,
spec_decode_metadata,
)
if propose_drafts_after_bookkeeping:
......@@ -6540,7 +6537,6 @@ class GPUModelRunner(
def _reshape_kv_cache_tensors(
self,
kv_cache_config: KVCacheConfig,
kv_cache_raw_tensors: dict[str, torch.Tensor],
kernel_block_sizes: list[int],
) -> dict[str, torch.Tensor]:
......@@ -6548,7 +6544,6 @@ class GPUModelRunner(
Reshape the KV cache tensors to the desired shape and dtype.
Args:
kv_cache_config: The KV cache config
kv_cache_raw_tensors: The KV cache buffer of each layer, with
correct size but uninitialized shape.
kernel_block_sizes: The kernel block sizes for each KV cache group.
......@@ -6712,7 +6707,7 @@ class GPUModelRunner(
# Change the memory buffer to the desired shape
kv_caches = self._reshape_kv_cache_tensors(
kv_cache_config, kv_cache_raw_tensors, kernel_block_sizes
kv_cache_raw_tensors, kernel_block_sizes
)
# Set up cross-layer KV cache sharing
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment