"docs/source/dev/sampling_params.rst" did not exist on "4bfa7e7f75eb5b1a397c93aeea1dea1afa867b2a"
Unverified Commit a474da28 authored by Wentao Ye's avatar Wentao Ye Committed by GitHub
Browse files

[Refactor] Remove unused dead code (#40640)


Signed-off-by: default avataryewentao256 <zhyanwentao@126.com>
parent ce6a199e
...@@ -39,7 +39,7 @@ def _matmul_launch_metadata( ...@@ -39,7 +39,7 @@ def _matmul_launch_metadata(
@triton.jit @triton.jit
def _compute_pid(tile_id, num_pid_in_group, num_pid_m, GROUP_SIZE_M, NUM_SMS): def _compute_pid(tile_id, num_pid_in_group, num_pid_m, GROUP_SIZE_M):
group_id = tile_id // num_pid_in_group group_id = tile_id // num_pid_in_group
first_pid_m = group_id * GROUP_SIZE_M first_pid_m = group_id * GROUP_SIZE_M
group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M) group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
...@@ -85,9 +85,7 @@ def matmul_kernel_persistent( ...@@ -85,9 +85,7 @@ def matmul_kernel_persistent(
num_pid_in_group = GROUP_SIZE_M * num_pid_n num_pid_in_group = GROUP_SIZE_M * num_pid_n
for tile_id in tl.range(start_pid, num_tiles, NUM_SMS, flatten=True): for tile_id in tl.range(start_pid, num_tiles, NUM_SMS, flatten=True):
pid_m, pid_n = _compute_pid( pid_m, pid_n = _compute_pid(tile_id, num_pid_in_group, num_pid_m, GROUP_SIZE_M)
tile_id, num_pid_in_group, num_pid_m, GROUP_SIZE_M, NUM_SMS
)
start_m = pid_m * BLOCK_SIZE_M start_m = pid_m * BLOCK_SIZE_M
start_n = pid_n * BLOCK_SIZE_N start_n = pid_n * BLOCK_SIZE_N
offs_am = start_m + tl.arange(0, BLOCK_SIZE_M) offs_am = start_m + tl.arange(0, BLOCK_SIZE_M)
...@@ -124,7 +122,7 @@ def matmul_kernel_persistent( ...@@ -124,7 +122,7 @@ def matmul_kernel_persistent(
tile_id_c += NUM_SMS tile_id_c += NUM_SMS
pid_m, pid_n = _compute_pid( pid_m, pid_n = _compute_pid(
tile_id_c, num_pid_in_group, num_pid_m, GROUP_SIZE_M, NUM_SMS tile_id_c, num_pid_in_group, num_pid_m, GROUP_SIZE_M
) )
offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import numpy as np
import torch import torch
import torch.distributed as dist import torch.distributed as dist
...@@ -167,7 +165,6 @@ def coordinate_batch_across_dp( ...@@ -167,7 +165,6 @@ def coordinate_batch_across_dp(
parallel_config: ParallelConfig, parallel_config: ParallelConfig,
num_tokens_padded: int | None = None, num_tokens_padded: int | None = None,
uniform_decode: bool | None = None, uniform_decode: bool | None = None,
num_scheduled_tokens_per_request: np.ndarray | None = None,
cudagraph_mode: int = 0, cudagraph_mode: int = 0,
) -> tuple[bool, torch.Tensor | None, int]: ) -> tuple[bool, torch.Tensor | None, int]:
""" """
...@@ -182,8 +179,6 @@ def coordinate_batch_across_dp( ...@@ -182,8 +179,6 @@ def coordinate_batch_across_dp(
TP, etc) TP, etc)
uniform_decode: Only used if allow_microbatching is True. True if the batch uniform_decode: Only used if allow_microbatching is True. True if the batch
only contains single token decodes only contains single token decodes
num_scheduled_tokens_per_request: Only used if allow_microbatching is True. The
number of tokens per request.
cudagraph_mode: The cudagraph mode for this rank (0=NONE, 1=PIECEWISE, 2=FULL). cudagraph_mode: The cudagraph mode for this rank (0=NONE, 1=PIECEWISE, 2=FULL).
DP padding is enabled when synced cudagraph mode across ranks is not NONE. DP padding is enabled when synced cudagraph mode across ranks is not NONE.
......
...@@ -13,12 +13,6 @@ from vllm.v1.worker.gpu.cudagraph_utils import ( ...@@ -13,12 +13,6 @@ from vllm.v1.worker.gpu.cudagraph_utils import (
) )
def make_num_tokens_across_dp(dp_size: int, num_tokens: int) -> torch.Tensor | None:
if dp_size == 1:
return None
return torch.full((dp_size,), num_tokens, dtype=torch.int32, device="cpu")
def sync_cudagraph_and_dp_padding( def sync_cudagraph_and_dp_padding(
cudagraph_manager: CudaGraphManager | None, cudagraph_manager: CudaGraphManager | None,
desired_batch_desc: BatchExecutionDescriptor, desired_batch_desc: BatchExecutionDescriptor,
......
...@@ -3362,7 +3362,6 @@ class GPUModelRunner( ...@@ -3362,7 +3362,6 @@ class GPUModelRunner(
logits: torch.Tensor | None, logits: torch.Tensor | None,
hidden_states: torch.Tensor, hidden_states: torch.Tensor,
num_scheduled_tokens: int, num_scheduled_tokens: int,
spec_decode_metadata: SpecDecodeMetadata | None,
) -> tuple[ ) -> tuple[
dict[str, int], dict[str, int],
LogprobsLists | None, LogprobsLists | None,
...@@ -3630,7 +3629,6 @@ class GPUModelRunner( ...@@ -3630,7 +3629,6 @@ class GPUModelRunner(
allow_microbatching=allow_microbatching, allow_microbatching=allow_microbatching,
num_tokens_padded=num_tokens_padded, num_tokens_padded=num_tokens_padded,
uniform_decode=uniform_decode, uniform_decode=uniform_decode,
num_scheduled_tokens_per_request=num_scheduled_tokens_np,
cudagraph_mode=cudagraph_mode.value, cudagraph_mode=cudagraph_mode.value,
) )
) )
...@@ -4308,7 +4306,6 @@ class GPUModelRunner( ...@@ -4308,7 +4306,6 @@ class GPUModelRunner(
logits, logits,
hidden_states, hidden_states,
scheduler_output.total_num_scheduled_tokens, scheduler_output.total_num_scheduled_tokens,
spec_decode_metadata,
) )
if propose_drafts_after_bookkeeping: if propose_drafts_after_bookkeeping:
...@@ -6540,7 +6537,6 @@ class GPUModelRunner( ...@@ -6540,7 +6537,6 @@ class GPUModelRunner(
def _reshape_kv_cache_tensors( def _reshape_kv_cache_tensors(
self, self,
kv_cache_config: KVCacheConfig,
kv_cache_raw_tensors: dict[str, torch.Tensor], kv_cache_raw_tensors: dict[str, torch.Tensor],
kernel_block_sizes: list[int], kernel_block_sizes: list[int],
) -> dict[str, torch.Tensor]: ) -> dict[str, torch.Tensor]:
...@@ -6548,7 +6544,6 @@ class GPUModelRunner( ...@@ -6548,7 +6544,6 @@ class GPUModelRunner(
Reshape the KV cache tensors to the desired shape and dtype. Reshape the KV cache tensors to the desired shape and dtype.
Args: Args:
kv_cache_config: The KV cache config
kv_cache_raw_tensors: The KV cache buffer of each layer, with kv_cache_raw_tensors: The KV cache buffer of each layer, with
correct size but uninitialized shape. correct size but uninitialized shape.
kernel_block_sizes: The kernel block sizes for each KV cache group. kernel_block_sizes: The kernel block sizes for each KV cache group.
...@@ -6712,7 +6707,7 @@ class GPUModelRunner( ...@@ -6712,7 +6707,7 @@ class GPUModelRunner(
# Change the memory buffer to the desired shape # Change the memory buffer to the desired shape
kv_caches = self._reshape_kv_cache_tensors( kv_caches = self._reshape_kv_cache_tensors(
kv_cache_config, kv_cache_raw_tensors, kernel_block_sizes kv_cache_raw_tensors, kernel_block_sizes
) )
# Set up cross-layer KV cache sharing # Set up cross-layer KV cache sharing
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment