Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
a474da28
Unverified
Commit
a474da28
authored
Apr 24, 2026
by
Wentao Ye
Committed by
GitHub
Apr 25, 2026
Browse files
[Refactor] Remove unused dead code (#40640)
Signed-off-by:
yewentao256
<
zhyanwentao@126.com
>
parent
ce6a199e
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
4 additions
and
22 deletions
+4
-22
vllm/model_executor/layers/batch_invariant.py
vllm/model_executor/layers/batch_invariant.py
+3
-5
vllm/v1/worker/dp_utils.py
vllm/v1/worker/dp_utils.py
+0
-5
vllm/v1/worker/gpu/dp_utils.py
vllm/v1/worker/gpu/dp_utils.py
+0
-6
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+1
-6
No files found.
vllm/model_executor/layers/batch_invariant.py
View file @
a474da28
...
...
@@ -39,7 +39,7 @@ def _matmul_launch_metadata(
@
triton
.
jit
def
_compute_pid
(
tile_id
,
num_pid_in_group
,
num_pid_m
,
GROUP_SIZE_M
,
NUM_SMS
):
def
_compute_pid
(
tile_id
,
num_pid_in_group
,
num_pid_m
,
GROUP_SIZE_M
):
group_id
=
tile_id
//
num_pid_in_group
first_pid_m
=
group_id
*
GROUP_SIZE_M
group_size_m
=
min
(
num_pid_m
-
first_pid_m
,
GROUP_SIZE_M
)
...
...
@@ -85,9 +85,7 @@ def matmul_kernel_persistent(
num_pid_in_group
=
GROUP_SIZE_M
*
num_pid_n
for
tile_id
in
tl
.
range
(
start_pid
,
num_tiles
,
NUM_SMS
,
flatten
=
True
):
pid_m
,
pid_n
=
_compute_pid
(
tile_id
,
num_pid_in_group
,
num_pid_m
,
GROUP_SIZE_M
,
NUM_SMS
)
pid_m
,
pid_n
=
_compute_pid
(
tile_id
,
num_pid_in_group
,
num_pid_m
,
GROUP_SIZE_M
)
start_m
=
pid_m
*
BLOCK_SIZE_M
start_n
=
pid_n
*
BLOCK_SIZE_N
offs_am
=
start_m
+
tl
.
arange
(
0
,
BLOCK_SIZE_M
)
...
...
@@ -124,7 +122,7 @@ def matmul_kernel_persistent(
tile_id_c
+=
NUM_SMS
pid_m
,
pid_n
=
_compute_pid
(
tile_id_c
,
num_pid_in_group
,
num_pid_m
,
GROUP_SIZE_M
,
NUM_SMS
tile_id_c
,
num_pid_in_group
,
num_pid_m
,
GROUP_SIZE_M
)
offs_cm
=
pid_m
*
BLOCK_SIZE_M
+
tl
.
arange
(
0
,
BLOCK_SIZE_M
)
offs_cn
=
pid_n
*
BLOCK_SIZE_N
+
tl
.
arange
(
0
,
BLOCK_SIZE_N
)
...
...
vllm/v1/worker/dp_utils.py
View file @
a474da28
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
numpy
as
np
import
torch
import
torch.distributed
as
dist
...
...
@@ -167,7 +165,6 @@ def coordinate_batch_across_dp(
parallel_config
:
ParallelConfig
,
num_tokens_padded
:
int
|
None
=
None
,
uniform_decode
:
bool
|
None
=
None
,
num_scheduled_tokens_per_request
:
np
.
ndarray
|
None
=
None
,
cudagraph_mode
:
int
=
0
,
)
->
tuple
[
bool
,
torch
.
Tensor
|
None
,
int
]:
"""
...
...
@@ -182,8 +179,6 @@ def coordinate_batch_across_dp(
TP, etc)
uniform_decode: Only used if allow_microbatching is True. True if the batch
only contains single token decodes
num_scheduled_tokens_per_request: Only used if allow_microbatching is True. The
number of tokens per request.
cudagraph_mode: The cudagraph mode for this rank (0=NONE, 1=PIECEWISE, 2=FULL).
DP padding is enabled when synced cudagraph mode across ranks is not NONE.
...
...
vllm/v1/worker/gpu/dp_utils.py
View file @
a474da28
...
...
@@ -13,12 +13,6 @@ from vllm.v1.worker.gpu.cudagraph_utils import (
)
def
make_num_tokens_across_dp
(
dp_size
:
int
,
num_tokens
:
int
)
->
torch
.
Tensor
|
None
:
if
dp_size
==
1
:
return
None
return
torch
.
full
((
dp_size
,),
num_tokens
,
dtype
=
torch
.
int32
,
device
=
"cpu"
)
def
sync_cudagraph_and_dp_padding
(
cudagraph_manager
:
CudaGraphManager
|
None
,
desired_batch_desc
:
BatchExecutionDescriptor
,
...
...
vllm/v1/worker/gpu_model_runner.py
View file @
a474da28
...
...
@@ -3362,7 +3362,6 @@ class GPUModelRunner(
logits
:
torch
.
Tensor
|
None
,
hidden_states
:
torch
.
Tensor
,
num_scheduled_tokens
:
int
,
spec_decode_metadata
:
SpecDecodeMetadata
|
None
,
)
->
tuple
[
dict
[
str
,
int
],
LogprobsLists
|
None
,
...
...
@@ -3630,7 +3629,6 @@ class GPUModelRunner(
allow_microbatching
=
allow_microbatching
,
num_tokens_padded
=
num_tokens_padded
,
uniform_decode
=
uniform_decode
,
num_scheduled_tokens_per_request
=
num_scheduled_tokens_np
,
cudagraph_mode
=
cudagraph_mode
.
value
,
)
)
...
...
@@ -4308,7 +4306,6 @@ class GPUModelRunner(
logits
,
hidden_states
,
scheduler_output
.
total_num_scheduled_tokens
,
spec_decode_metadata
,
)
if
propose_drafts_after_bookkeeping
:
...
...
@@ -6540,7 +6537,6 @@ class GPUModelRunner(
def
_reshape_kv_cache_tensors
(
self
,
kv_cache_config
:
KVCacheConfig
,
kv_cache_raw_tensors
:
dict
[
str
,
torch
.
Tensor
],
kernel_block_sizes
:
list
[
int
],
)
->
dict
[
str
,
torch
.
Tensor
]:
...
...
@@ -6548,7 +6544,6 @@ class GPUModelRunner(
Reshape the KV cache tensors to the desired shape and dtype.
Args:
kv_cache_config: The KV cache config
kv_cache_raw_tensors: The KV cache buffer of each layer, with
correct size but uninitialized shape.
kernel_block_sizes: The kernel block sizes for each KV cache group.
...
...
@@ -6712,7 +6707,7 @@ class GPUModelRunner(
# Change the memory buffer to the desired shape
kv_caches
=
self
.
_reshape_kv_cache_tensors
(
kv_cache_config
,
kv_cache_raw_tensors
,
kernel_block_sizes
kv_cache_raw_tensors
,
kernel_block_sizes
)
# Set up cross-layer KV cache sharing
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment