Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
1555157e
Commit
1555157e
authored
Aug 18, 2025
by
zhuwenwen
Browse files
fix mtp accept rate issu
parent
22a95571
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
198 additions
and
2 deletions
+198
-2
vllm/zero_overhead/v1/gpu_model_runner.py
vllm/zero_overhead/v1/gpu_model_runner.py
+198
-2
No files found.
vllm/zero_overhead/v1/gpu_model_runner.py
View file @
1555157e
...
...
@@ -7,6 +7,7 @@ from vllm.distributed.parallel_state import get_pp_group, get_tp_group
from
vllm.forward_context
import
set_forward_context
from
vllm.sequence
import
IntermediateTensors
from
vllm.utils
import
async_tensor_h2d
,
round_up
from
vllm.v1.attention.backends.utils
import
CommonAttentionMetadata
from
vllm.v1.core.sched.output
import
SchedulerOutput
from
vllm.v1.outputs
import
EMPTY_MODEL_RUNNER_OUTPUT
,
ModelRunnerOutput
from
vllm.v1.sample.metadata
import
SamplingMetadata
...
...
@@ -14,6 +15,7 @@ from vllm.v1.spec_decode.eagle import EagleProposer
from
vllm.v1.spec_decode.medusa
import
MedusaProposer
from
vllm.v1.spec_decode.metadata
import
SpecDecodeMetadata
from
vllm.v1.spec_decode.ngram_proposer
import
NgramProposer
from
vllm.v1.worker.block_table
import
BlockTable
from
vllm.v1.worker.gpu_model_runner
import
GPUModelRunner
from
vllm.zero_overhead.v1.outputs
import
ZeroV1ModelRunnerOutput
from
vllm.profiler.prof
import
profile
...
...
@@ -32,6 +34,202 @@ class V1ZeroModelRunner(GPUModelRunner):
self
.
last_draft_token_ids
=
None
self
.
last_draft_host_tokens
=
None
self
.
last_draft_event
=
torch
.
cuda
.
Event
(
enable_timing
=
False
)
def
_prepare_inputs
(
self
,
scheduler_output
:
"SchedulerOutput"
,
)
->
tuple
[
dict
[
str
,
Any
],
bool
,
torch
.
Tensor
,
Optional
[
SpecDecodeMetadata
],
np
.
ndarray
]:
"""
:return: tuple[
attn_metadata: layer-to-attention_metadata mapping,
attention_cuda_graphs: whether attention can run in cudagraph
logits_indices, spec_decode_metadata
]
"""
total_num_scheduled_tokens
=
scheduler_output
.
total_num_scheduled_tokens
assert
total_num_scheduled_tokens
>
0
num_reqs
=
self
.
input_batch
.
num_reqs
assert
num_reqs
>
0
# OPTIMIZATION: Start copying the block table first.
# This way, we can overlap the copy with the following CPU operations.
self
.
input_batch
.
block_table
.
commit
(
num_reqs
)
# Get the number of scheduled tokens for each request.
req_ids
=
self
.
input_batch
.
req_ids
tokens
=
[
scheduler_output
.
num_scheduled_tokens
[
i
]
for
i
in
req_ids
]
num_scheduled_tokens
=
np
.
array
(
tokens
,
dtype
=
np
.
int32
)
max_num_scheduled_tokens
=
max
(
tokens
)
# Get request indices.
# E.g., [2, 5, 3] -> [0, 0, 1, 1, 1, 1, 1, 2, 2, 2]
req_indices
=
np
.
repeat
(
self
.
arange_np
[:
num_reqs
],
num_scheduled_tokens
)
# cu_num_tokens: [2, 5, 3] -> [2, 7, 10]
# arange: [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
cu_num_tokens
,
arange
=
self
.
_get_cumsum_and_arange
(
num_scheduled_tokens
)
# Get positions.
positions_np
=
self
.
positions_np
[:
total_num_scheduled_tokens
]
np
.
add
(
self
.
input_batch
.
num_computed_tokens_cpu
[
req_indices
],
arange
,
out
=
positions_np
)
# Calculate M-RoPE positions.
# Only relevant for models using M-RoPE (e.g, Qwen2-VL)
if
self
.
uses_mrope
:
self
.
_calc_mrope_positions
(
scheduler_output
)
# Get token indices.
# E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
# -> [0, 1, M, M + 1, M + 2, M + 3, M + 4, 2 * M, 2 * M + 1, 2 * M + 2]
# where M is the max_model_len.
token_indices
=
(
positions_np
+
req_indices
*
self
.
input_batch
.
token_ids_cpu
.
shape
[
1
])
# NOTE(woosuk): We use torch.index_select instead of np.take here
# because torch.index_select is much faster than np.take for large
# tensors.
torch
.
index_select
(
self
.
input_batch
.
token_ids_cpu_tensor
.
flatten
(),
0
,
torch
.
from_numpy
(
token_indices
),
out
=
self
.
input_ids_cpu
[:
total_num_scheduled_tokens
])
# Calculate the slot mapping for each KV cache group.
for
kv_cache_group_id
,
kv_cache_group_spec
in
enumerate
(
self
.
kv_cache_config
.
kv_cache_groups
):
block_size
=
kv_cache_group_spec
.
kv_cache_spec
.
block_size
block_table
:
BlockTable
=
self
.
input_batch
.
block_table
[
kv_cache_group_id
]
# E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
# -> [0, 0, K, K, K + 1, K + 1, K + 2, 2 * K, 2 * K, 2 * K + 1]
# where K is the max_num_blocks_per_req and the block size is 2.
# NOTE(woosuk): We can't simply use `token_indices // block_size`
# here because M (max_model_len) is not necessarily divisible by
# block_size.
block_table_indices
=
(
req_indices
*
block_table
.
max_num_blocks_per_req
+
positions_np
//
block_size
)
block_table_cpu
=
block_table
.
get_cpu_tensor
()
block_numbers
=
block_table_cpu
.
flatten
(
)[
block_table_indices
].
numpy
()
block_offsets
=
positions_np
%
block_size
np
.
add
(
block_numbers
*
block_size
,
block_offsets
,
out
=
block_table
.
slot_mapping_np
[:
total_num_scheduled_tokens
])
# Prepare the attention metadata.
self
.
query_start_loc_np
[
0
]
=
0
self
.
query_start_loc_np
[
1
:
num_reqs
+
1
]
=
cu_num_tokens
self
.
seq_lens_np
[:
num_reqs
]
=
(
self
.
input_batch
.
num_computed_tokens_cpu
[:
num_reqs
]
+
num_scheduled_tokens
)
# Copy the tensors to the GPU.
self
.
input_ids
[:
total_num_scheduled_tokens
].
copy_
(
self
.
input_ids_cpu
[:
total_num_scheduled_tokens
],
non_blocking
=
True
)
self
.
zero_prepare_inputs
(
scheduler_output
,
self
.
input_ids
)
if
self
.
uses_mrope
:
# Only relevant for models using M-RoPE (e.g, Qwen2-VL)
self
.
mrope_positions
[:,
:
total_num_scheduled_tokens
].
copy_
(
self
.
mrope_positions_cpu
[:,
:
total_num_scheduled_tokens
],
non_blocking
=
True
)
else
:
# Common case (1D positions)
self
.
positions
[:
total_num_scheduled_tokens
].
copy_
(
self
.
positions_cpu
[:
total_num_scheduled_tokens
],
non_blocking
=
True
)
self
.
query_start_loc
[:
num_reqs
+
1
].
copy_
(
self
.
query_start_loc_cpu
[:
num_reqs
+
1
],
non_blocking
=
True
)
self
.
seq_lens
[:
num_reqs
].
copy_
(
self
.
seq_lens_cpu
[:
num_reqs
],
non_blocking
=
True
)
# Fill unused with -1. Needed for reshape_and_cache
self
.
seq_lens
[
num_reqs
:].
fill_
(
0
)
# Note: pad query_start_loc to be non-decreasing, as kernels
# like FlashAttention requires that
self
.
query_start_loc
[
num_reqs
+
1
:].
fill_
(
self
.
query_start_loc_cpu
[
num_reqs
].
item
())
query_start_loc
=
self
.
query_start_loc
[:
num_reqs
+
1
]
seq_lens
=
self
.
seq_lens
[:
num_reqs
]
common_attn_metadata
=
CommonAttentionMetadata
(
query_start_loc
=
query_start_loc
,
seq_lens
=
seq_lens
,
num_reqs
=
num_reqs
,
num_actual_tokens
=
total_num_scheduled_tokens
,
max_query_len
=
max_num_scheduled_tokens
,
)
attn_metadata
:
dict
[
str
,
Any
]
=
{}
# Prepare the attention metadata for each KV cache group and make layers
# in the same group share the same metadata.
for
kv_cache_group_id
,
kv_cache_group_spec
in
enumerate
(
self
.
kv_cache_config
.
kv_cache_groups
):
# Prepare for cascade attention if enabled & beneficial.
common_prefix_len
=
0
builder
=
self
.
attn_metadata_builders
[
kv_cache_group_id
]
if
self
.
cascade_attn_enabled
:
common_prefix_len
=
self
.
_compute_cascade_attn_prefix_len
(
num_scheduled_tokens
,
scheduler_output
.
num_common_prefix_blocks
[
kv_cache_group_id
],
kv_cache_group_spec
.
kv_cache_spec
,
builder
,
)
attn_metadata_i
=
(
builder
.
build
(
common_prefix_len
=
common_prefix_len
,
common_attn_metadata
=
common_attn_metadata
,
))
for
layer_name
in
kv_cache_group_spec
.
layer_names
:
attn_metadata
[
layer_name
]
=
attn_metadata_i
attention_cuda_graphs
=
all
(
b
.
can_run_in_cudagraph
(
common_attn_metadata
)
for
b
in
self
.
attn_metadata_builders
)
use_spec_decode
=
len
(
scheduler_output
.
scheduled_spec_decode_tokens
)
>
0
if
not
use_spec_decode
:
# NOTE(woosuk): Due to chunked prefills, the batch may contain
# partial requests. While we should not sample any token
# from these partial requests, we do so for simplicity.
# We will ignore the sampled tokens from the partial requests.
# TODO: Support prompt logprobs.
logits_indices
=
query_start_loc
[
1
:]
-
1
spec_decode_metadata
=
None
else
:
# Get the number of draft tokens for each request.
# Iterate over the dictionary rather than all requests since not all
# requests have draft tokens.
num_draft_tokens
=
np
.
zeros
(
num_reqs
,
dtype
=
np
.
int32
)
for
req_id
,
draft_token_ids
in
(
scheduler_output
.
scheduled_spec_decode_tokens
.
items
()):
req_idx
=
self
.
input_batch
.
req_id_to_index
[
req_id
]
num_draft_tokens
[
req_idx
]
=
len
(
draft_token_ids
)
spec_decode_metadata
=
self
.
_calc_spec_decode_metadata
(
num_draft_tokens
,
cu_num_tokens
)
logits_indices
=
spec_decode_metadata
.
logits_indices
# Hot-Swap lora model
if
self
.
lora_config
:
self
.
set_active_loras
(
self
.
input_batch
,
num_scheduled_tokens
)
return
(
attn_metadata
,
attention_cuda_graphs
,
logits_indices
,
spec_decode_metadata
,
num_scheduled_tokens
)
def
zero_prepare_inputs
(
self
,
scheduler_output
,
input_ids
):
req_ids
=
self
.
input_batch
.
req_ids
...
...
@@ -294,8 +492,6 @@ class V1ZeroModelRunner(GPUModelRunner):
# compiled with full CUDA graphs, we have to skip them entirely.
skip_cuda_graphs
=
self
.
full_cuda_graph
and
not
attention_cuda_graphs
self
.
zero_prepare_inputs
(
scheduler_output
,
input_ids
)
if
envs
.
VLLM_ENABLE_TBO
and
not
self
.
use_cuda_graph
:
model_output
,
finished_sending
,
finished_recving
=
\
tbo_split_and_execute_model
(
self
,
attn_metadata
,
num_input_tokens
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment