Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
9d4330d2
Commit
9d4330d2
authored
Jan 27, 2026
by
laibao
Browse files
refactor: 抽离 runner 侧 KV compression 逻辑并统一 slot mapping
parent
3adc766e
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
681 additions
and
537 deletions
+681
-537
vllm/v1/kv_compression/runner_buffers.py
vllm/v1/kv_compression/runner_buffers.py
+109
-0
vllm/v1/kv_compression/runner_prepare.py
vllm/v1/kv_compression/runner_prepare.py
+139
-0
vllm/v1/kv_compression/runner_prompt_compaction.py
vllm/v1/kv_compression/runner_prompt_compaction.py
+201
-0
vllm/v1/kv_compression/runner_step.py
vllm/v1/kv_compression/runner_step.py
+100
-0
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+102
-537
vllm/v1/worker/slot_mapping_utils.py
vllm/v1/worker/slot_mapping_utils.py
+30
-0
No files found.
vllm/v1/kv_compression/runner_buffers.py
0 → 100644
View file @
9d4330d2
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
__future__
import
annotations
from
typing
import
Any
,
Optional
import
torch
def
init_kv_compression_runner_buffers
(
*
,
runner
:
Any
,
max_num_tokens
:
int
,
max_num_reqs
:
int
,
device
:
torch
.
device
,
pin_memory
:
bool
,
)
->
None
:
"""Initialize per-runner buffers used by KV compression.
This helper keeps `gpu_model_runner.py` focused on orchestration while
preserving the existing attribute-based access patterns.
"""
# KV positions are decoupled from logical positions when KV compression is
# enabled. Keep a separate buffer to avoid recomputing or overwriting the
# logical `positions_np` (used for RoPE / token lookup).
runner
.
kv_positions_cpu
=
torch
.
zeros
(
max_num_tokens
,
dtype
=
torch
.
int64
,
device
=
"cpu"
,
pin_memory
=
pin_memory
,
)
runner
.
kv_positions_np
=
runner
.
kv_positions_cpu
.
numpy
()
# KV compression metadata buffers (used by the "topk" policy).
# Per-token: whether this scheduled token must be kept in KV cache.
runner
.
kv_compression_must_keep_cpu
=
torch
.
zeros
(
max_num_tokens
,
dtype
=
torch
.
bool
,
device
=
"cpu"
,
pin_memory
=
pin_memory
,
)
runner
.
kv_compression_must_keep_np
=
runner
.
kv_compression_must_keep_cpu
.
numpy
()
runner
.
kv_compression_must_keep
=
torch
.
zeros
(
max_num_tokens
,
dtype
=
torch
.
bool
,
device
=
device
,
)
# Per-request: how many additional prompt tokens to keep among
# non-protected candidates (budget from env; selection uses scores).
runner
.
kv_compression_topk_budget_cpu
=
torch
.
zeros
(
max_num_reqs
,
dtype
=
torch
.
int32
,
device
=
"cpu"
,
pin_memory
=
pin_memory
,
)
runner
.
kv_compression_topk_budget_np
=
runner
.
kv_compression_topk_budget_cpu
.
numpy
()
runner
.
kv_compression_topk_budget
=
torch
.
zeros
(
max_num_reqs
,
dtype
=
torch
.
int32
,
device
=
device
,
)
# Chunked-prefill prompt-end KV compression metadata (scheme 3).
# Per-request: whether this step finishes the prompt and should compute
# global prompt indices (score/topk) for a one-shot compaction.
runner
.
kv_compression_prompt_end_cpu
=
torch
.
zeros
(
max_num_reqs
,
dtype
=
torch
.
bool
,
device
=
"cpu"
,
pin_memory
=
pin_memory
,
)
runner
.
kv_compression_prompt_end_np
=
runner
.
kv_compression_prompt_end_cpu
.
numpy
()
runner
.
kv_compression_prompt_end
=
torch
.
zeros
(
max_num_reqs
,
dtype
=
torch
.
bool
,
device
=
device
,
)
# Per-request: prompt length (tokens) and Top-K keep count among prompt
# candidates (excluding protected prefix/suffix).
runner
.
kv_compression_prompt_lens_cpu
=
torch
.
zeros
(
max_num_reqs
,
dtype
=
torch
.
int32
,
device
=
"cpu"
,
pin_memory
=
pin_memory
,
)
runner
.
kv_compression_prompt_lens_np
=
runner
.
kv_compression_prompt_lens_cpu
.
numpy
()
runner
.
kv_compression_prompt_lens
=
torch
.
zeros
(
max_num_reqs
,
dtype
=
torch
.
int32
,
device
=
device
,
)
runner
.
kv_compression_prompt_topk_keep_cpu
=
torch
.
zeros
(
max_num_reqs
,
dtype
=
torch
.
int32
,
device
=
"cpu"
,
pin_memory
=
pin_memory
,
)
runner
.
kv_compression_prompt_topk_keep_np
=
runner
.
kv_compression_prompt_topk_keep_cpu
.
numpy
()
runner
.
kv_compression_prompt_topk_keep
=
torch
.
zeros
(
max_num_reqs
,
dtype
=
torch
.
int32
,
device
=
device
,
)
runner
.
kv_compression_prompt_topk_keep_max
=
None
# type: Optional[int]
vllm/v1/kv_compression/runner_prepare.py
0 → 100644
View file @
9d4330d2
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
__future__
import
annotations
from
typing
import
Optional
import
numpy
as
np
import
vllm.envs
as
envs
from
vllm.v1.kv_compression.budget
import
(
compute_prompt_topk_keep_total
,
compute_topk_budget_step
)
def
prepare_kv_compression_for_step
(
*
,
num_reqs
:
int
,
total_num_scheduled_tokens
:
int
,
num_scheduled_tokens
:
np
.
ndarray
,
# [B] int32
cu_num_tokens
:
np
.
ndarray
,
# [B] int64/int32 cumulative scheduled tokens
req_indices
:
np
.
ndarray
,
# [T] int64, request index per token
arange
:
np
.
ndarray
,
# [T] int64, position-within-request per token
num_computed_tokens_cpu
:
np
.
ndarray
,
# [max_reqs] int32/int64
num_prompt_tokens
:
np
.
ndarray
,
# [max_reqs] int32/int64
num_kv_tokens_cpu
:
np
.
ndarray
,
# [max_reqs] int32/int64
kv_positions_np
:
np
.
ndarray
,
# [T] int64 (out)
must_keep_np
:
np
.
ndarray
,
# [T] bool (out; scheme 1/2 only)
topk_budget_np
:
np
.
ndarray
,
# [B] int32 (out; scheme 1/2 only)
prompt_end_np
:
np
.
ndarray
,
# [B] bool (out; scheme 3 only)
prompt_lens_np
:
np
.
ndarray
,
# [B] int32 (out; scheme 3 only)
prompt_topk_keep_np
:
np
.
ndarray
,
# [B] int32 (out; scheme 3 only)
chunked_prefill_enabled
:
bool
,
)
->
tuple
[
bool
,
Optional
[
int
]]:
"""Prepare KV compression metadata for a single model step (CPU-side).
Fills:
- `kv_positions_np`: per-token KV write positions (decoupled from logical
RoPE positions).
- Scheme 3 (chunked prefill): `prompt_end/prompt_lens/prompt_topk_keep`.
- Scheme 1/2 (non-chunked): `must_keep/topk_budget`.
Returns:
(needs_compaction, prompt_topk_keep_max)
"""
if
total_num_scheduled_tokens
<=
0
or
num_reqs
<=
0
:
return
False
,
None
# KV positions (where scheduled tokens are written before optional
# compaction).
np
.
add
(
num_kv_tokens_cpu
[
req_indices
],
arange
,
out
=
kv_positions_np
)
prompt_ratio
=
envs
.
VLLM_KV_COMPRESSION_PROMPT_RATIO
prompt_budget
=
envs
.
VLLM_KV_COMPRESSION_PROMPT_BUDGET
protected_prefix
=
envs
.
VLLM_KV_COMPRESSION_PROTECTED_PREFIX
protected_suffix
=
envs
.
VLLM_KV_COMPRESSION_PROTECTED_SUFFIX
keep_last
=
envs
.
VLLM_KV_COMPRESSION_KEEP_LAST_TOKEN
if
chunked_prefill_enabled
:
# Scheme 3: with chunked prefill, defer compaction until after the full
# prompt is ingested. Otherwise, the next prefill chunk would attend to
# a truncated history and quality can collapse.
prompt_end_np
.
fill
(
False
)
prompt_lens_np
.
fill
(
0
)
prompt_topk_keep_np
.
fill
(
0
)
for
req_idx
in
range
(
num_reqs
):
qlen
=
int
(
num_scheduled_tokens
[
req_idx
])
if
qlen
<=
0
:
continue
base_pos
=
int
(
num_computed_tokens_cpu
[
req_idx
])
prompt_len
=
int
(
num_prompt_tokens
[
req_idx
])
end_pos
=
base_pos
+
qlen
ends_prompt
=
(
base_pos
<
prompt_len
)
and
(
end_pos
>=
prompt_len
)
if
not
ends_prompt
:
continue
prompt_end_np
[
req_idx
]
=
True
prompt_lens_np
[
req_idx
]
=
prompt_len
prompt_topk_keep_np
[
req_idx
]
=
compute_prompt_topk_keep_total
(
prompt_len
=
prompt_len
,
protected_prefix
=
protected_prefix
,
protected_suffix
=
protected_suffix
,
keep_last_token
=
keep_last
,
prompt_ratio
=
prompt_ratio
,
prompt_budget
=
prompt_budget
,
)
prompt_topk_keep_max
=
int
(
prompt_topk_keep_np
[:
num_reqs
].
max
())
return
False
,
prompt_topk_keep_max
# Scheme 1/2: per-step compaction within the scheduled segment.
must_keep_np
.
fill
(
False
)
topk_budget_np
.
fill
(
0
)
for
req_idx
in
range
(
num_reqs
):
qlen
=
int
(
num_scheduled_tokens
[
req_idx
])
if
qlen
<=
0
:
continue
start
=
0
if
req_idx
==
0
else
int
(
cu_num_tokens
[
req_idx
-
1
])
end
=
int
(
cu_num_tokens
[
req_idx
])
assert
end
-
start
==
qlen
base_pos
=
int
(
num_computed_tokens_cpu
[
req_idx
])
prompt_len
=
int
(
num_prompt_tokens
[
req_idx
])
end_pos
=
base_pos
+
qlen
pos_in_req
=
arange
[
start
:
end
].
astype
(
np
.
int64
,
copy
=
False
)
pos
=
base_pos
+
pos_in_req
prompt_mask
=
pos
<
prompt_len
# Decode tokens are always kept.
must_keep
=
~
prompt_mask
if
np
.
any
(
prompt_mask
):
suffix_start
=
max
(
prompt_len
-
protected_suffix
,
0
)
must_keep
|=
prompt_mask
&
(
pos
<
protected_prefix
)
must_keep
|=
prompt_mask
&
(
pos
>=
suffix_start
)
if
keep_last
:
last
=
prompt_len
-
1
if
base_pos
<=
last
<
end_pos
:
must_keep
[
last
-
base_pos
]
=
True
topk_budget_np
[
req_idx
]
=
compute_topk_budget_step
(
prompt_len
=
prompt_len
,
start_pos
=
base_pos
,
end_pos
=
end_pos
,
protected_prefix
=
protected_prefix
,
protected_suffix
=
protected_suffix
,
keep_last_token
=
keep_last
,
prompt_ratio
=
prompt_ratio
,
prompt_budget
=
prompt_budget
,
)
must_keep_np
[
start
:
end
]
=
must_keep
# Decode-only fast path: if all scheduled tokens are unconditionally kept
# and there is no Top-K budget, KV compaction is a no-op and can be skipped.
needs_compaction
=
(
not
must_keep_np
.
all
())
or
(
topk_budget_np
>
0
).
any
()
return
bool
(
needs_compaction
),
None
vllm/v1/kv_compression/runner_prompt_compaction.py
0 → 100644
View file @
9d4330d2
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
__future__
import
annotations
from
typing
import
Any
import
torch
import
vllm.envs
as
envs
from
vllm.forward_context
import
get_forward_context
from
vllm.platforms
import
current_platform
from
vllm.v1.kv_compression.forward_context
import
get_kv_compression_prompt_payload
def
stash_kv_compression_prompt_payload_to_requests
(
*
,
runner
:
Any
)
->
None
:
"""Persist prompt-end compaction indices from the forward context.
This is the runner-side half of chunked-prefill scheme 3:
flash_attn -> forward_context payload -> request state stash ->
(next step) one-shot KV compaction.
"""
if
not
envs
.
VLLM_ENABLE_KV_COMPRESSION
:
return
scheduler_config
=
getattr
(
runner
,
"scheduler_config"
,
None
)
if
scheduler_config
is
None
or
not
getattr
(
scheduler_config
,
"chunked_prefill_enabled"
,
False
):
return
forward_context
=
get_forward_context
()
payload
=
get_kv_compression_prompt_payload
(
forward_context
)
if
payload
is
None
:
return
req_indices
=
payload
.
get
(
"req_indices"
)
idx_sorted
=
payload
.
get
(
"idx_sorted"
)
keep_len
=
payload
.
get
(
"keep_len"
)
prompt_lens
=
payload
.
get
(
"prompt_lens"
)
if
(
req_indices
is
None
or
idx_sorted
is
None
or
keep_len
is
None
or
prompt_lens
is
None
):
return
input_batch
=
getattr
(
runner
,
"input_batch"
,
None
)
if
input_batch
is
None
:
return
req_ids
=
getattr
(
input_batch
,
"req_ids"
,
None
)
if
req_ids
is
None
:
return
requests
=
getattr
(
runner
,
"requests"
,
None
)
if
requests
is
None
:
return
req_indices_cpu
=
req_indices
.
to
(
device
=
"cpu"
,
dtype
=
torch
.
int64
).
tolist
()
keep_cpu
=
keep_len
.
to
(
device
=
"cpu"
,
dtype
=
torch
.
int64
).
tolist
()
prompt_cpu
=
prompt_lens
.
to
(
device
=
"cpu"
,
dtype
=
torch
.
int64
).
tolist
()
for
i
,
b
in
enumerate
(
req_indices_cpu
):
if
b
<
0
or
b
>=
len
(
req_ids
):
continue
req_id
=
req_ids
[
b
]
if
req_id
is
None
:
continue
rs
=
requests
.
get
(
req_id
)
if
rs
is
None
:
continue
rs
.
kv_compression_prompt_idx_sorted
=
idx_sorted
[
i
]
rs
.
kv_compression_prompt_keep_len
=
int
(
keep_cpu
[
i
])
rs
.
kv_compression_prompt_prompt_len
=
int
(
prompt_cpu
[
i
])
def
maybe_apply_kv_compression_prompt_compaction
(
*
,
runner
:
Any
)
->
None
:
"""Apply one-shot prompt KV compaction before the first decode step."""
if
not
envs
.
VLLM_ENABLE_KV_COMPRESSION
:
return
scheduler_config
=
getattr
(
runner
,
"scheduler_config"
,
None
)
if
scheduler_config
is
None
or
not
getattr
(
scheduler_config
,
"chunked_prefill_enabled"
,
False
):
return
input_batch
=
getattr
(
runner
,
"input_batch"
,
None
)
if
input_batch
is
None
:
return
requests
=
getattr
(
runner
,
"requests"
,
None
)
if
requests
is
None
:
return
pending_req_ids
:
list
[
str
]
=
[]
for
req_id
in
input_batch
.
req_ids
:
if
req_id
is
None
:
continue
rs
=
requests
.
get
(
req_id
)
if
rs
is
None
:
continue
if
rs
.
kv_compression_prompt_idx_sorted
is
None
:
continue
# Only apply once the prompt is fully ingested (decode stage).
if
rs
.
num_computed_tokens
<
rs
.
num_prompt_tokens
:
continue
pending_req_ids
.
append
(
req_id
)
if
not
pending_req_ids
:
return
device
=
runner
.
device
pending_states
:
list
[
tuple
[
str
,
torch
.
Tensor
,
int
]]
=
[]
for
req_id
in
pending_req_ids
:
rs
=
requests
[
req_id
]
keep
=
rs
.
kv_compression_prompt_keep_len
idx
=
rs
.
kv_compression_prompt_idx_sorted
if
keep
is
None
or
idx
is
None
:
continue
keep_i
=
int
(
keep
)
if
keep_i
<=
0
:
# No prompt tokens kept; clear and skip.
rs
.
kv_compression_prompt_idx_sorted
=
None
rs
.
kv_compression_prompt_keep_len
=
None
rs
.
kv_compression_prompt_prompt_len
=
None
continue
pending_states
.
append
((
req_id
,
idx
,
keep_i
))
if
not
pending_states
:
return
B
=
len
(
pending_states
)
keep_list
=
[
k
for
_
,
_
,
k
in
pending_states
]
K_max
=
max
(
keep_list
)
idx_batch
=
torch
.
zeros
((
B
,
K_max
),
device
=
device
,
dtype
=
torch
.
int32
)
for
i
,
(
_
,
row
,
k
)
in
enumerate
(
pending_states
):
idx_batch
[
i
,
:
k
]
=
row
[:
k
].
to
(
device
=
device
,
dtype
=
torch
.
int32
)
keep_tensor
=
torch
.
tensor
(
keep_list
,
device
=
device
,
dtype
=
torch
.
int32
)
from
vllm.v1.kv_compression.kv_cache_triton
import
(
front_compact_inplace_fa_triton
,
make_fa_cache_view
)
kv_cache_config
=
getattr
(
runner
,
"kv_cache_config"
,
None
)
if
kv_cache_config
is
None
:
return
# Apply compaction to every attention layer's KV cache in-place.
for
group_id
,
kv_cache_group_spec
in
enumerate
(
kv_cache_config
.
kv_cache_groups
):
max_blocks
=
0
for
req_id
,
_
,
_
in
pending_states
:
rs
=
requests
[
req_id
]
if
group_id
>=
len
(
rs
.
block_ids
):
continue
max_blocks
=
max
(
max_blocks
,
len
(
rs
.
block_ids
[
group_id
]))
if
max_blocks
==
0
:
continue
block_table_cpu
=
torch
.
zeros
((
B
,
max_blocks
),
dtype
=
torch
.
int32
,
device
=
"cpu"
)
for
i
,
(
req_id
,
_
,
_
)
in
enumerate
(
pending_states
):
rs
=
requests
[
req_id
]
if
group_id
>=
len
(
rs
.
block_ids
):
continue
ids
=
rs
.
block_ids
[
group_id
]
if
ids
:
block_table_cpu
[
i
,
:
len
(
ids
)]
=
torch
.
tensor
(
ids
,
dtype
=
torch
.
int32
,
device
=
"cpu"
)
block_table
=
block_table_cpu
.
to
(
device
=
device
,
non_blocking
=
True
)
kv_caches
=
getattr
(
runner
,
"kv_caches"
,
None
)
if
kv_caches
is
None
:
continue
for
layer_name
in
kv_cache_group_spec
.
layer_names
:
layer_index
=
runner
.
_extract_layer_index
(
layer_name
)
if
layer_index
>=
len
(
kv_caches
):
continue
kv_cache
=
kv_caches
[
layer_index
]
if
not
current_platform
.
is_rocm
():
if
not
isinstance
(
kv_cache
,
torch
.
Tensor
):
continue
key_cache
,
value_cache
=
kv_cache
.
unbind
(
0
)
else
:
if
(
not
isinstance
(
kv_cache
,
(
tuple
,
list
))
or
len
(
kv_cache
)
!=
2
):
continue
key_cache
,
value_cache
=
kv_cache
k_view
,
v_view
=
make_fa_cache_view
(
key_cache
=
key_cache
,
value_cache
=
value_cache
)
front_compact_inplace_fa_triton
(
k_view
,
v_view
,
block_table
,
idx_batch
,
keep_tensor
,
)
# Clear pending state after successful compaction.
for
req_id
,
_
,
_
in
pending_states
:
rs
=
requests
.
get
(
req_id
)
if
rs
is
None
:
continue
rs
.
kv_compression_prompt_idx_sorted
=
None
rs
.
kv_compression_prompt_keep_len
=
None
rs
.
kv_compression_prompt_prompt_len
=
None
vllm/v1/kv_compression/runner_step.py
0 → 100644
View file @
9d4330d2
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
__future__
import
annotations
from
typing
import
Any
,
Optional
import
numpy
as
np
import
vllm.envs
as
envs
from
vllm.v1.kv_compression.runner_prepare
import
prepare_kv_compression_for_step
def
maybe_prepare_kv_compression_for_runner_step
(
*
,
runner
:
Any
,
num_reqs
:
int
,
total_num_scheduled_tokens
:
int
,
num_scheduled_tokens
:
np
.
ndarray
,
# [B] int32
cu_num_tokens
:
np
.
ndarray
,
# [B] int64/int32
req_indices
:
np
.
ndarray
,
# [T] int64
arange
:
np
.
ndarray
,
# [T] int64
)
->
Optional
[
np
.
ndarray
]:
"""Prepare per-step KV compression metadata on CPU.
Returns the per-token KV positions (`kv_positions_np`) or None if KV
compression is disabled.
"""
if
not
envs
.
VLLM_ENABLE_KV_COMPRESSION
:
runner
.
kv_compression_needs_compaction
=
False
return
None
kv_positions_np
=
runner
.
kv_positions_np
[:
total_num_scheduled_tokens
]
must_keep_np
=
runner
.
kv_compression_must_keep_np
[:
total_num_scheduled_tokens
]
topk_budget_np
=
runner
.
kv_compression_topk_budget_np
[:
num_reqs
]
prompt_end_np
=
runner
.
kv_compression_prompt_end_np
[:
num_reqs
]
prompt_lens_np
=
runner
.
kv_compression_prompt_lens_np
[:
num_reqs
]
topk_keep_np
=
runner
.
kv_compression_prompt_topk_keep_np
[:
num_reqs
]
needs_compaction
,
prompt_topk_keep_max
=
prepare_kv_compression_for_step
(
num_reqs
=
num_reqs
,
total_num_scheduled_tokens
=
total_num_scheduled_tokens
,
num_scheduled_tokens
=
num_scheduled_tokens
,
cu_num_tokens
=
cu_num_tokens
,
req_indices
=
req_indices
,
arange
=
arange
,
num_computed_tokens_cpu
=
runner
.
input_batch
.
num_computed_tokens_cpu
,
num_prompt_tokens
=
runner
.
input_batch
.
num_prompt_tokens
,
num_kv_tokens_cpu
=
runner
.
input_batch
.
num_kv_tokens_cpu
,
kv_positions_np
=
kv_positions_np
,
must_keep_np
=
must_keep_np
,
topk_budget_np
=
topk_budget_np
,
prompt_end_np
=
prompt_end_np
,
prompt_lens_np
=
prompt_lens_np
,
prompt_topk_keep_np
=
topk_keep_np
,
chunked_prefill_enabled
=
runner
.
scheduler_config
.
chunked_prefill_enabled
,
)
runner
.
kv_compression_needs_compaction
=
bool
(
needs_compaction
)
if
prompt_topk_keep_max
is
not
None
:
runner
.
kv_compression_prompt_topk_keep_max
=
int
(
prompt_topk_keep_max
)
return
kv_positions_np
def
maybe_copy_kv_compression_step_tensors_to_gpu
(
*
,
runner
:
Any
,
num_reqs
:
int
,
total_num_scheduled_tokens
:
int
,
non_blocking
:
bool
=
True
,
)
->
None
:
"""Stage per-step KV compression tensors to GPU if needed."""
if
not
envs
.
VLLM_ENABLE_KV_COMPRESSION
:
return
if
runner
.
scheduler_config
.
chunked_prefill_enabled
:
runner
.
kv_compression_prompt_end
[:
num_reqs
].
copy_
(
runner
.
kv_compression_prompt_end_cpu
[:
num_reqs
],
non_blocking
=
non_blocking
,
)
runner
.
kv_compression_prompt_lens
[:
num_reqs
].
copy_
(
runner
.
kv_compression_prompt_lens_cpu
[:
num_reqs
],
non_blocking
=
non_blocking
,
)
runner
.
kv_compression_prompt_topk_keep
[:
num_reqs
].
copy_
(
runner
.
kv_compression_prompt_topk_keep_cpu
[:
num_reqs
],
non_blocking
=
non_blocking
,
)
return
if
runner
.
kv_compression_needs_compaction
:
runner
.
kv_compression_must_keep
[:
total_num_scheduled_tokens
].
copy_
(
runner
.
kv_compression_must_keep_cpu
[:
total_num_scheduled_tokens
],
non_blocking
=
non_blocking
,
)
runner
.
kv_compression_topk_budget
[:
num_reqs
].
copy_
(
runner
.
kv_compression_topk_budget_cpu
[:
num_reqs
],
non_blocking
=
non_blocking
,
)
vllm/v1/worker/gpu_model_runner.py
View file @
9d4330d2
...
@@ -55,8 +55,15 @@ from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
...
@@ -55,8 +55,15 @@ from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
from
vllm.v1.kv_cache_interface
import
(
AttentionSpec
,
FullAttentionSpec
,
from
vllm.v1.kv_cache_interface
import
(
AttentionSpec
,
FullAttentionSpec
,
KVCacheConfig
,
KVCacheSpec
,
MambaSpec
,
KVCacheConfig
,
KVCacheSpec
,
MambaSpec
,
SlidingWindowSpec
)
SlidingWindowSpec
)
from
vllm.v1.kv_compression.budget
import
(
compute_prompt_topk_keep_total
,
from
vllm.v1.kv_compression.runner_buffers
import
init_kv_compression_runner_buffers
compute_topk_budget_step
)
from
vllm.v1.kv_compression.runner_prompt_compaction
import
(
maybe_apply_kv_compression_prompt_compaction
,
stash_kv_compression_prompt_payload_to_requests
,
)
from
vllm.v1.kv_compression.runner_step
import
(
maybe_copy_kv_compression_step_tensors_to_gpu
,
maybe_prepare_kv_compression_for_runner_step
,
)
from
vllm.v1.outputs
import
(
EMPTY_MODEL_RUNNER_OUTPUT
,
LogprobsTensors
,
from
vllm.v1.outputs
import
(
EMPTY_MODEL_RUNNER_OUTPUT
,
LogprobsTensors
,
ModelRunnerOutput
)
ModelRunnerOutput
)
from
vllm.v1.pool.metadata
import
PoolingMetadata
from
vllm.v1.pool.metadata
import
PoolingMetadata
...
@@ -72,12 +79,12 @@ from vllm.v1.utils import bind_kv_cache
...
@@ -72,12 +79,12 @@ from vllm.v1.utils import bind_kv_cache
from
vllm.v1.worker.block_table
import
BlockTable
from
vllm.v1.worker.block_table
import
BlockTable
from
vllm.v1.worker.gpu_input_batch
import
CachedRequestState
,
InputBatch
from
vllm.v1.worker.gpu_input_batch
import
CachedRequestState
,
InputBatch
from
vllm.v1.worker.lora_model_runner_mixin
import
LoRAModelRunnerMixin
from
vllm.v1.worker.lora_model_runner_mixin
import
LoRAModelRunnerMixin
from
vllm.platforms
import
current_platform
from
vllm.two_batch_overlap.v1.model_input_split_v1
import
tbo_split_and_execute_model
from
vllm.two_batch_overlap.v1.model_input_split_v1
import
tbo_split_and_execute_model
from
vllm.profiler.prof
import
profile
from
vllm.profiler.prof
import
profile
from
..sample.logits_processor
import
LogitsProcessorManager
from
..sample.logits_processor
import
LogitsProcessorManager
from
.utils
import
(
gather_mm_placeholders
,
initialize_kv_cache_for_kv_sharing
,
from
.utils
import
(
gather_mm_placeholders
,
initialize_kv_cache_for_kv_sharing
,
sanity_check_mm_encoder_outputs
,
scatter_mm_placeholders
)
sanity_check_mm_encoder_outputs
,
scatter_mm_placeholders
)
from
.slot_mapping_utils
import
fill_block_table_slot_mapping_np
from
vllm.zero_overhead.v1.eagle
import
V1ZeroEagleProposer
from
vllm.zero_overhead.v1.eagle
import
V1ZeroEagleProposer
from
vllm.v1.spec_decode.utils
import
DraftProbs
from
vllm.v1.spec_decode.utils
import
DraftProbs
...
@@ -322,14 +329,15 @@ class GPUModelRunnerBase(LoRAModelRunnerMixin):
...
@@ -322,14 +329,15 @@ class GPUModelRunnerBase(LoRAModelRunnerMixin):
device
=
"cpu"
,
device
=
"cpu"
,
pin_memory
=
self
.
pin_memory
)
pin_memory
=
self
.
pin_memory
)
self
.
positions_np
=
self
.
positions_cpu
.
numpy
()
self
.
positions_np
=
self
.
positions_cpu
.
numpy
()
# KV positions are decoupled from logical positions when KV compression
# is enabled. We keep a separate buffer to avoid recomputing or
init_kv_compression_runner_buffers
(
# overwriting `positions_np` (used for RoPE / input token lookup).
runner
=
self
,
self
.
kv_positions_cpu
=
torch
.
zeros
(
self
.
max_num_tokens
,
max_num_tokens
=
self
.
max_num_tokens
,
dtype
=
torch
.
int64
,
max_num_reqs
=
self
.
max_num_reqs
,
device
=
"cpu"
,
device
=
self
.
device
,
pin_memory
=
self
.
pin_memory
)
pin_memory
=
self
.
pin_memory
,
self
.
kv_positions_np
=
self
.
kv_positions_cpu
.
numpy
()
)
self
.
query_start_loc_cpu
=
torch
.
zeros
(
self
.
max_num_reqs
+
1
,
self
.
query_start_loc_cpu
=
torch
.
zeros
(
self
.
max_num_reqs
+
1
,
dtype
=
torch
.
int32
,
dtype
=
torch
.
int32
,
device
=
"cpu"
,
device
=
"cpu"
,
...
@@ -340,77 +348,6 @@ class GPUModelRunnerBase(LoRAModelRunnerMixin):
...
@@ -340,77 +348,6 @@ class GPUModelRunnerBase(LoRAModelRunnerMixin):
device
=
"cpu"
,
device
=
"cpu"
,
pin_memory
=
self
.
pin_memory
)
pin_memory
=
self
.
pin_memory
)
self
.
seq_lens_np
=
self
.
seq_lens_cpu
.
numpy
()
self
.
seq_lens_np
=
self
.
seq_lens_cpu
.
numpy
()
# KV compression metadata buffers (used by the "topk" policy).
# Per-token: whether this scheduled token must be kept in KV cache.
self
.
kv_compression_must_keep_cpu
=
torch
.
zeros
(
self
.
max_num_tokens
,
dtype
=
torch
.
bool
,
device
=
"cpu"
,
pin_memory
=
self
.
pin_memory
,
)
self
.
kv_compression_must_keep_np
=
self
.
kv_compression_must_keep_cpu
.
numpy
()
self
.
kv_compression_must_keep
=
torch
.
zeros
(
self
.
max_num_tokens
,
dtype
=
torch
.
bool
,
device
=
self
.
device
,
)
# Per-request: how many additional prompt tokens to keep among
# non-protected candidates (budget from env; selection uses scores).
self
.
kv_compression_topk_budget_cpu
=
torch
.
zeros
(
self
.
max_num_reqs
,
dtype
=
torch
.
int32
,
device
=
"cpu"
,
pin_memory
=
self
.
pin_memory
,
)
self
.
kv_compression_topk_budget_np
=
self
.
kv_compression_topk_budget_cpu
.
numpy
()
self
.
kv_compression_topk_budget
=
torch
.
zeros
(
self
.
max_num_reqs
,
dtype
=
torch
.
int32
,
device
=
self
.
device
,
)
# Chunked-prefill prompt-end KV compression metadata (scheme 3).
# Per-request: whether this step finishes the prompt and should compute
# global prompt indices (score/topk) for a one-shot compaction.
self
.
kv_compression_prompt_end_cpu
=
torch
.
zeros
(
self
.
max_num_reqs
,
dtype
=
torch
.
bool
,
device
=
"cpu"
,
pin_memory
=
self
.
pin_memory
,
)
self
.
kv_compression_prompt_end_np
=
self
.
kv_compression_prompt_end_cpu
.
numpy
()
self
.
kv_compression_prompt_end
=
torch
.
zeros
(
self
.
max_num_reqs
,
dtype
=
torch
.
bool
,
device
=
self
.
device
,
)
# Per-request: prompt length (tokens) and Top-K keep count among prompt
# candidates (excluding protected prefix/suffix).
self
.
kv_compression_prompt_lens_cpu
=
torch
.
zeros
(
self
.
max_num_reqs
,
dtype
=
torch
.
int32
,
device
=
"cpu"
,
pin_memory
=
self
.
pin_memory
,
)
self
.
kv_compression_prompt_lens_np
=
self
.
kv_compression_prompt_lens_cpu
.
numpy
()
self
.
kv_compression_prompt_lens
=
torch
.
zeros
(
self
.
max_num_reqs
,
dtype
=
torch
.
int32
,
device
=
self
.
device
,
)
self
.
kv_compression_prompt_topk_keep_cpu
=
torch
.
zeros
(
self
.
max_num_reqs
,
dtype
=
torch
.
int32
,
device
=
"cpu"
,
pin_memory
=
self
.
pin_memory
,
)
self
.
kv_compression_prompt_topk_keep_np
=
self
.
kv_compression_prompt_topk_keep_cpu
.
numpy
()
self
.
kv_compression_prompt_topk_keep
=
torch
.
zeros
(
self
.
max_num_reqs
,
dtype
=
torch
.
int32
,
device
=
self
.
device
,
)
self
.
kv_compression_prompt_topk_keep_max
:
Optional
[
int
]
=
None
# Layer pairings for cross-layer KV sharing.
# Layer pairings for cross-layer KV sharing.
# If an Attention layer `layer_name` is in the keys of this dict, it
# If an Attention layer `layer_name` is in the keys of this dict, it
...
@@ -779,110 +716,17 @@ class GPUModelRunnerBase(LoRAModelRunnerMixin):
...
@@ -779,110 +716,17 @@ class GPUModelRunnerBase(LoRAModelRunnerMixin):
# from logical positions.
# from logical positions.
use_kv_compression
=
envs
.
VLLM_ENABLE_KV_COMPRESSION
use_kv_compression
=
envs
.
VLLM_ENABLE_KV_COMPRESSION
if
use_kv_compression
:
if
use_kv_compression
:
kv_positions_np
=
self
.
kv_positions_np
[:
total_num_scheduled_tokens
]
kv_positions_np
=
maybe_prepare_kv_compression_for_runner_step
(
np
.
add
(
self
.
input_batch
.
num_kv_tokens_cpu
[
req_indices
],
runner
=
self
,
arange
,
num_reqs
=
num_reqs
,
out
=
kv_positions_np
)
total_num_scheduled_tokens
=
total_num_scheduled_tokens
,
num_scheduled_tokens
=
num_scheduled_tokens
,
cu_num_tokens
=
cu_num_tokens
,
req_indices
=
req_indices
,
arange
=
arange
,
)
else
:
else
:
kv_positions_np
=
None
kv_positions_np
=
None
if
use_kv_compression
:
prompt_ratio
=
envs
.
VLLM_KV_COMPRESSION_PROMPT_RATIO
prompt_budget
=
envs
.
VLLM_KV_COMPRESSION_PROMPT_BUDGET
protected_prefix
=
envs
.
VLLM_KV_COMPRESSION_PROTECTED_PREFIX
protected_suffix
=
envs
.
VLLM_KV_COMPRESSION_PROTECTED_SUFFIX
keep_last
=
envs
.
VLLM_KV_COMPRESSION_KEEP_LAST_TOKEN
if
self
.
scheduler_config
.
chunked_prefill_enabled
:
# Scheme 3: with chunked prefill, defer compaction until after
# the full prompt is ingested. Otherwise, the next prefill chunk
# would attend to a truncated history and quality can collapse.
prompt_end_np
=
self
.
kv_compression_prompt_end_np
[:
num_reqs
]
prompt_end_np
.
fill
(
False
)
prompt_lens_np
=
self
.
kv_compression_prompt_lens_np
[:
num_reqs
]
prompt_lens_np
.
fill
(
0
)
topk_keep_np
=
self
.
kv_compression_prompt_topk_keep_np
[:
num_reqs
]
topk_keep_np
.
fill
(
0
)
for
req_idx
in
range
(
num_reqs
):
qlen
=
int
(
num_scheduled_tokens
[
req_idx
])
if
qlen
<=
0
:
continue
base_pos
=
int
(
self
.
input_batch
.
num_computed_tokens_cpu
[
req_idx
])
prompt_len
=
int
(
self
.
input_batch
.
num_prompt_tokens
[
req_idx
])
end_pos
=
base_pos
+
qlen
ends_prompt
=
(
base_pos
<
prompt_len
)
and
(
end_pos
>=
prompt_len
)
if
not
ends_prompt
:
continue
prompt_end_np
[
req_idx
]
=
True
prompt_lens_np
[
req_idx
]
=
prompt_len
topk_keep_np
[
req_idx
]
=
compute_prompt_topk_keep_total
(
prompt_len
=
prompt_len
,
protected_prefix
=
protected_prefix
,
protected_suffix
=
protected_suffix
,
keep_last_token
=
keep_last
,
prompt_ratio
=
prompt_ratio
,
prompt_budget
=
prompt_budget
,
)
self
.
kv_compression_prompt_topk_keep_max
=
int
(
topk_keep_np
[:
num_reqs
].
max
())
if
num_reqs
>
0
else
0
self
.
kv_compression_needs_compaction
=
False
else
:
must_keep_np
=
self
.
kv_compression_must_keep_np
[
:
total_num_scheduled_tokens
]
must_keep_np
.
fill
(
False
)
topk_budget_np
=
self
.
kv_compression_topk_budget_np
[:
num_reqs
]
topk_budget_np
.
fill
(
0
)
for
req_idx
in
range
(
num_reqs
):
qlen
=
int
(
num_scheduled_tokens
[
req_idx
])
if
qlen
<=
0
:
continue
start
=
0
if
req_idx
==
0
else
int
(
cu_num_tokens
[
req_idx
-
1
])
end
=
int
(
cu_num_tokens
[
req_idx
])
assert
end
-
start
==
qlen
base_pos
=
int
(
self
.
input_batch
.
num_computed_tokens_cpu
[
req_idx
])
prompt_len
=
int
(
self
.
input_batch
.
num_prompt_tokens
[
req_idx
])
end_pos
=
base_pos
+
qlen
pos
=
base_pos
+
np
.
arange
(
qlen
,
dtype
=
np
.
int64
)
prompt_mask
=
pos
<
prompt_len
# Decode tokens are always kept.
must_keep
=
~
prompt_mask
if
np
.
any
(
prompt_mask
):
suffix_start
=
max
(
prompt_len
-
protected_suffix
,
0
)
must_keep
|=
prompt_mask
&
(
pos
<
protected_prefix
)
must_keep
|=
prompt_mask
&
(
pos
>=
suffix_start
)
if
keep_last
:
last
=
prompt_len
-
1
if
base_pos
<=
last
<
end_pos
:
must_keep
[
last
-
base_pos
]
=
True
topk_budget_np
[
req_idx
]
=
compute_topk_budget_step
(
prompt_len
=
prompt_len
,
start_pos
=
base_pos
,
end_pos
=
end_pos
,
protected_prefix
=
protected_prefix
,
protected_suffix
=
protected_suffix
,
keep_last_token
=
keep_last
,
prompt_ratio
=
prompt_ratio
,
prompt_budget
=
prompt_budget
,
)
must_keep_np
[
start
:
end
]
=
must_keep
# Decode-only fast path: if all scheduled tokens are
# unconditionally kept and there is no Top-K budget, KV
# compaction is a no-op and we can skip score/topk/dst entirely
# in the attention backend.
self
.
kv_compression_needs_compaction
=
(
(
not
must_keep_np
.
all
())
or
(
topk_budget_np
>
0
).
any
())
else
:
self
.
kv_compression_needs_compaction
=
False
self
.
kv_compression_needs_compaction
=
False
# Calculate M-RoPE positions.
# Calculate M-RoPE positions.
...
@@ -910,25 +754,32 @@ class GPUModelRunnerBase(LoRAModelRunnerMixin):
...
@@ -910,25 +754,32 @@ class GPUModelRunnerBase(LoRAModelRunnerMixin):
block_size
=
kv_cache_group_spec
.
kv_cache_spec
.
block_size
block_size
=
kv_cache_group_spec
.
kv_cache_spec
.
block_size
block_table
:
BlockTable
=
self
.
input_batch
.
block_table
[
block_table
:
BlockTable
=
self
.
input_batch
.
block_table
[
kv_cache_group_id
]
kv_cache_group_id
]
slot_positions_np
=
(
kv_positions_np
if
use_kv_compression
:
if
use_kv_compression
else
positions_np
)
fill_block_table_slot_mapping_np
(
# E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
block_table
=
block_table
,
# -> [0, 0, K, K, K + 1, K + 1, K + 2, 2 * K, 2 * K, 2 * K + 1]
req_indices
=
req_indices
,
# where K is the max_num_blocks_per_req and the block size is 2.
slot_positions_np
=
kv_positions_np
,
# NOTE(woosuk): We can't simply use `token_indices // block_size`
total_num_scheduled_tokens
=
total_num_scheduled_tokens
,
# here because M (max_model_len) is not necessarily divisible by
block_size
=
block_size
,
# block_size.
)
block_table_indices
=
(
else
:
req_indices
*
block_table
.
max_num_blocks_per_req
+
# E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
slot_positions_np
//
block_size
)
# -> [0, 0, K, K, K + 1, K + 1, K + 2, 2 * K, 2 * K, 2 * K + 1]
block_table_cpu
=
block_table
.
get_cpu_tensor
()
# where K is the max_num_blocks_per_req and the block size is 2.
block_numbers
=
block_table_cpu
.
flatten
(
# NOTE(woosuk): We can't simply use `token_indices // block_size`
)[
block_table_indices
].
numpy
()
# here because M (max_model_len) is not necessarily divisible by
block_offsets
=
slot_positions_np
%
block_size
# block_size.
np
.
add
(
block_table_indices
=
(
block_numbers
*
block_size
,
req_indices
*
block_table
.
max_num_blocks_per_req
+
block_offsets
,
positions_np
//
block_size
)
out
=
block_table
.
slot_mapping_np
[:
total_num_scheduled_tokens
])
block_table_cpu
=
block_table
.
get_cpu_tensor
()
block_numbers
=
block_table_cpu
.
flatten
(
)[
block_table_indices
].
numpy
()
block_offsets
=
positions_np
%
block_size
np
.
add
(
block_numbers
*
block_size
,
block_offsets
,
out
=
block_table
.
slot_mapping_np
[:
total_num_scheduled_tokens
])
# Prepare the attention metadata.
# Prepare the attention metadata.
self
.
query_start_loc_np
[
0
]
=
0
self
.
query_start_loc_np
[
0
]
=
0
...
@@ -962,28 +813,11 @@ class GPUModelRunnerBase(LoRAModelRunnerMixin):
...
@@ -962,28 +813,11 @@ class GPUModelRunnerBase(LoRAModelRunnerMixin):
self
.
seq_lens
[:
num_reqs
].
copy_
(
self
.
seq_lens_cpu
[:
num_reqs
],
self
.
seq_lens
[:
num_reqs
].
copy_
(
self
.
seq_lens_cpu
[:
num_reqs
],
non_blocking
=
True
)
non_blocking
=
True
)
if
use_kv_compression
:
if
use_kv_compression
:
if
self
.
scheduler_config
.
chunked_prefill_enabled
:
maybe_copy_kv_compression_step_tensors_to_gpu
(
self
.
kv_compression_prompt_end
[:
num_reqs
].
copy_
(
runner
=
self
,
self
.
kv_compression_prompt_end_cpu
[:
num_reqs
],
num_reqs
=
num_reqs
,
non_blocking
=
True
,
total_num_scheduled_tokens
=
total_num_scheduled_tokens
,
)
)
self
.
kv_compression_prompt_lens
[:
num_reqs
].
copy_
(
self
.
kv_compression_prompt_lens_cpu
[:
num_reqs
],
non_blocking
=
True
,
)
self
.
kv_compression_prompt_topk_keep
[:
num_reqs
].
copy_
(
self
.
kv_compression_prompt_topk_keep_cpu
[:
num_reqs
],
non_blocking
=
True
,
)
elif
self
.
kv_compression_needs_compaction
:
self
.
kv_compression_must_keep
[:
total_num_scheduled_tokens
].
copy_
(
self
.
kv_compression_must_keep_cpu
[:
total_num_scheduled_tokens
],
non_blocking
=
True
,
)
self
.
kv_compression_topk_budget
[:
num_reqs
].
copy_
(
self
.
kv_compression_topk_budget_cpu
[:
num_reqs
],
non_blocking
=
True
,
)
# Fill unused with -1. Needed for reshape_and_cache
# Fill unused with -1. Needed for reshape_and_cache
self
.
seq_lens
[
num_reqs
:].
fill_
(
0
)
self
.
seq_lens
[
num_reqs
:].
fill_
(
0
)
...
@@ -1593,151 +1427,11 @@ class GPUModelRunnerBase(LoRAModelRunnerMixin):
...
@@ -1593,151 +1427,11 @@ class GPUModelRunnerBase(LoRAModelRunnerMixin):
def
_stash_kv_compression_prompt_payload
(
self
)
->
None
:
def
_stash_kv_compression_prompt_payload
(
self
)
->
None
:
"""Persist prompt-end compaction indices from the forward context."""
"""Persist prompt-end compaction indices from the forward context."""
if
(
not
envs
.
VLLM_ENABLE_KV_COMPRESSION
stash_kv_compression_prompt_payload_to_requests
(
runner
=
self
)
or
not
self
.
scheduler_config
.
chunked_prefill_enabled
):
return
forward_context
=
get_forward_context
()
payload
=
getattr
(
forward_context
,
"_kv_compression_prompt_payload"
,
None
)
if
payload
is
None
:
return
req_indices
=
payload
.
get
(
"req_indices"
)
idx_sorted
=
payload
.
get
(
"idx_sorted"
)
keep_len
=
payload
.
get
(
"keep_len"
)
prompt_lens
=
payload
.
get
(
"prompt_lens"
)
if
(
req_indices
is
None
or
idx_sorted
is
None
or
keep_len
is
None
or
prompt_lens
is
None
):
return
req_indices_cpu
=
req_indices
.
to
(
device
=
"cpu"
,
dtype
=
torch
.
int64
).
tolist
()
keep_cpu
=
keep_len
.
to
(
device
=
"cpu"
,
dtype
=
torch
.
int64
).
tolist
()
prompt_cpu
=
prompt_lens
.
to
(
device
=
"cpu"
,
dtype
=
torch
.
int64
).
tolist
()
for
i
,
b
in
enumerate
(
req_indices_cpu
):
if
b
<
0
or
b
>=
len
(
self
.
input_batch
.
req_ids
):
continue
req_id
=
self
.
input_batch
.
req_ids
[
b
]
if
req_id
is
None
:
continue
rs
=
self
.
requests
.
get
(
req_id
)
if
rs
is
None
:
continue
rs
.
kv_compression_prompt_idx_sorted
=
idx_sorted
[
i
]
rs
.
kv_compression_prompt_keep_len
=
int
(
keep_cpu
[
i
])
rs
.
kv_compression_prompt_prompt_len
=
int
(
prompt_cpu
[
i
])
def
_maybe_apply_kv_compression_prompt_compaction
(
self
)
->
None
:
def
_maybe_apply_kv_compression_prompt_compaction
(
self
)
->
None
:
"""Apply one-shot prompt KV compaction before the first decode step."""
"""Apply one-shot prompt KV compaction before the first decode step."""
if
(
not
envs
.
VLLM_ENABLE_KV_COMPRESSION
maybe_apply_kv_compression_prompt_compaction
(
runner
=
self
)
or
not
self
.
scheduler_config
.
chunked_prefill_enabled
):
return
pending_req_ids
:
list
[
str
]
=
[]
for
req_id
in
self
.
input_batch
.
req_ids
:
if
req_id
is
None
:
continue
rs
=
self
.
requests
.
get
(
req_id
)
if
rs
is
None
:
continue
if
rs
.
kv_compression_prompt_idx_sorted
is
None
:
continue
# Only apply once the prompt is fully ingested (decode stage).
if
rs
.
num_computed_tokens
<
rs
.
num_prompt_tokens
:
continue
pending_req_ids
.
append
(
req_id
)
if
not
pending_req_ids
:
return
device
=
self
.
device
pending_states
:
list
[
tuple
[
str
,
torch
.
Tensor
,
int
]]
=
[]
for
req_id
in
pending_req_ids
:
rs
=
self
.
requests
[
req_id
]
keep
=
rs
.
kv_compression_prompt_keep_len
idx
=
rs
.
kv_compression_prompt_idx_sorted
if
keep
is
None
or
idx
is
None
:
continue
keep_i
=
int
(
keep
)
if
keep_i
<=
0
:
# No prompt tokens kept; clear and skip.
rs
.
kv_compression_prompt_idx_sorted
=
None
rs
.
kv_compression_prompt_keep_len
=
None
rs
.
kv_compression_prompt_prompt_len
=
None
continue
pending_states
.
append
((
req_id
,
idx
,
keep_i
))
if
not
pending_states
:
return
B
=
len
(
pending_states
)
keep_list
=
[
k
for
_
,
_
,
k
in
pending_states
]
K_max
=
max
(
keep_list
)
idx_batch
=
torch
.
zeros
((
B
,
K_max
),
device
=
device
,
dtype
=
torch
.
int32
)
for
i
,
(
_
,
row
,
k
)
in
enumerate
(
pending_states
):
idx_batch
[
i
,
:
k
]
=
row
[:
k
].
to
(
device
=
device
,
dtype
=
torch
.
int32
)
keep_tensor
=
torch
.
tensor
(
keep_list
,
device
=
device
,
dtype
=
torch
.
int32
)
from
vllm.v1.attention.kv_compression.kv_cache_triton
import
(
front_compact_inplace_fa_triton
,
make_fa_cache_view
)
# Apply compaction to every attention layer's KV cache in-place.
for
group_id
,
kv_cache_group_spec
in
enumerate
(
self
.
kv_cache_config
.
kv_cache_groups
):
max_blocks
=
0
for
req_id
,
_
,
_
in
pending_states
:
rs
=
self
.
requests
[
req_id
]
if
group_id
>=
len
(
rs
.
block_ids
):
continue
max_blocks
=
max
(
max_blocks
,
len
(
rs
.
block_ids
[
group_id
]))
if
max_blocks
==
0
:
continue
block_table_cpu
=
torch
.
zeros
((
B
,
max_blocks
),
dtype
=
torch
.
int32
,
device
=
"cpu"
)
for
i
,
(
req_id
,
_
,
_
)
in
enumerate
(
pending_states
):
rs
=
self
.
requests
[
req_id
]
if
group_id
>=
len
(
rs
.
block_ids
):
continue
ids
=
rs
.
block_ids
[
group_id
]
if
ids
:
block_table_cpu
[
i
,
:
len
(
ids
)]
=
torch
.
tensor
(
ids
,
dtype
=
torch
.
int32
,
device
=
"cpu"
)
block_table
=
block_table_cpu
.
to
(
device
=
device
,
non_blocking
=
True
)
for
layer_name
in
kv_cache_group_spec
.
layer_names
:
layer_index
=
self
.
_extract_layer_index
(
layer_name
)
if
layer_index
>=
len
(
self
.
kv_caches
):
continue
kv_cache
=
self
.
kv_caches
[
layer_index
]
if
not
current_platform
.
is_rocm
():
if
not
isinstance
(
kv_cache
,
torch
.
Tensor
):
continue
key_cache
,
value_cache
=
kv_cache
.
unbind
(
0
)
else
:
if
(
not
isinstance
(
kv_cache
,
(
tuple
,
list
))
or
len
(
kv_cache
)
!=
2
):
continue
key_cache
,
value_cache
=
kv_cache
k_view
,
v_view
=
make_fa_cache_view
(
key_cache
=
key_cache
,
value_cache
=
value_cache
)
front_compact_inplace_fa_triton
(
k_view
,
v_view
,
block_table
,
idx_batch
,
keep_tensor
,
)
# Clear pending state after successful compaction.
for
req_id
,
_
,
_
in
pending_states
:
rs
=
self
.
requests
.
get
(
req_id
)
if
rs
is
None
:
continue
rs
.
kv_compression_prompt_idx_sorted
=
None
rs
.
kv_compression_prompt_keep_len
=
None
rs
.
kv_compression_prompt_prompt_len
=
None
@
torch
.
inference_mode
()
@
torch
.
inference_mode
()
def
execute_model
(
def
execute_model
(
...
@@ -1839,19 +1533,7 @@ class GPUModelRunnerBase(LoRAModelRunnerMixin):
...
@@ -1839,19 +1533,7 @@ class GPUModelRunnerBase(LoRAModelRunnerMixin):
# the first decode step writes/reads KV at the compressed positions.
# the first decode step writes/reads KV at the compressed positions.
self
.
_maybe_apply_kv_compression_prompt_compaction
()
self
.
_maybe_apply_kv_compression_prompt_compaction
()
use_tbo
=
(
envs
.
VLLM_ENABLE_TBO
and
scheduler_output
.
total_num_scheduled_tokens
if
envs
.
VLLM_ENABLE_TBO
and
scheduler_output
.
total_num_scheduled_tokens
>=
envs
.
VLLM_TBO_MIN_TOKENS
:
>=
envs
.
VLLM_TBO_MIN_TOKENS
)
if
(
use_tbo
and
envs
.
VLLM_ENABLE_KV_COMPRESSION
and
self
.
scheduler_config
.
chunked_prefill_enabled
):
# NOTE: the TBO path does not call `_stash_kv_compression_prompt_payload`
# inside its `set_forward_context`, so scheme-3 prompt-end payloads
# would be dropped and the next-step compaction would never run.
logger
.
warning_once
(
"TBO is currently incompatible with chunked prefill KV "
"compression (scheme 3); running without TBO."
)
use_tbo
=
False
if
use_tbo
:
model_output
,
finished_sending
,
finished_recving
=
\
model_output
,
finished_sending
,
finished_recving
=
\
tbo_split_and_execute_model
(
self
,
attn_metadata
,
num_input_tokens
,
tbo_split_and_execute_model
(
self
,
attn_metadata
,
num_input_tokens
,
num_tokens_across_dp
,
input_ids
,
positions
,
num_tokens_across_dp
,
input_ids
,
positions
,
...
@@ -3476,119 +3158,22 @@ class GPUModelRunnerMTP(GPUModelRunnerBase):
...
@@ -3476,119 +3158,22 @@ class GPUModelRunnerMTP(GPUModelRunnerBase):
np
.
add
(
self
.
input_batch
.
num_computed_tokens_cpu
[
req_indices
],
np
.
add
(
self
.
input_batch
.
num_computed_tokens_cpu
[
req_indices
],
arange
,
arange
,
out
=
positions_np
)
out
=
positions_np
)
# KV positions (where the KV for each scheduled token is temporarily
# KV positions (where the KV for each scheduled token is temporarily
# written). When KV compression is enabled, KV positions are decoupled
# written). When KV compression is enabled, KV positions are decoupled
# from logical positions.
# from logical positions.
use_kv_compression
=
envs
.
VLLM_ENABLE_KV_COMPRESSION
use_kv_compression
=
envs
.
VLLM_ENABLE_KV_COMPRESSION
if
use_kv_compression
:
if
use_kv_compression
:
kv_positions_np
=
self
.
kv_positions_np
[:
total_num_scheduled_tokens
]
kv_positions_np
=
maybe_prepare_kv_compression_for_runner_step
(
np
.
add
(
self
.
input_batch
.
num_kv_tokens_cpu
[
req_indices
],
runner
=
self
,
arange
,
num_reqs
=
num_reqs
,
out
=
kv_positions_np
)
total_num_scheduled_tokens
=
total_num_scheduled_tokens
,
num_scheduled_tokens
=
num_scheduled_tokens
,
cu_num_tokens
=
cu_num_tokens
,
req_indices
=
req_indices
,
arange
=
arange
,
)
else
:
else
:
kv_positions_np
=
None
kv_positions_np
=
None
if
use_kv_compression
:
prompt_ratio
=
envs
.
VLLM_KV_COMPRESSION_PROMPT_RATIO
prompt_budget
=
envs
.
VLLM_KV_COMPRESSION_PROMPT_BUDGET
protected_prefix
=
envs
.
VLLM_KV_COMPRESSION_PROTECTED_PREFIX
protected_suffix
=
envs
.
VLLM_KV_COMPRESSION_PROTECTED_SUFFIX
keep_last
=
envs
.
VLLM_KV_COMPRESSION_KEEP_LAST_TOKEN
if
self
.
scheduler_config
.
chunked_prefill_enabled
:
# Scheme 3: with chunked prefill, defer compaction until after
# the full prompt is ingested. Otherwise, the next prefill chunk
# would attend to a truncated history and quality can collapse.
prompt_end_np
=
self
.
kv_compression_prompt_end_np
[:
num_reqs
]
prompt_end_np
.
fill
(
False
)
prompt_lens_np
=
self
.
kv_compression_prompt_lens_np
[:
num_reqs
]
prompt_lens_np
.
fill
(
0
)
topk_keep_np
=
self
.
kv_compression_prompt_topk_keep_np
[:
num_reqs
]
topk_keep_np
.
fill
(
0
)
for
req_idx
in
range
(
num_reqs
):
qlen
=
int
(
num_scheduled_tokens
[
req_idx
])
if
qlen
<=
0
:
continue
base_pos
=
int
(
self
.
input_batch
.
num_computed_tokens_cpu
[
req_idx
])
prompt_len
=
int
(
self
.
input_batch
.
num_prompt_tokens
[
req_idx
])
end_pos
=
base_pos
+
qlen
ends_prompt
=
(
base_pos
<
prompt_len
)
and
(
end_pos
>=
prompt_len
)
if
not
ends_prompt
:
continue
prompt_end_np
[
req_idx
]
=
True
prompt_lens_np
[
req_idx
]
=
prompt_len
topk_keep_np
[
req_idx
]
=
compute_prompt_topk_keep_total
(
prompt_len
=
prompt_len
,
protected_prefix
=
protected_prefix
,
protected_suffix
=
protected_suffix
,
keep_last_token
=
keep_last
,
prompt_ratio
=
prompt_ratio
,
prompt_budget
=
prompt_budget
,
)
self
.
kv_compression_prompt_topk_keep_max
=
int
(
topk_keep_np
[:
num_reqs
].
max
())
if
num_reqs
>
0
else
0
self
.
kv_compression_needs_compaction
=
False
else
:
must_keep_np
=
self
.
kv_compression_must_keep_np
[
:
total_num_scheduled_tokens
]
must_keep_np
.
fill
(
False
)
topk_budget_np
=
self
.
kv_compression_topk_budget_np
[:
num_reqs
]
topk_budget_np
.
fill
(
0
)
for
req_idx
in
range
(
num_reqs
):
qlen
=
int
(
num_scheduled_tokens
[
req_idx
])
if
qlen
<=
0
:
continue
start
=
0
if
req_idx
==
0
else
int
(
cu_num_tokens
[
req_idx
-
1
])
end
=
int
(
cu_num_tokens
[
req_idx
])
assert
end
-
start
==
qlen
base_pos
=
int
(
self
.
input_batch
.
num_computed_tokens_cpu
[
req_idx
])
prompt_len
=
int
(
self
.
input_batch
.
num_prompt_tokens
[
req_idx
])
end_pos
=
base_pos
+
qlen
pos
=
base_pos
+
np
.
arange
(
qlen
,
dtype
=
np
.
int64
)
prompt_mask
=
pos
<
prompt_len
# Decode tokens are always kept.
must_keep
=
~
prompt_mask
if
np
.
any
(
prompt_mask
):
suffix_start
=
max
(
prompt_len
-
protected_suffix
,
0
)
must_keep
|=
prompt_mask
&
(
pos
<
protected_prefix
)
must_keep
|=
prompt_mask
&
(
pos
>=
suffix_start
)
if
keep_last
:
last
=
prompt_len
-
1
if
base_pos
<=
last
<
end_pos
:
must_keep
[
last
-
base_pos
]
=
True
topk_budget_np
[
req_idx
]
=
compute_topk_budget_step
(
prompt_len
=
prompt_len
,
start_pos
=
base_pos
,
end_pos
=
end_pos
,
protected_prefix
=
protected_prefix
,
protected_suffix
=
protected_suffix
,
keep_last_token
=
keep_last
,
prompt_ratio
=
prompt_ratio
,
prompt_budget
=
prompt_budget
,
)
must_keep_np
[
start
:
end
]
=
must_keep
# Decode-only fast path: if all scheduled tokens are
# unconditionally kept and there is no Top-K budget, KV
# compaction is a no-op and we can skip score/topk/dst entirely
# in the attention backend.
self
.
kv_compression_needs_compaction
=
(
(
not
must_keep_np
.
all
())
or
(
topk_budget_np
>
0
).
any
())
else
:
self
.
kv_compression_needs_compaction
=
False
self
.
kv_compression_needs_compaction
=
False
# Calculate M-RoPE positions.
# Calculate M-RoPE positions.
...
@@ -3610,32 +3195,38 @@ class GPUModelRunnerMTP(GPUModelRunnerBase):
...
@@ -3610,32 +3195,38 @@ class GPUModelRunnerMTP(GPUModelRunnerBase):
0
,
0
,
torch
.
from_numpy
(
token_indices
),
torch
.
from_numpy
(
token_indices
),
out
=
self
.
input_ids_cpu
[:
total_num_scheduled_tokens
])
out
=
self
.
input_ids_cpu
[:
total_num_scheduled_tokens
])
# Calculate the slot mapping for each KV cache group.
# Calculate the slot mapping for each KV cache group.
for
kv_cache_group_id
,
kv_cache_group_spec
in
enumerate
(
for
kv_cache_group_id
,
kv_cache_group_spec
in
enumerate
(
self
.
kv_cache_config
.
kv_cache_groups
):
self
.
kv_cache_config
.
kv_cache_groups
):
block_size
=
kv_cache_group_spec
.
kv_cache_spec
.
block_size
block_size
=
kv_cache_group_spec
.
kv_cache_spec
.
block_size
block_table
:
BlockTable
=
self
.
input_batch
.
block_table
[
block_table
:
BlockTable
=
self
.
input_batch
.
block_table
[
kv_cache_group_id
]
kv_cache_group_id
]
slot_positions_np
=
(
kv_positions_np
if
use_kv_compression
:
if
use_kv_compression
else
positions_np
)
fill_block_table_slot_mapping_np
(
# E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
block_table
=
block_table
,
# -> [0, 0, K, K, K + 1, K + 1, K + 2, 2 * K, 2 * K, 2 * K + 1]
req_indices
=
req_indices
,
# where K is the max_num_blocks_per_req and the block size is 2.
slot_positions_np
=
kv_positions_np
,
# NOTE(woosuk): We can't simply use `token_indices // block_size`
total_num_scheduled_tokens
=
total_num_scheduled_tokens
,
# here because M (max_model_len) is not necessarily divisible by
block_size
=
block_size
,
# block_size.
)
block_table_indices
=
(
else
:
# E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
# -> [0, 0, K, K, K + 1, K + 1, K + 2, 2 * K, 2 * K, 2 * K + 1]
# where K is the max_num_blocks_per_req and the block size is 2.
# NOTE(woosuk): We can't simply use `token_indices // block_size`
# here because M (max_model_len) is not necessarily divisible by
# block_size.
block_table_indices
=
(
req_indices
*
block_table
.
max_num_blocks_per_req
+
req_indices
*
block_table
.
max_num_blocks_per_req
+
slot_
positions_np
//
block_size
)
positions_np
//
block_size
)
block_table_cpu
=
block_table
.
get_cpu_tensor
()
block_table_cpu
=
block_table
.
get_cpu_tensor
()
block_numbers
=
block_table_cpu
.
flatten
(
block_numbers
=
block_table_cpu
.
flatten
(
)[
block_table_indices
].
numpy
()
)[
block_table_indices
].
numpy
()
block_offsets
=
slot_
positions_np
%
block_size
block_offsets
=
positions_np
%
block_size
np
.
add
(
np
.
add
(
block_numbers
*
block_size
,
block_numbers
*
block_size
,
block_offsets
,
block_offsets
,
out
=
block_table
.
slot_mapping_np
[:
total_num_scheduled_tokens
])
out
=
block_table
.
slot_mapping_np
[:
total_num_scheduled_tokens
])
# Prepare the attention metadata.
# Prepare the attention metadata.
self
.
query_start_loc_np
[
0
]
=
0
self
.
query_start_loc_np
[
0
]
=
0
...
@@ -3669,28 +3260,11 @@ class GPUModelRunnerMTP(GPUModelRunnerBase):
...
@@ -3669,28 +3260,11 @@ class GPUModelRunnerMTP(GPUModelRunnerBase):
self
.
seq_lens
[:
num_reqs
].
copy_
(
self
.
seq_lens_cpu
[:
num_reqs
],
self
.
seq_lens
[:
num_reqs
].
copy_
(
self
.
seq_lens_cpu
[:
num_reqs
],
non_blocking
=
True
)
non_blocking
=
True
)
if
use_kv_compression
:
if
use_kv_compression
:
if
self
.
scheduler_config
.
chunked_prefill_enabled
:
maybe_copy_kv_compression_step_tensors_to_gpu
(
self
.
kv_compression_prompt_end
[:
num_reqs
].
copy_
(
runner
=
self
,
self
.
kv_compression_prompt_end_cpu
[:
num_reqs
],
num_reqs
=
num_reqs
,
non_blocking
=
True
,
total_num_scheduled_tokens
=
total_num_scheduled_tokens
,
)
)
self
.
kv_compression_prompt_lens
[:
num_reqs
].
copy_
(
self
.
kv_compression_prompt_lens_cpu
[:
num_reqs
],
non_blocking
=
True
,
)
self
.
kv_compression_prompt_topk_keep
[:
num_reqs
].
copy_
(
self
.
kv_compression_prompt_topk_keep_cpu
[:
num_reqs
],
non_blocking
=
True
,
)
elif
self
.
kv_compression_needs_compaction
:
self
.
kv_compression_must_keep
[:
total_num_scheduled_tokens
].
copy_
(
self
.
kv_compression_must_keep_cpu
[:
total_num_scheduled_tokens
],
non_blocking
=
True
,
)
self
.
kv_compression_topk_budget
[:
num_reqs
].
copy_
(
self
.
kv_compression_topk_budget_cpu
[:
num_reqs
],
non_blocking
=
True
,
)
# Fill unused with -1. Needed for reshape_and_cache
# Fill unused with -1. Needed for reshape_and_cache
self
.
seq_lens
[
num_reqs
:].
fill_
(
0
)
self
.
seq_lens
[
num_reqs
:].
fill_
(
0
)
...
@@ -3877,16 +3451,7 @@ class GPUModelRunnerMTP(GPUModelRunnerBase):
...
@@ -3877,16 +3451,7 @@ class GPUModelRunnerMTP(GPUModelRunnerBase):
# the first decode step writes/reads KV at the compressed positions.
# the first decode step writes/reads KV at the compressed positions.
self
.
_maybe_apply_kv_compression_prompt_compaction
()
self
.
_maybe_apply_kv_compression_prompt_compaction
()
use_tbo
=
(
envs
.
VLLM_ENABLE_TBO
and
scheduler_output
.
total_num_scheduled_tokens
if
envs
.
VLLM_ENABLE_TBO
and
scheduler_output
.
total_num_scheduled_tokens
>=
envs
.
VLLM_TBO_MIN_TOKENS
:
>=
envs
.
VLLM_TBO_MIN_TOKENS
)
if
(
use_tbo
and
envs
.
VLLM_ENABLE_KV_COMPRESSION
and
self
.
scheduler_config
.
chunked_prefill_enabled
):
logger
.
warning_once
(
"TBO is currently incompatible with chunked prefill KV "
"compression (scheme 3); running without TBO."
)
use_tbo
=
False
if
use_tbo
:
model_output
,
finished_sending
,
finished_recving
=
\
model_output
,
finished_sending
,
finished_recving
=
\
tbo_split_and_execute_model
(
self
,
attn_metadata
,
num_input_tokens
,
tbo_split_and_execute_model
(
self
,
attn_metadata
,
num_input_tokens
,
num_tokens_across_dp
,
input_ids
,
positions
,
num_tokens_across_dp
,
input_ids
,
positions
,
...
...
vllm/v1/worker/slot_mapping_utils.py
0 → 100644
View file @
9d4330d2
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
__future__
import
annotations
import
numpy
as
np
from
vllm.v1.worker.block_table
import
BlockTable
def
fill_block_table_slot_mapping_np
(
*
,
block_table
:
BlockTable
,
req_indices
:
np
.
ndarray
,
# [T] int64
slot_positions_np
:
np
.
ndarray
,
# [T] int64
total_num_scheduled_tokens
:
int
,
block_size
:
int
,
)
->
None
:
"""Fill `block_table.slot_mapping_np` for a packed batch of tokens."""
block_table_indices
=
(
req_indices
*
block_table
.
max_num_blocks_per_req
+
slot_positions_np
//
int
(
block_size
))
block_table_cpu
=
block_table
.
get_cpu_tensor
()
block_numbers
=
block_table_cpu
.
flatten
()[
block_table_indices
].
numpy
()
block_offsets
=
slot_positions_np
%
int
(
block_size
)
np
.
add
(
block_numbers
*
int
(
block_size
),
block_offsets
,
out
=
block_table
.
slot_mapping_np
[:
total_num_scheduled_tokens
],
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment