Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
2df94aa9
"tests/entrypoints/pooling/openai/test_embedding.py" did not exist on "ed5ae4aaceacc28461f7e324c8f26be6fadd59df"
Commit
2df94aa9
authored
Jan 22, 2026
by
laibao
Browse files
feat: kvpress runner 侧按 num_kv_tokens 计算 KV 写入位置
parent
ad069e33
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
43 additions
and
5 deletions
+43
-5
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+43
-5
No files found.
vllm/v1/worker/gpu_model_runner.py
View file @
2df94aa9
...
...
@@ -146,6 +146,13 @@ class GPUModelRunnerBase(LoRAModelRunnerMixin):
self
.
attention_chunk_size
=
model_config
.
attention_chunk_size
self
.
cascade_attn_enabled
=
not
self
.
model_config
.
disable_cascade_attn
if
envs
.
VLLM_ENABLE_KV_COMPRESSION
:
# KV compression changes the effective KV sequence layout and
# invalidates cascade attention assumptions (common-prefix blocks).
self
.
cascade_attn_enabled
=
False
# Whether the current step needs KV compaction work (score/topk/dst).
# This is set per-step in `_prepare_inputs`.
self
.
kv_compression_needs_compaction
:
bool
=
False
# Multi-modal data support
self
.
mm_registry
=
MULTIMODAL_REGISTRY
...
...
@@ -673,6 +680,17 @@ class GPUModelRunnerBase(LoRAModelRunnerMixin):
np
.
add
(
self
.
input_batch
.
num_computed_tokens_cpu
[
req_indices
],
arange
,
out
=
positions_np
)
# KV positions (where the KV for each scheduled token is temporarily
# written). When KV compression is enabled, KV positions are decoupled
# from logical positions.
use_kv_compression
=
envs
.
VLLM_ENABLE_KV_COMPRESSION
if
use_kv_compression
:
kv_positions_np
=
self
.
kv_positions_np
[:
total_num_scheduled_tokens
]
np
.
add
(
self
.
input_batch
.
num_kv_tokens_cpu
[
req_indices
],
arange
,
out
=
kv_positions_np
)
else
:
kv_positions_np
=
None
# Calculate M-RoPE positions.
# Only relevant for models using M-RoPE (e.g, Qwen2-VL)
...
...
@@ -700,6 +718,8 @@ class GPUModelRunnerBase(LoRAModelRunnerMixin):
block_size
=
kv_cache_group_spec
.
kv_cache_spec
.
block_size
block_table
:
BlockTable
=
self
.
input_batch
.
block_table
[
kv_cache_group_id
]
slot_positions_np
=
(
kv_positions_np
if
use_kv_compression
else
positions_np
)
# E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
# -> [0, 0, K, K, K + 1, K + 1, K + 2, 2 * K, 2 * K, 2 * K + 1]
# where K is the max_num_blocks_per_req and the block size is 2.
...
...
@@ -708,11 +728,11 @@ class GPUModelRunnerBase(LoRAModelRunnerMixin):
# block_size.
block_table_indices
=
(
req_indices
*
block_table
.
max_num_blocks_per_req
+
positions_np
//
block_size
)
slot_
positions_np
//
block_size
)
block_table_cpu
=
block_table
.
get_cpu_tensor
()
block_numbers
=
block_table_cpu
.
flatten
(
)[
block_table_indices
].
numpy
()
block_offsets
=
positions_np
%
block_size
block_offsets
=
slot_
positions_np
%
block_size
np
.
add
(
block_numbers
*
block_size
,
block_offsets
,
...
...
@@ -722,6 +742,11 @@ class GPUModelRunnerBase(LoRAModelRunnerMixin):
self
.
query_start_loc_np
[
0
]
=
0
self
.
query_start_loc_np
[
1
:
num_reqs
+
1
]
=
cu_num_tokens
if
use_kv_compression
:
self
.
seq_lens_np
[:
num_reqs
]
=
(
self
.
input_batch
.
num_kv_tokens_cpu
[:
num_reqs
]
+
num_scheduled_tokens
)
else
:
self
.
seq_lens_np
[:
num_reqs
]
=
(
self
.
input_batch
.
num_computed_tokens_cpu
[:
num_reqs
]
+
num_scheduled_tokens
)
...
...
@@ -2547,6 +2572,10 @@ class GPUModelRunnerBase(LoRAModelRunnerMixin):
assert
len
(
self
.
attn_backends
)
==
0
and
len
(
self
.
attn_metadata_builders
)
==
0
,
"Attention backends are already initialized"
if
envs
.
VLLM_ENABLE_KV_COMPRESSION
and
self
.
full_cuda_graph
:
raise
ValueError
(
"KV compression is currently incompatible with full CUDA "
"graph mode."
)
for
i
,
kv_cache_group_spec
in
enumerate
(
kv_cache_config
.
kv_cache_groups
):
kv_cache_spec
=
kv_cache_group_spec
.
kv_cache_spec
...
...
@@ -2570,7 +2599,16 @@ class GPUModelRunnerBase(LoRAModelRunnerMixin):
raise
NotImplementedError
(
"Non-Attention backend is not supported by V1 "
"GPUModelRunner."
)
if
(
envs
.
VLLM_ENABLE_KV_COMPRESSION
and
attn_backend_i
.
get_name
()
!=
"FLASH_ATTN_VLLM_V1"
):
raise
ValueError
(
"KV compression currently requires "
"VLLM_ATTENTION_BACKEND=FLASH_ATTN_VLLM_V1."
)
elif
isinstance
(
kv_cache_spec
,
MambaSpec
):
if
envs
.
VLLM_ENABLE_KV_COMPRESSION
:
raise
ValueError
(
"KV compression is currently only supported for "
"Transformer attention layers."
)
attn_backend_i
=
Mamba2AttentionBackend
else
:
raise
ValueError
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment