Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
66652e80
Unverified
Commit
66652e80
authored
Jan 14, 2026
by
Lumosis
Committed by
GitHub
Jan 14, 2026
Browse files
[BugFix] Assign page_size_padded when unifying kv cache spec. (#32283)
Signed-off-by:
Lihao Ran
<
imlihao.ran@gmail.com
>
parent
e27078ea
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
99 additions
and
1 deletion
+99
-1
tests/v1/core/test_kv_cache_utils.py
tests/v1/core/test_kv_cache_utils.py
+97
-1
vllm/v1/core/kv_cache_utils.py
vllm/v1/core/kv_cache_utils.py
+2
-0
No files found.
tests/v1/core/test_kv_cache_utils.py
View file @
66652e80
...
@@ -36,6 +36,7 @@ from vllm.v1.core.kv_cache_utils import (
...
@@ -36,6 +36,7 @@ from vllm.v1.core.kv_cache_utils import (
tensor_data
,
tensor_data
,
)
)
from
vllm.v1.kv_cache_interface
import
(
from
vllm.v1.kv_cache_interface
import
(
ChunkedLocalAttentionSpec
,
FullAttentionSpec
,
FullAttentionSpec
,
KVCacheConfig
,
KVCacheConfig
,
KVCacheGroupSpec
,
KVCacheGroupSpec
,
...
@@ -102,29 +103,57 @@ def new_kv_cache_spec(
...
@@ -102,29 +103,57 @@ def new_kv_cache_spec(
num_kv_heads
=
2
,
num_kv_heads
=
2
,
head_size
=
64
,
head_size
=
64
,
dtype
=
torch
.
float32
,
dtype
=
torch
.
float32
,
page_size_padded
=
None
,
sliding_window
=
None
,
sliding_window
=
None
,
attention_chunk_size
=
None
,
):
):
return
FullAttentionSpec
(
return
FullAttentionSpec
(
block_size
=
block_size
,
block_size
=
block_size
,
num_kv_heads
=
num_kv_heads
,
num_kv_heads
=
num_kv_heads
,
head_size
=
head_size
,
head_size
=
head_size
,
dtype
=
dtype
,
dtype
=
dtype
,
page_size_padded
=
page_size_padded
,
sliding_window
=
sliding_window
,
sliding_window
=
sliding_window
,
attention_chunk_size
=
attention_chunk_size
,
)
)
def
new_sliding_window_spec
(
def
new_sliding_window_spec
(
block_size
=
16
,
num_kv_heads
=
2
,
head_size
=
64
,
dtype
=
torch
.
float32
,
sliding_window
=
1
block_size
=
16
,
num_kv_heads
=
2
,
head_size
=
64
,
dtype
=
torch
.
float32
,
page_size_padded
=
None
,
sliding_window
=
1
,
):
):
return
SlidingWindowSpec
(
return
SlidingWindowSpec
(
block_size
=
block_size
,
block_size
=
block_size
,
num_kv_heads
=
num_kv_heads
,
num_kv_heads
=
num_kv_heads
,
head_size
=
head_size
,
head_size
=
head_size
,
dtype
=
dtype
,
dtype
=
dtype
,
page_size_padded
=
page_size_padded
,
sliding_window
=
sliding_window
,
sliding_window
=
sliding_window
,
)
)
def
new_chunked_local_attention_spec
(
block_size
=
16
,
num_kv_heads
=
2
,
head_size
=
64
,
dtype
=
torch
.
float32
,
page_size_padded
=
None
,
attention_chunk_size
=
4
,
):
return
ChunkedLocalAttentionSpec
(
block_size
=
block_size
,
num_kv_heads
=
num_kv_heads
,
head_size
=
head_size
,
dtype
=
dtype
,
page_size_padded
=
page_size_padded
,
attention_chunk_size
=
attention_chunk_size
,
)
@
pytest
.
mark
.
parametrize
(
"hash_fn"
,
[
sha256
,
sha256_cbor
])
@
pytest
.
mark
.
parametrize
(
"hash_fn"
,
[
sha256
,
sha256_cbor
])
def
test_none_hash
(
monkeypatch
,
hash_fn
):
def
test_none_hash
(
monkeypatch
,
hash_fn
):
import
vllm.v1.core.kv_cache_utils
import
vllm.v1.core.kv_cache_utils
...
@@ -1855,3 +1884,70 @@ def test_auto_fit_max_model_len_not_triggered():
...
@@ -1855,3 +1884,70 @@ def test_auto_fit_max_model_len_not_triggered():
vllm_config
,
[
kv_cache_specs
],
[
mem_per_block_per_layer
*
2
*
32
]
vllm_config
,
[
kv_cache_specs
],
[
mem_per_block_per_layer
*
2
*
32
]
)
)
assert
vllm_config
.
model_config
.
max_model_len
==
16
assert
vllm_config
.
model_config
.
max_model_len
==
16
def
test_unify_hybrid_kv_cache_specs
():
# 1. has_full_attention and has_sliding_window
before_spec_1
=
new_kv_cache_spec
()
before_spec_2
=
new_sliding_window_spec
(
page_size_padded
=
32
*
1024
,
sliding_window
=
1024
)
kv_cache_spec
=
{
"layer_1"
:
before_spec_1
,
"layer_2"
:
before_spec_2
,
}
kv_cache_utils
.
unify_hybrid_kv_cache_specs
(
kv_cache_spec
)
expected_spec_1
=
new_kv_cache_spec
()
expected_spec_2
=
new_kv_cache_spec
(
page_size_padded
=
32
*
1024
,
sliding_window
=
1024
)
assert
kv_cache_spec
[
"layer_1"
]
==
expected_spec_1
assert
kv_cache_spec
[
"layer_2"
]
==
expected_spec_2
# 2. has_full_attention and has_chunked_local_attention
before_spec_1
=
new_kv_cache_spec
()
before_spec_2
=
new_chunked_local_attention_spec
(
page_size_padded
=
32
*
1024
,
attention_chunk_size
=
512
)
kv_cache_spec
=
{
"layer_1"
:
before_spec_1
,
"layer_2"
:
before_spec_2
,
}
kv_cache_utils
.
unify_hybrid_kv_cache_specs
(
kv_cache_spec
)
expected_spec_1
=
new_kv_cache_spec
()
expected_spec_2
=
new_kv_cache_spec
(
page_size_padded
=
32
*
1024
,
attention_chunk_size
=
512
)
assert
kv_cache_spec
[
"layer_1"
]
==
expected_spec_1
assert
kv_cache_spec
[
"layer_2"
]
==
expected_spec_2
# 3. has_full_attention, has_sliding_window and has_chunked_local_attention
before_spec_1
=
new_kv_cache_spec
()
before_spec_2
=
new_sliding_window_spec
(
page_size_padded
=
32
*
1024
,
sliding_window
=
1024
)
before_spec_3
=
new_chunked_local_attention_spec
(
page_size_padded
=
32
*
1024
,
attention_chunk_size
=
512
)
kv_cache_spec
=
{
"layer_1"
:
before_spec_1
,
"layer_2"
:
before_spec_2
,
"layer_3"
:
before_spec_3
,
}
kv_cache_utils
.
unify_hybrid_kv_cache_specs
(
kv_cache_spec
)
expected_spec_1
=
new_kv_cache_spec
()
expected_spec_2
=
new_kv_cache_spec
(
page_size_padded
=
32
*
1024
,
sliding_window
=
1024
)
expected_spec_3
=
new_kv_cache_spec
(
page_size_padded
=
32
*
1024
,
attention_chunk_size
=
512
)
assert
kv_cache_spec
[
"layer_1"
]
==
expected_spec_1
assert
kv_cache_spec
[
"layer_2"
]
==
expected_spec_2
assert
kv_cache_spec
[
"layer_3"
]
==
expected_spec_3
# 4. No FullAttentionSpec, should not convert
kv_cache_spec
=
{
"layer_1"
:
new_sliding_window_spec
(
sliding_window
=
1024
),
"layer_2"
:
new_chunked_local_attention_spec
(
attention_chunk_size
=
512
),
}
with
pytest
.
raises
(
ValueError
):
kv_cache_utils
.
unify_hybrid_kv_cache_specs
(
kv_cache_spec
)
vllm/v1/core/kv_cache_utils.py
View file @
66652e80
...
@@ -1185,6 +1185,7 @@ def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]):
...
@@ -1185,6 +1185,7 @@ def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]):
head_size
=
spec
.
head_size
,
head_size
=
spec
.
head_size
,
dtype
=
spec
.
dtype
,
dtype
=
spec
.
dtype
,
sliding_window
=
spec
.
sliding_window
,
sliding_window
=
spec
.
sliding_window
,
page_size_padded
=
spec
.
page_size_padded
,
)
)
elif
isinstance
(
spec
,
ChunkedLocalAttentionSpec
):
elif
isinstance
(
spec
,
ChunkedLocalAttentionSpec
):
kv_cache_spec
[
layer_name
]
=
FullAttentionSpec
(
kv_cache_spec
[
layer_name
]
=
FullAttentionSpec
(
...
@@ -1193,6 +1194,7 @@ def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]):
...
@@ -1193,6 +1194,7 @@ def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]):
head_size
=
spec
.
head_size
,
head_size
=
spec
.
head_size
,
dtype
=
spec
.
dtype
,
dtype
=
spec
.
dtype
,
attention_chunk_size
=
spec
.
attention_chunk_size
,
attention_chunk_size
=
spec
.
attention_chunk_size
,
page_size_padded
=
spec
.
page_size_padded
,
)
)
if
not
(
if
not
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment