Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
235e1f93
Unverified
Commit
235e1f93
authored
Apr 15, 2026
by
Or Ozeri
Committed by
GitHub
Apr 15, 2026
Browse files
[kv_offload+HMA][3/N]: Remove block_size from KVEvents (#36644)
Signed-off-by:
Or Ozeri
<
oro@il.ibm.com
>
parent
431cea3e
Changes
6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
10 additions
and
38 deletions
+10
-38
tests/v1/kv_connector/unit/offloading_connector/test_scheduler.py
.../kv_connector/unit/offloading_connector/test_scheduler.py
+3
-7
tests/v1/kv_offload/test_cpu_manager.py
tests/v1/kv_offload/test_cpu_manager.py
+6
-20
vllm/distributed/kv_transfer/kv_connector/v1/offloading/scheduler.py
...buted/kv_transfer/kv_connector/v1/offloading/scheduler.py
+1
-1
vllm/v1/kv_offload/abstract.py
vllm/v1/kv_offload/abstract.py
+0
-1
vllm/v1/kv_offload/cpu/manager.py
vllm/v1/kv_offload/cpu/manager.py
+0
-4
vllm/v1/kv_offload/cpu/spec.py
vllm/v1/kv_offload/cpu/spec.py
+0
-5
No files found.
tests/v1/kv_connector/unit/offloading_connector/test_scheduler.py
View file @
235e1f93
...
...
@@ -124,12 +124,8 @@ def test_offloading_connector(request_runner, async_scheduling: bool):
return
[
BlockHash
(
str
(
i
).
encode
())
for
i
in
int_hashes
]
def
take_events
()
->
Iterable
[
OffloadingEvent
]:
yield
OffloadingEvent
(
keys
=
to_keys
([
1
,
2
,
3
]),
block_size
=
16
,
medium
=
"A"
,
removed
=
False
)
yield
OffloadingEvent
(
keys
=
to_keys
([
4
,
5
,
6
]),
block_size
=
32
,
medium
=
"B"
,
removed
=
True
)
yield
OffloadingEvent
(
keys
=
to_keys
([
1
,
2
,
3
]),
medium
=
"A"
,
removed
=
False
)
yield
OffloadingEvent
(
keys
=
to_keys
([
4
,
5
,
6
]),
medium
=
"B"
,
removed
=
True
)
runner
.
manager
.
take_events
.
side_effect
=
take_events
events
=
list
(
runner
.
scheduler_connector
.
take_events
())
...
...
@@ -137,7 +133,7 @@ def test_offloading_connector(request_runner, async_scheduling: bool):
event
=
events
[
0
]
assert
isinstance
(
event
,
BlockStored
)
assert
event
.
block_hashes
==
to_hashes
([
1
,
2
,
3
])
assert
event
.
block_size
==
16
assert
event
.
block_size
==
0
assert
event
.
medium
==
"A"
assert
event
.
token_ids
==
[]
assert
event
.
parent_block_hash
is
None
...
...
tests/v1/kv_offload/test_cpu_manager.py
View file @
235e1f93
...
...
@@ -59,7 +59,6 @@ def verify_load_output(
def
verify_events
(
events
:
Iterable
[
OffloadingEvent
],
block_size
:
int
,
expected_stores
:
tuple
[
set
[
int
],
...]
=
(),
expected_evictions
:
tuple
[
set
[
int
],
...]
=
(),
):
...
...
@@ -67,7 +66,6 @@ def verify_events(
evictions
:
list
[
set
[
OffloadKey
]]
=
[]
for
event
in
events
:
assert
event
.
medium
==
CPULoadStoreSpec
.
medium
()
assert
event
.
block_size
==
block_size
if
event
.
removed
:
evictions
.
append
(
set
(
event
.
keys
))
else
:
...
...
@@ -98,9 +96,7 @@ def test_already_stored_block_not_evicted_during_prepare_store(eviction_policy):
candidate to make room for [3, 4, 5]
- After complete_store([2, 3, 4, 5]), block 2 must still be present.
"""
block_size
=
256
manager
=
CPUOffloadingManager
(
block_size
=
block_size
,
num_blocks
=
4
,
cache_policy
=
eviction_policy
,
enable_events
=
True
,
...
...
@@ -138,10 +134,9 @@ def test_cpu_manager():
"""
Tests CPUOffloadingManager with lru policy.
"""
# initialize a CPU backend with a capacity of 4 blocks
block_size
=
256
# initialize a CPU manager with a capacity of 4 blocks
cpu_manager
=
CPUOffloadingManager
(
block_size
=
block_size
,
num_blocks
=
4
,
cache_policy
=
"lru"
,
enable_events
=
True
num_blocks
=
4
,
cache_policy
=
"lru"
,
enable_events
=
True
)
# prepare store [1, 2]
...
...
@@ -163,9 +158,7 @@ def test_cpu_manager():
# complete store [1, 2]
cpu_manager
.
complete_store
(
to_keys
([
1
,
2
]))
verify_events
(
cpu_manager
.
take_events
(),
block_size
=
block_size
,
expected_stores
=
({
1
,
2
},)
)
verify_events
(
cpu_manager
.
take_events
(),
expected_stores
=
({
1
,
2
},))
# lookup [1, 2]
assert
cpu_manager
.
lookup
(
to_keys
([
1
]))
==
1
...
...
@@ -184,9 +177,7 @@ def test_cpu_manager():
)
# verify eviction event
verify_events
(
cpu_manager
.
take_events
(),
block_size
=
block_size
,
expected_evictions
=
({
1
},)
)
verify_events
(
cpu_manager
.
take_events
(),
expected_evictions
=
({
1
},))
# prepare store with no space
assert
cpu_manager
.
prepare_store
(
to_keys
([
1
,
6
]))
is
None
...
...
@@ -241,7 +232,6 @@ def test_cpu_manager():
verify_events
(
cpu_manager
.
take_events
(),
block_size
=
block_size
,
expected_stores
=
({
3
,
4
,
5
},
{
6
,
7
,
8
}),
expected_evictions
=
({
2
,
3
,
4
},
{
8
}),
)
...
...
@@ -254,7 +244,6 @@ class TestARCPolicy:
self
,
num_blocks
:
int
=
4
,
enable_events
:
bool
=
True
)
->
tuple
[
CPUOffloadingManager
,
ARCCachePolicy
]:
manager
=
CPUOffloadingManager
(
block_size
=
256
,
num_blocks
=
num_blocks
,
cache_policy
=
"arc"
,
enable_events
=
enable_events
,
...
...
@@ -289,9 +278,7 @@ class TestARCPolicy:
# complete store [1, 2]
cpu_manager
.
complete_store
(
to_keys
([
1
,
2
]))
verify_events
(
cpu_manager
.
take_events
(),
block_size
=
256
,
expected_stores
=
({
1
,
2
},)
)
verify_events
(
cpu_manager
.
take_events
(),
expected_stores
=
({
1
,
2
},))
# lookup [1, 2]
assert
cpu_manager
.
lookup
(
to_keys
([
1
]))
==
1
...
...
@@ -547,9 +534,8 @@ def test_filter_reused_manager():
"""
Tests FilterReusedOffloadingManager with a CPUOffloadingManager.
"""
block_size
=
256
lru_manager
=
CPUOffloadingManager
(
block_size
=
block_size
,
num_blocks
=
4
,
cache_policy
=
"lru"
,
enable_events
=
True
num_blocks
=
4
,
cache_policy
=
"lru"
,
enable_events
=
True
)
manager
=
FilterReusedOffloadingManager
(
...
...
vllm/distributed/kv_transfer/kv_connector/v1/offloading/scheduler.py
View file @
235e1f93
...
...
@@ -424,7 +424,7 @@ class OffloadingConnectorScheduler:
parent_block_hash
=
None
,
token_ids
=
[],
lora_id
=
None
,
block_size
=
event
.
block_size
,
block_size
=
0
,
medium
=
event
.
medium
,
lora_name
=
None
,
)
...
...
vllm/v1/kv_offload/abstract.py
View file @
235e1f93
...
...
@@ -79,7 +79,6 @@ class PrepareStoreOutput:
@
dataclass
class
OffloadingEvent
:
keys
:
list
[
OffloadKey
]
block_size
:
int
medium
:
str
# True if blocks are removed, False if stored
removed
:
bool
...
...
vllm/v1/kv_offload/cpu/manager.py
View file @
235e1f93
...
...
@@ -33,12 +33,10 @@ class CPUOffloadingManager(OffloadingManager):
def
__init__
(
self
,
block_size
:
int
,
num_blocks
:
int
,
cache_policy
:
Literal
[
"lru"
,
"arc"
]
=
"lru"
,
enable_events
:
bool
=
False
,
):
self
.
block_size
:
int
=
block_size
self
.
medium
:
str
=
CPULoadStoreSpec
.
medium
()
self
.
_num_blocks
:
int
=
num_blocks
self
.
_num_allocated_blocks
:
int
=
0
...
...
@@ -145,7 +143,6 @@ class CPUOffloadingManager(OffloadingManager):
self
.
events
.
append
(
OffloadingEvent
(
keys
=
to_evict
,
block_size
=
self
.
block_size
,
medium
=
self
.
medium
,
removed
=
True
,
)
...
...
@@ -188,7 +185,6 @@ class CPUOffloadingManager(OffloadingManager):
self
.
events
.
append
(
OffloadingEvent
(
keys
=
stored_keys
,
block_size
=
self
.
block_size
,
medium
=
self
.
medium
,
removed
=
False
,
)
...
...
vllm/v1/kv_offload/cpu/spec.py
View file @
235e1f93
...
...
@@ -60,12 +60,7 @@ class CPUOffloadingSpec(OffloadingSpec):
kv_events_config
is
not
None
and
kv_events_config
.
enable_kv_cache_events
)
assert
len
(
self
.
gpu_block_size
)
==
1
gpu_block_size
=
self
.
gpu_block_size
[
0
]
offloaded_block_size
=
gpu_block_size
*
self
.
block_size_factor
self
.
_manager
=
CPUOffloadingManager
(
block_size
=
offloaded_block_size
,
num_blocks
=
self
.
num_blocks
,
cache_policy
=
self
.
eviction_policy
,
# type: ignore[arg-type]
enable_events
=
enable_events
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment