Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
5dd8df07
Unverified
Commit
5dd8df07
authored
Mar 18, 2026
by
Or Ozeri
Committed by
GitHub
Mar 18, 2026
Browse files
[kv_offload+HMA][2/N]: Support multiple KV groups in GPULoadStoreSpec (#36642)
Signed-off-by:
Or Ozeri
<
oro@il.ibm.com
>
parent
39bfb57b
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
43 additions
and
10 deletions
+43
-10
tests/v1/kv_offload/test_cpu_gpu.py
tests/v1/kv_offload/test_cpu_gpu.py
+4
-8
vllm/distributed/kv_transfer/kv_connector/v1/offloading/scheduler.py
...buted/kv_transfer/kv_connector/v1/offloading/scheduler.py
+8
-2
vllm/v1/kv_offload/mediums.py
vllm/v1/kv_offload/mediums.py
+31
-0
No files found.
tests/v1/kv_offload/test_cpu_gpu.py
View file @
5dd8df07
...
@@ -135,19 +135,19 @@ def test_transfer(
...
@@ -135,19 +135,19 @@ def test_transfer(
# set transfer direction
# set transfer direction
if
gpu_to_cpu
:
if
gpu_to_cpu
:
handler
=
handlers
.
gpu_to_cpu_handler
handler
=
handlers
.
gpu_to_cpu_handler
src_spec_class
=
GPULoadStoreSpec
dst_spec_class
=
CPULoadStoreSpec
src_blocks
=
gpu_blocks
src_blocks
=
gpu_blocks
dst_blocks
=
cpu_blocks
dst_blocks
=
cpu_blocks
src_spec
=
GPULoadStoreSpec
(
src_blocks
,
group_sizes
=
(
len
(
src_blocks
),))
dst_spec
=
CPULoadStoreSpec
(
dst_blocks
)
src_blocks_in_kernel_block_size
=
gpu_blocks_in_kernel_block_size
src_blocks_in_kernel_block_size
=
gpu_blocks_in_kernel_block_size
dst_blocks_in_kernel_block_size
=
cpu_blocks_in_kernel_block_size
dst_blocks_in_kernel_block_size
=
cpu_blocks_in_kernel_block_size
dst_size_in_kernel_blocks
=
num_cpu_blocks
*
kernel_blocks_per_cpu_block
dst_size_in_kernel_blocks
=
num_cpu_blocks
*
kernel_blocks_per_cpu_block
else
:
else
:
handler
=
handlers
.
cpu_to_gpu_handler
handler
=
handlers
.
cpu_to_gpu_handler
src_spec_class
=
CPULoadStoreSpec
dst_spec_class
=
GPULoadStoreSpec
src_blocks
=
cpu_blocks
src_blocks
=
cpu_blocks
dst_blocks
=
gpu_blocks
dst_blocks
=
gpu_blocks
src_spec
=
CPULoadStoreSpec
(
src_blocks
)
dst_spec
=
GPULoadStoreSpec
(
dst_blocks
,
group_sizes
=
(
len
(
dst_blocks
),))
src_blocks_in_kernel_block_size
=
cpu_blocks_in_kernel_block_size
src_blocks_in_kernel_block_size
=
cpu_blocks_in_kernel_block_size
dst_blocks_in_kernel_block_size
=
gpu_blocks_in_kernel_block_size
dst_blocks_in_kernel_block_size
=
gpu_blocks_in_kernel_block_size
dst_size_in_kernel_blocks
=
num_gpu_blocks
*
kernel_blocks_per_gpu_block
dst_size_in_kernel_blocks
=
num_gpu_blocks
*
kernel_blocks_per_gpu_block
...
@@ -159,10 +159,6 @@ def test_transfer(
...
@@ -159,10 +159,6 @@ def test_transfer(
):
):
dst_to_src
[
dst_block
]
=
src_block
dst_to_src
[
dst_block
]
=
src_block
# build transfer specs
src_spec
=
src_spec_class
(
src_blocks
)
dst_spec
=
dst_spec_class
(
dst_blocks
)
# clone src and dst tensors before transfer
# clone src and dst tensors before transfer
orig_src_caches
=
[
x
.
clone
()
for
x
in
handler
.
src_tensors
]
orig_src_caches
=
[
x
.
clone
()
for
x
in
handler
.
src_tensors
]
orig_dst_caches
=
[
x
.
clone
()
for
x
in
handler
.
dst_tensors
]
orig_dst_caches
=
[
x
.
clone
()
for
x
in
handler
.
dst_tensors
]
...
...
vllm/distributed/kv_transfer/kv_connector/v1/offloading/scheduler.py
View file @
5dd8df07
...
@@ -173,7 +173,11 @@ class OffloadingConnectorScheduler:
...
@@ -173,7 +173,11 @@ class OffloadingConnectorScheduler:
)
)
src_spec
=
self
.
manager
.
prepare_load
(
block_hashes
)
src_spec
=
self
.
manager
.
prepare_load
(
block_hashes
)
dst_spec
=
GPULoadStoreSpec
(
block_ids
[
num_computed_gpu_blocks
:])
dst_spec
=
GPULoadStoreSpec
(
block_ids
[
num_computed_gpu_blocks
:],
group_sizes
=
(
num_pending_gpu_blocks
,),
block_indices
=
(
num_computed_gpu_blocks
,),
)
block_hashes
=
self
.
_get_block_hashes
(
block_hashes
=
self
.
_get_block_hashes
(
request
,
start_idx
=
start_block_idx
,
end_idx
=
num_blocks
request
,
start_idx
=
start_block_idx
,
end_idx
=
num_blocks
...
@@ -246,7 +250,9 @@ class OffloadingConnectorScheduler:
...
@@ -246,7 +250,9 @@ class OffloadingConnectorScheduler:
gpu_block_idx
=
offloaded_block_idx
*
self
.
block_size_factor
gpu_block_idx
=
offloaded_block_idx
*
self
.
block_size_factor
for
i
in
range
(
self
.
block_size_factor
):
for
i
in
range
(
self
.
block_size_factor
):
src_block_ids
.
append
(
block_ids
[
gpu_block_idx
+
i
])
src_block_ids
.
append
(
block_ids
[
gpu_block_idx
+
i
])
src_spec
=
GPULoadStoreSpec
(
src_block_ids
)
src_spec
=
GPULoadStoreSpec
(
src_block_ids
,
group_sizes
=
(
len
(
src_block_ids
),)
)
reqs_to_store
[
req_id
]
=
(
src_spec
,
dst_spec
)
reqs_to_store
[
req_id
]
=
(
src_spec
,
dst_spec
)
self
.
_reqs_being_stored
[
req_id
]
|=
block_hashes_to_store
self
.
_reqs_being_stored
[
req_id
]
|=
block_hashes_to_store
...
...
vllm/v1/kv_offload/mediums.py
View file @
5dd8df07
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
abc
import
ABC
from
abc
import
ABC
from
collections.abc
import
Sequence
import
numpy
as
np
import
numpy
as
np
...
@@ -22,8 +23,38 @@ class BlockIDsLoadStoreSpec(LoadStoreSpec, ABC):
...
@@ -22,8 +23,38 @@ class BlockIDsLoadStoreSpec(LoadStoreSpec, ABC):
class
GPULoadStoreSpec
(
BlockIDsLoadStoreSpec
):
class
GPULoadStoreSpec
(
BlockIDsLoadStoreSpec
):
"""
"""
Spec for loading/storing a KV block to GPU memory.
Spec for loading/storing a KV block to GPU memory.
If there are multiple KV groups, the blocks are expected to be
ordered by the group index.
In that case, group_sizes[i] determines the number of blocks
per the i-th KV group, and thus sum(group_sizes) == len(block_ids).
group_sizes=None indicates a single KV group.
If block_indices is given, each group (determined by group_sizes) of block IDs
will correspond to logically contiguous blocks, e.g. blocks 5-10 of a some request.
block_indices[i] will represent the block index of the first block in group #i.
Thus, len(block_indices) == len(group_sizes) = number of KV cache groups.
This information is required in order to support loading from offloaded blocks
which are larger than GPU blocks.
In such cases, the first GPU block per each group may be unaligned to the offloaded
block size, and so knowing block_indices[i] allows the worker to correctly
skip part of the first matching offloaded block.
Offloading from GPU is always aligned to offloaded block size, and so
block_indices will only be set by the offloading connector when loading into GPU.
"""
"""
def
__init__
(
self
,
block_ids
:
list
[
int
],
group_sizes
:
Sequence
[
int
],
block_indices
:
Sequence
[
int
]
|
None
=
None
,
):
super
().
__init__
(
block_ids
)
assert
sum
(
group_sizes
)
==
len
(
block_ids
)
assert
block_indices
is
None
or
len
(
block_indices
)
==
len
(
group_sizes
)
self
.
group_sizes
:
Sequence
[
int
]
=
group_sizes
self
.
block_indices
:
Sequence
[
int
]
|
None
=
block_indices
@
staticmethod
@
staticmethod
def
medium
()
->
str
:
def
medium
()
->
str
:
return
"GPU"
return
"GPU"
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment