Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
827b8c3e
Unverified
Commit
827b8c3e
authored
Nov 12, 2025
by
jthomson04
Committed by
GitHub
Nov 12, 2025
Browse files
feat: KVBM V2 transfer (#4068)
Signed-off-by:
jthomson04
<
jwillthomson19@gmail.com
>
parent
38242c8d
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
430 additions
and
121 deletions
+430
-121
lib/kvbm/src/block_manager/distributed/worker.rs
lib/kvbm/src/block_manager/distributed/worker.rs
+2
-2
lib/llm/src/block_manager/distributed.rs
lib/llm/src/block_manager/distributed.rs
+1
-1
lib/llm/src/block_manager/distributed/transfer.rs
lib/llm/src/block_manager/distributed/transfer.rs
+140
-33
lib/llm/src/block_manager/distributed/worker.rs
lib/llm/src/block_manager/distributed/worker.rs
+170
-70
lib/llm/src/block_manager/storage/cuda.rs
lib/llm/src/block_manager/storage/cuda.rs
+10
-6
lib/llm/src/block_manager/v2/memory/device.rs
lib/llm/src/block_manager/v2/memory/device.rs
+48
-7
pyproject.toml
pyproject.toml
+1
-0
tests/kvbm_integration/test_determinism_agg.py
tests/kvbm_integration/test_determinism_agg.py
+33
-1
tests/kvbm_integration/test_determinism_disagg.py
tests/kvbm_integration/test_determinism_disagg.py
+25
-1
No files found.
lib/kvbm/src/block_manager/distributed/worker.rs
View file @
827b8c3e
...
...
@@ -118,11 +118,11 @@ impl TorchTensor for VllmTensor {
#[pyclass]
#[derive(Clone)]
pub
struct
BlockTransferHandler
{
_
impl
:
Arc
<
RustBlockTransferHandler
>
,
_
impl
:
Arc
<
dyn
RustBlockTransferHandler
>
,
}
impl
BlockTransferHandler
{
pub
fn
get_handler
(
&
self
)
->
Arc
<
RustBlockTransferHandler
>
{
pub
fn
get_handler
(
&
self
)
->
Arc
<
dyn
RustBlockTransferHandler
>
{
self
._impl
.clone
()
}
}
...
...
lib/llm/src/block_manager/distributed.rs
View file @
827b8c3e
...
...
@@ -9,7 +9,7 @@ mod leader;
mod
worker
;
pub
use
leader
::{
KvbmLeader
,
KvbmLeaderConfig
,
KvbmLeaderNumBlocksConfig
};
pub
use
transfer
::
BlockTransferHandler
;
pub
use
transfer
::
{
BlockTransferHandler
,
BlockTransferHandlerV1
,
BlockTransferHandlerV2
}
;
pub
use
utils
::{
BlockTransferPool
,
BlockTransferRequest
,
ConnectorRequestLeader
,
ConnectorTransferType
,
};
...
...
lib/llm/src/block_manager/distributed/transfer.rs
View file @
827b8c3e
...
...
@@ -11,9 +11,10 @@ use zmq::*;
use
BlockTransferPool
::
*
;
use
crate
::
block_manager
::{
BasicMetadata
,
Storage
,
Storage
,
block
::{
Block
,
BlockDataProvider
,
BlockDataProviderMut
,
ReadableBlock
,
WritableBlock
,
BasicMetadata
,
Block
,
BlockDataProvider
,
BlockDataProviderMut
,
ReadableBlock
,
WritableBlock
,
data
::
local
::
LocalBlockData
,
locality
,
transfer
::{
TransferContext
,
WriteTo
,
WriteToStrategy
},
...
...
@@ -21,6 +22,10 @@ use crate::block_manager::{
connector
::
scheduler
::{
SchedulingDecision
,
TransferSchedulerClient
},
offload
::
MAX_TRANSFER_BATCH_SIZE
,
storage
::{
DeviceStorage
,
DiskStorage
,
Local
,
PinnedStorage
},
v2
::
physical
::{
layout
::
PhysicalLayout
,
manager
::
TransportManager
,
transfer
::
LayoutHandle
,
transfer
::
options
::
TransferOptions
,
},
};
use
anyhow
::
Result
;
...
...
@@ -44,9 +49,9 @@ impl ConnectorTransferBatcher {
}
}
pub
async
fn
execute_batched_transfer
(
pub
async
fn
execute_batched_transfer
<
T
:
BlockTransferDirectHandler
>
(
&
self
,
handler
:
&
BlockTransferHandler
,
handler
:
&
T
,
request
:
BlockTransferRequest
,
)
->
Result
<
()
>
{
let
blocks
=
request
.blocks
();
...
...
@@ -83,9 +88,21 @@ impl ConnectorTransferBatcher {
}
}
#[async_trait]
pub
trait
BlockTransferHandler
:
Send
+
Sync
{
async
fn
execute_transfer
(
&
self
,
request
:
BlockTransferRequest
)
->
Result
<
()
>
;
fn
scheduler_client
(
&
self
)
->
Option
<
TransferSchedulerClient
>
;
}
#[async_trait]
pub
trait
BlockTransferDirectHandler
{
async
fn
execute_transfer_direct
(
&
self
,
request
:
BlockTransferRequest
)
->
Result
<
()
>
;
}
/// A handler for all block transfers. Wraps a group of [`BlockTransferPoolManager`]s.
#[derive(Clone)]
pub
struct
BlockTransferHandler
{
pub
struct
BlockTransferHandler
V1
{
device
:
Option
<
LocalBlockDataList
<
DeviceStorage
>>
,
host
:
Option
<
LocalBlockDataList
<
PinnedStorage
>>
,
disk
:
Option
<
LocalBlockDataList
<
DiskStorage
>>
,
...
...
@@ -95,7 +112,46 @@ pub struct BlockTransferHandler {
// add worker-connector scheduler client here
}
impl
BlockTransferHandler
{
#[async_trait]
impl
BlockTransferHandler
for
BlockTransferHandlerV1
{
async
fn
execute_transfer
(
&
self
,
request
:
BlockTransferRequest
)
->
Result
<
()
>
{
self
.batcher
.execute_batched_transfer
(
self
,
request
)
.await
}
fn
scheduler_client
(
&
self
)
->
Option
<
TransferSchedulerClient
>
{
self
.scheduler_client
.clone
()
}
}
#[async_trait]
impl
BlockTransferDirectHandler
for
BlockTransferHandlerV1
{
async
fn
execute_transfer_direct
(
&
self
,
request
:
BlockTransferRequest
)
->
Result
<
()
>
{
tracing
::
debug!
(
"Performing transfer of {} blocks from {:?} to {:?}"
,
request
.blocks
()
.len
(),
request
.from_pool
(),
request
.to_pool
()
);
tracing
::
debug!
(
"request: {request:#?}"
);
let
notify
=
match
(
request
.from_pool
(),
request
.to_pool
())
{
(
Device
,
Host
)
=>
self
.begin_transfer
(
&
self
.device
,
&
self
.host
,
request
)
.await
,
(
Device
,
Disk
)
=>
self
.begin_transfer
(
&
self
.device
,
&
self
.disk
,
request
)
.await
,
(
Host
,
Device
)
=>
self
.begin_transfer
(
&
self
.host
,
&
self
.device
,
request
)
.await
,
(
Host
,
Disk
)
=>
self
.begin_transfer
(
&
self
.host
,
&
self
.disk
,
request
)
.await
,
(
Disk
,
Device
)
=>
self
.begin_transfer
(
&
self
.disk
,
&
self
.device
,
request
)
.await
,
_
=>
{
return
Err
(
anyhow
::
anyhow!
(
"Invalid transfer type."
));
}
}
?
;
notify
.await
?
;
Ok
(())
}
}
impl
BlockTransferHandlerV1
{
pub
fn
new
(
device_blocks
:
Option
<
Vec
<
LocalBlock
<
DeviceStorage
,
BasicMetadata
>>>
,
host_blocks
:
Option
<
Vec
<
LocalBlock
<
PinnedStorage
,
BasicMetadata
>>>
,
...
...
@@ -178,41 +234,94 @@ impl BlockTransferHandler {
}
}
}
}
#[derive(Clone)]
pub
struct
BlockTransferHandlerV2
{
device_handle
:
Option
<
LayoutHandle
>
,
host_handle
:
Option
<
LayoutHandle
>
,
disk_handle
:
Option
<
LayoutHandle
>
,
transport_manager
:
TransportManager
,
scheduler_client
:
Option
<
TransferSchedulerClient
>
,
batcher
:
ConnectorTransferBatcher
,
}
/// Execute transfer with batching to prevent resource exhaustion
pub
async
fn
execute_transfer
(
&
self
,
request
:
BlockTransferRequest
)
->
Result
<
()
>
{
impl
BlockTransferHandlerV2
{
pub
fn
new
(
device_layout
:
Option
<
PhysicalLayout
>
,
host_layout
:
Option
<
PhysicalLayout
>
,
disk_layout
:
Option
<
PhysicalLayout
>
,
transport_manager
:
TransportManager
,
scheduler_client
:
Option
<
TransferSchedulerClient
>
,
)
->
Result
<
Self
>
{
Ok
(
Self
{
device_handle
:
device_layout
.map
(|
layout
|
transport_manager
.register_layout
(
layout
)
.unwrap
()),
host_handle
:
host_layout
.map
(|
layout
|
transport_manager
.register_layout
(
layout
)
.unwrap
()),
disk_handle
:
disk_layout
.map
(|
layout
|
transport_manager
.register_layout
(
layout
)
.unwrap
()),
transport_manager
,
scheduler_client
,
batcher
:
ConnectorTransferBatcher
::
new
(),
})
}
}
#[async_trait]
impl
BlockTransferHandler
for
BlockTransferHandlerV2
{
async
fn
execute_transfer
(
&
self
,
request
:
BlockTransferRequest
)
->
Result
<
()
>
{
self
.batcher
.execute_batched_transfer
(
self
,
request
)
.await
}
/// Execute transfer directly without batching (used by the batcher)
pub
async
fn
execute_transfer_direct
(
&
self
,
request
:
BlockTransferRequest
)
->
Result
<
()
>
{
tracing
::
debug!
(
"Performing transfer of {} blocks from {:?} to {:?}"
,
request
.blocks
()
.len
(),
request
.from_pool
(),
request
.to_pool
()
);
fn
scheduler_client
(
&
self
)
->
Option
<
TransferSchedulerClient
>
{
self
.scheduler_client
.clone
()
}
}
tracing
::
debug!
(
"request: {request:#?}"
);
#[async_trait]
impl
BlockTransferDirectHandler
for
BlockTransferHandlerV2
{
async
fn
execute_transfer_direct
(
&
self
,
request
:
BlockTransferRequest
)
->
Result
<
()
>
{
let
(
src
,
dst
)
=
match
(
request
.from_pool
(),
request
.to_pool
())
{
(
Device
,
Host
)
=>
(
self
.device_handle
.as_ref
(),
self
.host_handle
.as_ref
()),
(
Device
,
Disk
)
=>
(
self
.device_handle
.as_ref
(),
self
.disk_handle
.as_ref
()),
(
Host
,
Device
)
=>
(
self
.host_handle
.as_ref
(),
self
.device_handle
.as_ref
()),
(
Host
,
Disk
)
=>
(
self
.host_handle
.as_ref
(),
self
.disk_handle
.as_ref
()),
(
Disk
,
Device
)
=>
(
self
.disk_handle
.as_ref
(),
self
.device_handle
.as_ref
()),
_
=>
return
Err
(
anyhow
::
anyhow!
(
"Invalid transfer type."
)),
};
let
notify
=
match
(
request
.from_pool
(),
request
.to_pool
())
{
(
Device
,
Host
)
=>
self
.begin_transfer
(
&
self
.device
,
&
self
.host
,
request
)
.await
,
(
Device
,
Disk
)
=>
self
.begin_transfer
(
&
self
.device
,
&
self
.disk
,
request
)
.await
,
(
Host
,
Device
)
=>
self
.begin_transfer
(
&
self
.host
,
&
self
.device
,
request
)
.await
,
(
Host
,
Disk
)
=>
self
.begin_transfer
(
&
self
.host
,
&
self
.disk
,
request
)
.await
,
(
Disk
,
Device
)
=>
self
.begin_transfer
(
&
self
.disk
,
&
self
.device
,
request
)
.await
,
_
=>
{
return
Err
(
anyhow
::
anyhow!
(
"Invalid transfer type."
));
}
}
?
;
if
let
(
Some
(
src
),
Some
(
dst
))
=
(
src
,
dst
)
{
let
src_block_ids
=
request
.blocks
()
.iter
()
.map
(|(
from
,
_
)|
*
from
)
.collect
::
<
Vec
<
_
>>
();
let
dst_block_ids
=
request
.blocks
()
.iter
()
.map
(|(
_
,
to
)|
*
to
)
.collect
::
<
Vec
<
_
>>
();
self
.transport_manager
.execute_transfer
(
*
src
,
&
src_block_ids
,
*
dst
,
&
dst_block_ids
,
TransferOptions
::
default
(),
)
?
.await
?
;
}
else
{
return
Err
(
anyhow
::
anyhow!
(
"Invalid transfer type."
));
}
notify
.await
?
;
Ok
(())
}
}
#[async_trait]
impl
Handler
for
BlockTransferHandler
{
impl
<
T
:
?
Sized
+
BlockTransferHandler
>
Handler
for
T
{
async
fn
handle
(
&
self
,
mut
message
:
MessageHandle
)
->
Result
<
()
>
{
if
message
.data
.len
()
!=
1
{
return
Err
(
anyhow
::
anyhow!
(
...
...
@@ -232,10 +341,8 @@ impl Handler for BlockTransferHandler {
);
let
client
=
self
.scheduler_client
.as_ref
()
.expect
(
"scheduler client is required"
)
.clone
();
.scheduler_client
()
.expect
(
"scheduler client is required"
);
let
handle
=
client
.schedule_transfer
(
req
)
.await
?
;
...
...
lib/llm/src/block_manager/distributed/worker.rs
View file @
827b8c3e
...
...
@@ -18,6 +18,12 @@ use crate::block_manager::{
layout
::
LayoutType
,
offload
::{
MAX_CONCURRENT_TRANSFERS
,
MAX_TRANSFER_BATCH_SIZE
},
storage
::{
DeviceAllocator
,
DeviceStorage
,
DiskAllocator
,
PinnedAllocator
,
torch
::
TorchTensor
},
v2
::
memory
::
DeviceStorage
as
DeviceStorageV2
,
v2
::
physical
::{
layout
::{
BlockDimension
,
LayoutConfig
as
LayoutConfigV2
,
builder
::
PhysicalLayoutBuilder
},
manager
::
TransportManager
,
transfer
::{
NixlAgent
as
NixlAgentV2
,
TransferCapabilities
},
},
};
use
derive_builder
::
Builder
;
...
...
@@ -111,70 +117,162 @@ async fn perform_allocation_and_build_handler(
worker_id
:
usize
,
device_id
:
usize
,
scheduler_client
:
Option
<
TransferSchedulerClient
>
,
)
->
anyhow
::
Result
<
BlockTransferHandler
>
{
let
agent
=
build_agent
(
worker_id
,
leader_meta
.num_disk_blocks
>
0
)
?
;
let
pool_config
=
PoolConfig
{
enable_pool
:
true
,
max_concurrent_transfers
:
MAX_CONCURRENT_TRANSFERS
,
max_transfer_batch_size
:
MAX_TRANSFER_BATCH_SIZE
,
num_outer_components
:
device_layout
.config
()
.outer_dim
,
num_layers
:
device_layout
.config
()
.num_layers
,
};
let
transfer_context
=
Arc
::
new
(
TransferContext
::
new
(
Arc
::
new
(
Some
(
agent
)),
DeviceAllocator
::
new
(
device_id
)
?
.ctx
()
.new_stream
()
?
,
Handle
::
current
(),
Some
(
pool_config
),
));
// device
let
device_blocks
=
Some
(
KvbmWorker
::
make_layout
::
<
_
,
BasicMetadata
>
(
device_layout
,
transfer_context
.nixl_agent
()
.as_ref
(),
0
,
worker_id
,
)
?
);
// host
let
host_blocks
=
if
leader_meta
.num_host_blocks
>
0
{
let
host_allocator
=
Arc
::
new
(
PinnedAllocator
::
default
());
let
host_layout
=
layout_builder
.num_blocks
(
leader_meta
.num_host_blocks
)
.build
()
?
.allocate_layout
(
worker_config
.host_layout_type
,
host_allocator
)
?
;
Some
(
KvbmWorker
::
make_layout
::
<
_
,
BasicMetadata
>
(
)
->
anyhow
::
Result
<
Arc
<
dyn
BlockTransferHandler
>>
{
let
use_v2_transfer
=
std
::
env
::
var
(
"DYN_KVBM_USE_V2_TRANSFER_EXPERIMENTAL"
)
.unwrap_or
(
"0"
.to_string
())
.parse
::
<
usize
>
()
.map
(|
v
|
v
>
0
)
.unwrap_or
(
false
);
if
use_v2_transfer
{
tracing
::
warn!
(
"Using V2 transfer handler. This is experimental. Use at your own risk."
);
let
backends
=
if
leader_meta
.num_disk_blocks
>
0
{
vec!
[
"POSIX"
,
"GDS_MT"
]
}
else
{
vec!
[
"POSIX"
]
};
let
agent
=
NixlAgentV2
::
new_with_backends
(
worker_id
.to_string
()
.as_str
(),
&
backends
)
?
;
let
mut
layout_config
=
LayoutConfigV2
::
builder
()
.num_blocks
(
device_layout
.config
()
.num_blocks
)
.num_layers
(
device_layout
.config
()
.num_layers
)
.outer_dim
(
device_layout
.config
()
.outer_dim
)
.inner_dim
(
device_layout
.config
()
.inner_dim
)
.page_size
(
device_layout
.config
()
.page_size
)
.alignment
(
device_layout
.config
()
.alignment
)
.dtype_width_bytes
(
device_layout
.config
()
.dtype_width_bytes
)
.build
()
?
;
let
v2_device_layout
=
PhysicalLayoutBuilder
::
new
(
agent
.clone
())
.with_config
(
layout_config
.clone
());
let
v2_device_layout
=
if
let
LayoutType
::
LayerSeparate
{
outer_contiguous
}
=
device_layout
.layout_type
()
{
v2_device_layout
.layer_separate
(
if
outer_contiguous
{
BlockDimension
::
BlockIsSecondDim
}
else
{
BlockDimension
::
BlockIsFirstDim
})
}
else
{
v2_device_layout
.fully_contiguous
()
};
let
regions
=
device_layout
.storage
()
.iter
()
.map
(|
s
|
DeviceStorageV2
::
from_v1
(
s
)
.unwrap
())
.collect
::
<
Vec
<
_
>>
();
let
v2_device_layout
=
v2_device_layout
.with_memory_regions
(
regions
)
?
.build
()
?
;
let
host_layout
=
if
leader_meta
.num_host_blocks
>
0
{
layout_config
.num_blocks
=
leader_meta
.num_host_blocks
;
Some
(
PhysicalLayoutBuilder
::
new
(
agent
.clone
())
.with_config
(
layout_config
.clone
())
.fully_contiguous
()
.allocate_pinned
(
true
)
.build
()
?
,
)
}
else
{
None
};
let
disk_layout
=
if
leader_meta
.num_disk_blocks
>
0
{
layout_config
.num_blocks
=
leader_meta
.num_disk_blocks
;
Some
(
PhysicalLayoutBuilder
::
new
(
agent
.clone
())
.with_config
(
layout_config
)
.fully_contiguous
()
.allocate_disk
(
None
)
.build
()
?
,
)
}
else
{
None
};
let
transport_manager
=
TransportManager
::
builder
()
.capabilities
(
TransferCapabilities
::
default
()
.with_gds
(
true
))
.worker_id
(
worker_id
as
u64
)
.nixl_agent
(
agent
)
.cuda_device_id
(
device_id
)
.build
()
?
;
let
handler
=
BlockTransferHandlerV2
::
new
(
Some
(
v2_device_layout
),
host_layout
,
transfer_context
.nixl_agent
()
.as_ref
(),
1
,
worker_id
,
)
?
)
}
else
{
None
};
// disk
let
disk_blocks
=
if
leader_meta
.num_disk_blocks
>
0
{
let
disk_allocator
=
Arc
::
new
(
DiskAllocator
);
let
disk_layout
=
layout_builder
.num_blocks
(
leader_meta
.num_disk_blocks
)
.build
()
?
.allocate_layout
(
worker_config
.disk_layout_type
,
disk_allocator
)
?
;
Some
(
KvbmWorker
::
make_layout
::
<
_
,
BasicMetadata
>
(
disk_layout
,
transport_manager
,
scheduler_client
,
)
?
;
Ok
(
Arc
::
new
(
handler
)
as
Arc
<
dyn
BlockTransferHandler
>
)
}
else
{
let
agent
=
build_agent
(
worker_id
,
leader_meta
.num_disk_blocks
>
0
)
?
;
let
pool_config
=
PoolConfig
{
enable_pool
:
true
,
max_concurrent_transfers
:
MAX_CONCURRENT_TRANSFERS
,
max_transfer_batch_size
:
MAX_TRANSFER_BATCH_SIZE
,
num_outer_components
:
device_layout
.config
()
.outer_dim
,
num_layers
:
device_layout
.config
()
.num_layers
,
};
let
transfer_context
=
Arc
::
new
(
TransferContext
::
new
(
Arc
::
new
(
Some
(
agent
)),
DeviceAllocator
::
new
(
device_id
)
?
.ctx
()
.new_stream
()
?
,
Handle
::
current
(),
Some
(
pool_config
),
));
// device
let
device_blocks
=
Some
(
KvbmWorker
::
make_layout
::
<
_
,
BasicMetadata
>
(
device_layout
,
transfer_context
.nixl_agent
()
.as_ref
(),
2
,
0
,
worker_id
,
)
?
)
}
else
{
None
};
let
handler
=
BlockTransferHandler
::
new
(
device_blocks
,
host_blocks
,
disk_blocks
,
transfer_context
,
scheduler_client
,
)
?
;
Ok
(
handler
)
)
?
);
// host
let
host_blocks
=
if
leader_meta
.num_host_blocks
>
0
{
let
host_allocator
=
Arc
::
new
(
PinnedAllocator
::
default
());
let
host_layout
=
layout_builder
.num_blocks
(
leader_meta
.num_host_blocks
)
.build
()
?
.allocate_layout
(
worker_config
.host_layout_type
,
host_allocator
)
?
;
Some
(
KvbmWorker
::
make_layout
::
<
_
,
BasicMetadata
>
(
host_layout
,
transfer_context
.nixl_agent
()
.as_ref
(),
1
,
worker_id
,
)
?
)
}
else
{
None
};
// disk
let
disk_blocks
=
if
leader_meta
.num_disk_blocks
>
0
{
let
disk_allocator
=
Arc
::
new
(
DiskAllocator
);
let
disk_layout
=
layout_builder
.num_blocks
(
leader_meta
.num_disk_blocks
)
.build
()
?
.allocate_layout
(
worker_config
.disk_layout_type
,
disk_allocator
)
?
;
Some
(
KvbmWorker
::
make_layout
::
<
_
,
BasicMetadata
>
(
disk_layout
,
transfer_context
.nixl_agent
()
.as_ref
(),
2
,
worker_id
,
)
?
)
}
else
{
None
};
let
handler
=
BlockTransferHandlerV1
::
new
(
device_blocks
,
host_blocks
,
disk_blocks
,
transfer_context
,
scheduler_client
,
)
?
;
Ok
(
Arc
::
new
(
handler
)
as
Arc
<
dyn
BlockTransferHandler
>
)
}
}
struct
WorkerMetadataHandler
{
...
...
@@ -199,6 +297,8 @@ impl Handler for WorkerMetadataHandler {
}
}
type
TransferHandlerSender
=
Mutex
<
Option
<
oneshot
::
Sender
<
Arc
<
dyn
BlockTransferHandler
>>>>
;
// Leader sends allocation config -> allocate -> publish handler -> mark ready -> ACK
struct
LeaderMetadataHandler
{
state
:
Arc
<
WorkerState
>
,
...
...
@@ -208,8 +308,8 @@ struct LeaderMetadataHandler {
worker_id
:
usize
,
device_id
:
usize
,
scheduler_client
:
Option
<
TransferSchedulerClient
>
,
handler_cell
:
Arc
<
RwLock
<
Option
<
BlockTransferHandler
>>>
,
handler_tx
:
Arc
<
Mutex
<
Option
<
oneshot
::
Sender
<
Block
TransferHandler
>>>
>
,
handler_cell
:
Arc
<
RwLock
<
Option
<
Arc
<
dyn
BlockTransferHandler
>>>
>
,
handler_tx
:
Arc
<
TransferHandler
Sender
>
,
started
:
AtomicBool
,
}
...
...
@@ -344,7 +444,7 @@ impl Handler for GatedPing {
// Transfer dispatcher that waits until block transfer handler exists
struct
BlockTransferDispatch
{
cell
:
Arc
<
RwLock
<
Option
<
BlockTransferHandler
>>>
,
cell
:
Arc
<
RwLock
<
Option
<
Arc
<
dyn
BlockTransferHandler
>>>
>
,
}
#[async_trait]
...
...
@@ -405,7 +505,7 @@ impl KvbmWorkerConfig {
pub
struct
KvbmWorker
{
task
:
Option
<
CriticalTaskExecutionHandle
>
,
block_transfer_handler_rx
:
Option
<
oneshot
::
Receiver
<
transfer
::
BlockTransferHandler
>>
,
block_transfer_handler_rx
:
Option
<
oneshot
::
Receiver
<
Arc
<
dyn
BlockTransferHandler
>>
>
,
}
impl
KvbmWorker
{
...
...
@@ -529,7 +629,7 @@ impl KvbmWorker {
layout_type
:
LayoutType
,
)
->
anyhow
::
Result
<
(
CriticalTaskExecutionHandle
,
oneshot
::
Receiver
<
transfer
::
BlockTransferHandler
>
,
oneshot
::
Receiver
<
Arc
<
dyn
BlockTransferHandler
>
>
,
)
>
{
let
cancel_token
=
config
.cancel_token
.clone
();
...
...
@@ -580,13 +680,13 @@ impl KvbmWorker {
layout_type
:
LayoutType
,
)
->
anyhow
::
Result
<
(
CriticalTaskExecutionHandle
,
oneshot
::
Receiver
<
transfer
::
BlockTransferHandler
>
,
oneshot
::
Receiver
<
Arc
<
dyn
BlockTransferHandler
>
>
,
)
>
{
let
cancel_token
=
config
.cancel_token
.clone
();
let
scheduler_client
=
config
.scheduler_client
.clone
();
// channel to get BlockTransferHandler back to the caller
let
(
handler_tx
,
handler_rx
)
=
oneshot
::
channel
::
<
transfer
::
BlockTransferHandler
>
();
let
(
handler_tx
,
handler_rx
)
=
oneshot
::
channel
::
<
Arc
<
dyn
BlockTransferHandler
>
>
();
let
handler_tx_cell
=
Arc
::
new
(
Mutex
::
new
(
Some
(
handler_tx
)));
// channel that the worker will use to signal layout readiness
...
...
@@ -649,7 +749,7 @@ impl KvbmWorker {
/// This is a bit of a hack. Improve the API design around this in the future.
pub
fn
block_transfer_handler_rx
(
&
mut
self
,
)
->
Option
<
tokio
::
sync
::
oneshot
::
Receiver
<
BlockTransferHandler
>>
{
)
->
Option
<
tokio
::
sync
::
oneshot
::
Receiver
<
Arc
<
dyn
BlockTransferHandler
>>
>
{
self
.block_transfer_handler_rx
.take
()
}
...
...
@@ -677,7 +777,7 @@ impl KvbmWorker {
_
device_layout_type
:
LayoutType
,
config
:
KvbmWorkerConfig
,
cancel_token
:
CancellationToken
,
handler_tx
:
Arc
<
Mutex
<
Option
<
oneshot
::
Sender
<
Block
TransferHandler
>>>
>
,
handler_tx
:
Arc
<
TransferHandler
Sender
>
,
layout_ready_tx
:
tokio
::
sync
::
Mutex
<
Option
<
oneshot
::
Sender
<
String
>>>
,
scheduler_client
:
Option
<
TransferSchedulerClient
>
,
bytes_per_block
:
usize
,
...
...
@@ -687,7 +787,7 @@ impl KvbmWorker {
let
state
=
Arc
::
new
(
WorkerState
::
new
());
// Cell to publish the transfer handler
let
transfer_handler_cell
:
Arc
<
RwLock
<
Option
<
BlockTransferHandler
>>>
=
let
transfer_handler_cell
:
Arc
<
RwLock
<
Option
<
Arc
<
dyn
BlockTransferHandler
>>>
>
=
Arc
::
new
(
RwLock
::
new
(
None
));
// Build handlers map
...
...
lib/llm/src/block_manager/storage/cuda.rs
View file @
827b8c3e
...
...
@@ -315,8 +315,8 @@ impl StorageAllocator<PinnedStorage> for PinnedAllocator {
/// When building a [`DeviceStorage`] from a torch tensor, we need to ensure that
/// the torch tensor is not GCed until the [`DeviceStorage`] is dropped.
/// Because of this, we need to store a reference to the torch tensor in the [`DeviceStorage`]
#[derive(Debug)]
enum
DeviceStorageType
{
#[derive(
Clone,
Debug)]
pub
enum
DeviceStorageType
{
Owned
,
// Memory that we allocated ourselves.
Torch
{
_
tensor
:
Arc
<
dyn
TorchTensor
>
},
// Memory that came from a torch tensor.
}
...
...
@@ -328,7 +328,7 @@ pub struct DeviceStorage {
size
:
usize
,
ctx
:
Arc
<
CudaContext
>
,
handles
:
RegistrationHandles
,
_
storage_type
:
DeviceStorageType
,
storage_type
:
DeviceStorageType
,
}
impl
Local
for
DeviceStorage
{}
...
...
@@ -345,7 +345,7 @@ impl DeviceStorage {
size
,
ctx
:
ctx
.clone
(),
handles
:
RegistrationHandles
::
new
(),
_
storage_type
:
DeviceStorageType
::
Owned
,
storage_type
:
DeviceStorageType
::
Owned
,
})
}
...
...
@@ -373,7 +373,7 @@ impl DeviceStorage {
size
,
ctx
:
ctx
.clone
(),
handles
:
RegistrationHandles
::
new
(),
_
storage_type
:
DeviceStorageType
::
Torch
{
_
tensor
:
tensor
},
storage_type
:
DeviceStorageType
::
Torch
{
_
tensor
:
tensor
},
})
}
...
...
@@ -381,6 +381,10 @@ impl DeviceStorage {
pub
fn
context
(
&
self
)
->
&
Arc
<
CudaContext
>
{
&
self
.ctx
}
pub
fn
device_storage_type
(
&
self
)
->
&
DeviceStorageType
{
&
self
.storage_type
}
}
impl
Storage
for
DeviceStorage
{
...
...
@@ -414,7 +418,7 @@ impl CudaContextProivder for DeviceStorage {
impl
Drop
for
DeviceStorage
{
fn
drop
(
&
mut
self
)
{
self
.handles
.release
();
match
&
self
.
_
storage_type
{
match
&
self
.storage_type
{
DeviceStorageType
::
Owned
=>
{
unsafe
{
cudarc
::
driver
::
result
::
free_sync
(
self
.ptr
as
_
)
}
.unwrap
()
}
...
...
lib/llm/src/block_manager/v2/memory/device.rs
View file @
827b8c3e
...
...
@@ -3,8 +3,13 @@
//! CUDA device memory storage.
use
crate
::
block_manager
::
DeviceStorage
as
V1DeviceStorage
;
use
crate
::
block_manager
::
Storage
as
V1Storage
;
use
crate
::
block_manager
::
storage
::
cuda
::
DeviceStorageType
as
V1DeviceStorageType
;
use
super
::{
MemoryRegion
,
Result
,
StorageError
,
StorageKind
};
use
cudarc
::
driver
::
CudaContext
;
use
nixl_sys
::
NixlDescriptor
;
use
std
::
any
::
Any
;
use
std
::
collections
::
HashMap
;
use
std
::
sync
::{
Arc
,
Mutex
,
OnceLock
};
...
...
@@ -30,6 +35,8 @@ pub struct DeviceStorage {
ptr
:
u64
,
device_id
:
u32
,
len
:
usize
,
// TODO: This is a bit ugly. We need to translate our v1 device layout to v2.
device_storage_type
:
V1DeviceStorageType
,
}
unsafe
impl
Send
for
DeviceStorage
{}
...
...
@@ -57,6 +64,7 @@ impl DeviceStorage {
ptr
,
device_id
,
len
,
device_storage_type
:
V1DeviceStorageType
::
Owned
,
})
}
...
...
@@ -69,18 +77,51 @@ impl DeviceStorage {
pub
fn
device_id
(
&
self
)
->
u32
{
self
.device_id
}
pub
fn
from_v1
(
v1_storage
:
&
V1DeviceStorage
)
->
Result
<
Self
>
{
let
device_id
=
v1_storage
.device_id
()
as
u32
;
let
ctx
=
cuda_context
(
device_id
)
?
;
let
ptr
;
unsafe
{
ptr
=
v1_storage
.as_ptr
()
as
u64
;
}
let
len
=
v1_storage
.size
();
if
!
matches!
(
v1_storage
.device_storage_type
(),
V1DeviceStorageType
::
Torch
{
..
}
)
{
return
Err
(
StorageError
::
Unsupported
(
"Unable to convert owned device tensors."
.into
(),
));
}
Ok
(
Self
{
ctx
,
ptr
,
device_id
,
len
,
device_storage_type
:
v1_storage
.device_storage_type
()
.clone
(),
})
}
}
impl
Drop
for
DeviceStorage
{
fn
drop
(
&
mut
self
)
{
if
let
Err
(
e
)
=
self
.ctx
.bind_to_thread
()
{
tracing
::
debug!
(
"failed to bind CUDA context for free: {e}"
);
}
unsafe
{
if
let
Err
(
e
)
=
cudarc
::
driver
::
result
::
free_sync
(
self
.ptr
)
{
tracing
::
debug!
(
"failed to free device memory: {e}"
);
match
self
.device_storage_type
{
V1DeviceStorageType
::
Owned
=>
{
if
let
Err
(
e
)
=
self
.ctx
.bind_to_thread
()
{
tracing
::
debug!
(
"failed to bind CUDA context for free: {e}"
);
}
unsafe
{
if
let
Err
(
e
)
=
cudarc
::
driver
::
result
::
free_sync
(
self
.ptr
)
{
tracing
::
debug!
(
"failed to free device memory: {e}"
);
}
}
}
};
V1DeviceStorageType
::
Torch
{
..
}
=>
{}
// Do nothing.
}
}
}
...
...
pyproject.toml
View file @
827b8c3e
...
...
@@ -199,6 +199,7 @@ markers = [
"slow: marks tests as known to be slow"
,
"h100: marks tests to run on H100"
,
"kvbm: marks tests for KV behavior and model determinism"
,
"kvbm_v2: marks tests using KVBM V2"
,
"model: model id used by a test or parameter"
,
"custom_build: marks tests that require custom builds or special setup (e.g., MoE models)"
,
"k8s: marks tests as requiring Kubernetes"
,
...
...
tests/kvbm_integration/test_determinism_agg.py
View file @
827b8c3e
...
...
@@ -182,6 +182,20 @@ class LLMServerManager:
)
self
.
server_stdout_file
.
flush
()
# Try to download the model.
model
=
os
.
environ
.
get
(
"KVBM_MODEL_ID"
,
"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
)
print
(
"Attempting model download..."
)
try
:
subprocess
.
run
(
f
"pip install hf_transfer && HF_HUB_ENABLE_HF_TRANSFER=1 hf download
{
model
}
"
,
check
=
True
,
shell
=
True
,
)
except
subprocess
.
CalledProcessError
:
print
(
"Model download failed. Is this a locally stored model?"
)
# Launch
self
.
process
=
subprocess
.
Popen
(
self
.
server_cmd
,
...
...
@@ -334,7 +348,7 @@ def llm_server(request, runtime_services):
server_type
=
server_type
,
)
start_timeout
=
int
(
os
.
environ
.
get
(
"KVBM_SERVER_START_TIMEOUT"
,
"
3
00"
))
start_timeout
=
int
(
os
.
environ
.
get
(
"KVBM_SERVER_START_TIMEOUT"
,
"
6
00"
))
if
not
server_manager
.
start_server
(
timeout
=
start_timeout
):
pytest
.
fail
(
f
"Failed to start
{
server_type
}
server (cpu_blocks=
{
cpu_blocks
}
, gpu_blocks=
{
gpu_blocks
}
, port=
{
server_manager
.
port
}
)"
...
...
@@ -374,6 +388,24 @@ class TestDeterminismAgg(BaseTestDeterminism):
tester
,
llm_server
,
runtime_services
)
@
pytest
.
mark
.
parametrize
(
"llm_server"
,
[
{
"cpu_blocks"
:
int
(
os
.
environ
.
get
(
"KVBM_CPU_BLOCKS"
,
"10000"
))},
],
indirect
=
True
,
)
@
pytest
.
mark
.
kvbm_v2
def
test_determinism_agg_with_cache_reset_v2
(
self
,
tester
,
llm_server
,
runtime_services
,
monkeypatch
):
"""Test determinism across cache reset: run test with warmup, reset cache, run again without warmup."""
monkeypatch
.
setenv
(
"DYN_KVBM_USE_V2_TRANSFER_EXPERIMENTAL"
,
"1"
)
# Call the base class implementation
super
().
base_test_determinism_with_cache_reset
(
tester
,
llm_server
,
runtime_services
)
@
pytest
.
mark
.
parametrize
(
"llm_server"
,
[
...
...
tests/kvbm_integration/test_determinism_disagg.py
View file @
827b8c3e
...
...
@@ -429,7 +429,7 @@ def llm_server(request, runtime_services):
server_type
=
server_type
,
)
start_timeout
=
int
(
os
.
environ
.
get
(
"KVBM_SERVER_START_TIMEOUT"
,
"
3
00"
))
start_timeout
=
int
(
os
.
environ
.
get
(
"KVBM_SERVER_START_TIMEOUT"
,
"
6
00"
))
if
not
server_manager
.
start_server
(
timeout
=
start_timeout
):
pytest
.
fail
(
f
"Failed to start
{
server_type
}
server (cpu_blocks=
{
cpu_blocks
}
, gpu_blocks=
{
gpu_blocks
}
, port=
{
server_manager
.
port
}
)"
...
...
@@ -475,6 +475,30 @@ class TestDeterminismDisagg(BaseTestDeterminism):
success_rate_threshold
=
SUCCESS_RATE_THRESHOLD
,
)
@
pytest
.
mark
.
parametrize
(
"llm_server"
,
[
{
"cpu_blocks"
:
int
(
os
.
environ
.
get
(
"KVBM_CPU_BLOCKS"
,
"10000"
)),
"gpu_blocks"
:
int
(
os
.
environ
.
get
(
"KVBM_GPU_BLOCKS"
,
"1000"
)),
},
],
indirect
=
True
,
)
@
pytest
.
mark
.
kvbm_v2
def
test_determinism_disagg_with_cache_reset_v2
(
self
,
tester
,
llm_server
,
runtime_services
,
monkeypatch
):
"""Test determinism across cache reset: run test with warmup, reset cache, run again without warmup."""
monkeypatch
.
setenv
(
"DYN_KVBM_USE_V2_TRANSFER_EXPERIMENTAL"
,
"1"
)
# Call the base class implementation
super
().
base_test_determinism_with_cache_reset
(
tester
,
llm_server
,
runtime_services
,
success_rate_threshold
=
SUCCESS_RATE_THRESHOLD
,
)
if
__name__
==
"__main__"
:
# Allow running as script
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment