Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
9ab148dc
Unverified
Commit
9ab148dc
authored
Mar 01, 2026
by
Ryan Olson
Committed by
GitHub
Mar 01, 2026
Browse files
feat: kvbm-physical (#6490)
Signed-off-by:
Ryan Olson
<
rolson@nvidia.com
>
parent
7546c193
Changes
54
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
5722 additions
and
0 deletions
+5722
-0
lib/kvbm-physical/src/layout/physical.rs
lib/kvbm-physical/src/layout/physical.rs
+312
-0
lib/kvbm-physical/src/layout/serialize.rs
lib/kvbm-physical/src/layout/serialize.rs
+282
-0
lib/kvbm-physical/src/layout/tests.rs
lib/kvbm-physical/src/layout/tests.rs
+370
-0
lib/kvbm-physical/src/layout/validation.rs
lib/kvbm-physical/src/layout/validation.rs
+125
-0
lib/kvbm-physical/src/lib.rs
lib/kvbm-physical/src/lib.rs
+25
-0
lib/kvbm-physical/src/manager/handle.rs
lib/kvbm-physical/src/manager/handle.rs
+119
-0
lib/kvbm-physical/src/manager/local.rs
lib/kvbm-physical/src/manager/local.rs
+119
-0
lib/kvbm-physical/src/manager/metadata.rs
lib/kvbm-physical/src/manager/metadata.rs
+301
-0
lib/kvbm-physical/src/manager/mod.rs
lib/kvbm-physical/src/manager/mod.rs
+869
-0
lib/kvbm-physical/src/manager/remote.rs
lib/kvbm-physical/src/manager/remote.rs
+127
-0
lib/kvbm-physical/src/transfer/capabilities.rs
lib/kvbm-physical/src/transfer/capabilities.rs
+209
-0
lib/kvbm-physical/src/transfer/checksum.rs
lib/kvbm-physical/src/transfer/checksum.rs
+267
-0
lib/kvbm-physical/src/transfer/context.rs
lib/kvbm-physical/src/transfer/context.rs
+507
-0
lib/kvbm-physical/src/transfer/executor/cuda.rs
lib/kvbm-physical/src/transfer/executor/cuda.rs
+327
-0
lib/kvbm-physical/src/transfer/executor/memcpy.rs
lib/kvbm-physical/src/transfer/executor/memcpy.rs
+165
-0
lib/kvbm-physical/src/transfer/executor/mod.rs
lib/kvbm-physical/src/transfer/executor/mod.rs
+664
-0
lib/kvbm-physical/src/transfer/executor/nixl.rs
lib/kvbm-physical/src/transfer/executor/nixl.rs
+381
-0
lib/kvbm-physical/src/transfer/fill.rs
lib/kvbm-physical/src/transfer/fill.rs
+293
-0
lib/kvbm-physical/src/transfer/mod.rs
lib/kvbm-physical/src/transfer/mod.rs
+173
-0
lib/kvbm-physical/src/transfer/notifications/cuda_event.rs
lib/kvbm-physical/src/transfer/notifications/cuda_event.rs
+87
-0
No files found.
lib/kvbm-physical/src/layout/physical.rs
0 → 100644
View file @
9ab148dc
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Physical layout types that combine abstract layouts with storage location metadata.
use
crate
::
BlockId
;
use
super
::{
FullyContiguousLayout
,
InnerShape
,
LayerSeparateLayout
,
Layout
,
MemoryRegion
,
builder
::{
PhysicalLayoutBuilder
,
PhysicalLayoutBuilderDefault
},
serialize
::{
LayoutDescriptor
,
LayoutTypeDetails
},
};
use
anyhow
::{
Result
,
anyhow
};
use
dynamo_memory
::{
Buffer
,
MemoryDescriptor
,
StorageKind
,
nixl
::{
MemType
,
NixlAgent
,
NixlDescriptor
},
};
use
serde
::{
Deserialize
,
Serialize
};
use
std
::
any
::
Any
;
use
std
::
sync
::
Arc
;
/// Runtime representation of a layout with its physical storage location.
///
/// A `PhysicalLayout` wraps an abstract [`Layout`] with information about where
/// its memory physically resides (GPU, host, disk) and whether it's local or remote.
/// This enables the transfer system to select appropriate copy strategies and build
/// NIXL transfer descriptors.
#[derive(Debug,
Clone)]
pub
struct
PhysicalLayout
{
/// The abstract layout defining memory organization
layout
:
Arc
<
dyn
Layout
>
,
/// Physical storage location (System, Device, Pinned, Disk)
location
:
StorageKind
,
/// NIXL registration metadata
nixl_metadata
:
NixlMetadata
,
}
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
pub
struct
NixlMetadata
{
agent_name
:
String
,
mem_type
:
MemType
,
device_id
:
u64
,
}
impl
NixlMetadata
{
pub
fn
new
(
agent_name
:
String
,
mem_type
:
MemType
,
device_id
:
u64
)
->
Self
{
Self
{
agent_name
,
mem_type
,
device_id
,
}
}
pub
fn
agent_name
(
&
self
)
->
&
str
{
&
self
.agent_name
}
#[inline(always)]
pub
fn
mem_type
(
&
self
)
->
MemType
{
self
.mem_type
}
#[inline(always)]
pub
fn
device_id
(
&
self
)
->
u64
{
self
.device_id
}
}
impl
PhysicalLayout
{
/// Create a typed builder that enforces NIXL registration.
pub
fn
builder
(
agent
:
NixlAgent
)
->
PhysicalLayoutBuilderDefault
{
PhysicalLayoutBuilder
::
new
(
agent
)
}
/// Create a new local physical layout.
///
/// # Arguments
/// * `layout` - The abstract layout to wrap
/// * `location` - Where the layout's memory resides
pub
(
crate
)
fn
new_local
(
layout
:
Arc
<
dyn
Layout
>
,
location
:
StorageKind
,
nixl_metadata
:
NixlMetadata
,
)
->
Self
{
Self
{
layout
,
location
,
nixl_metadata
,
}
}
// /// Create a new remote physical layout from a descriptor.
// ///
// /// # Arguments
// /// * `layout` - The abstract layout to wrap
// /// * `location` - Where the layout's memory resides (on remote node)
// /// * `remote_agent` - Name of the NIXL agent on the remote node
// pub fn new_remote(
// layout: Arc<dyn Layout>,
// location: StorageKind,
// remote_agent: String,
// ) -> Self {
// let metadata = NixlMetadata::new(
// remote_agent.clone(),
// location.to_nixl_mem_type(),
// location.device_id(),
// );
// let registrations = vec![RegisteredStorageMetadata::new(
// metadata.agent_name().to_string(),
// location,
// )];
// Self {
// layout,
// location,
// locality: Locality::Remote(remote_agent),
// nixl_metadata: Some(metadata),
// registered: registrations,
// }
// }
/// Get the underlying layout.
pub
fn
layout
(
&
self
)
->
&
Arc
<
dyn
Layout
>
{
&
self
.layout
}
/// Get the storage location.
pub
(
crate
)
fn
location
(
&
self
)
->
StorageKind
{
self
.location
}
/// Get the NIXL metadata.
pub
(
crate
)
fn
nixl_metadata
(
&
self
)
->
&
NixlMetadata
{
&
self
.nixl_metadata
}
/// Get a memory region with location information.
///
/// # Arguments
/// * `block_id` - Block identifier
/// * `layer_id` - Layer identifier
/// * `outer_id` - Outer dimension identifier
pub
fn
memory_region
(
&
self
,
block_id
:
BlockId
,
layer_id
:
usize
,
outer_id
:
usize
,
)
->
Result
<
MemoryRegion
>
{
self
.layout
.memory_region
(
block_id
,
layer_id
,
outer_id
)
}
/// Serialize this physical layout for transmission to remote nodes.
///
/// This converts the runtime `PhysicalLayout` into a `LayoutDescriptor` that
/// contains all information needed to reconstruct the layout on a remote node,
/// including layout configuration, memory descriptors, NIXL metadata, and
/// layout-type-specific details.
///
/// # Returns
/// A serializable representation of this layout
pub
(
crate
)
fn
to_descriptor
(
&
self
)
->
Result
<
LayoutDescriptor
>
{
// Extract memory descriptors
let
memory_descriptors
=
self
.layout
.memory_regions
()
.iter
()
.map
(|
region
|
MemoryRegion
{
addr
:
region
.addr
(),
size
:
region
.size
(),
})
.collect
();
// Get layout type details from the layout itself
let
layout_type_details
=
self
.layout
.serialization_details
();
Ok
(
LayoutDescriptor
{
version
:
LayoutDescriptor
::
CURRENT_VERSION
,
layout_config
:
self
.layout
.config
()
.clone
(),
location
:
self
.location
,
nixl_metadata
:
self
.nixl_metadata
.clone
(),
memory_descriptors
,
layout_type_details
,
})
}
/// Reconstruct a physical layout from serialized data received from a remote node.
///
/// This creates a new `PhysicalLayout` from a `LayoutDescriptor`. The reconstructed
/// layout will have memory descriptors that point to the remote node's memory,
/// allowing NIXL to build RDMA descriptors for remote access.
///
/// # Arguments
/// * `serialized` - Serialized layout data from a remote node
///
/// # Returns
/// A new `PhysicalLayout` representing the remote layout
///
/// # Note
/// The memory regions in the reconstructed layout are not valid for local access;
/// they represent remote memory addresses and are used to build NIXL transfer descriptors.
pub
(
crate
)
fn
from_descriptor
(
serialized
:
LayoutDescriptor
)
->
Result
<
Self
>
{
// Validate version
if
serialized
.version
>
LayoutDescriptor
::
CURRENT_VERSION
{
return
Err
(
anyhow!
(
"Unsupported serialization version: {}. Maximum supported: {}"
,
serialized
.version
,
LayoutDescriptor
::
CURRENT_VERSION
));
}
// Create remote memory regions from descriptors
let
remote_regions
:
Vec
<
Arc
<
dyn
MemoryDescriptor
>>
=
serialized
.memory_descriptors
.iter
()
.map
(|
desc
|
{
Arc
::
new
(
RemoteMemoryDescriptor
{
addr
:
desc
.addr
,
size
:
desc
.size
,
storage_kind
:
serialized
.location
,
nixl_metadata
:
serialized
.nixl_metadata
.clone
(),
})
as
Arc
<
dyn
MemoryDescriptor
>
})
.collect
();
// Reconstruct the layout based on type
let
layout
:
Arc
<
dyn
Layout
>
=
match
serialized
.layout_type_details
{
LayoutTypeDetails
::
FullyContiguous
(
details
)
=>
{
if
remote_regions
.len
()
!=
1
{
return
Err
(
anyhow!
(
"FullyContiguous layout requires exactly 1 memory region, got {}"
,
remote_regions
.len
()
));
}
let
layout
=
FullyContiguousLayout
::
new_with_format
(
serialized
.layout_config
.clone
(),
Buffer
::
from_arc
(
remote_regions
[
0
]
.clone
()),
details
.block_format
,
details
.kv_block_layout
,
)
?
;
Arc
::
new
(
layout
)
}
LayoutTypeDetails
::
LayerSeparate
(
details
)
=>
{
if
remote_regions
.len
()
!=
serialized
.layout_config.num_layers
{
return
Err
(
anyhow!
(
"LayerSeparate layout requires {} memory regions (one per layer), got {}"
,
serialized
.layout_config.num_layers
,
remote_regions
.len
()
));
}
let
inner_shape
=
details
.kv_block_layout
.to_inner_shape
()
.unwrap_or
(
InnerShape
::
Unknown
);
let
layout
=
LayerSeparateLayout
::
builder
()
.config
(
serialized
.layout_config
.clone
())
.memory
(
remote_regions
.into_iter
()
.map
(
Buffer
::
from_arc
)
.collect
())
.block_dim
(
details
.block_dim
)
.inner_shape
(
inner_shape
)
.build
()
?
;
Arc
::
new
(
layout
)
}
};
Ok
(
Self
{
layout
,
location
:
serialized
.location
,
nixl_metadata
:
serialized
.nixl_metadata
,
})
}
}
/// A memory region that represents remote memory addresses.
///
/// This type is used when reconstructing layouts from serialized data.
/// The addresses are not valid for local access but can be used to
/// build NIXL transfer descriptors for remote memory access.
#[derive(Debug)]
struct
RemoteMemoryDescriptor
{
addr
:
usize
,
size
:
usize
,
storage_kind
:
StorageKind
,
nixl_metadata
:
NixlMetadata
,
}
impl
MemoryDescriptor
for
RemoteMemoryDescriptor
{
fn
addr
(
&
self
)
->
usize
{
self
.addr
}
fn
size
(
&
self
)
->
usize
{
self
.size
}
fn
storage_kind
(
&
self
)
->
StorageKind
{
self
.storage_kind
}
fn
as_any
(
&
self
)
->
&
dyn
Any
{
self
}
fn
nixl_descriptor
(
&
self
)
->
Option
<
NixlDescriptor
>
{
Some
(
NixlDescriptor
{
addr
:
self
.addr
as
u64
,
size
:
self
.size
,
mem_type
:
self
.nixl_metadata
.mem_type
(),
device_id
:
self
.nixl_metadata
.device_id
(),
})
}
}
lib/kvbm-physical/src/layout/serialize.rs
0 → 100644
View file @
9ab148dc
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Serialization types for physical layouts.
//!
//! This module provides types for serializing and deserializing physical layouts
//! so they can be transmitted to remote nodes and reconstructed there for RDMA operations.
use
super
::
physical
::
NixlMetadata
;
use
super
::{
BlockDimension
,
KvBlockLayout
,
LayoutConfig
};
use
anyhow
::
Result
;
use
dynamo_memory
::{
MemoryRegion
,
StorageKind
};
use
serde
::{
Deserialize
,
Serialize
};
/// Format of blocks in a fully contiguous layout.
///
/// This enum describes how the blocks are organized and formatted in memory.
/// Currently only `Operational` is supported, but future variants may include
/// different compression schemes or memory layouts.
#[derive(Debug,
Clone,
Copy,
PartialEq,
Eq,
Serialize,
Deserialize)]
pub
enum
BlockFormat
{
/// Standard operational format - blocks are stored in their normal, uncompressed form.
Operational
,
}
impl
Default
for
BlockFormat
{
fn
default
()
->
Self
{
Self
::
Operational
}
}
/// Details specific to fully contiguous layouts.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
pub
struct
FullyContiguousDetails
{
/// Format of the blocks in memory
pub
block_format
:
BlockFormat
,
/// KV block layout describing dimension ordering within blocks
#[serde(default)]
pub
kv_block_layout
:
KvBlockLayout
,
}
/// Details specific to layer-separate layouts.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
pub
struct
LayerSeparateDetails
{
/// Block dimension ordering (block-first or block-second)
pub
block_dim
:
BlockDimension
,
/// KV block layout for the inner tensor format (must be operational: NHD or HND)
#[serde(default)]
pub
kv_block_layout
:
KvBlockLayout
,
}
/// Layout-type-specific details.
///
/// This enum captures the information that differs between layout types
/// and is needed to reconstruct the layout on a remote node.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
pub
enum
LayoutTypeDetails
{
/// Fully contiguous layout details
FullyContiguous
(
FullyContiguousDetails
),
/// Layer-separate layout details
LayerSeparate
(
LayerSeparateDetails
),
}
/// Serializable representation of a physical layout.
///
/// This structure contains all information needed to reconstruct a layout
/// on a remote node, including:
/// - Layout configuration (dimensions, sizes, etc.)
/// - Storage location and NIXL metadata
/// - Memory descriptors for all regions
/// - Layout-type-specific details
///
/// The serialized form can be transmitted over the network and used to
/// build NIXL transfer descriptors for remote memory access.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
pub
struct
LayoutDescriptor
{
/// Serialization format version (for future compatibility)
pub
version
:
u32
,
/// Layout configuration
pub
layout_config
:
LayoutConfig
,
/// Storage location
pub
location
:
StorageKind
,
/// NIXL metadata from the source node
pub
nixl_metadata
:
NixlMetadata
,
/// Memory descriptors for all regions backing this layout
pub
memory_descriptors
:
Vec
<
MemoryRegion
>
,
/// Layout-type-specific details
pub
layout_type_details
:
LayoutTypeDetails
,
}
impl
LayoutDescriptor
{
/// Current serialization version
pub
const
CURRENT_VERSION
:
u32
=
1
;
/// Serialize this layout to a JSON string.
///
/// # Returns
/// JSON string representation of the layout
pub
fn
to_json
(
&
self
)
->
Result
<
String
>
{
serde_json
::
to_string
(
self
)
.map_err
(|
e
|
anyhow
::
anyhow!
(
"failed to serialize layout to JSON: {}"
,
e
))
}
/// Serialize this layout to JSON bytes.
///
/// # Returns
/// UTF-8 encoded JSON bytes
pub
fn
to_json_bytes
(
&
self
)
->
Result
<
Vec
<
u8
>>
{
serde_json
::
to_vec
(
self
)
.map_err
(|
e
|
anyhow
::
anyhow!
(
"failed to serialize layout to JSON bytes: {}"
,
e
))
}
/// Deserialize a layout from a JSON string.
///
/// # Arguments
/// * `json` - JSON string representation
///
/// # Returns
/// Deserialized layout
pub
fn
from_json
(
json
:
&
str
)
->
Result
<
Self
>
{
serde_json
::
from_str
(
json
)
.map_err
(|
e
|
anyhow
::
anyhow!
(
"failed to deserialize layout from JSON: {}"
,
e
))
}
/// Deserialize a layout from JSON bytes.
///
/// # Arguments
/// * `bytes` - UTF-8 encoded JSON bytes
///
/// # Returns
/// Deserialized layout
pub
fn
from_json_bytes
(
bytes
:
&
[
u8
])
->
Result
<
Self
>
{
serde_json
::
from_slice
(
bytes
)
.map_err
(|
e
|
anyhow
::
anyhow!
(
"failed to deserialize layout from JSON bytes: {}"
,
e
))
}
/// Get the layout configuration.
pub
fn
layout_config
(
&
self
)
->
&
LayoutConfig
{
&
self
.layout_config
}
/// Get the storage location.
pub
fn
location
(
&
self
)
->
StorageKind
{
self
.location
}
/// Get the NIXL metadata from the source node.
pub
fn
nixl_metadata
(
&
self
)
->
&
NixlMetadata
{
&
self
.nixl_metadata
}
/// Get the memory descriptors.
pub
fn
memory_descriptors
(
&
self
)
->
&
[
MemoryRegion
]
{
&
self
.memory_descriptors
}
/// Get the layout type details.
pub
fn
layout_type_details
(
&
self
)
->
&
LayoutTypeDetails
{
&
self
.layout_type_details
}
}
#[cfg(all(test,
feature
=
"testing-kvbm"
))]
mod
tests
{
use
dynamo_memory
::
nixl
::
MemType
;
use
super
::
*
;
fn
make_test_config
()
->
LayoutConfig
{
LayoutConfig
::
builder
()
.num_blocks
(
10
)
.num_layers
(
4
)
.outer_dim
(
2
)
.page_size
(
16
)
.inner_dim
(
128
)
.dtype_width_bytes
(
2
)
.build
()
.unwrap
()
}
#[test]
fn
test_block_format_default
()
{
assert_eq!
(
BlockFormat
::
default
(),
BlockFormat
::
Operational
);
}
#[test]
fn
test_serialized_layout_json_roundtrip
()
{
let
layout
=
LayoutDescriptor
{
version
:
LayoutDescriptor
::
CURRENT_VERSION
,
layout_config
:
make_test_config
(),
location
:
StorageKind
::
System
,
nixl_metadata
:
NixlMetadata
::
new
(
"test_agent"
.to_string
(),
MemType
::
Dram
,
0
),
memory_descriptors
:
vec!
[
MemoryRegion
::
new
(
0x1000
,
4096
)],
layout_type_details
:
LayoutTypeDetails
::
FullyContiguous
(
FullyContiguousDetails
{
block_format
:
BlockFormat
::
Operational
,
kv_block_layout
:
KvBlockLayout
::
OperationalNHD
,
}),
};
// Test to_json/from_json
let
json
=
layout
.to_json
()
.unwrap
();
let
deserialized
=
LayoutDescriptor
::
from_json
(
&
json
)
.unwrap
();
assert_eq!
(
deserialized
.version
,
layout
.version
);
assert_eq!
(
deserialized
.layout_config
,
layout
.layout_config
);
assert_eq!
(
deserialized
.location
,
layout
.location
);
assert_eq!
(
deserialized
.nixl_metadata
.agent_name
(),
layout
.nixl_metadata
.agent_name
()
);
assert_eq!
(
deserialized
.memory_descriptors
.len
(),
1
);
}
#[test]
fn
test_serialized_layout_json_bytes_roundtrip
()
{
let
layout
=
LayoutDescriptor
{
version
:
LayoutDescriptor
::
CURRENT_VERSION
,
layout_config
:
make_test_config
(),
location
:
StorageKind
::
System
,
nixl_metadata
:
NixlMetadata
::
new
(
"test_agent"
.to_string
(),
MemType
::
Vram
,
5
),
memory_descriptors
:
vec!
[
MemoryRegion
::
new
(
0x1000
,
2048
),
MemoryRegion
::
new
(
0x2000
,
2048
),
],
layout_type_details
:
LayoutTypeDetails
::
LayerSeparate
(
LayerSeparateDetails
{
block_dim
:
BlockDimension
::
BlockIsFirstDim
,
kv_block_layout
:
KvBlockLayout
::
OperationalNHD
,
}),
};
// Test to_json_bytes/from_json_bytes
let
bytes
=
layout
.to_json_bytes
()
.unwrap
();
let
deserialized
=
LayoutDescriptor
::
from_json_bytes
(
&
bytes
)
.unwrap
();
assert_eq!
(
deserialized
.version
,
layout
.version
);
assert_eq!
(
deserialized
.nixl_metadata
.device_id
(),
5
);
assert_eq!
(
deserialized
.memory_descriptors
.len
(),
2
);
}
#[test]
fn
test_fully_contiguous_details_serialization
()
{
let
details
=
LayoutTypeDetails
::
FullyContiguous
(
FullyContiguousDetails
{
block_format
:
BlockFormat
::
Operational
,
kv_block_layout
:
KvBlockLayout
::
UniversalTP
,
});
let
json
=
serde_json
::
to_string
(
&
details
)
.unwrap
();
let
deserialized
:
LayoutTypeDetails
=
serde_json
::
from_str
(
&
json
)
.unwrap
();
match
deserialized
{
LayoutTypeDetails
::
FullyContiguous
(
d
)
=>
{
assert_eq!
(
d
.block_format
,
BlockFormat
::
Operational
);
assert_eq!
(
d
.kv_block_layout
,
KvBlockLayout
::
UniversalTP
);
}
_
=>
panic!
(
"Expected FullyContiguous variant"
),
}
}
#[test]
fn
test_layer_separate_details_serialization
()
{
let
details
=
LayoutTypeDetails
::
LayerSeparate
(
LayerSeparateDetails
{
block_dim
:
BlockDimension
::
BlockIsSecondDim
,
kv_block_layout
:
KvBlockLayout
::
OperationalHND
,
});
let
json
=
serde_json
::
to_string
(
&
details
)
.unwrap
();
let
deserialized
:
LayoutTypeDetails
=
serde_json
::
from_str
(
&
json
)
.unwrap
();
match
deserialized
{
LayoutTypeDetails
::
LayerSeparate
(
d
)
=>
{
assert_eq!
(
d
.block_dim
,
BlockDimension
::
BlockIsSecondDim
);
assert_eq!
(
d
.kv_block_layout
,
KvBlockLayout
::
OperationalHND
);
}
_
=>
panic!
(
"Expected LayerSeparate variant"
),
}
}
}
lib/kvbm-physical/src/layout/tests.rs
0 → 100644
View file @
9ab148dc
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Integration tests for layout serialization.
//!
//! These tests verify the complete serialization and deserialization flow,
//! ensuring that layouts can be transmitted to remote nodes and reconstructed
//! with all necessary metadata intact.
use
crate
::
layout
::
physical
::
PhysicalLayout
;
use
crate
::
layout
::{
BlockDimension
,
LayoutConfig
,
LayoutDescriptor
};
use
dynamo_memory
::
nixl
::{
MemType
,
NixlAgent
,
NixlDescriptor
};
use
dynamo_memory
::{
Buffer
,
MemoryDescriptor
,
MemoryRegion
,
StorageKind
};
use
std
::
any
::
Any
;
use
std
::
sync
::
Arc
;
// Simple mock implementation for testing
#[derive(Debug)]
pub
struct
MockMemory
{
addr
:
usize
,
size
:
usize
,
}
impl
MockMemory
{
pub
fn
new
(
addr
:
usize
,
size
:
usize
)
->
Arc
<
Self
>
{
Arc
::
new
(
Self
{
addr
,
size
})
}
}
impl
MemoryDescriptor
for
MockMemory
{
fn
addr
(
&
self
)
->
usize
{
self
.addr
}
fn
size
(
&
self
)
->
usize
{
self
.size
}
fn
storage_kind
(
&
self
)
->
StorageKind
{
StorageKind
::
System
}
fn
as_any
(
&
self
)
->
&
dyn
Any
{
self
}
fn
nixl_descriptor
(
&
self
)
->
Option
<
NixlDescriptor
>
{
None
}
}
/// Mock memory region for testing serialization
#[derive(Debug)]
struct
TestMemoryRegion
{
addr
:
usize
,
size
:
usize
,
kind
:
StorageKind
,
descriptor
:
NixlDescriptor
,
}
impl
TestMemoryRegion
{
fn
new
(
addr
:
usize
,
size
:
usize
,
kind
:
StorageKind
)
->
Arc
<
Self
>
{
Arc
::
new
(
Self
{
addr
,
size
,
kind
,
descriptor
:
NixlDescriptor
{
addr
:
addr
as
u64
,
size
,
mem_type
:
MemType
::
Dram
,
device_id
:
0
,
},
})
}
}
impl
MemoryDescriptor
for
TestMemoryRegion
{
fn
addr
(
&
self
)
->
usize
{
self
.addr
}
fn
size
(
&
self
)
->
usize
{
self
.size
}
fn
storage_kind
(
&
self
)
->
StorageKind
{
self
.kind
}
fn
as_any
(
&
self
)
->
&
dyn
Any
{
self
}
fn
nixl_descriptor
(
&
self
)
->
Option
<
NixlDescriptor
>
{
Some
(
self
.descriptor
.clone
())
}
}
fn
make_test_config
()
->
LayoutConfig
{
LayoutConfig
::
builder
()
.num_blocks
(
10
)
.num_layers
(
4
)
.outer_dim
(
2
)
.page_size
(
16
)
.inner_dim
(
128
)
.dtype_width_bytes
(
2
)
.build
()
.unwrap
()
}
#[test]
fn
test_fully_contiguous_layout_serialization_roundtrip
()
{
let
agent
=
NixlAgent
::
new
(
"test-fc-serialize"
)
.expect
(
"failed to create agent"
);
let
config
=
make_test_config
();
// Calculate required size
let
required_size
=
config
.num_blocks
*
config
.num_layers
*
config
.outer_dim
*
config
.page_size
*
config
.inner_dim
*
config
.dtype_width_bytes
;
// Create test memory region
let
memory
=
TestMemoryRegion
::
new
(
0x10000
,
required_size
,
StorageKind
::
System
);
let
regions
=
vec!
[
Buffer
::
from_arc
(
memory
as
Arc
<
dyn
MemoryDescriptor
>
)];
// Build physical layout
let
original_layout
=
PhysicalLayout
::
builder
(
agent
)
.with_config
(
config
.clone
())
.fully_contiguous
()
.with_registered_regions
(
regions
)
.expect
(
"failed to provide regions"
)
.build
()
.expect
(
"failed to build layout"
);
// Serialize to LayoutDescriptor
let
serialized
=
original_layout
.to_descriptor
()
.expect
(
"failed to serialize layout"
);
// Verify serialized data
assert_eq!
(
serialized
.version
,
LayoutDescriptor
::
CURRENT_VERSION
);
assert_eq!
(
serialized
.layout_config
,
config
);
assert_eq!
(
serialized
.location
,
StorageKind
::
System
);
assert_eq!
(
serialized
.memory_descriptors
.len
(),
1
);
assert_eq!
(
serialized
.memory_descriptors
[
0
]
.addr
,
0x10000
);
assert_eq!
(
serialized
.memory_descriptors
[
0
]
.size
,
required_size
);
// Serialize to JSON
let
json
=
serialized
.to_json
()
.expect
(
"failed to serialize to JSON"
);
assert
!
(
json
.contains
(
"
\"
version
\"
:1"
));
assert
!
(
json
.contains
(
"
\"
num_blocks
\"
:10"
));
// Deserialize from JSON
let
deserialized
=
LayoutDescriptor
::
from_json
(
&
json
)
.expect
(
"failed to deserialize from JSON"
);
// Verify deserialized matches original
assert_eq!
(
deserialized
.version
,
serialized
.version
);
assert_eq!
(
deserialized
.layout_config
,
serialized
.layout_config
);
assert_eq!
(
deserialized
.location
,
serialized
.location
);
assert_eq!
(
deserialized
.memory_descriptors
.len
(),
serialized
.memory_descriptors
.len
()
);
// Reconstruct layout from serialized data
let
reconstructed
=
PhysicalLayout
::
from_descriptor
(
deserialized
)
.expect
(
"failed to reconstruct layout"
);
// Verify reconstructed layout has same configuration
assert_eq!
(
reconstructed
.layout
()
.config
(),
&
config
);
assert_eq!
(
reconstructed
.location
(),
StorageKind
::
System
);
assert_eq!
(
reconstructed
.layout
()
.num_blocks
(),
10
);
assert_eq!
(
reconstructed
.layout
()
.num_layers
(),
4
);
assert
!
(
reconstructed
.layout
()
.is_fully_contiguous
());
}
#[test]
fn
test_layer_separate_layout_serialization_roundtrip
()
{
let
agent
=
NixlAgent
::
new
(
"test-ls-serialize"
)
.expect
(
"failed to create agent"
);
let
config
=
make_test_config
();
// Calculate per-layer size
let
per_layer_size
=
config
.num_blocks
*
config
.outer_dim
*
config
.page_size
*
config
.inner_dim
*
config
.dtype_width_bytes
;
// Create memory regions (one per layer)
let
regions
:
Vec
<
Buffer
>
=
(
0
..
config
.num_layers
)
.map
(|
i
|
{
Buffer
::
from_arc
(
TestMemoryRegion
::
new
(
0x10000
+
i
*
per_layer_size
,
per_layer_size
,
StorageKind
::
System
,
)
as
Arc
<
dyn
MemoryDescriptor
>
)
})
.collect
();
// Build physical layout
let
original_layout
=
PhysicalLayout
::
builder
(
agent
)
.with_config
(
config
.clone
())
.layer_separate
(
BlockDimension
::
BlockIsFirstDim
)
.with_registered_regions
(
regions
)
.expect
(
"failed to provide regions"
)
.build
()
.expect
(
"failed to build layout"
);
// Serialize to LayoutDescriptor
let
serialized
=
original_layout
.to_descriptor
()
.expect
(
"failed to serialize layout"
);
// Verify serialized data
assert_eq!
(
serialized
.version
,
LayoutDescriptor
::
CURRENT_VERSION
);
assert_eq!
(
serialized
.layout_config
,
config
);
assert_eq!
(
serialized
.memory_descriptors
.len
(),
4
);
// One per layer
// Verify memory descriptors
for
(
i
,
desc
)
in
serialized
.memory_descriptors
.iter
()
.enumerate
()
{
assert_eq!
(
desc
.addr
,
0x10000
+
i
*
per_layer_size
);
assert_eq!
(
desc
.size
,
per_layer_size
);
}
// Serialize to JSON bytes
let
json_bytes
=
serialized
.to_json_bytes
()
.expect
(
"failed to serialize to JSON bytes"
);
// Deserialize from JSON bytes
let
deserialized
=
LayoutDescriptor
::
from_json_bytes
(
&
json_bytes
)
.expect
(
"failed to deserialize from JSON bytes"
);
// Verify deserialized matches original
assert_eq!
(
deserialized
.version
,
serialized
.version
);
assert_eq!
(
deserialized
.layout_config
,
serialized
.layout_config
);
assert_eq!
(
deserialized
.memory_descriptors
.len
(),
serialized
.memory_descriptors
.len
()
);
// Reconstruct layout from serialized data
let
reconstructed
=
PhysicalLayout
::
from_descriptor
(
deserialized
)
.expect
(
"failed to reconstruct layout"
);
// Verify reconstructed layout has same configuration
assert_eq!
(
reconstructed
.layout
()
.config
(),
&
config
);
assert_eq!
(
reconstructed
.location
(),
StorageKind
::
System
);
assert_eq!
(
reconstructed
.layout
()
.num_blocks
(),
10
);
assert_eq!
(
reconstructed
.layout
()
.num_layers
(),
4
);
assert
!
(
!
reconstructed
.layout
()
.is_fully_contiguous
());
}
#[test]
fn
test_memory_region_calculation_after_deserialization
()
{
let
agent
=
NixlAgent
::
new
(
"test-memory-calc"
)
.expect
(
"failed to create agent"
);
let
config
=
LayoutConfig
::
builder
()
.num_blocks
(
2
)
.num_layers
(
2
)
.outer_dim
(
2
)
.page_size
(
4
)
.inner_dim
(
8
)
.dtype_width_bytes
(
2
)
.build
()
.unwrap
();
let
required_size
=
config
.num_blocks
*
config
.num_layers
*
config
.outer_dim
*
config
.page_size
*
config
.inner_dim
*
config
.dtype_width_bytes
;
let
memory
=
TestMemoryRegion
::
new
(
0x1000
,
required_size
,
StorageKind
::
System
);
let
regions
=
vec!
[
Buffer
::
from_arc
(
memory
as
Arc
<
dyn
MemoryDescriptor
>
)];
let
original_layout
=
PhysicalLayout
::
builder
(
agent
)
.with_config
(
config
.clone
())
.fully_contiguous
()
.with_registered_regions
(
regions
)
.expect
(
"failed to provide regions"
)
.build
()
.expect
(
"failed to build layout"
);
// Serialize and deserialize
let
serialized
=
original_layout
.to_descriptor
()
.expect
(
"failed to serialize"
);
let
reconstructed
=
PhysicalLayout
::
from_descriptor
(
serialized
)
.expect
(
"failed to reconstruct"
);
// Verify memory region calculations
let
region
=
reconstructed
.memory_region
(
0
,
0
,
0
)
.expect
(
"failed to get memory region"
);
assert_eq!
(
region
.addr
,
0x1000
);
let
region_size
=
config
.page_size
*
config
.inner_dim
*
config
.dtype_width_bytes
;
assert_eq!
(
region
.size
,
region_size
);
// Test different block/layer/outer indices
let
region
=
reconstructed
.memory_region
(
1
,
1
,
1
)
.expect
(
"failed to get memory region"
);
// Address should be: base + block_stride + layer_stride + outer_stride
let
layer_stride
=
config
.outer_dim
*
region_size
;
let
block_stride
=
config
.num_layers
*
layer_stride
;
let
expected_addr
=
0x1000
+
block_stride
+
layer_stride
+
region_size
;
assert_eq!
(
region
.addr
,
expected_addr
);
}
#[test]
fn
test_version_check_on_deserialization
()
{
let
config
=
make_test_config
();
// Calculate required size for fully contiguous layout
let
required_size
=
config
.num_blocks
*
config
.num_layers
*
config
.outer_dim
*
config
.page_size
*
config
.inner_dim
*
config
.dtype_width_bytes
;
let
mut
serialized
=
LayoutDescriptor
{
version
:
999
,
// Future version
layout_config
:
config
.clone
(),
location
:
StorageKind
::
System
,
nixl_metadata
:
crate
::
layout
::
physical
::
NixlMetadata
::
new
(
"test"
.to_string
(),
MemType
::
Dram
,
0
,
),
memory_descriptors
:
vec!
[],
layout_type_details
:
crate
::
layout
::
LayoutTypeDetails
::
FullyContiguous
(
crate
::
layout
::
FullyContiguousDetails
{
block_format
:
crate
::
layout
::
BlockFormat
::
Operational
,
kv_block_layout
:
crate
::
layout
::
KvBlockLayout
::
OperationalNHD
,
},
),
};
// Should fail with unsupported version
let
result
=
PhysicalLayout
::
from_descriptor
(
serialized
.clone
());
assert
!
(
result
.is_err
());
assert
!
(
result
.unwrap_err
()
.to_string
()
.contains
(
"Unsupported serialization version"
)
);
// Should succeed with supported version
serialized
.version
=
LayoutDescriptor
::
CURRENT_VERSION
;
serialized
.memory_descriptors
=
vec!
[
MemoryRegion
{
addr
:
0x1000
,
size
:
required_size
,
}];
let
result
=
PhysicalLayout
::
from_descriptor
(
serialized
);
if
let
Err
(
ref
e
)
=
result
{
eprintln!
(
"Error during deserialization: {}"
,
e
);
}
assert
!
(
result
.is_ok
(),
"Expected successful deserialization, got error: {:?}"
,
result
.err
()
);
let
layout
=
result
.unwrap
();
assert_eq!
(
layout
.layout
()
.block_layout
(),
crate
::
layout
::
KvBlockLayout
::
OperationalNHD
);
}
lib/kvbm-physical/src/layout/validation.rs
0 → 100644
View file @
9ab148dc
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Tensor validation utilities for layout creation.
use
anyhow
::{
Result
,
anyhow
};
use
std
::
sync
::
Arc
;
use
dynamo_memory
::
TensorDescriptor
;
/// Format of tensor layout (for future TP translation).
#[allow(clippy::upper_case_acronyms)]
#[derive(Debug,
Clone,
Copy,
PartialEq,
Eq)]
pub
enum
TensorFormat
{
/// NHD format: [N, H, D] where N=block_size, H=heads, D=hidden
NHD
,
/// HND format: [H, N, D] where H=heads, N=block_size, D=hidden
HND
,
/// Unknown or ambiguous format
Unknown
,
}
/// Validate tensor strides and detect format.
///
/// This function checks that tensor strides are monotonically decreasing,
/// which ensures tensor-contiguous layout. The stride validation is flexible
/// at the inner dimension boundary to accommodate different layouts.
///
/// Additionally, it attempts to detect whether the layout is NHD or HND format,
/// which is important for future tensor parallel (TP) translation.
///
/// # Arguments
/// * `tensors` - Slice of tensors to validate
///
/// # Returns
/// The detected tensor format (NHD, HND, or Unknown)
#[expect(dead_code)]
pub
fn
validate_tensor_strides
(
tensors
:
&
[
Arc
<
dyn
TensorDescriptor
>
])
->
Result
<
TensorFormat
>
{
if
tensors
.is_empty
()
{
return
Err
(
anyhow!
(
"Cannot validate empty tensor list"
));
}
let
mut
format
=
TensorFormat
::
Unknown
;
for
tensor
in
tensors
{
let
stride
=
tensor
.stride
();
let
shape
=
tensor
.shape
();
if
stride
.len
()
<
2
{
return
Err
(
anyhow!
(
"Tensor must have at least 2 dimensions, got stride: {:?}"
,
stride
));
}
// Check monotonic decreasing stride
// Note: We're flexible at the combined inner dimension boundary as per requirements
let
mut
prev_stride
=
usize
::
MAX
;
for
(
i
,
&
current_stride
)
in
stride
.iter
()
.enumerate
()
{
if
current_stride
>
prev_stride
{
return
Err
(
anyhow!
(
"Tensor strides must be monotonically decreasing (until inner dimension).
\
Got stride: {:?} at position {}"
,
stride
,
i
));
}
prev_stride
=
current_stride
;
}
// Attempt to detect NHD vs HND format based on shape and stride patterns
// This is a heuristic and may need refinement based on actual usage
if
shape
.len
()
>=
3
{
// If the first dimension stride is smaller than the second, likely HND
// If the first dimension stride is larger than the second, likely NHD
if
stride
[
0
]
<
stride
[
1
]
{
format
=
TensorFormat
::
HND
;
}
else
if
stride
[
0
]
>
stride
[
1
]
{
format
=
TensorFormat
::
NHD
;
}
}
}
Ok
(
format
)
}
/// Validate that all tensors have consistent shapes.
///
/// # Arguments
/// * `tensors` - Slice of tensors to validate
///
/// # Returns
/// The common shape shared by all tensors
#[expect(dead_code)]
pub
fn
validate_tensor_shapes
(
tensors
:
&
[
Arc
<
dyn
TensorDescriptor
>
])
->
Result
<
Vec
<
usize
>>
{
if
tensors
.is_empty
()
{
return
Err
(
anyhow!
(
"Cannot validate empty tensor list"
));
}
let
first_shape
=
tensors
[
0
]
.shape
();
for
tensor
in
&
tensors
[
1
..
]
{
if
tensor
.shape
()
!=
first_shape
{
return
Err
(
anyhow!
(
"All tensors must have the same shape. Expected {:?}, got {:?}"
,
first_shape
,
tensor
.shape
()
));
}
}
Ok
(
first_shape
.to_vec
())
}
#[allow(dead_code)]
pub
fn
determine_compressed_shape
(
shape
:
&
[
usize
])
->
usize
{
shape
.iter
()
.product
()
}
#[cfg(all(test,
feature
=
"testing-kvbm"
))]
mod
tests
{
// Note: These tests would require mock TorchTensor implementations
// which we can add if needed for testing infrastructure
}
lib/kvbm-physical/src/lib.rs
0 → 100644
View file @
9ab148dc
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
pub
mod
layout
;
pub
mod
manager
;
pub
mod
transfer
;
pub
use
manager
::
TransferManager
;
pub
use
transfer
::{
TransferConfig
,
TransferOptions
};
pub
use
kvbm_common
::
BlockId
;
pub
type
SequenceHash
=
kvbm_common
::
SequenceHash
;
#[cfg(test)]
#[cfg(not(feature
=
"testing-kvbm"
))]
mod
sentinel
{
#[test]
#[allow(non_snake_case)]
fn
all_functional_tests_skipped___enable_testing_kvbm
()
{
eprintln!
(
"kvbm-physical functional tests require feature `testing-kvbm`.
\
Run with: cargo test -p kvbm-physical --features testing-kvbm"
);
}
}
lib/kvbm-physical/src/manager/handle.rs
0 → 100644
View file @
9ab148dc
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Layout handle type encoding worker ID and layout ID.
use
bincode
::{
Decode
,
Encode
};
use
serde
::{
Deserialize
,
Serialize
};
/// Unique handle for a layout combining worker_id and layout_id.
///
/// The handle encodes:
/// - Bits 0-63: worker_id (u64)
/// - Bits 64-79: layout_id (u16)
/// - Bits 80-127: Reserved (48 bits, currently unused)
#[derive(Debug,
Clone,
Copy,
PartialEq,
Eq,
Hash,
Encode,
Decode,
Serialize,
Deserialize)]
pub
struct
LayoutHandle
(
u128
);
impl
LayoutHandle
{
/// Create a new layout handle from worker_id and layout_id.
///
/// # Arguments
/// * `worker_id` - Unique identifier for the worker (0-63 bits)
/// * `layout_id` - Layout identifier within the worker (64-79 bits)
pub
fn
new
(
worker_id
:
u64
,
layout_id
:
u16
)
->
Self
{
let
handle
=
(
worker_id
as
u128
)
|
((
layout_id
as
u128
)
<<
64
);
Self
(
handle
)
}
/// Extract the worker_id from this handle.
pub
fn
worker_id
(
&
self
)
->
u64
{
(
self
.0
&
0xFFFF_FFFF_FFFF_FFFF
)
as
u64
}
/// Extract the layout_id from this handle.
pub
fn
layout_id
(
&
self
)
->
u16
{
((
self
.0
>>
64
)
&
0xFFFF
)
as
u16
}
/// Get the raw u128 value.
pub
fn
as_u128
(
&
self
)
->
u128
{
self
.0
}
/// Reconstruct a handle from a raw u128 value.
///
/// This preserves all bits including reserved bits, and is intended for
/// deserialization roundtrips with `as_u128()`.
pub
fn
from_u128
(
value
:
u128
)
->
Self
{
Self
(
value
)
}
}
impl
std
::
fmt
::
Display
for
LayoutHandle
{
fn
fmt
(
&
self
,
f
:
&
mut
std
::
fmt
::
Formatter
<
'_
>
)
->
std
::
fmt
::
Result
{
write!
(
f
,
"LayoutHandle(worker={}, layout={})"
,
self
.worker_id
(),
self
.layout_id
()
)
}
}
#[cfg(all(test,
feature
=
"testing-kvbm"
))]
mod
tests
{
use
super
::
*
;
#[test]
fn
test_handle_encoding
()
{
let
worker_id
=
0x1234_5678_9ABC_DEF0u64
;
let
layout_id
=
0x4242u16
;
let
handle
=
LayoutHandle
::
new
(
worker_id
,
layout_id
);
assert_eq!
(
handle
.worker_id
(),
worker_id
);
assert_eq!
(
handle
.layout_id
(),
layout_id
);
}
#[test]
fn
test_handle_roundtrip
()
{
let
handle
=
LayoutHandle
::
new
(
42
,
100
);
let
raw
=
handle
.as_u128
();
let
restored
=
LayoutHandle
::
from_u128
(
raw
);
assert_eq!
(
handle
,
restored
);
assert_eq!
(
restored
.worker_id
(),
42
);
assert_eq!
(
restored
.layout_id
(),
100
);
}
#[test]
fn
test_handle_max_values
()
{
let
max_worker
=
u64
::
MAX
;
let
max_layout
=
u16
::
MAX
;
let
handle
=
LayoutHandle
::
new
(
max_worker
,
max_layout
);
assert_eq!
(
handle
.worker_id
(),
max_worker
);
assert_eq!
(
handle
.layout_id
(),
max_layout
);
}
#[test]
fn
test_handle_bincode_roundtrip
()
{
let
handle
=
LayoutHandle
::
new
(
999
,
42
);
let
encoded
=
bincode
::
encode_to_vec
(
handle
,
bincode
::
config
::
standard
())
.unwrap
();
let
(
decoded
,
_
):
(
LayoutHandle
,
_
)
=
bincode
::
decode_from_slice
(
&
encoded
,
bincode
::
config
::
standard
())
.unwrap
();
assert_eq!
(
handle
,
decoded
);
}
#[test]
fn
test_handle_display
()
{
let
handle
=
LayoutHandle
::
new
(
123
,
456
);
let
display
=
format!
(
"{}"
,
handle
);
assert
!
(
display
.contains
(
"123"
));
assert
!
(
display
.contains
(
"456"
));
}
}
lib/kvbm-physical/src/manager/local.rs
0 → 100644
View file @
9ab148dc
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Local layout wrapper with handle and metadata.
use
std
::
ops
::
Deref
;
use
super
::
handle
::
LayoutHandle
;
use
crate
::
layout
::
PhysicalLayout
;
/// A local physical layout with an assigned handle.
///
/// This wraps a `PhysicalLayout` that exists on the local worker,
/// associating it with a unique handle that combines the worker_id
/// and a locally-assigned layout_id.
///
/// This type is cheap to clone as `PhysicalLayout` contains `Arc` internally.
#[derive(Debug,
Clone)]
pub
struct
LocalLayout
{
handle
:
LayoutHandle
,
layout
:
PhysicalLayout
,
}
#[allow(dead_code)]
impl
LocalLayout
{
/// Create a new local layout.
///
/// # Arguments
/// * `handle` - Unique handle for this layout
/// * `layout` - The physical layout
pub
fn
new
(
handle
:
LayoutHandle
,
layout
:
PhysicalLayout
)
->
Self
{
Self
{
handle
,
layout
}
}
/// Get the handle for this layout.
pub
fn
handle
(
&
self
)
->
LayoutHandle
{
self
.handle
}
/// Get a reference to the physical layout.
pub
fn
layout
(
&
self
)
->
&
PhysicalLayout
{
&
self
.layout
}
/// Get the worker_id from the handle.
pub
fn
worker_id
(
&
self
)
->
u64
{
self
.handle
.worker_id
()
}
/// Get the layout_id from the handle.
pub
fn
layout_id
(
&
self
)
->
u16
{
self
.handle
.layout_id
()
}
/// Consume this local layout and return the physical layout.
pub
fn
into_layout
(
self
)
->
PhysicalLayout
{
self
.layout
}
}
impl
Deref
for
LocalLayout
{
type
Target
=
PhysicalLayout
;
fn
deref
(
&
self
)
->
&
Self
::
Target
{
&
self
.layout
}
}
#[cfg(all(test,
feature
=
"testing-kvbm"
))]
mod
tests
{
use
super
::
*
;
use
crate
::
layout
::{
LayoutConfig
,
PhysicalLayout
};
use
dynamo_memory
::
nixl
::
NixlAgent
;
fn
create_test_agent
(
name
:
&
str
)
->
NixlAgent
{
NixlAgent
::
new
(
name
)
.expect
(
"failed to create agent"
)
}
fn
make_test_layout
()
->
PhysicalLayout
{
let
agent
=
create_test_agent
(
"test-local"
);
let
config
=
LayoutConfig
::
builder
()
.num_blocks
(
2
)
.num_layers
(
2
)
.outer_dim
(
2
)
.page_size
(
4
)
.inner_dim
(
8
)
.dtype_width_bytes
(
2
)
.build
()
.unwrap
();
PhysicalLayout
::
builder
(
agent
)
.with_config
(
config
)
.fully_contiguous
()
.allocate_system
()
.build
()
.unwrap
()
}
#[test]
fn
test_local_layout_creation
()
{
let
handle
=
LayoutHandle
::
new
(
42
,
100
);
let
layout
=
make_test_layout
();
let
local
=
LocalLayout
::
new
(
handle
,
layout
);
assert_eq!
(
local
.handle
(),
handle
);
assert_eq!
(
local
.worker_id
(),
42
);
assert_eq!
(
local
.layout_id
(),
100
);
}
#[test]
fn
test_local_layout_into_layout
()
{
let
handle
=
LayoutHandle
::
new
(
1
,
2
);
let
layout
=
make_test_layout
();
let
local
=
LocalLayout
::
new
(
handle
,
layout
);
let
_
recovered
=
local
.into_layout
();
// Successfully consumed and returned the layout
}
}
lib/kvbm-physical/src/manager/metadata.rs
0 → 100644
View file @
9ab148dc
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Serialization types for exporting/importing layout metadata with NIXL integration.
use
super
::
handle
::
LayoutHandle
;
use
crate
::
layout
::
LayoutDescriptor
;
use
anyhow
::
Result
;
use
bincode
::{
Decode
,
Encode
};
use
serde
::{
Deserialize
,
Serialize
};
use
kvbm_common
::
LogicalLayoutHandle
;
/// Worker identification combining worker_id and NIXL agent name.
#[derive(Debug,
Clone,
Encode,
Decode,
PartialEq,
Eq)]
pub
struct
WorkerAddress
{
/// Unique identifier for this worker
pub
worker_id
:
u64
,
/// NIXL agent name on this worker
pub
nixl_agent_name
:
String
,
}
impl
WorkerAddress
{
/// Create a new worker address.
pub
fn
new
(
worker_id
:
u64
,
nixl_agent_name
:
String
)
->
Self
{
Self
{
worker_id
,
nixl_agent_name
,
}
}
}
/// Layout descriptor with its assigned handle and logical type for RDMA metadata exchange.
///
/// This includes the logical layout type (G1, G2, G3, G4) so that remote instances
/// know which physical handle corresponds to which tier.
#[derive(Debug,
Clone,
Encode,
Decode)]
pub
struct
LogicalLayoutDescriptor
{
/// Unique handle for this layout
pub
handle
:
LayoutHandle
,
/// The logical layout type (G1, G2, G3, G4)
#[bincode(with_serde)]
pub
logical_type
:
LogicalLayoutHandle
,
/// Serialized layout data (uses Serde, bridged via bincode)
#[bincode(with_serde)]
pub
layout
:
LayoutDescriptor
,
}
impl
LogicalLayoutDescriptor
{
/// Create a new layout descriptor with handle and logical type.
pub
fn
new
(
handle
:
LayoutHandle
,
logical_type
:
LogicalLayoutHandle
,
layout
:
LayoutDescriptor
,
)
->
Self
{
Self
{
handle
,
logical_type
,
layout
,
}
}
/// Create a layout descriptor with G2 as the default logical type.
///
/// This is provided for backwards compatibility with code that doesn't
/// track logical types. G2 is used as the default since it's the most
/// common tier for RDMA transfers (GPU memory for KV cache).
///
/// For proper RDMA transfers between instances, use `new()` with the
/// correct logical type from the Worker's registered handles.
pub
fn
new_with_default_type
(
handle
:
LayoutHandle
,
layout
:
LayoutDescriptor
)
->
Self
{
Self
{
handle
,
logical_type
:
LogicalLayoutHandle
::
G2
,
layout
,
}
}
}
/// Type alias for backwards compatibility.
pub
type
LocalLayoutDescriptor
=
LogicalLayoutDescriptor
;
/// The set of [`LogicalLayoutDescriptor`] that are RDMA enabled. This object packages the detail
/// about the layouts and the NIXL RDMA metadata required to reconstruct the layouts and access
/// the memory via NIXL RDMA.
#[derive(Debug,
Encode,
Decode)]
pub
struct
RdmaLayoutDescriptors
{
/// Worker identification
pub
worker_address
:
WorkerAddress
,
/// Exported NIXL metadata from nixl_sys::Agent::get_local_md()
pub
nixl_metadata
:
Vec
<
u8
>
,
/// Serialized layouts (handle + logical type + layout data)
pub
layouts
:
Vec
<
LogicalLayoutDescriptor
>
,
}
/// Managed memory metadata package for export/import.
///
/// This is the wire format for transmitting layout metadata between workers.
/// It contains everything needed to reconstruct remote layouts and load their
/// NIXL registration data.
#[derive(Clone,
Serialize,
Deserialize,
Encode,
Decode)]
#[serde(transparent)]
pub
struct
SerializedLayout
(
Vec
<
u8
>
);
impl
SerializedLayout
{
/// Pack metadata into a serialized form.
///
/// # Arguments
/// * `worker_address` - Worker identification
/// * `nixl_metadata` - NIXL metadata blob from get_local_md()
/// * `layouts` - Vector of layouts with handles and logical types to export
///
/// # Returns
/// Packed metadata ready for transmission
pub
fn
pack
(
worker_address
:
WorkerAddress
,
nixl_metadata
:
Vec
<
u8
>
,
layouts
:
Vec
<
LogicalLayoutDescriptor
>
,
)
->
Result
<
Self
>
{
let
inner
=
RdmaLayoutDescriptors
{
worker_address
,
nixl_metadata
,
layouts
,
};
let
bytes
=
bincode
::
encode_to_vec
(
&
inner
,
bincode
::
config
::
standard
())
.map_err
(|
e
|
anyhow
::
anyhow!
(
"failed to encode managed memory metadata: {}"
,
e
))
?
;
Ok
(
Self
(
bytes
))
}
/// Unpack metadata from serialized form.
///
/// # Returns
/// Unpacked metadata structure
pub
fn
unpack
(
&
self
)
->
Result
<
RdmaLayoutDescriptors
>
{
let
(
inner
,
_
)
=
bincode
::
decode_from_slice
(
&
self
.0
,
bincode
::
config
::
standard
())
.map_err
(|
e
|
anyhow
::
anyhow!
(
"failed to decode managed memory metadata: {}"
,
e
))
?
;
Ok
(
inner
)
}
/// Get the raw bytes.
pub
fn
as_bytes
(
&
self
)
->
&
[
u8
]
{
&
self
.0
}
/// Create from raw bytes.
pub
fn
from_bytes
(
bytes
:
Vec
<
u8
>
)
->
Self
{
Self
(
bytes
)
}
/// Get the size in bytes.
pub
fn
len
(
&
self
)
->
usize
{
self
.0
.len
()
}
/// Check if empty.
pub
fn
is_empty
(
&
self
)
->
bool
{
self
.0
.is_empty
()
}
}
impl
std
::
fmt
::
Debug
for
SerializedLayout
{
fn
fmt
(
&
self
,
f
:
&
mut
std
::
fmt
::
Formatter
<
'_
>
)
->
std
::
fmt
::
Result
{
f
.debug_struct
(
"SerializedLayout"
)
.field
(
"size_bytes"
,
&
self
.len
())
.finish
()
}
}
#[cfg(all(test,
feature
=
"testing-kvbm"
))]
mod
tests
{
use
super
::
*
;
use
crate
::
layout
::{
BlockFormat
,
FullyContiguousDetails
,
KvBlockLayout
,
LayoutConfig
,
LayoutDescriptor
,
LayoutTypeDetails
,
NixlMetadata
,
};
use
dynamo_memory
::{
MemoryRegion
,
StorageKind
,
nixl
};
use
kvbm_common
::
LogicalLayoutHandle
;
fn
make_test_serialized_layout
()
->
LayoutDescriptor
{
let
config
=
LayoutConfig
::
builder
()
.num_blocks
(
2
)
.num_layers
(
2
)
.outer_dim
(
2
)
.page_size
(
4
)
.inner_dim
(
8
)
.dtype_width_bytes
(
2
)
.build
()
.unwrap
();
LayoutDescriptor
{
version
:
1
,
layout_config
:
config
,
location
:
StorageKind
::
System
,
nixl_metadata
:
NixlMetadata
::
new
(
"test"
.to_string
(),
nixl
::
MemType
::
Dram
,
0
),
memory_descriptors
:
vec!
[
MemoryRegion
{
addr
:
0x1000
,
size
:
4096
,
}],
layout_type_details
:
LayoutTypeDetails
::
FullyContiguous
(
FullyContiguousDetails
{
block_format
:
BlockFormat
::
Operational
,
kv_block_layout
:
KvBlockLayout
::
OperationalNHD
,
}),
}
}
#[test]
fn
test_worker_address
()
{
let
addr
=
WorkerAddress
::
new
(
42
,
"test_agent"
.to_string
());
assert_eq!
(
addr
.worker_id
,
42
);
assert_eq!
(
addr
.nixl_agent_name
,
"test_agent"
);
}
#[test]
fn
test_serialized_layout_with_handle
()
{
let
handle
=
LayoutHandle
::
new
(
1
,
2
);
let
layout
=
make_test_serialized_layout
();
let
with_handle
=
LogicalLayoutDescriptor
::
new
(
handle
,
LogicalLayoutHandle
::
G2
,
layout
);
assert_eq!
(
with_handle
.handle
,
handle
);
assert_eq!
(
with_handle
.logical_type
,
LogicalLayoutHandle
::
G2
);
}
#[test]
fn
test_metadata_pack_unpack
()
{
let
worker_address
=
WorkerAddress
::
new
(
100
,
"worker_100"
.to_string
());
let
nixl_metadata
=
vec!
[
1
,
2
,
3
,
4
,
5
];
let
layouts
=
vec!
[
LogicalLayoutDescriptor
::
new
(
LayoutHandle
::
new
(
100
,
1
),
LogicalLayoutHandle
::
G2
,
make_test_serialized_layout
(),
)];
let
packed
=
SerializedLayout
::
pack
(
worker_address
.clone
(),
nixl_metadata
.clone
(),
layouts
)
.unwrap
();
assert
!
(
!
packed
.is_empty
());
let
unpacked
=
packed
.unpack
()
.unwrap
();
assert_eq!
(
unpacked
.worker_address
,
worker_address
);
assert_eq!
(
unpacked
.nixl_metadata
,
nixl_metadata
);
assert_eq!
(
unpacked
.layouts
.len
(),
1
);
assert_eq!
(
unpacked
.layouts
[
0
]
.handle
.worker_id
(),
100
);
assert_eq!
(
unpacked
.layouts
[
0
]
.handle
.layout_id
(),
1
);
assert_eq!
(
unpacked
.layouts
[
0
]
.logical_type
,
LogicalLayoutHandle
::
G2
);
}
#[test]
fn
test_metadata_multiple_layouts
()
{
let
worker_address
=
WorkerAddress
::
new
(
200
,
"worker_200"
.to_string
());
let
nixl_metadata
=
vec!
[
10
,
20
,
30
];
let
layouts
=
vec!
[
LogicalLayoutDescriptor
::
new
(
LayoutHandle
::
new
(
200
,
1
),
LogicalLayoutHandle
::
G1
,
make_test_serialized_layout
(),
),
LogicalLayoutDescriptor
::
new
(
LayoutHandle
::
new
(
200
,
2
),
LogicalLayoutHandle
::
G2
,
make_test_serialized_layout
(),
),
LogicalLayoutDescriptor
::
new
(
LayoutHandle
::
new
(
200
,
3
),
LogicalLayoutHandle
::
G3
,
make_test_serialized_layout
(),
),
];
let
packed
=
SerializedLayout
::
pack
(
worker_address
,
nixl_metadata
,
layouts
.clone
())
.unwrap
();
let
unpacked
=
packed
.unpack
()
.unwrap
();
assert_eq!
(
unpacked
.layouts
.len
(),
3
);
let
expected_logical_types
=
[
LogicalLayoutHandle
::
G1
,
LogicalLayoutHandle
::
G2
,
LogicalLayoutHandle
::
G3
,
];
for
(
i
,
layout
)
in
unpacked
.layouts
.iter
()
.enumerate
()
{
assert_eq!
(
layout
.handle
.worker_id
(),
200
);
assert_eq!
(
layout
.handle
.layout_id
(),
(
i
+
1
)
as
u16
);
assert_eq!
(
layout
.logical_type
,
expected_logical_types
[
i
]);
}
}
#[test]
fn
test_metadata_from_bytes
()
{
let
worker_address
=
WorkerAddress
::
new
(
42
,
"test"
.to_string
());
let
nixl_metadata
=
vec!
[
1
,
2
,
3
];
let
layouts
=
vec!
[];
let
packed
=
SerializedLayout
::
pack
(
worker_address
,
nixl_metadata
,
layouts
)
.unwrap
();
let
bytes
=
packed
.as_bytes
()
.to_vec
();
let
restored
=
SerializedLayout
::
from_bytes
(
bytes
);
let
unpacked
=
restored
.unpack
()
.unwrap
();
assert_eq!
(
unpacked
.worker_address.worker_id
,
42
);
}
}
lib/kvbm-physical/src/manager/mod.rs
0 → 100644
View file @
9ab148dc
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Transport manager for local and remote physical layouts with transfer execution.
mod
handle
;
mod
local
;
mod
metadata
;
mod
remote
;
pub
use
handle
::
LayoutHandle
;
pub
use
metadata
::{
LogicalLayoutDescriptor
,
SerializedLayout
,
WorkerAddress
};
pub
(
crate
)
use
local
::
LocalLayout
;
pub
(
crate
)
use
metadata
::
LocalLayoutDescriptor
;
pub
(
crate
)
use
remote
::
RemoteLayout
;
use
crate
::
layout
::
PhysicalLayout
;
use
crate
::
transfer
::
BounceBufferInternal
;
use
crate
::
transfer
::
TransferContext
;
use
crate
::
transfer
::
context
::
TransferCompleteNotification
;
use
crate
::
transfer
::
executor
::
TransferOptionsInternal
;
use
crate
::
transfer
::
options
::
TransferOptions
;
use
crate
::{
BlockId
,
SequenceHash
};
use
anyhow
::{
Result
,
anyhow
,
bail
};
use
dynamo_memory
::
StorageKind
;
use
dynamo_memory
::
nixl
::
NixlAgent
;
use
kvbm_common
::
LogicalLayoutHandle
;
use
std
::
collections
::{
HashMap
,
HashSet
};
use
std
::
sync
::
atomic
::{
AtomicU16
,
Ordering
};
use
std
::
sync
::{
Arc
,
RwLock
};
/// Public entry point for layout and transfer management.
///
/// TransferManager combines layout registration/metadata management with
/// transfer execution capabilities, providing a unified API for:
/// - Registering local layouts and obtaining handles
/// - Exporting/importing layout metadata for remote workers
/// - Executing transfers between layouts using handles
/// - Managing CUDA, NIXL, and other execution resources
#[derive(Clone)]
pub
struct
TransferManager
{
registry
:
Arc
<
RwLock
<
LayoutRegistry
>>
,
context
:
Arc
<
TransferContext
>
,
}
impl
TransferManager
{
/// Create a new TransferManager builder.
///
/// The builder configures the worker ID, NIXL agent, CUDA device,
/// and other execution parameters before creating the manager.
///
/// # Example
/// ```ignore
/// let manager = TransferManager::builder()
/// .worker_id(0) // NIXL agent name defaults to "worker-0"
/// .nixl_backend("ucx") // Optional: defaults to UCX from env
/// .cuda_device_id(0)
/// .build()?;
///
/// // Or with custom agent name:
/// let manager = TransferManager::builder()
/// .worker_id(0)
/// .nixl_agent_name("custom-agent")
/// .build()?;
/// ```
pub
fn
builder
()
->
crate
::
transfer
::
context
::
TransferConfigBuilder
{
TransferContext
::
builder
()
}
/// Create a TransferManager from a built TransferContext.
///
/// This is used internally by the builder to wrap the context
/// and create the associated registry.
pub
(
crate
)
fn
from_context
(
context
:
TransferContext
)
->
Self
{
let
worker_id
=
context
.worker_id
();
let
nixl_agent
=
context
.nixl_agent
()
.clone
();
let
registry
=
Arc
::
new
(
RwLock
::
new
(
LayoutRegistry
::
new
(
nixl_agent
,
worker_id
)));
Self
{
registry
,
context
:
Arc
::
new
(
context
),
}
}
// ===== Layout Registration and Metadata Management =====
/// Register a local physical layout and return a unique handle.
///
/// This registers the layout with the embedded memory manager, assigning
/// it a unique handle that can be used for handle-based transfers.
///
/// # Arguments
/// * `layout` - Physical layout to register
///
/// # Returns
/// Unique handle for the registered layout
///
/// # Errors
/// Returns an error if layout IDs are exhausted (u16::MAX reached)
pub
fn
register_layout
(
&
self
,
layout
:
PhysicalLayout
)
->
Result
<
LayoutHandle
>
{
self
.registry
.write
()
.unwrap
()
.register_local
(
layout
)
}
/// Export layout metadata for transmission to remote workers.
///
/// This exports all registered local layouts along with NIXL metadata
/// needed for remote memory registration.
///
/// # Returns
/// Packed metadata ready for transmission to remote workers
pub
fn
export_metadata
(
&
self
)
->
Result
<
SerializedLayout
>
{
self
.registry
.read
()
.unwrap
()
.export_metadata
()
}
/// Import remote layout metadata.
///
/// This loads NIXL metadata and reconstructs physical layouts from a remote
/// worker's exported metadata.
///
/// # Arguments
/// * `metadata` - Packed metadata from remote worker
///
/// # Returns
/// Vector of handles for the imported remote layouts
///
/// # Errors
/// Returns an error if the remote worker was already loaded or if metadata
/// loading/reconstruction fails
pub
fn
import_metadata
(
&
self
,
metadata
:
SerializedLayout
)
->
Result
<
Vec
<
LayoutHandle
>>
{
self
.registry
.write
()
.unwrap
()
.import_metadata
(
metadata
)
}
/// Build a logical layout descriptor for a specific handle.
///
/// This creates a descriptor that includes the logical layout type (G1, G2, G3, G4)
/// for use in RDMA metadata exchange. The caller must provide the logical type
/// mapping since only the caller (e.g., DirectWorker) knows which handle corresponds
/// to which logical tier.
///
/// # Arguments
/// * `handle` - Handle to the local layout
/// * `logical_type` - The logical tier (G1, G2, G3, G4) this handle represents
///
/// # Returns
/// A LogicalLayoutDescriptor ready for serialization
///
/// # Errors
/// Returns an error if the handle is not found or serialization fails
pub
fn
build_logical_descriptor
(
&
self
,
handle
:
LayoutHandle
,
logical_type
:
LogicalLayoutHandle
,
)
->
Result
<
LogicalLayoutDescriptor
>
{
self
.registry
.read
()
.unwrap
()
.build_logical_descriptor
(
handle
,
logical_type
)
}
/// Get the NIXL metadata for this worker.
///
/// Returns the raw NIXL metadata bytes needed for remote registration.
pub
fn
get_nixl_metadata
(
&
self
)
->
Result
<
Vec
<
u8
>>
{
self
.registry
.read
()
.unwrap
()
.get_nixl_metadata
()
}
/// Get the worker address for this manager.
pub
fn
worker_address
(
&
self
)
->
WorkerAddress
{
self
.registry
.read
()
.unwrap
()
.worker_address
()
}
/// Get a reference to the NIXL agent.
///
/// This is useful for building layouts that need to register memory
/// with the same agent that the TransferManager uses.
pub
fn
nixl_agent
(
&
self
)
->
&
NixlAgent
{
self
.context
.nixl_agent
()
}
/// Get the layout configuration for a registered layout.
///
/// Returns a clone of the layout's configuration, which includes
/// dimensions like num_blocks, num_layers, page_size, etc.
///
/// # Arguments
/// * `handle` - Handle to a registered layout (local or remote)
///
/// # Returns
/// A clone of the layout's configuration
///
/// # Errors
/// Returns an error if the handle is not found
pub
fn
get_layout_config
(
&
self
,
handle
:
LayoutHandle
)
->
Result
<
crate
::
layout
::
LayoutConfig
>
{
let
registry
=
self
.registry
.read
()
.unwrap
();
let
physical_layout
=
registry
.get_layout
(
handle
)
.ok_or_else
(||
anyhow!
(
"invalid handle: {}"
,
handle
))
?
;
Ok
(
physical_layout
.layout
()
.config
()
.clone
())
}
// ===== Handle-Based Transfer API =====
/// Transfer complete blocks between layouts using handles.
///
/// This function copies entire blocks (all layers and outer dimensions) between
/// the source and destination layouts identified by their handles. The transfer
/// strategy (memcpy, CUDA, NIXL) is automatically selected based on storage locations.
///
/// The lock on the registry is held only briefly during layout lookup,
/// then released before executing the actual transfer.
///
/// # Arguments
/// * `src_handle` - Handle to source layout
/// * `src_blocks` - Source block IDs to transfer
/// * `dst_handle` - Handle to destination layout
/// * `dst_blocks` - Destination block IDs to transfer
///
/// # Returns
/// A notification handle that can be awaited for transfer completion
///
/// # Errors
/// Returns an error if:
/// - Either handle is invalid
/// - Block IDs are out of bounds
/// - Transfer execution fails
pub
fn
execute_transfer
(
&
self
,
src_handle
:
LayoutHandle
,
src_blocks
:
&
[
BlockId
],
dst_handle
:
LayoutHandle
,
dst_blocks
:
&
[
BlockId
],
options
:
TransferOptions
,
)
->
Result
<
TransferCompleteNotification
>
{
// Clone layouts inside the lock, then drop lock before transfer
let
(
src_layout
,
dst_layout
)
=
{
let
registry
=
self
.registry
.read
()
.unwrap
();
let
src
=
registry
.get_layout
(
src_handle
)
.ok_or_else
(||
anyhow!
(
"invalid source handle: {}"
,
src_handle
))
?
.clone
();
// Cheap: just Arc refcount bump
let
dst
=
registry
.get_layout
(
dst_handle
)
.ok_or_else
(||
anyhow!
(
"invalid destination handle: {}"
,
dst_handle
))
?
.clone
();
(
src
,
dst
)
};
// Lock released here
let
(
layer_range
,
nixl_write_notification
,
bounce_buffer
,
cuda_stream
,
src_kv_layout
,
dst_kv_layout
,
)
=
options
.dissolve
();
let
mut
internal_options
=
TransferOptionsInternal
::
builder
();
if
let
Some
(
range
)
=
layer_range
{
internal_options
=
internal_options
.layer_range
(
range
);
}
if
let
Some
(
notification
)
=
nixl_write_notification
{
internal_options
=
internal_options
.nixl_write_notification
(
notification
);
}
if
let
Some
(
bounce
)
=
bounce_buffer
{
let
(
handle
,
block_ids
)
=
bounce
.into_parts
();
let
bounce_buffer
=
self
.create_bounce_buffer
(
handle
,
block_ids
)
?
;
internal_options
=
internal_options
.bounce_buffer
(
bounce_buffer
);
}
if
let
Some
(
stream
)
=
cuda_stream
{
internal_options
=
internal_options
.cuda_stream
(
stream
);
}
if
let
Some
(
layout
)
=
src_kv_layout
{
internal_options
=
internal_options
.src_kv_layout
(
layout
);
}
if
let
Some
(
layout
)
=
dst_kv_layout
{
internal_options
=
internal_options
.dst_kv_layout
(
layout
);
}
let
options
=
internal_options
.build
()
?
;
tracing
::
debug!
(
src_handle
=
src_handle
.to_string
(),
dst_handle
=
dst_handle
.to_string
(),
"Executing transfer; src_blocks = {:?}; dst_blocks = {:?}"
,
src_blocks
,
dst_blocks
,
);
// Execute transfer with no lock held
super
::
transfer
::
executor
::
execute_transfer
(
&
src_layout
,
&
dst_layout
,
src_blocks
,
dst_blocks
,
options
,
&
self
.context
,
)
}
/// Execute a G4 offload.
///
/// Takes a LayoutHandle and a vector of block IDs for the source blocks and
/// a list of SequenceHashes for the destination blocks.
///
/// use an extension on TransferOptions to pass in the "rank/part" of the the object in a
/// multi-worker/multi-tp scenario.
pub
fn
execute_g4_offload
(
_
src_handle
:
LayoutHandle
,
_
src_blocks
:
&
[
BlockId
],
_
dst_object
:
&
[
SequenceHash
],
_
options
:
TransferOptions
,
// add rank/part to the options
)
->
Result
<
TransferCompleteNotification
>
{
// check registration cache for the remote object, if it's not found, register it with nixl
// register all non-registered blocks with nixl in parallel
// then extend super::transfer::executor to access the memory regions for the source
// and generate a nixl descriptor
todo!
(
"implement remote offload"
)
}
pub
fn
execute_g4_onboard
()
{
todo!
(
"implement remote onboard"
)
}
// ===== Query Methods =====
/// Get the worker ID for this manager.
pub
fn
worker_id
(
&
self
)
->
u64
{
self
.context
.worker_id
()
}
/// Get handles for all locally registered layouts.
pub
fn
get_local_handles
(
&
self
)
->
Vec
<
LayoutHandle
>
{
self
.registry
.read
()
.unwrap
()
.local_handles
()
}
/// Get handles for all imported remote layouts.
pub
fn
get_remote_handles
(
&
self
)
->
Vec
<
LayoutHandle
>
{
self
.registry
.read
()
.unwrap
()
.remote_handles
()
}
/// Get a clone of the physical layout for a given handle.
///
/// # Arguments
/// * `handle` - Handle to a registered layout (local or remote)
///
/// # Returns
/// A clone of the physical layout, or None if the handle is not found.
pub
fn
get_physical_layout
(
&
self
,
handle
:
LayoutHandle
)
->
Option
<
PhysicalLayout
>
{
self
.registry
.read
()
.unwrap
()
.get_layout
(
handle
)
.cloned
()
}
/// Create a bounce buffer specification from a layout handle and block IDs.
///
/// This resolves the layout handle to a physical layout and wraps it in a
/// BounceBufferSpec implementation for use in transfer options.
pub
(
crate
)
fn
create_bounce_buffer
(
&
self
,
handle
:
LayoutHandle
,
block_ids
:
Vec
<
BlockId
>
,
)
->
Result
<
BounceBufferInternal
>
{
let
layout
=
{
let
registry
=
self
.registry
.read
()
.unwrap
();
registry
.get_layout
(
handle
)
.ok_or_else
(||
anyhow!
(
"invalid bounce buffer handle: {}"
,
handle
))
?
.clone
()
};
Ok
(
BounceBufferInternal
::
from_layout
(
layout
,
block_ids
))
}
// ===== Internal Methods for Testing =====
/// Get the internal transfer context.
#[doc(hidden)]
pub
fn
context
(
&
self
)
->
&
TransferContext
{
&
self
.context
}
/// Get access to the internal layout registry.
///
/// This is primarily for testing utilities that need direct layout access
/// (e.g., fill patterns, checksum computation).
#[doc(hidden)]
pub
fn
registry
(
&
self
)
->
&
RwLock
<
LayoutRegistry
>
{
&
self
.registry
}
/// Get the H2D stream (for testing only).
#[cfg(test)]
#[allow(dead_code)]
pub
(
crate
)
fn
h2d_stream
(
&
self
)
->
&
std
::
sync
::
Arc
<
cudarc
::
driver
::
CudaStream
>
{
self
.context
.h2d_stream
()
}
/// Get the D2H stream (for testing only).
#[cfg(test)]
#[allow(dead_code)]
pub
(
crate
)
fn
d2h_stream
(
&
self
)
->
&
std
::
sync
::
Arc
<
cudarc
::
driver
::
CudaStream
>
{
self
.context
.d2h_stream
()
}
/// Get the CUDA context (for testing only).
#[cfg(test)]
#[allow(dead_code)]
pub
(
crate
)
fn
cuda_context
(
&
self
)
->
&
std
::
sync
::
Arc
<
cudarc
::
driver
::
CudaContext
>
{
self
.context
.cuda_context
()
}
/// Register a CUDA event for completion (for testing only).
#[cfg(test)]
#[allow(dead_code)]
pub
(
crate
)
fn
register_cuda_event
(
&
self
,
event
:
cudarc
::
driver
::
CudaEvent
,
)
->
TransferCompleteNotification
{
self
.context
.register_cuda_event
(
event
)
}
/// Get the CUDA memory pool (for testing only).
#[cfg(test)]
#[expect(dead_code)]
pub
(
crate
)
fn
cuda_pool
(
&
self
)
->
&
std
::
sync
::
Arc
<
dynamo_memory
::
CudaMemPool
>
{
self
.context
.cuda_pool
()
}
}
/// Internal registry for local and remote physical layouts with NIXL integration.
///
/// The LayoutRegistry handles:
/// - Registering local layouts with unique handles
/// - Exporting local layout metadata for remote access
/// - Importing remote layout metadata and reconstructing layouts
/// - Managing NIXL metadata for RDMA operations
#[derive(Debug)]
#[doc(hidden)]
pub
struct
LayoutRegistry
{
/// NIXL agent for memory registration
nixl_agent
:
NixlAgent
,
/// Worker ID for this manager
worker_id
:
u64
,
/// Next layout ID to assign (monotonically increasing)
next_layout_id
:
AtomicU16
,
/// Local layouts registered on this worker
local_layouts
:
HashMap
<
LayoutHandle
,
LocalLayout
>
,
/// Remote layouts imported from other workers
remote_layouts
:
HashMap
<
LayoutHandle
,
RemoteLayout
>
,
/// Set of loaded remote workers (agent_name, worker_id) to prevent duplicates
loaded_remotes
:
HashSet
<
(
String
,
u64
)
>
,
}
#[expect(dead_code)]
impl
LayoutRegistry
{
/// Create a new layout manager.
///
/// # Arguments
/// * `nixl_agent` - NIXL agent for memory registration
/// * `worker_id` - Unique identifier for this worker
pub
(
crate
)
fn
new
(
nixl_agent
:
NixlAgent
,
worker_id
:
u64
)
->
Self
{
Self
{
nixl_agent
,
worker_id
,
next_layout_id
:
AtomicU16
::
new
(
0
),
local_layouts
:
HashMap
::
new
(),
remote_layouts
:
HashMap
::
new
(),
loaded_remotes
:
HashSet
::
new
(),
}
}
/// Register a local physical layout.
///
/// # Arguments
/// * `layout` - Physical layout to register
///
/// # Returns
/// Unique handle for the registered layout
///
/// # Errors
/// Returns an error if layout IDs are exhausted (u16::MAX reached)
pub
(
crate
)
fn
register_local
(
&
mut
self
,
layout
:
PhysicalLayout
)
->
Result
<
LayoutHandle
>
{
// Check before incrementing to prevent wrapping
let
current
=
self
.next_layout_id
.load
(
Ordering
::
SeqCst
);
if
current
==
u16
::
MAX
{
bail!
(
"Layout ID overflow: maximum number of layouts ({}) reached"
,
u16
::
MAX
);
}
let
layout_id
=
self
.next_layout_id
.fetch_add
(
1
,
Ordering
::
SeqCst
);
// Create handle
let
handle
=
LayoutHandle
::
new
(
self
.worker_id
,
layout_id
);
// Wrap in LocalLayout
let
local_layout
=
LocalLayout
::
new
(
handle
,
layout
);
// Store
self
.local_layouts
.insert
(
handle
,
local_layout
);
Ok
(
handle
)
}
/// Export local layout metadata for transmission to remote workers.
///
/// This exports:
/// - NIXL agent metadata for remote memory registration
/// - All host and device layouts (disk layouts are excluded)
/// - Worker address information
///
/// # Returns
/// Packed metadata ready for transmission
pub
(
crate
)
fn
export_metadata
(
&
self
)
->
Result
<
SerializedLayout
>
{
// Get NIXL metadata from agent
let
nixl_metadata
=
self
.nixl_agent
.get_local_md
()
.map_err
(|
e
|
anyhow!
(
"failed to get NIXL local metadata: {:?}"
,
e
))
?
;
// Create worker address
let
worker_address
=
WorkerAddress
::
new
(
self
.worker_id
,
self
.nixl_agent
.name
()
.to_string
());
// Filter and serialize layouts (only host and device, skip disk)
let
mut
serialized_layouts
=
Vec
::
new
();
for
(
handle
,
local_layout
)
in
&
self
.local_layouts
{
let
location
=
local_layout
.layout
()
.location
();
// Only export host and device layouts
if
matches!
(
location
,
StorageKind
::
System
|
StorageKind
::
Device
(
_
)
|
StorageKind
::
Pinned
)
{
let
serialized
=
local_layout
.layout
()
.to_descriptor
()
.map_err
(|
e
|
anyhow!
(
"failed to serialize layout {}: {}"
,
handle
,
e
))
?
;
serialized_layouts
.push
(
LocalLayoutDescriptor
::
new_with_default_type
(
*
handle
,
serialized
,
));
}
}
// Pack into managed metadata
SerializedLayout
::
pack
(
worker_address
,
nixl_metadata
,
serialized_layouts
)
}
/// Import remote layout metadata.
///
/// This:
/// - Validates the remote worker hasn't been loaded already
/// - Loads NIXL metadata into the agent
/// - Reconstructs physical layouts from serialized data
/// - Stores them as remote layouts
///
/// # Arguments
/// * `metadata` - Packed metadata from remote worker
///
/// # Returns
/// Vector of handles for the imported layouts
///
/// # Errors
/// Returns an error if:
/// - The remote worker was already loaded
/// - NIXL metadata loading fails
/// - Agent name mismatch after loading
/// - Layout reconstruction fails
pub
(
crate
)
fn
import_metadata
(
&
mut
self
,
metadata
:
SerializedLayout
,
)
->
Result
<
Vec
<
LayoutHandle
>>
{
// Unpack metadata
let
inner
=
metadata
.unpack
()
?
;
// Validate not already loaded
let
remote_key
=
(
inner
.worker_address.nixl_agent_name
.clone
(),
inner
.worker_address.worker_id
,
);
if
self
.loaded_remotes
.contains
(
&
remote_key
)
{
bail!
(
"Remote worker already loaded: {} (worker_id={})"
,
remote_key
.0
,
remote_key
.1
);
}
// Load NIXL metadata
let
returned_agent_name
=
self
.nixl_agent
.load_remote_md
(
&
inner
.nixl_metadata
)
.map_err
(|
e
|
anyhow!
(
"failed to load remote NIXL metadata: {:?}"
,
e
))
?
;
// Verify agent name matches
if
returned_agent_name
!=
inner
.worker_address.nixl_agent_name
{
bail!
(
"Agent name mismatch: expected '{}', got '{}'"
,
inner
.worker_address.nixl_agent_name
,
returned_agent_name
);
}
// Reconstruct layouts
let
mut
imported_handles
=
Vec
::
new
();
for
serialized_with_handle
in
inner
.layouts
{
let
handle
=
serialized_with_handle
.handle
;
let
layout
=
PhysicalLayout
::
from_descriptor
(
serialized_with_handle
.layout
)
.map_err
(|
e
|
anyhow!
(
"failed to reconstruct layout {}: {}"
,
handle
,
e
))
?
;
let
remote_layout
=
RemoteLayout
::
new
(
handle
,
layout
);
self
.remote_layouts
.insert
(
handle
,
remote_layout
);
imported_handles
.push
(
handle
);
}
// Mark remote as loaded
self
.loaded_remotes
.insert
(
remote_key
);
Ok
(
imported_handles
)
}
/// Build a logical layout descriptor for a specific handle.
///
/// # Arguments
/// * `handle` - Handle to the local layout
/// * `logical_type` - The logical tier (G1, G2, G3, G4) this handle represents
///
/// # Returns
/// A LogicalLayoutDescriptor ready for serialization
pub
(
crate
)
fn
build_logical_descriptor
(
&
self
,
handle
:
LayoutHandle
,
logical_type
:
LogicalLayoutHandle
,
)
->
Result
<
LogicalLayoutDescriptor
>
{
let
local_layout
=
self
.local_layouts
.get
(
&
handle
)
.ok_or_else
(||
anyhow!
(
"Layout handle not found: {:?}"
,
handle
))
?
;
let
layout_descriptor
=
local_layout
.layout
()
.to_descriptor
()
.map_err
(|
e
|
anyhow!
(
"failed to serialize layout {}: {}"
,
handle
,
e
))
?
;
Ok
(
LogicalLayoutDescriptor
::
new
(
handle
,
logical_type
,
layout_descriptor
,
))
}
/// Get the NIXL metadata for this worker.
pub
(
crate
)
fn
get_nixl_metadata
(
&
self
)
->
Result
<
Vec
<
u8
>>
{
self
.nixl_agent
.get_local_md
()
.map_err
(|
e
|
anyhow!
(
"failed to get NIXL local metadata: {:?}"
,
e
))
}
/// Get the worker address for this registry.
pub
(
crate
)
fn
worker_address
(
&
self
)
->
WorkerAddress
{
WorkerAddress
::
new
(
self
.worker_id
,
self
.nixl_agent
.name
()
.to_string
())
}
/// Get a local layout by handle.
pub
(
crate
)
fn
get_local
(
&
self
,
handle
:
LayoutHandle
)
->
Option
<&
LocalLayout
>
{
self
.local_layouts
.get
(
&
handle
)
}
/// Get a remote layout by handle.
pub
(
crate
)
fn
get_remote
(
&
self
,
handle
:
LayoutHandle
)
->
Option
<&
RemoteLayout
>
{
self
.remote_layouts
.get
(
&
handle
)
}
/// Get a layout by handle (either local or remote).
///
/// # Returns
/// Returns a reference to the PhysicalLayout if found
pub
fn
get_layout
(
&
self
,
handle
:
LayoutHandle
)
->
Option
<&
PhysicalLayout
>
{
self
.local_layouts
.get
(
&
handle
)
.map
(|
l
|
l
.layout
())
.or_else
(||
self
.remote_layouts
.get
(
&
handle
)
.map
(|
r
|
r
.layout
()))
}
/// Check if a handle refers to a local layout.
pub
(
crate
)
fn
is_local
(
&
self
,
handle
:
LayoutHandle
)
->
bool
{
self
.local_layouts
.contains_key
(
&
handle
)
}
/// Check if a handle refers to a remote layout.
pub
(
crate
)
fn
is_remote
(
&
self
,
handle
:
LayoutHandle
)
->
bool
{
self
.remote_layouts
.contains_key
(
&
handle
)
}
/// Get the number of local layouts.
pub
(
crate
)
fn
local_count
(
&
self
)
->
usize
{
self
.local_layouts
.len
()
}
/// Get the number of remote layouts.
pub
(
crate
)
fn
remote_count
(
&
self
)
->
usize
{
self
.remote_layouts
.len
()
}
/// Get the worker ID for this manager.
pub
(
crate
)
fn
worker_id
(
&
self
)
->
u64
{
self
.worker_id
}
/// Get all local layout handles.
pub
(
crate
)
fn
local_handles
(
&
self
)
->
Vec
<
LayoutHandle
>
{
self
.local_layouts
.keys
()
.copied
()
.collect
()
}
/// Get all remote layout handles.
pub
(
crate
)
fn
remote_handles
(
&
self
)
->
Vec
<
LayoutHandle
>
{
self
.remote_layouts
.keys
()
.copied
()
.collect
()
}
}
#[cfg(all(test,
feature
=
"testing-kvbm"
))]
mod
tests
{
use
super
::
*
;
use
crate
::
layout
::
LayoutConfig
;
use
dynamo_memory
::
nixl
::
NixlAgent
;
fn
make_test_agent
(
name
:
&
str
)
->
NixlAgent
{
NixlAgent
::
new
(
name
)
.expect
(
"failed to create agent"
)
}
fn
make_test_layout
(
agent
:
&
NixlAgent
)
->
PhysicalLayout
{
let
config
=
LayoutConfig
::
builder
()
.num_blocks
(
2
)
.num_layers
(
2
)
.outer_dim
(
2
)
.page_size
(
4
)
.inner_dim
(
8
)
.dtype_width_bytes
(
2
)
.build
()
.unwrap
();
PhysicalLayout
::
builder
(
agent
.clone
())
.with_config
(
config
)
.fully_contiguous
()
.allocate_system
()
.build
()
.unwrap
()
}
#[test]
fn
test_manager_creation
()
{
let
agent
=
make_test_agent
(
"test-manager"
);
let
manager
=
LayoutRegistry
::
new
(
agent
,
42
);
assert_eq!
(
manager
.worker_id
(),
42
);
assert_eq!
(
manager
.local_count
(),
0
);
assert_eq!
(
manager
.remote_count
(),
0
);
}
#[test]
fn
test_register_local
()
{
let
agent
=
make_test_agent
(
"test-register"
);
let
mut
manager
=
LayoutRegistry
::
new
(
agent
.clone
(),
100
);
let
layout
=
make_test_layout
(
&
agent
);
let
handle
=
manager
.register_local
(
layout
)
.unwrap
();
assert_eq!
(
handle
.worker_id
(),
100
);
assert_eq!
(
handle
.layout_id
(),
0
);
assert_eq!
(
manager
.local_count
(),
1
);
assert
!
(
manager
.is_local
(
handle
));
assert
!
(
!
manager
.is_remote
(
handle
));
}
#[test]
fn
test_register_multiple_locals
()
{
let
agent
=
make_test_agent
(
"test-multiple"
);
let
mut
manager
=
LayoutRegistry
::
new
(
agent
.clone
(),
1
);
let
handle1
=
manager
.register_local
(
make_test_layout
(
&
agent
))
.unwrap
();
let
handle2
=
manager
.register_local
(
make_test_layout
(
&
agent
))
.unwrap
();
let
handle3
=
manager
.register_local
(
make_test_layout
(
&
agent
))
.unwrap
();
assert_eq!
(
handle1
.layout_id
(),
0
);
assert_eq!
(
handle2
.layout_id
(),
1
);
assert_eq!
(
handle3
.layout_id
(),
2
);
assert_eq!
(
manager
.local_count
(),
3
);
}
#[test]
#[ignore]
// Requires actual NIXL memory registration
fn
test_export_import_roundtrip
()
{
// Create source manager and register layouts
let
source_agent
=
make_test_agent
(
"source"
);
let
mut
source_manager
=
LayoutRegistry
::
new
(
source_agent
.clone
(),
1
);
let
handle1
=
source_manager
.register_local
(
make_test_layout
(
&
source_agent
))
.unwrap
();
let
handle2
=
source_manager
.register_local
(
make_test_layout
(
&
source_agent
))
.unwrap
();
// Export metadata
let
metadata
=
source_manager
.export_metadata
()
.unwrap
();
assert
!
(
!
metadata
.is_empty
());
// Create destination manager and import
let
dest_agent
=
make_test_agent
(
"dest"
);
let
mut
dest_manager
=
LayoutRegistry
::
new
(
dest_agent
,
2
);
let
imported_handles
=
dest_manager
.import_metadata
(
metadata
)
.unwrap
();
// Verify
assert_eq!
(
imported_handles
.len
(),
2
);
assert_eq!
(
dest_manager
.remote_count
(),
2
);
assert
!
(
dest_manager
.is_remote
(
handle1
));
assert
!
(
dest_manager
.is_remote
(
handle2
));
// Can get layouts
assert
!
(
dest_manager
.get_remote
(
handle1
)
.is_some
());
assert
!
(
dest_manager
.get_remote
(
handle2
)
.is_some
());
assert
!
(
dest_manager
.get_layout
(
handle1
)
.is_some
());
}
#[test]
#[ignore]
// Requires actual NIXL memory registration
fn
test_import_duplicate_remote_fails
()
{
let
source_agent
=
make_test_agent
(
"source2"
);
let
mut
source_manager
=
LayoutRegistry
::
new
(
source_agent
.clone
(),
10
);
source_manager
.register_local
(
make_test_layout
(
&
source_agent
))
.unwrap
();
let
metadata
=
source_manager
.export_metadata
()
.unwrap
();
let
dest_agent
=
make_test_agent
(
"dest2"
);
let
mut
dest_manager
=
LayoutRegistry
::
new
(
dest_agent
,
20
);
// First import succeeds
let
metadata_clone
=
SerializedLayout
::
from_bytes
(
metadata
.as_bytes
()
.to_vec
());
dest_manager
.import_metadata
(
metadata
)
.unwrap
();
// Second import should fail
let
result
=
dest_manager
.import_metadata
(
metadata_clone
);
assert
!
(
result
.is_err
());
assert
!
(
result
.unwrap_err
()
.to_string
()
.contains
(
"already loaded"
));
}
#[test]
fn
test_get_layout_handles
()
{
let
agent
=
make_test_agent
(
"test-handles"
);
let
mut
manager
=
LayoutRegistry
::
new
(
agent
.clone
(),
5
);
let
h1
=
manager
.register_local
(
make_test_layout
(
&
agent
))
.unwrap
();
let
h2
=
manager
.register_local
(
make_test_layout
(
&
agent
))
.unwrap
();
let
handles
=
manager
.local_handles
();
assert_eq!
(
handles
.len
(),
2
);
assert
!
(
handles
.contains
(
&
h1
));
assert
!
(
handles
.contains
(
&
h2
));
}
}
lib/kvbm-physical/src/manager/remote.rs
0 → 100644
View file @
9ab148dc
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Remote layout wrapper reconstructed from imported metadata.
use
super
::
handle
::
LayoutHandle
;
use
crate
::
layout
::
PhysicalLayout
;
/// A remote physical layout reconstructed from imported metadata.
///
/// This wraps a `PhysicalLayout` that was deserialized from another worker's
/// exported metadata. The layout's memory regions point to addresses on the
/// remote worker and are used for building NIXL RDMA transfer descriptors.
///
/// This type is cheap to clone as `PhysicalLayout` contains `Arc` internally.
#[derive(Debug,
Clone)]
pub
struct
RemoteLayout
{
handle
:
LayoutHandle
,
layout
:
PhysicalLayout
,
}
#[allow(dead_code)]
impl
RemoteLayout
{
/// Create a new remote layout.
///
/// # Arguments
/// * `handle` - Unique handle for this layout (from remote worker)
/// * `layout` - The reconstructed physical layout
pub
fn
new
(
handle
:
LayoutHandle
,
layout
:
PhysicalLayout
)
->
Self
{
Self
{
handle
,
layout
}
}
/// Get the handle for this layout.
pub
fn
handle
(
&
self
)
->
LayoutHandle
{
self
.handle
}
/// Get a reference to the physical layout.
pub
fn
layout
(
&
self
)
->
&
PhysicalLayout
{
&
self
.layout
}
/// Get the worker_id from the handle (identifies the remote worker).
pub
fn
worker_id
(
&
self
)
->
u64
{
self
.handle
.worker_id
()
}
/// Get the layout_id from the handle.
pub
fn
layout_id
(
&
self
)
->
u16
{
self
.handle
.layout_id
()
}
/// Consume this remote layout and return the physical layout.
pub
fn
into_layout
(
self
)
->
PhysicalLayout
{
self
.layout
}
}
#[cfg(all(test,
feature
=
"testing-kvbm"
))]
mod
tests
{
use
super
::
*
;
use
crate
::
layout
::{
LayoutConfig
,
LayoutDescriptor
,
NixlMetadata
,
PhysicalLayout
};
fn
make_serialized_layout
()
->
LayoutDescriptor
{
use
crate
::
layout
::{
BlockFormat
,
FullyContiguousDetails
,
LayoutTypeDetails
};
use
dynamo_memory
::{
MemoryRegion
,
StorageKind
,
nixl
};
let
config
=
LayoutConfig
::
builder
()
.num_blocks
(
2
)
.num_layers
(
2
)
.outer_dim
(
2
)
.page_size
(
4
)
.inner_dim
(
8
)
.dtype_width_bytes
(
2
)
.build
()
.unwrap
();
let
required_size
=
config
.num_blocks
*
config
.num_layers
*
config
.outer_dim
*
config
.page_size
*
config
.inner_dim
*
config
.dtype_width_bytes
;
LayoutDescriptor
{
version
:
1
,
layout_config
:
config
,
location
:
StorageKind
::
System
,
nixl_metadata
:
NixlMetadata
::
new
(
"remote_agent"
.to_string
(),
nixl
::
MemType
::
Dram
,
0
),
memory_descriptors
:
vec!
[
MemoryRegion
{
addr
:
0x1000
,
size
:
required_size
,
}],
layout_type_details
:
LayoutTypeDetails
::
FullyContiguous
(
FullyContiguousDetails
{
block_format
:
BlockFormat
::
Operational
,
kv_block_layout
:
crate
::
layout
::
KvBlockLayout
::
OperationalNHD
,
}),
}
}
#[test]
fn
test_remote_layout_creation
()
{
let
handle
=
LayoutHandle
::
new
(
999
,
42
);
let
serialized
=
make_serialized_layout
();
let
layout
=
PhysicalLayout
::
from_descriptor
(
serialized
)
.unwrap
();
let
remote
=
RemoteLayout
::
new
(
handle
,
layout
);
assert_eq!
(
remote
.handle
(),
handle
);
assert_eq!
(
remote
.worker_id
(),
999
);
assert_eq!
(
remote
.layout_id
(),
42
);
assert_eq!
(
remote
.layout
()
.layout
()
.block_layout
(),
crate
::
layout
::
KvBlockLayout
::
OperationalNHD
);
}
#[test]
fn
test_remote_layout_into_layout
()
{
let
handle
=
LayoutHandle
::
new
(
100
,
200
);
let
serialized
=
make_serialized_layout
();
let
layout
=
PhysicalLayout
::
from_descriptor
(
serialized
)
.unwrap
();
let
remote
=
RemoteLayout
::
new
(
handle
,
layout
);
let
_
recovered
=
remote
.into_layout
();
// Successfully consumed and returned the layout
}
}
lib/kvbm-physical/src/transfer/capabilities.rs
0 → 100644
View file @
9ab148dc
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Transfer capability flags for controlling direct path enablement.
//!
//! By default, the transfer system uses a conservative staging policy where:
//! - Device can only transfer to/from Host
//! - Disk can only transfer to/from Host
//! - Host can transfer to Device, Disk, or Remote
//! - Device ↔ Device is allowed (native CUDA)
//!
//! These capability flags enable optional direct paths that bypass host staging.
use
serde
::{
Deserialize
,
Serialize
};
use
std
::
sync
::
OnceLock
;
use
crate
::{
layout
::
LayoutConfig
,
transfer
::{
PhysicalLayout
,
TransferManager
,
executor
::{
TransferOptionsInternal
,
execute_transfer
},
},
};
use
dynamo_memory
::
nixl
::
NixlAgent
;
/// Transfer capability flags controlling which direct paths are enabled.
///
/// # Default Policy (Conservative)
///
/// With all flags disabled (default), the system uses host staging:
/// - **Device → Remote**: Device → Host → Remote (2 hops)
/// - **Disk → Remote**: Disk → Host → Remote (2 hops)
/// - **Device ↔ Disk**: Device → Host → Disk (2 hops)
///
/// # Optional Direct Paths
///
/// - `allow_gds`: Enables GPU Direct Storage (Disk ↔ Device without host)
/// - `allow_gpu_rdma`: Enables GPU RDMA (Device → Remote without host)
///
/// # Example
///
/// ```
/// # use kvbm_physical::transfer::TransferCapabilities;
/// // Default conservative policy
/// let caps = TransferCapabilities::default();
/// assert!(!caps.allow_gds);
/// assert!(!caps.allow_gpu_rdma);
///
/// // Enable GDS for high-performance disk I/O
/// let caps = TransferCapabilities::default().with_gds(true);
/// ```
static
GDS_SUPPORTED
:
OnceLock
<
bool
>
=
OnceLock
::
new
();
#[derive(Debug,
Clone,
Copy,
PartialEq,
Eq,
Serialize,
Deserialize,
Default)]
pub
struct
TransferCapabilities
{
/// Enable GPU Direct Storage (Disk ↔ Device without host staging).
///
/// When enabled:
/// - Disk → Device: Direct transfer (requires GDS support)
/// - Device → Disk: Direct transfer (requires GDS support)
///
/// When disabled (default):
/// - Disk → Device: Disk → Host → Device (2 hops)
/// - Device → Disk: Device → Host → Disk (2 hops)
pub
allow_gds
:
bool
,
/// Enable GPU RDMA (Device → Remote without host staging).
///
/// When enabled:
/// - Device → Remote: Direct NIXL transfer
///
/// When disabled (default):
/// - Device → Remote: Device → Host → Remote (2 hops)
///
/// Note: This only affects Device → Remote. Host → Remote is always direct.
pub
allow_gpu_rdma
:
bool
,
}
impl
TransferCapabilities
{
/// Create capabilities with default conservative policy (all direct paths disabled).
pub
fn
new
()
->
Self
{
Self
::
default
()
}
/// Create capabilities with all direct paths enabled (high performance mode).
pub
fn
all_enabled
()
->
Self
{
Self
{
allow_gds
:
true
,
allow_gpu_rdma
:
true
,
}
}
/// Set the GDS (GPU Direct Storage) capability.
pub
fn
with_gds
(
mut
self
,
enabled
:
bool
)
->
Self
{
self
.allow_gds
=
enabled
;
self
}
fn
test_gds_transfer
(
&
self
)
->
anyhow
::
Result
<
()
>
{
let
agent
=
NixlAgent
::
with_backends
(
"agent"
,
&
[
"GDS_MT"
])
?
;
// Try a little test transfer and see if it works.
let
config
=
LayoutConfig
::
builder
()
.num_blocks
(
1
)
.num_layers
(
1
)
.outer_dim
(
1
)
.page_size
(
1
)
.inner_dim
(
4096
)
.build
()
?
;
let
src
=
PhysicalLayout
::
builder
(
agent
.clone
())
.with_config
(
config
.clone
())
.fully_contiguous
()
.allocate_device
(
0
)
.build
()
?
;
let
dst
=
PhysicalLayout
::
builder
(
agent
.clone
())
.with_config
(
config
)
.fully_contiguous
()
.allocate_disk
(
None
)
.build
()
?
;
let
src_blocks
=
vec!
[
0
];
let
dst_blocks
=
vec!
[
0
];
let
ctx
=
TransferManager
::
builder
()
.nixl_agent
(
agent
)
.cuda_device_id
(
0
)
.build
()
?
;
execute_transfer
(
&
src
,
&
dst
,
&
src_blocks
,
&
dst_blocks
,
TransferOptionsInternal
::
default
(),
ctx
.context
(),
)
?
;
Ok
(())
}
pub
fn
with_gds_if_supported
(
mut
self
)
->
Self
{
self
.allow_gds
=
*
GDS_SUPPORTED
.get_or_init
(||
self
.test_gds_transfer
()
.is_ok
());
self
}
/// Set the GPU RDMA capability.
pub
fn
with_gpu_rdma
(
mut
self
,
enabled
:
bool
)
->
Self
{
self
.allow_gpu_rdma
=
enabled
;
self
}
/// Check if a direct path from Device to Disk is allowed.
pub
fn
allows_device_disk_direct
(
&
self
)
->
bool
{
self
.allow_gds
}
/// Check if a direct path from Device to Remote is allowed.
pub
fn
allows_device_remote_direct
(
&
self
)
->
bool
{
self
.allow_gpu_rdma
}
}
#[cfg(all(test,
feature
=
"testing-kvbm"
))]
mod
tests
{
use
super
::
*
;
#[test]
fn
test_default_capabilities
()
{
let
caps
=
TransferCapabilities
::
default
();
assert
!
(
!
caps
.allow_gds
);
assert
!
(
!
caps
.allow_gpu_rdma
);
assert
!
(
!
caps
.allows_device_disk_direct
());
assert
!
(
!
caps
.allows_device_remote_direct
());
}
#[test]
fn
test_all_enabled
()
{
let
caps
=
TransferCapabilities
::
all_enabled
();
assert
!
(
caps
.allow_gds
);
assert
!
(
caps
.allow_gpu_rdma
);
assert
!
(
caps
.allows_device_disk_direct
());
assert
!
(
caps
.allows_device_remote_direct
());
}
#[test]
fn
test_builder_pattern
()
{
let
caps
=
TransferCapabilities
::
new
()
.with_gds
(
true
)
.with_gpu_rdma
(
false
);
assert
!
(
caps
.allow_gds
);
assert
!
(
!
caps
.allow_gpu_rdma
);
}
#[test]
fn
test_selective_enablement
()
{
// Enable only GDS
let
caps
=
TransferCapabilities
::
new
()
.with_gds
(
true
);
assert
!
(
caps
.allows_device_disk_direct
());
assert
!
(
!
caps
.allows_device_remote_direct
());
// Enable only GPU RDMA
let
caps
=
TransferCapabilities
::
new
()
.with_gpu_rdma
(
true
);
assert
!
(
!
caps
.allows_device_disk_direct
());
assert
!
(
caps
.allows_device_remote_direct
());
}
}
lib/kvbm-physical/src/transfer/checksum.rs
0 → 100644
View file @
9ab148dc
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Block checksum computation for verification.
//!
//! This module provides utilities to compute checksums of blocks for
//! round-trip test verification.
use
dynamo_memory
::
StorageKind
;
use
super
::
PhysicalLayout
;
use
aligned_vec
::{
AVec
,
avec
};
use
anyhow
::{
Result
,
anyhow
};
use
blake3
::
Hasher
;
use
std
::{
collections
::
HashMap
,
fs
::
File
,
io
::{
Read
,
Seek
},
mem
::
ManuallyDrop
,
ops
::
Range
,
os
::
fd
::
FromRawFd
,
};
use
cudarc
::
runtime
::
sys
::{
cudaMemcpy
,
cudaMemcpyKind
};
pub
type
BlockChecksum
=
String
;
/// Compute checksums for a list of blocks.
///
/// # Arguments
/// * `layout` - The physical layout containing the blocks
/// * `block_ids` - List of block IDs to checksum
///
/// # Returns
/// A map from block ID to its checksum
///
/// # Errors
/// Returns an error if:
/// - Layout is remote (cannot checksum remote memory directly)
/// - Block IDs are out of range
pub
fn
compute_block_checksums
(
layout
:
&
PhysicalLayout
,
block_ids
:
&
[
usize
],
)
->
Result
<
HashMap
<
usize
,
BlockChecksum
>>
{
let
mut
checksums
=
HashMap
::
new
();
for
&
block_id
in
block_ids
{
let
checksum
=
compute_single_block_checksum
(
layout
,
block_id
,
None
)
?
;
checksums
.insert
(
block_id
,
checksum
);
}
Ok
(
checksums
)
}
/// Compute checksums for specific layers in blocks.
///
/// # Arguments
/// * `layout` - The physical layout containing the blocks
/// * `block_ids` - List of block IDs to checksum
/// * `layer_range` - Range of layers to include in checksum
///
/// # Returns
/// A map from block ID to its checksum (for the specified layers only)
pub
fn
compute_layer_checksums
(
layout
:
&
PhysicalLayout
,
block_ids
:
&
[
usize
],
layer_range
:
Range
<
usize
>
,
)
->
Result
<
HashMap
<
usize
,
BlockChecksum
>>
{
let
config
=
layout
.layout
()
.config
();
if
layer_range
.end
>
config
.num_layers
{
return
Err
(
anyhow!
(
"Layer range {:?} exceeds num_layers {}"
,
layer_range
,
config
.num_layers
));
}
let
mut
checksums
=
HashMap
::
new
();
for
&
block_id
in
block_ids
{
let
checksum
=
compute_single_block_checksum
(
layout
,
block_id
,
Some
(
layer_range
.clone
()))
?
;
checksums
.insert
(
block_id
,
checksum
);
}
Ok
(
checksums
)
}
/// Compute checksum for a single block.
fn
compute_single_block_checksum
(
layout
:
&
PhysicalLayout
,
block_id
:
usize
,
layer_range
:
Option
<
Range
<
usize
>>
,
)
->
Result
<
String
>
{
let
config
=
layout
.layout
()
.config
();
if
block_id
>=
config
.num_blocks
{
return
Err
(
anyhow!
(
"Block ID {} out of range"
,
block_id
));
}
let
num_layers
=
config
.num_layers
;
let
outer_dim
=
config
.outer_dim
;
let
layers
=
layer_range
.unwrap_or
(
0
..
num_layers
);
// validate layer range
if
layers
.end
>
config
.num_layers
{
return
Err
(
anyhow!
(
"Layer range {:?} exceeds num_layers {}"
,
layers
,
config
.num_layers
));
}
let
mut
hasher
=
Hasher
::
new
();
// Iterate over all layers and outer dimensions
for
layer_id
in
layers
{
for
outer_id
in
0
..
outer_dim
{
let
region
=
layout
.memory_region
(
block_id
,
layer_id
,
outer_id
)
?
;
match
layout
.location
()
{
StorageKind
::
System
|
StorageKind
::
Pinned
=>
{
let
slice
=
unsafe
{
std
::
slice
::
from_raw_parts
(
region
.addr
()
as
*
const
u8
,
region
.size
())
};
hasher
.update
(
slice
);
}
StorageKind
::
Device
(
_
)
=>
{
let
mut
system_region
:
Vec
<
u8
>
=
vec!
[
0
;
region
.size
()];
let
err
=
unsafe
{
cudaMemcpy
(
system_region
.as_mut_ptr
()
as
*
mut
std
::
ffi
::
c_void
,
region
.addr
()
as
*
const
std
::
ffi
::
c_void
,
region
.size
(),
cudaMemcpyKind
::
cudaMemcpyDeviceToHost
,
)
};
if
err
!=
cudarc
::
runtime
::
sys
::
cudaError
::
cudaSuccess
{
return
Err
(
anyhow!
(
"cudaMemcpy D2H failed in checksum: {:?}"
,
err
));
}
hasher
.update
(
system_region
.as_slice
());
}
StorageKind
::
Disk
(
fd
)
=>
{
let
mut
system_region
:
AVec
<
u8
,
_
>
=
avec!
[[
4096
]|
0
;
region
.size
()];
let
mut
file
=
ManuallyDrop
::
new
(
unsafe
{
File
::
from_raw_fd
(
fd
as
i32
)
});
file
.seek
(
std
::
io
::
SeekFrom
::
Start
(
region
.addr
()
as
u64
))
?
;
file
.read_exact
(
&
mut
system_region
)
?
;
hasher
.update
(
system_region
.as_slice
());
}
}
}
}
Ok
(
hasher
.finalize
()
.to_string
())
}
#[cfg(all(test,
feature
=
"testing-kvbm"
))]
mod
tests
{
use
super
::
super
::
tests
::
*
;
use
super
::
*
;
use
crate
::
transfer
::{
FillPattern
,
fill_blocks
};
#[test]
fn
test_checksum_constant_pattern
()
{
let
physical
=
builder
(
2
)
.fully_contiguous
()
.allocate_system
()
.build
()
.unwrap
();
fill_blocks
(
&
physical
,
&
[
0
,
1
],
FillPattern
::
Constant
(
42
))
.unwrap
();
let
checksums
=
compute_block_checksums
(
&
physical
,
&
[
0
,
1
])
.unwrap
();
// Both blocks should have the same checksum values (same pattern)
assert_eq!
(
checksums
[
&
0
],
checksums
[
&
1
]);
let
memory_region
=
physical
.memory_region
(
0
,
0
,
0
)
.unwrap
();
let
slice
=
unsafe
{
std
::
slice
::
from_raw_parts
(
memory_region
.addr
()
as
*
const
u8
,
memory_region
.size
())
};
assert
!
(
slice
.iter
()
.all
(|
&
b
|
b
==
42
));
let
mut
hasher
=
Hasher
::
new
();
hasher
.update
(
slice
);
let
checksum_mr_slice
=
hasher
.finalize
()
.to_string
();
let
vec
=
vec!
[
42
;
memory_region
.size
()];
let
mut
hasher
=
Hasher
::
new
();
hasher
.update
(
&
vec
);
let
checksum_vec
=
hasher
.finalize
()
.to_string
();
assert_eq!
(
checksum_mr_slice
,
checksum_vec
);
}
// #[test]
// fn test_checksum_different_patterns() {
// let (layout, _memory) = create_test_layout(2);
// let physical = PhysicalLayout::new_local(layout, StorageLocation::System);
// // Fill blocks with different patterns
// fill_blocks(&physical, &[0], FillPattern::Constant(42)).unwrap();
// fill_blocks(&physical, &[1], FillPattern::Constant(100)).unwrap();
// let checksums = compute_block_checksums(&physical, &[0, 1]).unwrap();
// // Blocks should have different checksums
// assert_ne!(checksums[&0], checksums[&1]);
// }
// #[test]
// fn test_checksum_matches() {
// let (layout1, _memory1) = create_test_layout(1);
// let (layout2, _memory2) = create_test_layout(1);
// let physical1 = PhysicalLayout::new_local(layout1, StorageLocation::System);
// let physical2 = PhysicalLayout::new_local(layout2, StorageLocation::System);
// // Fill both with same pattern
// fill_blocks(&physical1, &[0], FillPattern::Sequential).unwrap();
// fill_blocks(&physical2, &[0], FillPattern::Sequential).unwrap();
// let checksum1 = compute_block_checksums(&physical1, &[0]).unwrap();
// let checksum2 = compute_block_checksums(&physical2, &[0]).unwrap();
// // Checksums should match (ignoring block_id)
// assert!(checksum1[&0].matches(&checksum2[&0]));
// }
// #[test]
// fn test_layer_checksums() {
// let (layout, _memory) = create_test_layout(1);
// let physical = PhysicalLayout::new_local(layout, StorageLocation::System);
// // Fill entire block
// fill_blocks(&physical, &[0], FillPattern::Sequential).unwrap();
// // Compute checksums for different layer ranges
// let full_checksum = compute_block_checksums(&physical, &[0]).unwrap();
// let layer0_checksum = compute_layer_checksums(&physical, &[0], 0..1).unwrap();
// let layer1_checksum = compute_layer_checksums(&physical, &[0], 1..2).unwrap();
// // Layer checksums should be different from full checksum
// assert_ne!(full_checksum[&0].byte_count, layer0_checksum[&0].byte_count);
// assert_ne!(full_checksum[&0].byte_count, layer1_checksum[&0].byte_count);
// // Layer 0 and Layer 1 should have same byte count (same size)
// assert_eq!(
// layer0_checksum[&0].byte_count,
// layer1_checksum[&0].byte_count
// );
// }
// #[test]
// fn test_checksum_remote_layout_fails() {
// let (layout, _memory) = create_test_layout(1);
// let physical =
// PhysicalLayout::new_remote(layout, StorageLocation::System, "remote".to_string());
// let result = compute_block_checksums(&physical, &[0]);
// assert!(result.is_err());
// assert!(result.unwrap_err().to_string().contains("remote"));
// }
}
lib/kvbm-physical/src/transfer/context.rs
0 → 100644
View file @
9ab148dc
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Transfer context.
use
std
::
sync
::
Arc
;
use
std
::
sync
::
atomic
::{
AtomicUsize
,
Ordering
};
use
anyhow
::
Result
;
use
cudarc
::
driver
::{
CudaContext
,
CudaEvent
,
CudaStream
};
use
derive_builder
::
Builder
;
use
tokio
::
sync
::
mpsc
;
use
uuid
::
Uuid
;
use
dynamo_memory
::
CudaMemPool
;
use
dynamo_memory
::
nixl
::{
NixlAgent
,
NixlBackendConfig
,
XferRequest
};
use
velo_events
::
EventManager
;
use
crate
::
manager
::
TransferManager
;
// Notifications module is declared in ../mod.rs
// Re-export for convenience
use
super
::
TransferCapabilities
;
use
notifications
::
RegisterPollingNotification
;
pub
(
crate
)
use
super
::
notifications
;
pub
use
super
::
notifications
::
TransferCompleteNotification
;
#[derive(Clone,
Builder)]
#[builder(pattern
=
"owned"
,
build_fn(private,
name
=
"build_internal"
),
public)]
#[allow(dead_code)]
// Fields are used in build() but derive macros confuse dead code analysis
pub
struct
TransferConfig
{
#[builder(default
=
"Arc::new(EventManager::local())"
)]
event_system
:
Arc
<
EventManager
>
,
/// Optional custom name for the NIXL agent. If not provided, defaults to "worker-{worker_id}"
#[builder(default
=
"None"
,
setter(strip_option))]
nixl_agent_name
:
Option
<
String
>
,
/// Backend configuration for NIXL backends to enable
#[builder(default
=
"NixlBackendConfig::default()"
)]
nixl_backend_config
:
NixlBackendConfig
,
#[builder(default
=
"0"
)]
cuda_device_id
:
usize
,
#[builder(default
=
"get_tokio_runtime()"
)]
tokio_runtime
:
TokioRuntime
,
#[builder(default
=
"TransferCapabilities::default()"
)]
capabilities
:
TransferCapabilities
,
/// Size in bytes to pre-allocate for the CUDA memory pool (default: 64 MiB)
#[builder(default
=
"64 * 1024 * 1024"
)]
cuda_pool_reserve_size
:
usize
,
/// Release threshold for the CUDA memory pool (default: Some(64 MiB))
/// Memory above this threshold is returned to the system when freed.
/// If None, no release threshold is set.
#[builder(default
=
"Some(64 * 1024 * 1024)"
)]
cuda_pool_release_threshold
:
Option
<
u64
>
,
}
impl
TransferConfigBuilder
{
/// Initialize builder with event system and tokio handle.
///
/// This sets the event_system and tokio runtime handle, ensuring consistency
/// with Nova's event system. Use this when the runtime has already been
/// constructed and you want components to share the same event notification
/// infrastructure.
pub
fn
from_event_system_and_handle
(
self
,
event_system
:
Arc
<
EventManager
>
,
handle
:
tokio
::
runtime
::
Handle
,
)
->
Self
{
self
.event_system
(
event_system
)
.tokio_runtime
(
TokioRuntime
::
Handle
(
handle
))
}
/// Directly provide a pre-configured wrapped NIXL agent (mainly for testing).
///
/// This bypasses the agent creation and backend initialization logic,
/// using the provided agent directly. Useful for tests that need full
/// control over agent configuration.
pub
fn
nixl_agent
(
self
,
agent
:
NixlAgent
)
->
TransferConfigBuilderWithAgent
{
TransferConfigBuilderWithAgent
{
builder
:
self
,
agent
,
}
}
/// Add a NIXL backend to enable (uses default plugin parameters).
pub
fn
nixl_backend
(
mut
self
,
backend
:
impl
Into
<
String
>
)
->
Self
{
let
config
=
self
.nixl_backend_config
.get_or_insert_with
(
NixlBackendConfig
::
default
);
*
config
=
config
.clone
()
.with_backend
(
backend
);
self
}
/// Load NIXL backend configuration from environment variables.
///
/// This merges environment-based configuration with any backends already
/// configured via the builder.
pub
fn
with_env_backends
(
mut
self
)
->
Result
<
Self
>
{
let
env_config
=
NixlBackendConfig
::
from_env
()
?
;
let
config
=
self
.nixl_backend_config
.get_or_insert_with
(
NixlBackendConfig
::
default
);
*
config
=
config
.clone
()
.merge
(
env_config
);
Ok
(
self
)
}
pub
fn
build
(
self
)
->
Result
<
TransferManager
>
{
let
mut
config
=
self
.build_internal
()
?
;
let
worker_id
=
config
.event_system
.system_id
();
// Merge environment backends if not explicitly configured
if
config
.nixl_backend_config
.backends
()
.is_empty
()
{
config
.nixl_backend_config
=
NixlBackendConfig
::
from_env
()
?
;
}
// Derive agent name from worker_id if not provided
let
agent_name
=
config
.nixl_agent_name
.unwrap_or_else
(||
format!
(
"worker-{}"
,
worker_id
));
let
nixl_agent
=
NixlAgent
::
from_nixl_backend_config
(
&
agent_name
,
config
.nixl_backend_config
)
?
;
let
cuda_context
=
CudaContext
::
new
(
config
.cuda_device_id
)
?
;
let
context
=
TransferContext
::
new
(
nixl_agent
,
config
.event_system
,
cuda_context
,
config
.tokio_runtime
,
config
.capabilities
,
config
.cuda_pool_reserve_size
,
config
.cuda_pool_release_threshold
,
)
?
;
Ok
(
TransferManager
::
from_context
(
context
))
}
}
/// Builder that already has a pre-configured NIXL agent.
///
/// This is generally used for testing when you want to pass in an agent directly
/// rather than having it created by the builder.
pub
struct
TransferConfigBuilderWithAgent
{
builder
:
TransferConfigBuilder
,
agent
:
NixlAgent
,
}
impl
TransferConfigBuilderWithAgent
{
/// Build the TransferManager using the pre-configured agent.
pub
fn
build
(
self
)
->
Result
<
TransferManager
>
{
let
config
=
self
.builder
.build_internal
()
?
;
let
cuda_context
=
CudaContext
::
new
(
config
.cuda_device_id
)
?
;
let
context
=
TransferContext
::
new
(
self
.agent
,
config
.event_system
,
cuda_context
,
config
.tokio_runtime
,
config
.capabilities
,
config
.cuda_pool_reserve_size
,
config
.cuda_pool_release_threshold
,
)
?
;
Ok
(
TransferManager
::
from_context
(
context
))
}
pub
fn
cuda_device_id
(
mut
self
,
cuda_device_id
:
usize
)
->
Self
{
self
.builder
=
self
.builder
.cuda_device_id
(
cuda_device_id
);
self
}
}
fn
get_tokio_runtime
()
->
TokioRuntime
{
match
tokio
::
runtime
::
Handle
::
try_current
()
{
Ok
(
handle
)
=>
TokioRuntime
::
Handle
(
handle
),
Err
(
_
)
=>
{
let
rt
=
tokio
::
runtime
::
Builder
::
new_multi_thread
()
.enable_all
()
.max_blocking_threads
(
4
)
.worker_threads
(
2
)
.build
()
.expect
(
"failed to build tokio runtime"
);
TokioRuntime
::
Shared
(
Arc
::
new
(
rt
))
}
}
}
#[derive(Debug,
Clone)]
#[doc(hidden)]
pub
enum
TokioRuntime
{
Handle
(
tokio
::
runtime
::
Handle
),
Shared
(
Arc
<
tokio
::
runtime
::
Runtime
>
),
}
impl
TokioRuntime
{
pub
fn
handle
(
&
self
)
->
&
tokio
::
runtime
::
Handle
{
match
self
{
TokioRuntime
::
Handle
(
handle
)
=>
handle
,
TokioRuntime
::
Shared
(
runtime
)
=>
runtime
.handle
(),
}
}
}
#[derive(Clone)]
#[doc(hidden)]
pub
struct
TransferContext
{
worker_id
:
u64
,
nixl_agent
:
NixlAgent
,
#[allow(dead_code)]
cuda_context
:
Arc
<
CudaContext
>
,
d2h_stream
:
Arc
<
CudaStream
>
,
h2d_stream
:
Arc
<
CudaStream
>
,
d2h_streams
:
Vec
<
Arc
<
CudaStream
>>
,
h2d_streams
:
Vec
<
Arc
<
CudaStream
>>
,
current_d2h_stream
:
Arc
<
AtomicUsize
>
,
current_h2d_stream
:
Arc
<
AtomicUsize
>
,
#[allow(dead_code)]
tokio_runtime
:
TokioRuntime
,
capabilities
:
TransferCapabilities
,
event_system
:
Arc
<
EventManager
>
,
// CUDA memory pool for kernel allocations
cuda_pool
:
Arc
<
CudaMemPool
>
,
// Channels for background notification handlers
tx_nixl_status
:
mpsc
::
Sender
<
RegisterPollingNotification
<
notifications
::
NixlStatusChecker
>>
,
tx_cuda_event
:
mpsc
::
Sender
<
RegisterPollingNotification
<
notifications
::
CudaEventChecker
>>
,
#[allow(dead_code)]
tx_nixl_events
:
mpsc
::
Sender
<
notifications
::
RegisterNixlNotification
>
,
}
impl
TransferContext
{
pub
fn
builder
()
->
TransferConfigBuilder
{
TransferConfigBuilder
::
default
()
}
pub
(
crate
)
fn
new
(
nixl_agent
:
NixlAgent
,
event_system
:
Arc
<
EventManager
>
,
cuda_context
:
Arc
<
CudaContext
>
,
tokio_runtime
:
TokioRuntime
,
capabilities
:
TransferCapabilities
,
cuda_pool_reserve_size
:
usize
,
cuda_pool_release_threshold
:
Option
<
u64
>
,
)
->
Result
<
Self
>
{
unsafe
{
cuda_context
.disable_event_tracking
()
};
// Create CUDA memory pool for kernel allocations
let
mut
pool_builder
=
CudaMemPool
::
builder
(
cuda_context
.clone
(),
cuda_pool_reserve_size
);
if
let
Some
(
threshold
)
=
cuda_pool_release_threshold
{
pool_builder
=
pool_builder
.release_threshold
(
threshold
);
}
let
cuda_pool
=
Arc
::
new
(
pool_builder
.build
()
?
);
// Create channels for background notification handlers
let
(
tx_nixl_status
,
rx_nixl_status
)
=
mpsc
::
channel
(
64
);
let
(
tx_cuda_event
,
rx_cuda_event
)
=
mpsc
::
channel
(
64
);
let
(
tx_nixl_events
,
rx_nixl_events
)
=
mpsc
::
channel
(
64
);
// Spawn background handlers
let
handle
=
tokio_runtime
.handle
();
// Spawn NIXL status polling handler
handle
.spawn
(
notifications
::
process_polling_notifications
(
rx_nixl_status
,
event_system
.clone
(),
));
// Spawn CUDA event polling handler
handle
.spawn
(
notifications
::
process_polling_notifications
(
rx_cuda_event
,
event_system
.clone
(),
));
// Spawn NIXL notification events handler
handle
.spawn
(
notifications
::
process_nixl_notification_events
(
nixl_agent
.raw_agent
()
.clone
(),
rx_nixl_events
,
event_system
.clone
(),
));
let
d2h_streams
:
Vec
<
Arc
<
CudaStream
>>
=
(
0
..
4
)
.map
(|
_
|
cuda_context
.new_stream
())
.collect
::
<
Result
<
Vec
<
_
>
,
_
>>
()
?
;
let
h2d_streams
:
Vec
<
Arc
<
CudaStream
>>
=
(
0
..
4
)
.map
(|
_
|
cuda_context
.new_stream
())
.collect
::
<
Result
<
Vec
<
_
>
,
_
>>
()
?
;
let
d2h_stream
=
d2h_streams
[
0
]
.clone
();
let
h2d_stream
=
h2d_streams
[
0
]
.clone
();
let
current_d2h_stream
=
Arc
::
new
(
AtomicUsize
::
new
(
0
));
let
current_h2d_stream
=
Arc
::
new
(
AtomicUsize
::
new
(
0
));
Ok
(
Self
{
worker_id
:
event_system
.system_id
(),
nixl_agent
,
cuda_context
:
cuda_context
.clone
(),
d2h_stream
,
h2d_stream
,
d2h_streams
,
h2d_streams
,
current_d2h_stream
,
current_h2d_stream
,
tokio_runtime
,
capabilities
,
event_system
,
cuda_pool
,
tx_nixl_status
,
tx_cuda_event
,
tx_nixl_events
,
})
}
pub
(
crate
)
fn
nixl_agent
(
&
self
)
->
&
NixlAgent
{
&
self
.nixl_agent
}
#[allow(dead_code)]
pub
(
crate
)
fn
cuda_context
(
&
self
)
->
&
Arc
<
CudaContext
>
{
&
self
.cuda_context
}
// Provides the same d2h stream per invocation
#[allow(dead_code)]
pub
(
crate
)
fn
d2h_stream
(
&
self
)
->
&
Arc
<
CudaStream
>
{
&
self
.d2h_stream
}
// Provides the same h2d stream per invocation
#[allow(dead_code)]
pub
(
crate
)
fn
h2d_stream
(
&
self
)
->
&
Arc
<
CudaStream
>
{
&
self
.h2d_stream
}
// Provides the next d2h stream in a round-robin fashion
pub
(
crate
)
fn
next_d2h_streams
(
&
self
)
->
Arc
<
CudaStream
>
{
let
current_d2h_stream
=
self
.current_d2h_stream
.fetch_add
(
1
,
Ordering
::
Relaxed
);
self
.d2h_streams
[
current_d2h_stream
%
self
.d2h_streams
.len
()]
.clone
()
}
// Provides the next h2d stream in a round-robin fashion
pub
(
crate
)
fn
next_h2d_streams
(
&
self
)
->
Arc
<
CudaStream
>
{
let
current_h2d_stream
=
self
.current_h2d_stream
.fetch_add
(
1
,
Ordering
::
Relaxed
);
self
.h2d_streams
[
current_h2d_stream
%
self
.h2d_streams
.len
()]
.clone
()
}
/// Acquire an H2D stream for use by caller.
///
/// This returns a stream from the pool that the caller can use for multiple
/// sequential operations. The caller is responsible for all synchronization
/// (e.g., recording events after operations).
///
/// Used for layer-wise transfers where all layers must execute on the same stream.
pub
fn
acquire_h2d_stream
(
&
self
)
->
Arc
<
CudaStream
>
{
self
.next_h2d_streams
()
}
/// Acquire a D2H stream for use by caller.
///
/// This returns a stream from the pool that the caller can use for multiple
/// sequential operations. The caller is responsible for all synchronization
/// (e.g., recording events after operations).
///
/// Used for layer-wise transfers where all layers must execute on the same stream.
pub
fn
acquire_d2h_stream
(
&
self
)
->
Arc
<
CudaStream
>
{
self
.next_d2h_streams
()
}
#[allow(dead_code)]
#[doc(hidden)]
pub
fn
tokio
(
&
self
)
->
&
tokio
::
runtime
::
Handle
{
self
.tokio_runtime
.handle
()
}
pub
(
crate
)
fn
capabilities
(
&
self
)
->
&
TransferCapabilities
{
&
self
.capabilities
}
#[doc(hidden)]
pub
fn
event_system
(
&
self
)
->
&
Arc
<
EventManager
>
{
&
self
.event_system
}
/// Get the CUDA memory pool for kernel allocations.
pub
(
crate
)
fn
cuda_pool
(
&
self
)
->
&
Arc
<
CudaMemPool
>
{
&
self
.cuda_pool
}
/// Register a NIXL transfer request for status polling completion.
///
/// This method enqueues the transfer request to be polled for completion
/// using `agent.get_xfer_status()`. Returns a notification object that
/// can be awaited for completion.
pub
(
crate
)
fn
register_nixl_status
(
&
self
,
xfer_req
:
XferRequest
,
)
->
TransferCompleteNotification
{
let
event
=
self
.event_system
.new_event
()
.expect
(
"Failed to allocate event"
);
let
handle
=
event
.into_handle
();
let
awaiter
=
self
.event_system
.awaiter
(
handle
)
.expect
(
"Failed to get awaiter"
);
let
notification
=
notifications
::
RegisterPollingNotification
{
uuid
:
Uuid
::
new_v4
(),
checker
:
notifications
::
NixlStatusChecker
::
new
(
self
.nixl_agent
.raw_agent
()
.clone
(),
xfer_req
,
),
event_handle
:
handle
,
};
// Send to background handler — log error if channel is full or closed
if
let
Err
(
e
)
=
self
.tx_nixl_status
.try_send
(
notification
)
{
tracing
::
error!
(
"Failed to enqueue NIXL status notification: channel full or closed: {}"
,
e
);
}
TransferCompleteNotification
::
from_awaiter
(
awaiter
)
}
/// Register a CUDA event for polling completion.
///
/// This method enqueues the CUDA event to be polled for completion.
/// Returns a notification object that can be awaited for completion.
pub
(
crate
)
fn
register_cuda_event
(
&
self
,
event
:
CudaEvent
)
->
TransferCompleteNotification
{
let
new_event
=
self
.event_system
.new_event
()
.expect
(
"Failed to allocate event"
);
let
handle
=
new_event
.into_handle
();
let
awaiter
=
self
.event_system
.awaiter
(
handle
)
.expect
(
"Failed to get awaiter"
);
let
notification
=
notifications
::
RegisterPollingNotification
{
uuid
:
Uuid
::
new_v4
(),
checker
:
notifications
::
CudaEventChecker
::
new
(
event
),
event_handle
:
handle
,
};
// Send to background handler — log error if channel is full or closed
if
let
Err
(
e
)
=
self
.tx_cuda_event
.try_send
(
notification
)
{
tracing
::
error!
(
"Failed to enqueue CUDA event notification: channel full or closed: {}"
,
e
);
}
TransferCompleteNotification
::
from_awaiter
(
awaiter
)
}
/// Register a NIXL transfer request for notification-based completion.
///
/// This method enqueues the transfer request to be completed via NIXL
/// notification events. Returns a notification object that can be awaited
/// for completion.
#[allow(dead_code)]
pub
(
crate
)
fn
register_nixl_event
(
&
self
,
xfer_req
:
XferRequest
,
)
->
TransferCompleteNotification
{
let
event
=
self
.event_system
.new_event
()
.expect
(
"Failed to allocate event"
);
let
handle
=
event
.into_handle
();
let
awaiter
=
self
.event_system
.awaiter
(
handle
)
.expect
(
"Failed to get awaiter"
);
let
notification
=
notifications
::
RegisterNixlNotification
{
uuid
:
Uuid
::
new_v4
(),
xfer_req
,
event_handle
:
handle
,
};
// Send to background handler — log error if channel is full or closed
if
let
Err
(
e
)
=
self
.tx_nixl_events
.try_send
(
notification
)
{
tracing
::
error!
(
"Failed to enqueue NIXL event notification: channel full or closed: {}"
,
e
);
}
TransferCompleteNotification
::
from_awaiter
(
awaiter
)
}
/// Get the worker ID for this context.
pub
(
crate
)
fn
worker_id
(
&
self
)
->
u64
{
self
.worker_id
}
}
lib/kvbm-physical/src/transfer/executor/cuda.rs
0 → 100644
View file @
9ab148dc
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! CUDA executor for GPU memory transfers.
use
super
::
TransferContext
;
use
super
::{
PhysicalLayout
,
TransferStrategy
};
use
crate
::
BlockId
;
use
crate
::
transfer
::
context
::
TransferCompleteNotification
;
use
crate
::
transfer
::{
can_use_whole_block_transfer
,
validate_layout_compatibility
};
use
anyhow
::{
Result
,
anyhow
};
use
cudarc
::
driver
::{
CudaStream
,
result
as
cuda_result
};
use
cudarc
::
runtime
::
sys
::
cudaStream_t
;
use
dynamo_memory
::
CudaMemPool
;
use
kvbm_kernels
::
MemcpyBatchMode
;
use
std
::
ffi
::
c_void
;
use
std
::
ops
::
Range
;
use
std
::
sync
::
Arc
;
// #[cfg(test)]
// mod cuda_kernel_tests;
/// Execute a CUDA transfer between host and device memory.
///
/// This executor handles transfers involving GPU memory using CUDA APIs.
/// Supports async and blocking transfers depending on the strategy.
///
/// # Arguments
/// * `src` - Source physical layout
/// * `dst` - Destination physical layout
/// * `src_block_ids` - Source block IDs to transfer
/// * `dst_block_ids` - Destination block IDs to transfer
/// * `layer_range` - Optional range of layers to transfer (None = all layers)
/// * `strategy` - CUDA transfer strategy (H2D, D2H, D2D, async or blocking)
/// * `cuda_stream` - Optional caller-provided stream. If provided, use this stream
/// and skip event recording (caller manages sync). Returns completed() immediately.
/// * `ctx` - Transfer context with CUDA stream
#[allow(clippy::too_many_arguments)]
pub
fn
execute_cuda_transfer
(
src
:
&
PhysicalLayout
,
dst
:
&
PhysicalLayout
,
src_block_ids
:
&
[
BlockId
],
dst_block_ids
:
&
[
BlockId
],
layer_range
:
Option
<
Range
<
usize
>>
,
strategy
:
TransferStrategy
,
cuda_stream
:
Option
<
Arc
<
CudaStream
>>
,
ctx
:
&
TransferContext
,
)
->
Result
<
TransferCompleteNotification
>
{
// Validate layouts
let
src_layout
=
src
.layout
();
let
dst_layout
=
dst
.layout
();
if
src_layout
.num_layers
()
!=
dst_layout
.num_layers
()
{
return
Err
(
anyhow!
(
"Layouts have incompatible layer counts: src={}, dst={}"
,
src_layout
.num_layers
(),
dst_layout
.num_layers
()
));
}
if
src_layout
.outer_dim
()
!=
dst_layout
.outer_dim
()
{
return
Err
(
anyhow!
(
"Layouts have incompatible outer dimensions: src={}, dst={}"
,
src_layout
.outer_dim
(),
dst_layout
.outer_dim
()
));
}
// Validate layout compatibility (errors if transform would be needed)
validate_layout_compatibility
(
src
,
dst
)
?
;
// Determine layer range
let
layers
=
layer_range
.clone
()
.unwrap_or
(
0
..
src_layout
.num_layers
());
// Check if we can use optimized whole-block transfer
let
use_whole_block
=
can_use_whole_block_transfer
(
src
,
dst
,
layer_range
.as_ref
());
// Track whether caller provided stream (affects event recording)
let
caller_manages_sync
=
cuda_stream
.is_some
();
// Get appropriate CUDA stream - use caller-provided or acquire from pool
let
stream
=
if
let
Some
(
s
)
=
cuda_stream
{
s
}
else
{
match
strategy
{
TransferStrategy
::
CudaAsyncD2H
=>
ctx
.next_d2h_streams
(),
_
=>
ctx
.next_h2d_streams
(),
// H2D and D2D use h2d_stream
}
};
// Perform CUDA transfers based on strategy
// Determine direction name for logging
let
strategy_name
=
match
strategy
{
TransferStrategy
::
CudaAsyncH2D
=>
"H2D"
,
TransferStrategy
::
CudaAsyncD2H
=>
"D2H"
,
TransferStrategy
::
CudaAsyncD2D
=>
"D2D"
,
_
=>
"Unknown"
,
};
match
strategy
{
TransferStrategy
::
CudaAsyncH2D
|
TransferStrategy
::
CudaAsyncD2H
|
TransferStrategy
::
CudaAsyncD2D
=>
{
if
use_whole_block
{
// FC→FC: Use unified whole-block path with batched memcpy
// Direction auto-detected by cudaMemcpyDefault
tracing
::
debug!
(
strategy
=
strategy_name
,
num_blocks
=
src_block_ids
.len
(),
bytes_per_block
=
src_layout
.config
()
.bytes_per_block
(),
"Using whole-block transfer (auto direction)"
);
execute_whole_block_cuda
(
src
,
dst
,
src_block_ids
,
dst_block_ids
,
stream
.as_ref
())
?
;
}
else
{
// FC↔LW: Use vectorized_copy kernel directly
tracing
::
debug!
(
strategy
=
strategy_name
,
num_blocks
=
src_block_ids
.len
(),
num_layers
=
layers
.len
(),
"Using vectorized_copy for FC↔LW transfer"
);
execute_fc_lw_vectorized
(
src
,
dst
,
src_block_ids
,
dst_block_ids
,
layers
.clone
(),
stream
.as_ref
(),
ctx
.cuda_pool
(),
)
?
;
}
}
_
=>
{
return
Err
(
anyhow!
(
"Invalid CUDA transfer strategy: {:?}"
,
strategy
));
}
}
// If caller provided the stream, they manage synchronization - return completed immediately
if
caller_manages_sync
{
return
Ok
(
TransferCompleteNotification
::
completed
());
}
// For async transfers, record an event and register it for completion tracking
if
matches!
(
strategy
,
TransferStrategy
::
CudaAsyncH2D
|
TransferStrategy
::
CudaAsyncD2H
|
TransferStrategy
::
CudaAsyncD2D
)
{
let
event
=
stream
.record_event
(
None
)
?
;
Ok
(
ctx
.register_cuda_event
(
event
))
}
else
{
// Blocking transfers are already synchronized
Ok
(
TransferCompleteNotification
::
completed
())
}
}
// ============================================================================
// Whole-Block Transfer Functions (FC→FC optimization)
// ============================================================================
/// Unified whole-block transfer using batched memcpy.
///
/// NO device pointer allocation needed. Direction is auto-detected by CUDA
/// from pointer types using cudaMemcpyDefault.
///
/// Uses cudaMemcpyBatchAsync when available (CUDA 12.9+), falling back to
/// individual cudaMemcpyAsync calls on older CUDA versions.
fn
execute_whole_block_cuda
(
src
:
&
PhysicalLayout
,
dst
:
&
PhysicalLayout
,
src_block_ids
:
&
[
BlockId
],
dst_block_ids
:
&
[
BlockId
],
stream
:
&
cudarc
::
driver
::
CudaStream
,
)
->
Result
<
()
>
{
let
bytes_per_block
=
src
.layout
()
.config
()
.bytes_per_block
();
let
num_blocks
=
src_block_ids
.len
();
if
num_blocks
==
0
{
return
Ok
(());
}
// Build host pointer arrays
let
mut
src_ptrs
:
Vec
<*
const
std
::
ffi
::
c_void
>
=
Vec
::
with_capacity
(
num_blocks
);
let
mut
dst_ptrs
:
Vec
<*
mut
std
::
ffi
::
c_void
>
=
Vec
::
with_capacity
(
num_blocks
);
for
(
&
src_block_id
,
&
dst_block_id
)
in
src_block_ids
.iter
()
.zip
(
dst_block_ids
.iter
())
{
let
src_region
=
src
.memory_region
(
src_block_id
,
0
,
0
)
?
;
let
dst_region
=
dst
.memory_region
(
dst_block_id
,
0
,
0
)
?
;
src_ptrs
.push
(
src_region
.addr
()
as
*
const
std
::
ffi
::
c_void
);
dst_ptrs
.push
(
dst_region
.addr
()
as
*
mut
std
::
ffi
::
c_void
);
}
// Use batched memcpy - handles CUDA 12.9+ batch API with automatic fallback
let
status
=
unsafe
{
kvbm_kernels
::
memcpy_batch
(
src_ptrs
.as_ptr
(),
dst_ptrs
.as_ptr
(),
bytes_per_block
,
num_blocks
,
MemcpyBatchMode
::
BatchedWithFallback
,
stream
.cu_stream
()
as
cudarc
::
runtime
::
sys
::
cudaStream_t
,
)
};
if
status
!=
cudarc
::
runtime
::
sys
::
cudaError
::
cudaSuccess
{
return
Err
(
anyhow!
(
"memcpy_batch failed: {:?}"
,
status
));
}
tracing
::
debug!
(
num_blocks
,
bytes_per_block
,
batch_available
=
kvbm_kernels
::
is_memcpy_batch_available
(),
"Whole-block transfer completed"
);
Ok
(())
}
// ============================================================================
// FC↔LW Transfer using vectorized_copy kernel
// ============================================================================
/// Execute FC↔LW transfer using vectorized_copy kernel.
///
/// This function builds flat (src, dst) pointer arrays for all chunks across all blocks,
/// uploads them to device memory, and calls the vectorized_copy kernel directly.
///
/// Benefits over the old operational_copy approach:
/// - Simpler: One kernel, no backend selection logic
/// - Faster: 16-byte (int4) loads when aligned (vs 8-byte in operational_copy_vectorized)
/// - All offset math on host: Kernel just copies bytes
/// - Handles any alignment: Falls back gracefully to 8/4/1-byte copies
fn
execute_fc_lw_vectorized
(
src
:
&
PhysicalLayout
,
dst
:
&
PhysicalLayout
,
src_block_ids
:
&
[
BlockId
],
dst_block_ids
:
&
[
BlockId
],
layers
:
Range
<
usize
>
,
stream
:
&
CudaStream
,
pool
:
&
CudaMemPool
,
)
->
Result
<
()
>
{
// Bind CUDA context to current thread before any CUDA operations.
stream
.context
()
.bind_to_thread
()
?
;
let
src_layout
=
src
.layout
();
let
nl
=
layers
.len
();
let
no
=
src_layout
.outer_dim
();
let
chunk_size
=
src_layout
.page_size
()
*
src_layout
.inner_dim
()
*
src_layout
.dtype_width_bytes
();
let
num_blocks
=
src_block_ids
.len
();
let
total_chunks
=
num_blocks
*
nl
*
no
;
if
total_chunks
==
0
{
return
Ok
(());
}
// Build flat pointer arrays on host
let
mut
src_ptrs
:
Vec
<
usize
>
=
Vec
::
with_capacity
(
total_chunks
);
let
mut
dst_ptrs
:
Vec
<
usize
>
=
Vec
::
with_capacity
(
total_chunks
);
for
(
&
src_block_id
,
&
dst_block_id
)
in
src_block_ids
.iter
()
.zip
(
dst_block_ids
.iter
())
{
for
layer_id
in
layers
.clone
()
{
for
outer_id
in
0
..
no
{
let
src_region
=
src
.memory_region
(
src_block_id
,
layer_id
,
outer_id
)
?
;
let
dst_region
=
dst
.memory_region
(
dst_block_id
,
layer_id
,
outer_id
)
?
;
src_ptrs
.push
(
src_region
.addr
());
dst_ptrs
.push
(
dst_region
.addr
());
}
}
}
// Allocate device memory for pointer arrays
let
src_ptrs_device
=
pool
.alloc_async
(
total_chunks
*
std
::
mem
::
size_of
::
<
usize
>
(),
stream
)
?
;
let
dst_ptrs_device
=
pool
.alloc_async
(
total_chunks
*
std
::
mem
::
size_of
::
<
usize
>
(),
stream
)
?
;
// Upload pointer arrays to device
unsafe
{
cuda_result
::
memcpy_htod_async
(
src_ptrs_device
,
std
::
slice
::
from_raw_parts
(
src_ptrs
.as_ptr
()
as
*
const
u8
,
total_chunks
*
std
::
mem
::
size_of
::
<
usize
>
(),
),
stream
.cu_stream
(),
)
?
;
cuda_result
::
memcpy_htod_async
(
dst_ptrs_device
,
std
::
slice
::
from_raw_parts
(
dst_ptrs
.as_ptr
()
as
*
const
u8
,
total_chunks
*
std
::
mem
::
size_of
::
<
usize
>
(),
),
stream
.cu_stream
(),
)
?
;
}
let
pointers_transfered_event
=
stream
.record_event
(
None
)
?
;
// Call vectorized_copy kernel
let
status
=
unsafe
{
kvbm_kernels
::
vectorized_copy
(
src_ptrs_device
as
*
mut
*
mut
c_void
,
dst_ptrs_device
as
*
mut
*
mut
c_void
,
chunk_size
,
total_chunks
as
i32
,
stream
.cu_stream
()
as
cudaStream_t
,
)
};
// Free device allocations back to the pool
pool
.free_async
(
src_ptrs_device
,
stream
)
?
;
pool
.free_async
(
dst_ptrs_device
,
stream
)
?
;
if
status
!=
cudarc
::
runtime
::
sys
::
cudaError
::
cudaSuccess
{
return
Err
(
anyhow!
(
"vectorized_copy failed: {:?}"
,
status
));
}
tracing
::
debug!
(
total_chunks
,
chunk_size
,
"FC↔LW vectorized_copy transfer completed"
);
pointers_transfered_event
.synchronize
()
?
;
Ok
(())
}
lib/kvbm-physical/src/transfer/executor/memcpy.rs
0 → 100644
View file @
9ab148dc
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Memcpy executor for host-to-host transfers.
use
crate
::
BlockId
;
use
crate
::
transfer
::
PhysicalLayout
;
use
crate
::
transfer
::
TransferContext
;
use
crate
::
transfer
::
context
::
TransferCompleteNotification
;
use
crate
::
transfer
::{
can_use_whole_block_transfer
,
validate_layout_compatibility
};
use
anyhow
::
Result
;
use
std
::
ops
::
Range
;
/// Execute a memcpy transfer between host memory locations.
///
/// This executor handles transfers between System and Pinned memory using
/// standard CPU memcpy operations. The transfer is synchronous and blocking.
///
/// For FC→FC transfers with compatible layouts and full-block transfers,
/// this uses an optimized whole-block copy path (single memcpy per block).
/// Otherwise, falls back to layer-wise copying.
///
/// # Arguments
/// * `src` - Source physical layout
/// * `dst` - Destination physical layout
/// * `src_block_ids` - Source block IDs to transfer
/// * `dst_block_ids` - Destination block IDs to transfer
/// * `layer_range` - Optional range of layers to transfer (None = all layers)
/// * `_ctx` - Transfer context (unused for memcpy, kept for API consistency)
pub
fn
execute_memcpy_transfer
(
src
:
&
PhysicalLayout
,
dst
:
&
PhysicalLayout
,
src_block_ids
:
&
[
BlockId
],
dst_block_ids
:
&
[
BlockId
],
layer_range
:
Option
<
Range
<
usize
>>
,
_
ctx
:
&
TransferContext
,
)
->
Result
<
TransferCompleteNotification
>
{
if
src_block_ids
.len
()
!=
dst_block_ids
.len
()
{
return
Err
(
anyhow
::
anyhow!
(
"Block ID slice length mismatch: src={}, dst={}"
,
src_block_ids
.len
(),
dst_block_ids
.len
()
));
}
// Validate layouts have compatible structure
let
src_layout
=
src
.layout
();
let
dst_layout
=
dst
.layout
();
if
src_layout
.num_layers
()
!=
dst_layout
.num_layers
()
{
return
Err
(
anyhow
::
anyhow!
(
"Layouts have incompatible layer counts: src={}, dst={}"
,
src_layout
.num_layers
(),
dst_layout
.num_layers
()
));
}
if
src_layout
.outer_dim
()
!=
dst_layout
.outer_dim
()
{
return
Err
(
anyhow
::
anyhow!
(
"Layouts have incompatible outer dimensions: src={}, dst={}"
,
src_layout
.outer_dim
(),
dst_layout
.outer_dim
()
));
}
// Validate layout compatibility (errors if transform would be needed)
validate_layout_compatibility
(
src
,
dst
)
?
;
let
layers
=
layer_range
.clone
()
.unwrap_or
(
0
..
src_layout
.num_layers
());
// Try whole-block path for FC→FC transfers with compatible layouts
if
can_use_whole_block_transfer
(
src
,
dst
,
layer_range
.as_ref
())
{
tracing
::
debug!
(
num_blocks
=
src_block_ids
.len
(),
bytes_per_block
=
src_layout
.config
()
.bytes_per_block
(),
"Using whole-block memcpy path"
);
execute_whole_block_memcpy
(
src
,
dst
,
src_block_ids
,
dst_block_ids
)
?
;
}
else
{
tracing
::
debug!
(
num_blocks
=
src_block_ids
.len
(),
layer_range
=
?
layers
,
src_fc
=
src_layout
.is_fully_contiguous
(),
dst_fc
=
dst_layout
.is_fully_contiguous
(),
"Using layer-wise memcpy path"
);
execute_layer_wise_memcpy
(
src
,
dst
,
src_block_ids
,
dst_block_ids
,
layers
)
?
;
}
// Memcpy is synchronous, so return already-completed notification
Ok
(
TransferCompleteNotification
::
completed
())
}
/// Whole-block memcpy for FC→FC with compatible layouts.
///
/// Copies entire blocks in a single memcpy operation per block,
/// leveraging the fully contiguous memory layout.
fn
execute_whole_block_memcpy
(
src
:
&
PhysicalLayout
,
dst
:
&
PhysicalLayout
,
src_block_ids
:
&
[
BlockId
],
dst_block_ids
:
&
[
BlockId
],
)
->
Result
<
()
>
{
let
bytes_per_block
=
src
.layout
()
.config
()
.bytes_per_block
();
for
(
&
src_block_id
,
&
dst_block_id
)
in
src_block_ids
.iter
()
.zip
(
dst_block_ids
.iter
())
{
// Get block base address (layer=0, outer=0 for FC layout gives base)
let
src_region
=
src
.memory_region
(
src_block_id
,
0
,
0
)
?
;
let
dst_region
=
dst
.memory_region
(
dst_block_id
,
0
,
0
)
?
;
unsafe
{
std
::
ptr
::
copy_nonoverlapping
(
src_region
.addr
()
as
*
const
u8
,
dst_region
.addr
()
as
*
mut
u8
,
bytes_per_block
,
);
}
}
Ok
(())
}
/// Layer-wise memcpy (existing behavior, refactored).
///
/// Copies blocks layer by layer and outer dimension by outer dimension.
/// Used for FC→LW, LW→FC, LW→LW, or partial layer transfers.
fn
execute_layer_wise_memcpy
(
src
:
&
PhysicalLayout
,
dst
:
&
PhysicalLayout
,
src_block_ids
:
&
[
BlockId
],
dst_block_ids
:
&
[
BlockId
],
layers
:
Range
<
usize
>
,
)
->
Result
<
()
>
{
let
src_layout
=
src
.layout
();
for
(
&
src_block_id
,
&
dst_block_id
)
in
src_block_ids
.iter
()
.zip
(
dst_block_ids
.iter
())
{
for
layer_id
in
layers
.clone
()
{
for
outer_id
in
0
..
src_layout
.outer_dim
()
{
// Get source and destination memory regions
let
src_region
=
src
.memory_region
(
src_block_id
,
layer_id
,
outer_id
)
?
;
let
dst_region
=
dst
.memory_region
(
dst_block_id
,
layer_id
,
outer_id
)
?
;
// Validate sizes match
if
src_region
.size
()
!=
dst_region
.size
()
{
return
Err
(
anyhow
::
anyhow!
(
"Memory region size mismatch at block=({},{}), layer={}, outer={}: src={}, dst={}"
,
src_block_id
,
dst_block_id
,
layer_id
,
outer_id
,
src_region
.size
(),
dst_region
.size
()
));
}
// Perform memcpy
unsafe
{
let
src_ptr
=
src_region
.addr
()
as
*
const
u8
;
let
dst_ptr
=
dst_region
.addr
()
as
*
mut
u8
;
std
::
ptr
::
copy_nonoverlapping
(
src_ptr
,
dst_ptr
,
src_region
.size
());
}
}
}
}
Ok
(())
}
lib/kvbm-physical/src/transfer/executor/mod.rs
0 → 100644
View file @
9ab148dc
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Transfer executors for different copy strategies.
pub
(
super
)
mod
cuda
;
mod
memcpy
;
mod
nixl
;
use
super
::
strategy
::
select_strategy
;
use
super
::
strategy
::{
TransferPlan
,
TransferStrategy
};
use
super
::
validation
::
validate_block_transfer
;
use
super
::{
PhysicalLayout
,
TransferContext
};
use
crate
::
BlockId
;
use
crate
::
layout
::
KvBlockLayout
;
use
crate
::
transfer
::
BounceBufferInternal
;
use
crate
::
transfer
::{
StorageKind
,
context
::
TransferCompleteNotification
};
use
anyhow
::
Result
;
use
cudarc
::
driver
::
CudaStream
;
use
std
::
ops
::
Range
;
use
std
::
sync
::
Arc
;
use
tokio
::
sync
::
Mutex
;
// Re-export the NIXL transfer builder for public use
pub
use
nixl
::
NixlTransferBuilder
;
/// Transformation kernel types for converting between different block layouts.
#[derive(Debug,
Clone,
Copy,
PartialEq,
Eq)]
pub
(
crate
)
enum
TransformKernel
{
/// No transformation needed - layouts are compatible, use copy
None
,
/// Transform from operational (NHD/HND) to universal format
BlockToUniversal
{
src_layout
:
KvBlockLayout
},
/// Transform from universal to operational (NHD/HND) format
UniversalToBlock
{
dst_layout
:
KvBlockLayout
},
/// Transpose between operational formats (NHD <-> HND)
OperationalTranspose
,
/// Layouts are incompatible and no kernel is available
Unsupported
,
}
/// Select the appropriate transformation kernel based on source and destination layouts.
///
/// Returns `TransformKernel::None` if the layouts are the same (copy is sufficient).
/// Returns `TransformKernel::Unsupported` if the layout combination is not supported.
#[allow(dead_code)]
pub
(
crate
)
fn
select_transform_kernel
(
src_layout
:
KvBlockLayout
,
dst_layout
:
KvBlockLayout
,
)
->
TransformKernel
{
// Same layout - no transformation needed
if
!
src_layout
.requires_transform
(
&
dst_layout
)
{
return
TransformKernel
::
None
;
}
// Unknown layouts cannot be transformed
if
matches!
(
src_layout
,
KvBlockLayout
::
Unknown
)
||
matches!
(
dst_layout
,
KvBlockLayout
::
Unknown
)
{
return
TransformKernel
::
Unsupported
;
}
match
(
src_layout
,
dst_layout
)
{
// Operational to Universal
(
KvBlockLayout
::
OperationalNHD
,
KvBlockLayout
::
UniversalTP
)
|
(
KvBlockLayout
::
OperationalNHD
,
KvBlockLayout
::
UniversalPP
)
|
(
KvBlockLayout
::
OperationalHND
,
KvBlockLayout
::
UniversalTP
)
|
(
KvBlockLayout
::
OperationalHND
,
KvBlockLayout
::
UniversalPP
)
=>
{
TransformKernel
::
BlockToUniversal
{
src_layout
}
}
// Universal to Operational
(
KvBlockLayout
::
UniversalTP
,
KvBlockLayout
::
OperationalNHD
)
|
(
KvBlockLayout
::
UniversalTP
,
KvBlockLayout
::
OperationalHND
)
|
(
KvBlockLayout
::
UniversalPP
,
KvBlockLayout
::
OperationalNHD
)
|
(
KvBlockLayout
::
UniversalPP
,
KvBlockLayout
::
OperationalHND
)
=>
{
TransformKernel
::
UniversalToBlock
{
dst_layout
}
}
// Operational NHD <-> HND transpose
(
KvBlockLayout
::
OperationalNHD
,
KvBlockLayout
::
OperationalHND
)
|
(
KvBlockLayout
::
OperationalHND
,
KvBlockLayout
::
OperationalNHD
)
=>
{
TransformKernel
::
OperationalTranspose
}
// Custom layouts need explicit handling
(
KvBlockLayout
::
Custom
(
_
),
_
)
|
(
_
,
KvBlockLayout
::
Custom
(
_
))
=>
{
TransformKernel
::
Unsupported
}
// Universal to Universal (different variants)
(
KvBlockLayout
::
UniversalTP
,
KvBlockLayout
::
UniversalPP
)
|
(
KvBlockLayout
::
UniversalPP
,
KvBlockLayout
::
UniversalTP
)
=>
{
// TODO: Add direct universal-to-universal kernel
TransformKernel
::
Unsupported
}
// Fallback for any unhandled combinations
_
=>
TransformKernel
::
Unsupported
,
}
}
/// Get the effective source layout, using override if provided.
#[expect(dead_code)]
pub
(
crate
)
fn
effective_src_layout
(
src
:
&
PhysicalLayout
,
override_layout
:
Option
<
KvBlockLayout
>
,
)
->
KvBlockLayout
{
override_layout
.unwrap_or_else
(||
src
.layout
()
.block_layout
())
}
/// Get the effective destination layout, using override if provided.
#[expect(dead_code)]
pub
(
crate
)
fn
effective_dst_layout
(
dst
:
&
PhysicalLayout
,
override_layout
:
Option
<
KvBlockLayout
>
,
)
->
KvBlockLayout
{
override_layout
.unwrap_or_else
(||
dst
.layout
()
.block_layout
())
}
#[derive(Default)]
#[expect(dead_code)]
pub
(
crate
)
struct
TransferOptionsInternal
{
layer_range
:
Option
<
Range
<
usize
>>
,
nixl_write_notification
:
Option
<
u64
>
,
bounce_buffer
:
Option
<
BounceBufferInternal
>
,
/// If provided, use this stream instead of acquiring from pool.
/// Caller manages synchronization - no event is recorded by the executor.
pub
(
crate
)
cuda_stream
:
Option
<
Arc
<
CudaStream
>>
,
/// Override source block layout interpretation.
/// If None, uses the layout's block_layout() method.
pub
(
crate
)
src_kv_layout
:
Option
<
KvBlockLayout
>
,
/// Override destination block layout interpretation.
/// If None, uses the layout's block_layout() method.
pub
(
crate
)
dst_kv_layout
:
Option
<
KvBlockLayout
>
,
}
impl
TransferOptionsInternal
{
pub
(
crate
)
fn
builder
()
->
TransferOptionsInternalBuilder
{
TransferOptionsInternalBuilder
::
default
()
}
}
#[derive(Default)]
pub
(
crate
)
struct
TransferOptionsInternalBuilder
{
layer_range
:
Option
<
Range
<
usize
>>
,
nixl_write_notification
:
Option
<
u64
>
,
bounce_buffer
:
Option
<
BounceBufferInternal
>
,
cuda_stream
:
Option
<
Arc
<
CudaStream
>>
,
src_kv_layout
:
Option
<
KvBlockLayout
>
,
dst_kv_layout
:
Option
<
KvBlockLayout
>
,
}
impl
TransferOptionsInternalBuilder
{
pub
(
crate
)
fn
layer_range
(
mut
self
,
range
:
Range
<
usize
>
)
->
Self
{
self
.layer_range
=
Some
(
range
);
self
}
pub
(
crate
)
fn
nixl_write_notification
(
mut
self
,
notification
:
u64
)
->
Self
{
self
.nixl_write_notification
=
Some
(
notification
);
self
}
pub
(
crate
)
fn
bounce_buffer
(
mut
self
,
bounce_buffer
:
BounceBufferInternal
)
->
Self
{
self
.bounce_buffer
=
Some
(
bounce_buffer
);
self
}
/// Set a specific CUDA stream to use for this transfer.
///
/// When provided, the executor will use this stream instead of acquiring
/// one from the pool. The caller is responsible for synchronization -
/// no event is recorded by the executor.
///
/// This is useful for layer-wise transfers where all layers must execute
/// on the same stream to allow proper event sequencing.
pub
(
crate
)
fn
cuda_stream
(
mut
self
,
stream
:
Arc
<
CudaStream
>
)
->
Self
{
self
.cuda_stream
=
Some
(
stream
);
self
}
/// Override the source block layout interpretation.
///
/// When set, the transfer executor will treat source blocks as having
/// this layout instead of the layout's default block_layout().
/// This enables transferring blocks that are stored in one format
/// but should be interpreted as another.
pub
(
crate
)
fn
src_kv_layout
(
mut
self
,
layout
:
KvBlockLayout
)
->
Self
{
self
.src_kv_layout
=
Some
(
layout
);
self
}
/// Override the destination block layout interpretation.
///
/// When set, the transfer executor will treat destination blocks as having
/// this layout instead of the layout's default block_layout().
/// This enables writing blocks in a different format than the destination
/// layout's native format.
pub
(
crate
)
fn
dst_kv_layout
(
mut
self
,
layout
:
KvBlockLayout
)
->
Self
{
self
.dst_kv_layout
=
Some
(
layout
);
self
}
pub
(
crate
)
fn
build
(
self
)
->
Result
<
TransferOptionsInternal
>
{
Ok
(
TransferOptionsInternal
{
layer_range
:
self
.layer_range
,
nixl_write_notification
:
self
.nixl_write_notification
,
bounce_buffer
:
self
.bounce_buffer
,
cuda_stream
:
self
.cuda_stream
,
src_kv_layout
:
self
.src_kv_layout
,
dst_kv_layout
:
self
.dst_kv_layout
,
})
}
}
/// Execute a transfer between two physical layouts.
///
/// This is an internal entry point for all transfer operations called by TransportManager.
/// It selects the appropriate strategy and dispatches to the corresponding executor.
///
/// # Arguments
/// * `src` - Source physical layout
/// * `dst` - Destination physical layout
/// * `src_block_ids` - Source block IDs to transfer
/// * `dst_block_ids` - Destination block IDs to transfer
/// * `options` - Transfer options
/// * `ctx` - Transfer context with CUDA stream and NIXL agent
pub
(
crate
)
fn
execute_transfer
(
src
:
&
PhysicalLayout
,
dst
:
&
PhysicalLayout
,
src_block_ids
:
&
[
BlockId
],
dst_block_ids
:
&
[
BlockId
],
options
:
TransferOptionsInternal
,
ctx
:
&
TransferContext
,
)
->
Result
<
TransferCompleteNotification
>
{
// Validate block IDs
validate_block_transfer
(
src_block_ids
,
dst_block_ids
,
None
,
src
,
dst
,
None
)
?
;
// Select transfer plan based on locations and capabilities
let
plan
=
select_strategy
(
src
,
dst
,
ctx
)
?
;
// Dispatch based on plan type
match
plan
{
TransferPlan
::
Direct
(
strategy
)
=>
execute_direct_transfer
(
src
,
dst
,
src_block_ids
,
dst_block_ids
,
options
.layer_range
,
strategy
,
options
.cuda_stream
,
ctx
,
),
TransferPlan
::
TwoHop
{
first
,
bounce_location
,
second
,
}
=>
execute_two_hop_transfer
(
TwoHopTransferParams
{
src
,
dst
,
src_block_ids
,
dst_block_ids
,
first_strategy
:
first
,
bounce_location
,
second_strategy
:
second
,
options
,
ctx
,
}),
}
}
/// Execute a direct single-hop transfer.
#[allow(clippy::too_many_arguments)]
fn
execute_direct_transfer
(
src
:
&
PhysicalLayout
,
dst
:
&
PhysicalLayout
,
src_block_ids
:
&
[
BlockId
],
dst_block_ids
:
&
[
BlockId
],
layer_range
:
Option
<
Range
<
usize
>>
,
strategy
:
TransferStrategy
,
cuda_stream
:
Option
<
Arc
<
CudaStream
>>
,
ctx
:
&
TransferContext
,
)
->
Result
<
TransferCompleteNotification
>
{
match
strategy
{
TransferStrategy
::
Memcpy
=>
{
if
cuda_stream
.is_some
()
{
return
Err
(
anyhow
::
anyhow!
(
"cuda_stream option is not supported for Memcpy strategy"
));
}
memcpy
::
execute_memcpy_transfer
(
src
,
dst
,
src_block_ids
,
dst_block_ids
,
layer_range
,
ctx
,
)
}
TransferStrategy
::
CudaAsyncH2D
|
TransferStrategy
::
CudaAsyncD2H
|
TransferStrategy
::
CudaAsyncD2D
=>
Ok
(
cuda
::
execute_cuda_transfer
(
src
,
dst
,
src_block_ids
,
dst_block_ids
,
layer_range
,
strategy
,
cuda_stream
,
ctx
,
)
?
),
TransferStrategy
::
NixlRead
|
TransferStrategy
::
NixlWrite
|
TransferStrategy
::
NixlReadFlipped
|
TransferStrategy
::
NixlWriteFlipped
=>
{
if
cuda_stream
.is_some
()
{
return
Err
(
anyhow
::
anyhow!
(
"cuda_stream option is not supported for NIXL strategies"
));
}
let
mut
builder
=
NixlTransferBuilder
::
new
()
.src
(
src
)
.dst
(
dst
)
.src_blocks
(
src_block_ids
)
.dst_blocks
(
dst_block_ids
)
.strategy
(
strategy
);
if
let
Some
(
range
)
=
layer_range
{
builder
=
builder
.layer_range
(
range
);
}
builder
.execute
(
ctx
)
}
TransferStrategy
::
Invalid
=>
Err
(
anyhow
::
anyhow!
(
"Invalid transfer strategy for src={:?}, dst={:?}"
,
src
.location
(),
dst
.location
()
)),
}
}
/// Work-stealing bounce buffer transfer using two parallel tasks.
///
/// This function implements a work-stealing approach where two tasks each take
/// batches from a shared iterator and execute complete two-hop transfers.
/// This is simpler to maintain than double-buffering while still providing
/// good throughput through task parallelism.
///
/// # Algorithm
/// 1. Split bounce buffer into two groups (group 0 and group 1)
/// 2. Create a shared iterator over (src_block_id, dst_block_id) pairs
/// 3. Two parallel tasks each:
/// - Lock the iterator, take a batch of pairs
/// - Execute the complete two-hop transfer for that batch
/// - Repeat until iterator is exhausted
#[allow(clippy::too_many_arguments)]
async
fn
handle_buffered_transfer
(
src
:
&
PhysicalLayout
,
bounce_layout
:
&
PhysicalLayout
,
dst
:
&
PhysicalLayout
,
src_block_ids
:
&
[
BlockId
],
bounce_block_ids
:
&
[
BlockId
],
dst_block_ids
:
&
[
BlockId
],
first_strategy
:
TransferStrategy
,
second_strategy
:
TransferStrategy
,
layer_range
:
&
Option
<
Range
<
usize
>>
,
ctx
:
&
TransferContext
,
)
->
Result
<
()
>
{
let
bounce_groups
=
&
bounce_block_ids
[
0
..
std
::
cmp
::
min
(
src_block_ids
.len
(),
bounce_block_ids
.len
())];
let
(
bounce_group_0
,
bounce_group_1
)
=
bounce_groups
.split_at
(
bounce_groups
.len
()
/
2
);
let
bounce_group_0
=
bounce_group_0
.to_vec
();
let
bounce_group_1
=
bounce_group_1
.to_vec
();
let
src_dst_iter
=
Arc
::
new
(
Mutex
::
new
(
src_block_ids
.iter
()
.zip
(
dst_block_ids
.iter
())));
let
transfer_task
=
async
move
|
bounce_group
:
&
[
BlockId
]|
->
Result
<
()
>
{
loop
{
let
(
src_ids
,
dst_ids
):
(
Vec
<
BlockId
>
,
Vec
<
BlockId
>
);
{
let
mut
x
=
src_dst_iter
.lock
()
.await
;
(
src_ids
,
dst_ids
)
=
x
.by_ref
()
.take
(
bounce_group
.len
())
.map
(|(
&
s
,
&
d
)|
(
s
,
d
))
.unzip
();
if
src_ids
.is_empty
()
{
break
;
}
}
execute_two_hop_transfer_chunk
(
src
,
bounce_layout
,
dst
,
&
src_ids
,
&
bounce_group
[
0
..
src_ids
.len
()],
&
dst_ids
,
first_strategy
,
second_strategy
,
layer_range
,
ctx
,
)
.await
?
;
}
Ok
(())
};
let
transfer_0
=
transfer_task
(
&
bounce_group_0
);
let
transfer_1
=
transfer_task
(
&
bounce_group_1
);
futures
::
future
::
try_join
(
transfer_0
,
transfer_1
)
.await
?
;
Ok
(())
}
/// Execute a single chunk of a two-hop transfer sequentially.
///
/// Used when bounce buffer has only a single block or as a fallback.
/// Performs src→bounce followed by bounce→dst sequentially.
#[allow(clippy::too_many_arguments)]
async
fn
execute_two_hop_transfer_chunk
(
src
:
&
PhysicalLayout
,
bounce_layout
:
&
PhysicalLayout
,
dst
:
&
PhysicalLayout
,
src_block_ids
:
&
[
BlockId
],
bounce_block_ids
:
&
[
BlockId
],
dst_block_ids
:
&
[
BlockId
],
first_strategy
:
TransferStrategy
,
second_strategy
:
TransferStrategy
,
layer_range
:
&
Option
<
Range
<
usize
>>
,
ctx
:
&
TransferContext
,
)
->
Result
<
()
>
{
let
bounce_ids_to_use
=
&
bounce_block_ids
[
..
src_block_ids
.len
()];
execute_direct_transfer
(
src
,
bounce_layout
,
src_block_ids
,
bounce_ids_to_use
,
layer_range
.clone
(),
first_strategy
,
None
,
// Two-hop transfers don't support caller-provided streams
ctx
,
)
?
.await
?
;
execute_direct_transfer
(
bounce_layout
,
dst
,
bounce_ids_to_use
,
dst_block_ids
,
layer_range
.clone
(),
second_strategy
,
None
,
// Two-hop transfers don't support caller-provided streams
ctx
,
)
?
.await
?
;
Ok
(())
}
/// Parameters for two-hop transfer execution
struct
TwoHopTransferParams
<
'a
>
{
src
:
&
'a
PhysicalLayout
,
dst
:
&
'a
PhysicalLayout
,
src_block_ids
:
&
'a
[
BlockId
],
dst_block_ids
:
&
'a
[
BlockId
],
first_strategy
:
TransferStrategy
,
bounce_location
:
StorageKind
,
second_strategy
:
TransferStrategy
,
options
:
TransferOptionsInternal
,
ctx
:
&
'a
TransferContext
,
}
fn
execute_two_hop_transfer
(
params
:
TwoHopTransferParams
)
->
Result
<
TransferCompleteNotification
>
{
let
TwoHopTransferParams
{
src
,
dst
,
src_block_ids
,
dst_block_ids
,
first_strategy
,
bounce_location
,
second_strategy
,
options
,
ctx
,
}
=
params
;
let
event
=
ctx
.event_system
()
.new_event
()
?
;
let
handle
=
event
.into_handle
();
let
awaiter
=
ctx
.event_system
()
.awaiter
(
handle
)
?
;
let
system
=
ctx
.event_system
()
.clone
();
// TODO: Cloning all this stuff is not ideal.
let
src_clone
=
src
.clone
();
let
dst_clone
=
dst
.clone
();
let
src_block_ids
=
src_block_ids
.to_vec
();
let
dst_block_ids
=
dst_block_ids
.to_vec
();
let
ctx_clone
=
ctx
.clone
();
// let options_clone = options.clone();
ctx
.tokio
()
.spawn
(
async
move
{
let
Some
(
ref
bounce_buffer_spec
)
=
options
.bounce_buffer
else
{
let
_
=
system
.poison
(
handle
,
"Two-hop transfers require a bounce buffer."
.to_string
(),
);
return
;
};
if
bounce_buffer_spec
.layout
.location
()
!=
bounce_location
{
let
_
=
system
.poison
(
handle
,
"Bounce buffer layout does not match bounce location."
.to_string
(),
);
return
;
}
let
num_bounce_blocks
=
bounce_buffer_spec
.block_ids
.len
();
if
num_bounce_blocks
==
1
{
// Single bounce block: use sequential processing for each block
let
bounce_block
=
bounce_buffer_spec
.block_ids
[
0
];
for
(
src_block_id
,
dst_block_id
)
in
src_block_ids
.iter
()
.zip
(
dst_block_ids
.iter
())
{
if
let
Err
(
e
)
=
execute_two_hop_transfer_chunk
(
&
src_clone
,
&
bounce_buffer_spec
.layout
,
&
dst_clone
,
&
[
*
src_block_id
],
&
[
bounce_block
],
&
[
*
dst_block_id
],
first_strategy
,
second_strategy
,
&
options
.layer_range
,
&
ctx_clone
,
)
.await
{
let
_
=
system
.poison
(
handle
,
e
.to_string
());
return
;
}
}
let
_
=
system
.trigger
(
handle
);
}
else
{
// Multiple bounce blocks: use work-stealing parallel transfer
if
let
Err
(
e
)
=
handle_buffered_transfer
(
&
src_clone
,
&
bounce_buffer_spec
.layout
,
&
dst_clone
,
&
src_block_ids
,
&
bounce_buffer_spec
.block_ids
,
&
dst_block_ids
,
first_strategy
,
second_strategy
,
&
options
.layer_range
,
&
ctx_clone
,
)
.await
{
let
_
=
system
.poison
(
handle
,
e
.to_string
());
return
;
}
let
_
=
system
.trigger
(
handle
);
}
});
Ok
(
TransferCompleteNotification
::
from_awaiter
(
awaiter
))
}
#[cfg(all(test,
feature
=
"testing-kvbm"
))]
mod
tests
{
use
super
::
*
;
#[test]
fn
test_select_transform_kernel_same_layout
()
{
// Same layout - no transformation
assert_eq!
(
select_transform_kernel
(
KvBlockLayout
::
OperationalNHD
,
KvBlockLayout
::
OperationalNHD
),
TransformKernel
::
None
);
assert_eq!
(
select_transform_kernel
(
KvBlockLayout
::
UniversalTP
,
KvBlockLayout
::
UniversalTP
),
TransformKernel
::
None
);
}
#[test]
fn
test_select_transform_kernel_block_to_universal
()
{
// Operational to Universal
assert
!
(
matches!
(
select_transform_kernel
(
KvBlockLayout
::
OperationalNHD
,
KvBlockLayout
::
UniversalTP
),
TransformKernel
::
BlockToUniversal
{
src_layout
:
KvBlockLayout
::
OperationalNHD
}
));
assert
!
(
matches!
(
select_transform_kernel
(
KvBlockLayout
::
OperationalHND
,
KvBlockLayout
::
UniversalTP
),
TransformKernel
::
BlockToUniversal
{
src_layout
:
KvBlockLayout
::
OperationalHND
}
));
}
#[test]
fn
test_select_transform_kernel_universal_to_block
()
{
// Universal to Operational
assert
!
(
matches!
(
select_transform_kernel
(
KvBlockLayout
::
UniversalTP
,
KvBlockLayout
::
OperationalNHD
),
TransformKernel
::
UniversalToBlock
{
dst_layout
:
KvBlockLayout
::
OperationalNHD
}
));
assert
!
(
matches!
(
select_transform_kernel
(
KvBlockLayout
::
UniversalTP
,
KvBlockLayout
::
OperationalHND
),
TransformKernel
::
UniversalToBlock
{
dst_layout
:
KvBlockLayout
::
OperationalHND
}
));
}
#[test]
fn
test_select_transform_kernel_operational_transpose
()
{
// NHD <-> HND
assert_eq!
(
select_transform_kernel
(
KvBlockLayout
::
OperationalNHD
,
KvBlockLayout
::
OperationalHND
),
TransformKernel
::
OperationalTranspose
);
assert_eq!
(
select_transform_kernel
(
KvBlockLayout
::
OperationalHND
,
KvBlockLayout
::
OperationalNHD
),
TransformKernel
::
OperationalTranspose
);
}
#[test]
fn
test_select_transform_kernel_unknown_unsupported
()
{
// Unknown is always unsupported
assert_eq!
(
select_transform_kernel
(
KvBlockLayout
::
Unknown
,
KvBlockLayout
::
OperationalNHD
),
TransformKernel
::
Unsupported
);
assert_eq!
(
select_transform_kernel
(
KvBlockLayout
::
OperationalNHD
,
KvBlockLayout
::
Unknown
),
TransformKernel
::
Unsupported
);
}
#[test]
fn
test_select_transform_kernel_custom_unsupported
()
{
// Custom layouts are unsupported (for now)
let
custom
=
KvBlockLayout
::
Custom
([
crate
::
layout
::
BlockDim
::
Head
,
crate
::
layout
::
BlockDim
::
Layer
,
crate
::
layout
::
BlockDim
::
Outer
,
crate
::
layout
::
BlockDim
::
Page
,
]);
assert_eq!
(
select_transform_kernel
(
custom
,
KvBlockLayout
::
OperationalNHD
),
TransformKernel
::
Unsupported
);
}
}
lib/kvbm-physical/src/transfer/executor/nixl.rs
0 → 100644
View file @
9ab148dc
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Typestate builder for NIXL transfers.
//!
//! This module provides a compile-time safe builder for NIXL transfers that ensures
//! all required parameters are set before execution.
use
super
::{
PhysicalLayout
,
TransferContext
,
TransferStrategy
};
use
crate
::
BlockId
;
use
crate
::
transfer
::
context
::
TransferCompleteNotification
;
use
crate
::
transfer
::{
can_use_whole_block_transfer
,
validate_layout_compatibility
};
use
anyhow
::{
Result
,
anyhow
};
use
dynamo_memory
::
nixl
::{
XferDescList
,
XferOp
};
use
std
::
marker
::
PhantomData
;
use
std
::
ops
::
Range
;
/// Marker type for unset builder fields.
pub
struct
Unset
;
/// Marker type for set builder fields.
pub
struct
Set
;
/// Typestate builder for NIXL transfers.
///
/// This builder uses the typestate pattern to ensure all required parameters are set
/// at compile time. The type parameters track which fields have been set:
/// - `TSrc`: Source layout state
/// - `TDst`: Destination layout state
/// - `TSrcBlocks`: Source block IDs state
/// - `TDstBlocks`: Destination block IDs state
/// - `TStrategy`: Transfer strategy state
pub
struct
NixlTransferBuilder
<
'a
,
TSrc
,
TDst
,
TSrcBlocks
,
TDstBlocks
,
TStrategy
>
{
src
:
Option
<&
'a
PhysicalLayout
>
,
dst
:
Option
<&
'a
PhysicalLayout
>
,
src_block_ids
:
Option
<&
'a
[
BlockId
]
>
,
dst_block_ids
:
Option
<&
'a
[
BlockId
]
>
,
strategy
:
Option
<
TransferStrategy
>
,
layer_range
:
Option
<
Range
<
usize
>>
,
write_notif
:
Option
<
uuid
::
Uuid
>
,
_
phantom
:
PhantomData
<
(
TSrc
,
TDst
,
TSrcBlocks
,
TDstBlocks
,
TStrategy
)
>
,
}
impl
<
'a
>
NixlTransferBuilder
<
'a
,
Unset
,
Unset
,
Unset
,
Unset
,
Unset
>
{
/// Creates a new NIXL transfer builder with all fields unset.
pub
fn
new
()
->
Self
{
Self
{
src
:
None
,
dst
:
None
,
src_block_ids
:
None
,
dst_block_ids
:
None
,
strategy
:
None
,
layer_range
:
None
,
write_notif
:
None
,
_
phantom
:
PhantomData
,
}
}
}
impl
<
'a
>
Default
for
NixlTransferBuilder
<
'a
,
Unset
,
Unset
,
Unset
,
Unset
,
Unset
>
{
fn
default
()
->
Self
{
Self
::
new
()
}
}
// Required field setters - these consume self and return a new builder with the field marked as Set
impl
<
'a
,
TDst
,
TSrcBlocks
,
TDstBlocks
,
TStrategy
>
NixlTransferBuilder
<
'a
,
Unset
,
TDst
,
TSrcBlocks
,
TDstBlocks
,
TStrategy
>
{
/// Sets the source physical layout.
pub
fn
src
(
self
,
src
:
&
'a
PhysicalLayout
,
)
->
NixlTransferBuilder
<
'a
,
Set
,
TDst
,
TSrcBlocks
,
TDstBlocks
,
TStrategy
>
{
NixlTransferBuilder
{
src
:
Some
(
src
),
dst
:
self
.dst
,
src_block_ids
:
self
.src_block_ids
,
dst_block_ids
:
self
.dst_block_ids
,
strategy
:
self
.strategy
,
layer_range
:
self
.layer_range
,
write_notif
:
self
.write_notif
,
_
phantom
:
PhantomData
,
}
}
}
impl
<
'a
,
TSrc
,
TSrcBlocks
,
TDstBlocks
,
TStrategy
>
NixlTransferBuilder
<
'a
,
TSrc
,
Unset
,
TSrcBlocks
,
TDstBlocks
,
TStrategy
>
{
/// Sets the destination physical layout.
pub
fn
dst
(
self
,
dst
:
&
'a
PhysicalLayout
,
)
->
NixlTransferBuilder
<
'a
,
TSrc
,
Set
,
TSrcBlocks
,
TDstBlocks
,
TStrategy
>
{
NixlTransferBuilder
{
src
:
self
.src
,
dst
:
Some
(
dst
),
src_block_ids
:
self
.src_block_ids
,
dst_block_ids
:
self
.dst_block_ids
,
strategy
:
self
.strategy
,
layer_range
:
self
.layer_range
,
write_notif
:
self
.write_notif
,
_
phantom
:
PhantomData
,
}
}
}
impl
<
'a
,
TSrc
,
TDst
,
TDstBlocks
,
TStrategy
>
NixlTransferBuilder
<
'a
,
TSrc
,
TDst
,
Unset
,
TDstBlocks
,
TStrategy
>
{
/// Sets the source block IDs to transfer.
pub
fn
src_blocks
(
self
,
src_block_ids
:
&
'a
[
BlockId
],
)
->
NixlTransferBuilder
<
'a
,
TSrc
,
TDst
,
Set
,
TDstBlocks
,
TStrategy
>
{
NixlTransferBuilder
{
src
:
self
.src
,
dst
:
self
.dst
,
src_block_ids
:
Some
(
src_block_ids
),
dst_block_ids
:
self
.dst_block_ids
,
strategy
:
self
.strategy
,
layer_range
:
self
.layer_range
,
write_notif
:
self
.write_notif
,
_
phantom
:
PhantomData
,
}
}
}
impl
<
'a
,
TSrc
,
TDst
,
TSrcBlocks
,
TStrategy
>
NixlTransferBuilder
<
'a
,
TSrc
,
TDst
,
TSrcBlocks
,
Unset
,
TStrategy
>
{
/// Sets the destination block IDs to transfer.
pub
fn
dst_blocks
(
self
,
dst_block_ids
:
&
'a
[
BlockId
],
)
->
NixlTransferBuilder
<
'a
,
TSrc
,
TDst
,
TSrcBlocks
,
Set
,
TStrategy
>
{
NixlTransferBuilder
{
src
:
self
.src
,
dst
:
self
.dst
,
src_block_ids
:
self
.src_block_ids
,
dst_block_ids
:
Some
(
dst_block_ids
),
strategy
:
self
.strategy
,
layer_range
:
self
.layer_range
,
write_notif
:
self
.write_notif
,
_
phantom
:
PhantomData
,
}
}
}
impl
<
'a
,
TSrc
,
TDst
,
TSrcBlocks
,
TDstBlocks
>
NixlTransferBuilder
<
'a
,
TSrc
,
TDst
,
TSrcBlocks
,
TDstBlocks
,
Unset
>
{
/// Sets the NIXL transfer strategy (Read or Write).
pub
fn
strategy
(
self
,
strategy
:
TransferStrategy
,
)
->
NixlTransferBuilder
<
'a
,
TSrc
,
TDst
,
TSrcBlocks
,
TDstBlocks
,
Set
>
{
NixlTransferBuilder
{
src
:
self
.src
,
dst
:
self
.dst
,
src_block_ids
:
self
.src_block_ids
,
dst_block_ids
:
self
.dst_block_ids
,
strategy
:
Some
(
strategy
),
layer_range
:
self
.layer_range
,
write_notif
:
self
.write_notif
,
_
phantom
:
PhantomData
,
}
}
}
// Optional field setters - these can be called at any point in the builder chain
impl
<
'a
,
TSrc
,
TDst
,
TSrcBlocks
,
TDstBlocks
,
TStrategy
>
NixlTransferBuilder
<
'a
,
TSrc
,
TDst
,
TSrcBlocks
,
TDstBlocks
,
TStrategy
>
{
/// Sets an optional range of layers to transfer.
/// If not called, all layers will be transferred.
pub
fn
layer_range
(
mut
self
,
layer_range
:
Range
<
usize
>
)
->
Self
{
self
.layer_range
=
Some
(
layer_range
);
self
}
/// Sets an optional write notification UUID.
#[expect(dead_code)]
pub
fn
write_notif
(
mut
self
,
write_notif
:
uuid
::
Uuid
)
->
Self
{
self
.write_notif
=
Some
(
write_notif
);
self
}
}
// Execute method - only available when all required fields are Set
impl
<
'a
>
NixlTransferBuilder
<
'a
,
Set
,
Set
,
Set
,
Set
,
Set
>
{
/// Executes the NIXL transfer with the configured parameters.
///
/// This method is only available when all required fields have been set,
/// enforced at compile time by the typestate pattern.
pub
(
crate
)
fn
execute
(
self
,
ctx
:
&
TransferContext
)
->
Result
<
TransferCompleteNotification
>
{
// Unwrap all required fields (safe because typestate guarantees they're set)
let
src
=
self
.src
.unwrap
();
let
dst
=
self
.dst
.unwrap
();
let
src_block_ids
=
self
.src_block_ids
.unwrap
();
let
dst_block_ids
=
self
.dst_block_ids
.unwrap
();
let
strategy
=
self
.strategy
.unwrap
();
let
layer_range
=
self
.layer_range
;
let
_
write_notif
=
self
.write_notif
;
// Validate layouts
let
src_layout
=
src
.layout
();
let
dst_layout
=
dst
.layout
();
if
src_layout
.num_layers
()
!=
dst_layout
.num_layers
()
{
return
Err
(
anyhow!
(
"Layouts have incompatible layer counts: src={}, dst={}"
,
src_layout
.num_layers
(),
dst_layout
.num_layers
()
));
}
if
src_layout
.outer_dim
()
!=
dst_layout
.outer_dim
()
{
return
Err
(
anyhow!
(
"Layouts have incompatible outer dimensions: src={}, dst={}"
,
src_layout
.outer_dim
(),
dst_layout
.outer_dim
()
));
}
// Validate layout compatibility (errors if transform would be needed)
validate_layout_compatibility
(
src
,
dst
)
?
;
// Get NIXL agent
let
nixl_agent
=
ctx
.nixl_agent
();
// Determine layer range
let
layers
=
layer_range
.clone
()
.unwrap_or
(
0
..
src_layout
.num_layers
());
// Check if we can use optimized whole-block transfer
let
use_whole_block
=
can_use_whole_block_transfer
(
src
,
dst
,
layer_range
.as_ref
());
// Determine NIXL operation type
let
xfer_op
=
match
strategy
{
TransferStrategy
::
NixlRead
|
TransferStrategy
::
NixlReadFlipped
=>
XferOp
::
Read
,
TransferStrategy
::
NixlWrite
|
TransferStrategy
::
NixlWriteFlipped
=>
XferOp
::
Write
,
_
=>
{
return
Err
(
anyhow!
(
"Invalid NIXL transfer strategy: {:?}"
,
strategy
));
}
};
// Validate locality constraints based on operation type:
// - For Write operations (push): source must be local, we're writing FROM local TO remote
// - For Read operations (pull): destination must be local, we're reading FROM remote INTO local
let
src_is_local
=
nixl_agent
.name
()
==
src
.nixl_metadata
()
.agent_name
();
let
dst_is_local
=
nixl_agent
.name
()
==
dst
.nixl_metadata
()
.agent_name
();
// These are invariant assertions — a violation means a bug in `select_strategy`,
// not a user error. The strategy selection guarantees locality constraints.
match
xfer_op
{
XferOp
::
Write
=>
{
assert
!
(
src_is_local
,
"For NIXL Write (push), the source must be local. src_agent='{}', local_agent='{}'"
,
src
.nixl_metadata
()
.agent_name
(),
nixl_agent
.name
()
);
}
XferOp
::
Read
=>
{
assert
!
(
dst_is_local
,
"For NIXL Read (pull), the destination must be local. dst_agent='{}', local_agent='{}'"
,
dst
.nixl_metadata
()
.agent_name
(),
nixl_agent
.name
()
);
}
}
// Capture NIXL metadata for both layouts
let
src_metadata
=
src
.nixl_metadata
();
let
dst_metadata
=
dst
.nixl_metadata
();
let
src_mem_type
=
src_metadata
.mem_type
();
let
dst_mem_type
=
dst_metadata
.mem_type
();
let
src_device_id
=
src_metadata
.device_id
();
let
dst_device_id
=
dst_metadata
.device_id
();
// Build XferDescLists for source and destination
let
mut
src_dl
=
XferDescList
::
new
(
src_mem_type
)
?
;
let
mut
dst_dl
=
XferDescList
::
new
(
dst_mem_type
)
?
;
// Build descriptor lists - use whole-block or layer-wise depending on layout
if
use_whole_block
{
let
bytes_per_block
=
src_layout
.config
()
.bytes_per_block
();
tracing
::
debug!
(
num_blocks
=
src_block_ids
.len
(),
bytes_per_block
,
"Building whole-block NIXL descriptors"
);
for
(
&
src_block_id
,
&
dst_block_id
)
in
src_block_ids
.iter
()
.zip
(
dst_block_ids
.iter
())
{
let
src_region
=
src
.memory_region
(
src_block_id
,
0
,
0
)
?
;
let
dst_region
=
dst
.memory_region
(
dst_block_id
,
0
,
0
)
?
;
src_dl
.add_desc
(
src_region
.addr
(),
bytes_per_block
,
src_device_id
);
dst_dl
.add_desc
(
dst_region
.addr
(),
bytes_per_block
,
dst_device_id
);
}
}
else
{
tracing
::
debug!
(
num_blocks
=
src_block_ids
.len
(),
layer_range
=
?
layers
,
src_fc
=
src_layout
.is_fully_contiguous
(),
dst_fc
=
dst_layout
.is_fully_contiguous
(),
"Building layer-wise NIXL descriptors"
);
for
(
&
src_block_id
,
&
dst_block_id
)
in
src_block_ids
.iter
()
.zip
(
dst_block_ids
.iter
())
{
for
layer_id
in
layers
.clone
()
{
for
outer_id
in
0
..
src_layout
.outer_dim
()
{
let
src_region
=
src
.memory_region
(
src_block_id
,
layer_id
,
outer_id
)
?
;
let
dst_region
=
dst
.memory_region
(
dst_block_id
,
layer_id
,
outer_id
)
?
;
if
src_region
.size
()
!=
dst_region
.size
()
{
return
Err
(
anyhow!
(
"Size mismatch at block=({},{}), layer={}, outer={}: src={}, dst={}"
,
src_block_id
,
dst_block_id
,
layer_id
,
outer_id
,
src_region
.size
(),
dst_region
.size
()
));
}
src_dl
.add_desc
(
src_region
.addr
(),
src_region
.size
(),
src_device_id
);
dst_dl
.add_desc
(
dst_region
.addr
(),
dst_region
.size
(),
dst_device_id
);
}
}
}
}
// Note: Overlap detection was removed from nixl-sys 0.6.1
// The NIXL library now handles overlap detection internally
if
matches!
(
strategy
,
TransferStrategy
::
NixlReadFlipped
|
TransferStrategy
::
NixlWriteFlipped
)
{
std
::
mem
::
swap
(
&
mut
src_dl
,
&
mut
dst_dl
);
}
// Create transfer request
// The remote agent depends on operation type:
// - For Write (push): remote is the destination
// - For Read (pull): remote is the source
let
remote_agent
=
match
xfer_op
{
XferOp
::
Write
=>
dst_metadata
.agent_name
(),
XferOp
::
Read
=>
src_metadata
.agent_name
(),
};
let
xfer_req
=
nixl_agent
.create_xfer_req
(
xfer_op
,
&
src_dl
,
&
dst_dl
,
remote_agent
,
None
,
// opt_args
)
?
;
// Post transfer request
// Note: Notification handling via OptArgs can be added later if needed
let
still_pending
=
nixl_agent
.post_xfer_req
(
&
xfer_req
,
None
)
?
;
if
still_pending
{
// Register for async completion via status polling
Ok
(
ctx
.register_nixl_status
(
xfer_req
))
}
else
{
// Transfer completed synchronously
Ok
(
TransferCompleteNotification
::
completed
())
}
}
}
lib/kvbm-physical/src/transfer/fill.rs
0 → 100644
View file @
9ab148dc
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Block filling operations for testing.
//!
//! This module provides utilities to populate blocks with specific patterns
//! for verification in round-trip tests.
use
crate
::
BlockId
;
use
super
::
PhysicalLayout
;
use
aligned_vec
::{
AVec
,
avec
};
use
anyhow
::{
Result
,
anyhow
};
use
cudarc
::
runtime
::
sys
::{
cudaMemcpy
,
cudaMemcpyKind
};
use
dynamo_memory
::
StorageKind
;
use
std
::{
fs
::
File
,
io
::{
Seek
,
Write
},
mem
::
ManuallyDrop
,
ops
::
Range
,
os
::
fd
::
FromRawFd
,
};
/// Fill strategy for block memory.
#[derive(Debug,
Clone,
Copy)]
pub
enum
FillPattern
{
/// Fill with a constant byte value
Constant
(
u8
),
/// Fill with a sequential pattern: block_id + layer_id + offset % 256
Sequential
,
}
/// Fill blocks in a physical layout with a specific pattern.
///
/// This operation directly writes to memory and should only be used on
/// local layouts. Remote layouts cannot be filled directly.
///
/// # Arguments
/// * `layout` - The physical layout containing the blocks
/// * `block_ids` - List of block IDs to fill
/// * `pattern` - Fill pattern to use
///
/// # Errors
/// Returns an error if:
/// - Layout is remote (cannot fill remote memory directly)
/// - Block IDs are out of range
/// - Memory access fails
pub
fn
fill_blocks
(
layout
:
&
PhysicalLayout
,
block_ids
:
&
[
BlockId
],
pattern
:
FillPattern
,
)
->
Result
<
()
>
{
// Can only fill local layouts
let
config
=
layout
.layout
()
.config
();
let
num_layers
=
config
.num_layers
;
let
outer_dim
=
config
.outer_dim
;
for
&
block_id
in
block_ids
{
if
block_id
>=
config
.num_blocks
as
BlockId
{
return
Err
(
anyhow!
(
"Block ID {} out of range"
,
block_id
));
}
// Fill all layers and outer dimensions for this block
for
layer_id
in
0
..
num_layers
{
for
outer_id
in
0
..
outer_dim
{
let
region
=
layout
.memory_region
(
block_id
,
layer_id
,
outer_id
)
?
;
match
layout
.location
()
{
StorageKind
::
System
|
StorageKind
::
Pinned
=>
{
fill_memory_region
(
region
.addr
(),
region
.size
(),
block_id
,
layer_id
,
pattern
,
)
?
;
}
StorageKind
::
Device
(
_
)
=>
{
let
system_region
:
Vec
<
u8
>
=
vec!
[
0
;
region
.size
()];
fill_memory_region
(
system_region
.as_ptr
()
as
usize
,
system_region
.len
(),
block_id
,
layer_id
,
pattern
,
)
?
;
let
err
=
unsafe
{
cudaMemcpy
(
region
.addr
()
as
*
mut
std
::
ffi
::
c_void
,
system_region
.as_ptr
()
as
*
const
std
::
ffi
::
c_void
,
region
.size
(),
cudaMemcpyKind
::
cudaMemcpyHostToDevice
,
)
};
if
err
!=
cudarc
::
runtime
::
sys
::
cudaError
::
cudaSuccess
{
return
Err
(
anyhow!
(
"cudaMemcpy H2D failed in fill_blocks: {:?}"
,
err
));
}
}
StorageKind
::
Disk
(
fd
)
=>
{
let
system_region
:
AVec
<
u8
,
_
>
=
avec!
[[
4096
]|
0
;
region
.size
()];
fill_memory_region
(
system_region
.as_ptr
()
as
usize
,
system_region
.len
(),
block_id
,
layer_id
,
pattern
,
)
?
;
let
mut
file
=
ManuallyDrop
::
new
(
unsafe
{
File
::
from_raw_fd
(
fd
as
i32
)
});
file
.seek
(
std
::
io
::
SeekFrom
::
Start
(
region
.addr
()
as
u64
))
?
;
file
.write_all
(
&
system_region
)
?
;
file
.sync_all
()
?
;
file
.flush
()
?
;
}
}
}
}
}
Ok
(())
}
/// Fill a subset of layers in blocks with a specific pattern.
///
/// # Arguments
/// * `layout` - The physical layout containing the blocks
/// * `block_ids` - List of block IDs to fill
/// * `layer_range` - Range of layers to fill
/// * `pattern` - Fill pattern to use
pub
fn
fill_layers
(
layout
:
&
PhysicalLayout
,
block_ids
:
&
[
usize
],
layer_range
:
Range
<
usize
>
,
pattern
:
FillPattern
,
)
->
Result
<
()
>
{
let
config
=
layout
.layout
()
.config
();
let
num_layers
=
config
.num_layers
;
let
outer_dim
=
config
.outer_dim
;
if
layer_range
.end
>
num_layers
{
return
Err
(
anyhow!
(
"Layer range {:?} exceeds num_layers {}"
,
layer_range
,
num_layers
));
}
for
&
block_id
in
block_ids
{
if
block_id
>=
config
.num_blocks
{
return
Err
(
anyhow!
(
"Block ID {} out of range"
,
block_id
));
}
// Fill specified layers and all outer dimensions
for
layer_id
in
layer_range
.clone
()
{
for
outer_id
in
0
..
outer_dim
{
let
region
=
layout
.memory_region
(
block_id
,
layer_id
,
outer_id
)
?
;
match
layout
.location
()
{
StorageKind
::
System
|
StorageKind
::
Pinned
=>
{
fill_memory_region
(
region
.addr
(),
region
.size
(),
block_id
,
layer_id
,
pattern
,
)
?
;
}
StorageKind
::
Device
(
_
)
|
StorageKind
::
Disk
(
_
)
=>
{
return
Err
(
anyhow!
(
"fill_layers only supports host-accessible storage (System/Pinned)"
));
}
}
}
}
}
Ok
(())
}
/// Fill a memory region with the specified pattern.
///
/// # Safety
/// This function performs unsafe memory writes. The caller must ensure:
/// - The memory region is valid and accessible
/// - No other references exist to this memory
fn
fill_memory_region
(
addr
:
usize
,
size
:
usize
,
block_id
:
BlockId
,
layer_id
:
usize
,
pattern
:
FillPattern
,
)
->
Result
<
()
>
{
unsafe
{
let
ptr
=
addr
as
*
mut
u8
;
match
pattern
{
FillPattern
::
Constant
(
value
)
=>
{
std
::
ptr
::
write_bytes
(
ptr
,
value
,
size
);
}
FillPattern
::
Sequential
=>
{
for
offset
in
0
..
size
{
let
value
=
((
block_id
+
layer_id
+
offset
)
%
256
)
as
u8
;
ptr
.add
(
offset
)
.write
(
value
);
}
}
}
}
Ok
(())
}
#[cfg(all(test,
feature
=
"testing-kvbm"
))]
mod
tests
{
use
super
::
super
::
tests
::
*
;
use
super
::
*
;
#[test]
fn
test_fill_blocks_constant
()
{
let
physical
=
builder
(
2
)
.fully_contiguous
()
.allocate_system
()
.build
()
.unwrap
();
fill_blocks
(
&
physical
,
&
[
0
,
1
],
FillPattern
::
Constant
(
42
))
.unwrap
();
// Verify all bytes are set to 42
assert
!
(
unsafe
{
physical
.memory_region
(
0
,
0
,
0
)
.unwrap
()
.as_slice
()
.unwrap
()
.iter
()
.all
(|
&
b
|
b
==
42
)
});
}
#[test]
fn
test_fill_blocks_sequential
()
{
let
physical
=
builder
(
2
)
.fully_contiguous
()
.allocate_system
()
.build
()
.unwrap
();
fill_blocks
(
&
physical
,
&
[
0
,
1
],
FillPattern
::
Sequential
)
.unwrap
();
let
mr
=
physical
.memory_region
(
0
,
0
,
0
)
.unwrap
();
let
mr_slice
=
unsafe
{
mr
.as_slice
()
.unwrap
()
};
// Verify pattern is applied (spot check a few bytes)
let
first_byte
=
mr_slice
[
0
];
let
second_byte
=
mr_slice
[
1
];
assert_eq!
(
first_byte
,
0
);
assert_eq!
(
second_byte
,
first_byte
.wrapping_add
(
1
));
let
mr
=
physical
.memory_region
(
1
,
1
,
0
)
.unwrap
();
let
mr_slice
=
unsafe
{
mr
.as_slice
()
.unwrap
()
};
let
first_byte
=
mr_slice
[
0
];
let
second_byte
=
mr_slice
[
1
];
assert_eq!
(
first_byte
,
2
);
assert_eq!
(
second_byte
,
first_byte
.wrapping_add
(
1
));
}
#[test]
fn
test_fill_layers
()
{
let
physical
=
builder
(
2
)
.fully_contiguous
()
.allocate_system
()
.build
()
.unwrap
();
// Fill only layer 0
fill_layers
(
&
physical
,
&
[
0
],
0
..
1
,
FillPattern
::
Constant
(
0
))
.unwrap
();
fill_layers
(
&
physical
,
&
[
0
],
1
..
2
,
FillPattern
::
Constant
(
1
))
.unwrap
();
fill_layers
(
&
physical
,
&
[
1
],
0
..
1
,
FillPattern
::
Constant
(
100
))
.unwrap
();
fill_layers
(
&
physical
,
&
[
1
],
1
..
2
,
FillPattern
::
Constant
(
101
))
.unwrap
();
let
mr_00
=
unsafe
{
physical
.memory_region
(
0
,
0
,
0
)
.unwrap
()
.as_slice
()
.unwrap
()[
0
]
};
let
mr_01
=
unsafe
{
physical
.memory_region
(
0
,
1
,
0
)
.unwrap
()
.as_slice
()
.unwrap
()[
0
]
};
let
mr_10
=
unsafe
{
physical
.memory_region
(
1
,
0
,
0
)
.unwrap
()
.as_slice
()
.unwrap
()[
0
]
};
let
mr_11
=
unsafe
{
physical
.memory_region
(
1
,
1
,
0
)
.unwrap
()
.as_slice
()
.unwrap
()[
0
]
};
assert_eq!
(
mr_00
,
0
);
assert_eq!
(
mr_01
,
1
);
assert_eq!
(
mr_10
,
100
);
assert_eq!
(
mr_11
,
101
);
}
}
lib/kvbm-physical/src/transfer/mod.rs
0 → 100644
View file @
9ab148dc
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Transfer module for copying blocks between layouts with different storage locations.
//!
//! This module provides functionality for transferring KV cache blocks between layouts
//! that may be backed by different storage types (GPU memory, pinned host memory, disk, etc.)
//! and potentially across NIXL-connected remote nodes.
//!
//! # Core Concepts
//!
//! - [`PhysicalLayout`]: Wraps a layout with its physical storage location and NIXL metadata
//! - [`LayoutDescriptor`]: Serializable representation for cross-node communication
//! - Transfer strategies: memcpy, CUDA, NIXL based on source/destination locations
//! - Block-wise and layer-wise transfer operations
//!
//! # Usage
//!
//! ```rust,ignore
//! use dynamo_kvbm::v2::transfer::{PhysicalLayout, transfer_blocks};
//!
//! // Create local physical layout with NIXL registration
//! let src = PhysicalLayout::new_local(src_layout, StorageKind::Device(0))
//! .with_nixl_registration("local_agent".to_string())?;
//!
//! // Create remote physical layout
//! let dst = PhysicalLayout::new_remote(
//! dst_layout,
//! StorageKind::Pinned,
//! "remote_agent".to_string()
//! );
//!
//! // Transfer blocks from local to remote
//! let src_block_ids = [0, 1, 2];
//! let dst_block_ids = [0, 1, 2];
//! let future = transfer_blocks(&src, &dst, &src_block_ids, &dst_block_ids, &ctx)?;
//! future.await?;
//! ```
pub
(
crate
)
mod
capabilities
;
pub
(
crate
)
mod
checksum
;
pub
mod
context
;
pub
(
crate
)
mod
executor
;
pub
(
crate
)
mod
fill
;
pub
(
crate
)
mod
notifications
;
pub
(
crate
)
mod
options
;
pub
(
crate
)
mod
preferences
;
pub
(
crate
)
mod
strategy
;
pub
(
crate
)
mod
validation
;
#[cfg(all(test,
feature
=
"testing-kvbm"
))]
mod
tests
;
// Re-export StorageKind
pub
use
dynamo_memory
::
StorageKind
;
pub
use
capabilities
::
TransferCapabilities
;
pub
use
checksum
::{
BlockChecksum
,
compute_block_checksums
,
compute_layer_checksums
};
pub
use
context
::{
TransferCompleteNotification
,
TransferConfig
};
pub
use
dynamo_memory
::
nixl
::
NixlAgent
;
pub
use
fill
::{
FillPattern
,
fill_blocks
,
fill_layers
};
pub
use
options
::{
TransferOptions
,
TransferOptionsBuilder
};
// TransferContext - managed by TransferManager
#[doc(hidden)]
pub
use
context
::
TransferContext
;
use
crate
::
BlockId
;
pub
use
crate
::
layout
::
PhysicalLayout
;
// Re-export manager types - TransferManager is the primary public API
pub
use
crate
::
manager
::{
LayoutHandle
,
SerializedLayout
,
TransferManager
,
WorkerAddress
};
// #[cfg(test)]
// pub use testing::{RoundTripTest, RoundTripTestResult};
// /// Specification for bounce buffer in multi-hop transfers.
// ///
// /// This structure provides the layout and block IDs to use as an intermediate
// /// staging area when direct transfers are not allowed.
// #[deprecated(since = "2025-11-25", note = "use TransferOptions instead")]
// pub trait BounceBufferSpec: Send + Sync {
// fn layout(&self) -> &PhysicalLayout;
// fn block_ids(&self) -> &[BlockId];
// }
#[derive(Clone)]
pub
struct
BounceBuffer
{
layout
:
LayoutHandle
,
block_ids
:
Vec
<
BlockId
>
,
}
#[derive(Clone)]
pub
struct
BounceBufferInternal
{
layout
:
PhysicalLayout
,
block_ids
:
Vec
<
BlockId
>
,
}
impl
BounceBuffer
{
pub
fn
from_handle
(
layout
:
LayoutHandle
,
block_ids
:
Vec
<
BlockId
>
)
->
Self
{
Self
{
layout
,
block_ids
}
}
#[doc(hidden)]
pub
fn
into_parts
(
self
)
->
(
LayoutHandle
,
Vec
<
BlockId
>
)
{
(
self
.layout
,
self
.block_ids
)
}
}
impl
BounceBufferInternal
{
pub
fn
from_layout
(
layout
:
PhysicalLayout
,
block_ids
:
Vec
<
BlockId
>
)
->
Self
{
Self
{
layout
,
block_ids
}
}
}
// ============================================================================
// Layout Compatibility Helpers
// ============================================================================
use
anyhow
::
anyhow
;
use
std
::
ops
::
Range
;
/// Validate that layouts are compatible for transfer.
///
/// Returns an error if layouts require transformation, which is not yet supported.
/// This should be called early in transfer execution to fail fast.
pub
(
crate
)
fn
validate_layout_compatibility
(
src
:
&
PhysicalLayout
,
dst
:
&
PhysicalLayout
,
)
->
anyhow
::
Result
<
()
>
{
let
src_layout
=
src
.layout
();
let
dst_layout
=
dst
.layout
();
if
src_layout
.block_layout
()
.requires_transform
(
&
dst_layout
.block_layout
())
{
return
Err
(
anyhow!
(
"Layout transformation not supported: src={:?}, dst={:?}"
,
src_layout
.block_layout
(),
dst_layout
.block_layout
()
));
}
Ok
(())
}
/// Check if layouts support whole-block transfers.
///
/// Returns true when:
/// - Both src and dst are fully contiguous
/// - Transfer is full-block (layer_range covers all layers or is None)
///
/// Note: Caller must have already validated layout compatibility via
/// [`validate_layout_compatibility`].
pub
(
crate
)
fn
can_use_whole_block_transfer
(
src
:
&
PhysicalLayout
,
dst
:
&
PhysicalLayout
,
layer_range
:
Option
<&
Range
<
usize
>>
,
)
->
bool
{
// Must be full-block transfer
let
is_full_block
=
match
layer_range
{
None
=>
true
,
Some
(
range
)
=>
range
.start
==
0
&&
range
.end
==
src
.layout
()
.num_layers
(),
};
if
!
is_full_block
{
return
false
;
}
// Both must be fully contiguous
src
.layout
()
.is_fully_contiguous
()
&&
dst
.layout
()
.is_fully_contiguous
()
}
lib/kvbm-physical/src/transfer/notifications/cuda_event.rs
0 → 100644
View file @
9ab148dc
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! CUDA event polling-based completion checker.
use
anyhow
::
Result
;
use
cudarc
::
driver
::{
CudaEvent
,
DriverError
,
result
as
cuda_result
,
sys
::
CUresult
};
use
super
::
CompletionChecker
;
/// Completion checker that polls CUDA event status.
pub
struct
CudaEventChecker
{
event
:
CudaEvent
,
}
impl
CudaEventChecker
{
pub
fn
new
(
event
:
CudaEvent
)
->
Self
{
Self
{
event
}
}
}
impl
CompletionChecker
for
CudaEventChecker
{
fn
is_complete
(
&
self
)
->
Result
<
bool
>
{
// Query the CUDA event to check if it's complete
// cudaEventQuery returns cudaSuccess if complete, cudaErrorNotReady if still pending
unsafe
{
match
cuda_result
::
event
::
query
(
self
.event
.cu_event
())
{
Ok
(())
=>
Ok
(
true
),
// Event is complete
Err
(
DriverError
(
CUresult
::
CUDA_ERROR_NOT_READY
))
=>
Ok
(
false
),
Err
(
e
)
=>
Err
(
anyhow
::
anyhow!
(
"CUDA event query failed: {:?}"
,
e
)),
}
}
}
}
#[cfg(all(test,
feature
=
"testing-kvbm"
))]
mod
tests
{
use
crate
::
manager
::
TransferManager
;
use
crate
::
transfer
::
tests
::
CudaSleep
;
use
dynamo_memory
::
nixl
::
NixlAgent
;
use
std
::
time
::{
Duration
,
Instant
};
#[tokio::test]
async
fn
test_cuda_event_delayed_notification
()
{
let
agent
=
NixlAgent
::
new
(
"test_agent"
)
.unwrap
();
let
manager
=
TransferManager
::
builder
()
.cuda_device_id
(
0
)
.nixl_agent
(
agent
)
.build
()
.unwrap
();
let
stream
=
manager
.h2d_stream
();
let
cuda_ctx
=
manager
.cuda_context
();
// Get or create the CudaSleep utility (compiles kernel and calibrates on first use)
let
cuda_sleep
=
CudaSleep
::
for_context
(
cuda_ctx
)
.unwrap
();
// Test 1: Launch sleep and wait via async notification
let
t0_queue_start
=
Instant
::
now
();
cuda_sleep
.launch
(
Duration
::
from_millis
(
600
),
stream
)
.unwrap
();
let
queue_time
=
t0_queue_start
.elapsed
();
let
event
=
stream
.record_event
(
None
)
.unwrap
();
let
notification
=
manager
.register_cuda_event
(
event
);
notification
.await
.unwrap
();
let
wait_time
=
t0_queue_start
.elapsed
()
-
queue_time
;
println!
(
"GPU sleep test: queue {:?}, wait {:?}"
,
queue_time
,
wait_time
);
assert
!
(
queue_time
<
Duration
::
from_millis
(
10
),
"launching the sleep kernel should be fast: {:?}"
,
queue_time
);
assert
!
(
wait_time
>=
Duration
::
from_millis
(
500
),
"wait time should reflect >=500ms of GPU work: {:?}"
,
wait_time
);
}
}
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment