"docs/mocker/mocker.md" did not exist on "39d645e58647d6adb074650e46be5de25f3f3bc6"
Unverified Commit 9ab148dc authored by Ryan Olson's avatar Ryan Olson Committed by GitHub
Browse files

feat: kvbm-physical (#6490)


Signed-off-by: default avatarRyan Olson <rolson@nvidia.com>
parent 7546c193
......@@ -154,6 +154,8 @@ jobs:
cargo fmt -- --check && \
cargo clippy --features block-manager,media-ffmpeg,testing-nixl,integration --no-deps --all-targets -- -D warnings && \
cargo test --locked --all-targets --features=block-manager,media-ffmpeg,testing-nixl,integration -- --nocapture && \
cargo clippy -p kvbm-physical --no-deps --all-targets -- -D warnings && \
cargo test --locked -p kvbm-physical --features testing-kvbm -- --nocapture --test-threads=4 && \
/workspace/container/use-sccache.sh show-stats "Rust Checks"'
test-parallel:
......
......@@ -3877,6 +3877,14 @@ dependencies = [
"tracing",
]
[[package]]
name = "kvbm-common"
version = "1.0.0"
dependencies = [
"dynamo-tokens",
"serde",
]
[[package]]
name = "kvbm-kernels"
version = "1.0.0"
......@@ -3894,7 +3902,6 @@ version = "1.0.0"
dependencies = [
"anyhow",
"async-stream",
"bincode 2.0.1",
"bytes",
"derive_builder",
"dynamo-tokens",
......@@ -3914,6 +3921,33 @@ dependencies = [
"xxhash-rust",
]
[[package]]
name = "kvbm-physical"
version = "1.0.0"
dependencies = [
"aligned-vec",
"anyhow",
"bincode 2.0.1",
"blake3",
"cudarc",
"derive-getters",
"derive_builder",
"dynamo-memory",
"futures",
"kvbm-common",
"kvbm-kernels",
"parking_lot",
"rstest 0.26.1",
"serde",
"serde_json",
"thiserror 2.0.18",
"tokio",
"tracing",
"uuid",
"validator",
"velo-events",
]
[[package]]
name = "lalrpop-util"
version = "0.20.2"
......@@ -3999,12 +4033,13 @@ checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981"
[[package]]
name = "libredox"
version = "0.1.12"
version = "0.1.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3d0b95e02c851351f877147b7deea7b1afb1df71b63aa5f8270716e0c5720616"
checksum = "1744e39d1d6a9948f4f388969627434e31128196de472883b39f148769bfe30a"
dependencies = [
"bitflags 2.11.0",
"libc",
"plain",
"redox_syscall 0.7.3",
]
......@@ -5474,6 +5509,12 @@ version = "0.3.32"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c"
[[package]]
name = "plain"
version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b4596b6d070b27117e987119b4dac604f3c58cfb0b191112e24771b2faeac1a6"
[[package]]
name = "plotters"
version = "0.3.7"
......
......@@ -10,8 +10,10 @@ members = [
"lib/mocker",
"lib/kv-router",
"lib/memory",
"lib/kvbm-common",
"lib/kvbm-kernels",
"lib/kvbm-logical",
"lib/kvbm-physical",
"lib/async-openai",
"lib/parsers",
"lib/bench",
......@@ -45,11 +47,13 @@ dynamo-async-openai = { path = "lib/async-openai", version = "1.0.0", features =
dynamo-parsers = { path = "lib/parsers", version = "1.0.0" }
# kvbm
kvbm-common = { path = "lib/kvbm-common", version = "1.0.0" }
kvbm-kernels = { path = "lib/kvbm-kernels", version = "1.0.0" }
kvbm-logical = { path = "lib/kvbm-logical", version = "1.0.0" }
kvbm-physical = { path = "lib/kvbm-physical", version = "1.0.0" }
# velo
velo-events = { path = "lib/velo-events", version = "0.9.0" }
velo-events = { path = "lib/velo-events", version = "1.0.0" }
# External dependencies
anyhow = { version = "1" }
......@@ -66,7 +70,7 @@ chrono = { version = "0.4", default-features = false, features = [
"now",
"serde",
] }
cudarc = { version = "0.19.2", features = ["cuda-12020"] }
cudarc = { version = "0.19.2", features = ["cuda-version-from-build-system", "fallback-latest"] }
dashmap = { version = "6.1" }
derive_builder = { version = "0.20" }
derive-getters = { version = "0.5" }
......
......@@ -3015,9 +3015,9 @@ dependencies = [
[[package]]
name = "js-sys"
version = "0.3.90"
version = "0.3.91"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "14dc6f6450b3f6d4ed5b16327f38fed626d375a886159ca555bd7822c0c3a5a6"
checksum = "b49715b7073f385ba4bc528e5747d02e66cb39c6146efb66b781f131f0fb399c"
dependencies = [
"once_cell",
"wasm-bindgen",
......@@ -3329,12 +3329,13 @@ checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981"
[[package]]
name = "libredox"
version = "0.1.12"
version = "0.1.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3d0b95e02c851351f877147b7deea7b1afb1df71b63aa5f8270716e0c5720616"
checksum = "1744e39d1d6a9948f4f388969627434e31128196de472883b39f148769bfe30a"
dependencies = [
"bitflags 2.11.0",
"libc",
"plain",
"redox_syscall 0.7.3",
]
......@@ -4701,6 +4702,12 @@ version = "0.3.32"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c"
[[package]]
name = "plain"
version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b4596b6d070b27117e987119b4dac604f3c58cfb0b191112e24771b2faeac1a6"
[[package]]
name = "png"
version = "0.18.1"
......@@ -7422,9 +7429,9 @@ dependencies = [
[[package]]
name = "wasm-bindgen"
version = "0.2.113"
version = "0.2.114"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "60722a937f594b7fde9adb894d7c092fc1bb6612897c46368d18e7a20208eff2"
checksum = "6532f9a5c1ece3798cb1c2cfdba640b9b3ba884f5db45973a6f442510a87d38e"
dependencies = [
"cfg-if 1.0.4",
"once_cell",
......@@ -7435,9 +7442,9 @@ dependencies = [
[[package]]
name = "wasm-bindgen-futures"
version = "0.4.63"
version = "0.4.64"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8a89f4650b770e4521aa6573724e2aed4704372151bd0de9d16a3bbabb87441a"
checksum = "e9c5522b3a28661442748e09d40924dfb9ca614b21c00d3fd135720e48b67db8"
dependencies = [
"cfg-if 1.0.4",
"futures-util",
......@@ -7449,9 +7456,9 @@ dependencies = [
[[package]]
name = "wasm-bindgen-macro"
version = "0.2.113"
version = "0.2.114"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0fac8c6395094b6b91c4af293f4c79371c163f9a6f56184d2c9a85f5a95f3950"
checksum = "18a2d50fcf105fb33bb15f00e7a77b772945a2ee45dcf454961fd843e74c18e6"
dependencies = [
"quote",
"wasm-bindgen-macro-support",
......@@ -7459,9 +7466,9 @@ dependencies = [
[[package]]
name = "wasm-bindgen-macro-support"
version = "0.2.113"
version = "0.2.114"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ab3fabce6159dc20728033842636887e4877688ae94382766e00b180abac9d60"
checksum = "03ce4caeaac547cdf713d280eda22a730824dd11e6b8c3ca9e42247b25c631e3"
dependencies = [
"bumpalo",
"proc-macro2",
......@@ -7472,9 +7479,9 @@ dependencies = [
[[package]]
name = "wasm-bindgen-shared"
version = "0.2.113"
version = "0.2.114"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "de0e091bdb824da87dc01d967388880d017a0a9bc4f3bdc0d86ee9f9336e3bb5"
checksum = "75a326b8c223ee17883a4251907455a2431acc2791c98c26279376490c378c16"
dependencies = [
"unicode-ident",
]
......@@ -7528,9 +7535,9 @@ dependencies = [
[[package]]
name = "web-sys"
version = "0.3.90"
version = "0.3.91"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "705eceb4ce901230f8625bd1d665128056ccbe4b7408faa625eec1ba80f59a97"
checksum = "854ba17bb104abfb26ba36da9729addc7ce7f06f5c0f90f3c391f8461cca21f9"
dependencies = [
"js-sys",
"wasm-bindgen",
......
......@@ -56,6 +56,6 @@ pyo3-async-runtimes = { version = "0.23.0", default-features = false, features =
] }
dlpark = { version = "0.5", features = ["pyo3", "half"], optional = true }
cudarc = { version = "0.19.2", features = ["cuda-12020"], optional = true }
cudarc = { version = "0.19.2", features = ["cuda-version-from-build-system", "fallback-latest"], optional = true }
[dev-dependencies]
......@@ -3075,9 +3075,9 @@ dependencies = [
[[package]]
name = "js-sys"
version = "0.3.90"
version = "0.3.91"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "14dc6f6450b3f6d4ed5b16327f38fed626d375a886159ca555bd7822c0c3a5a6"
checksum = "b49715b7073f385ba4bc528e5747d02e66cb39c6146efb66b781f131f0fb399c"
dependencies = [
"once_cell",
"wasm-bindgen",
......@@ -3368,12 +3368,13 @@ checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981"
[[package]]
name = "libredox"
version = "0.1.12"
version = "0.1.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3d0b95e02c851351f877147b7deea7b1afb1df71b63aa5f8270716e0c5720616"
checksum = "1744e39d1d6a9948f4f388969627434e31128196de472883b39f148769bfe30a"
dependencies = [
"bitflags 2.11.0",
"libc",
"plain",
"redox_syscall 0.7.3",
]
......@@ -4749,6 +4750,12 @@ version = "0.3.32"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c"
[[package]]
name = "plain"
version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b4596b6d070b27117e987119b4dac604f3c58cfb0b191112e24771b2faeac1a6"
[[package]]
name = "png"
version = "0.18.1"
......@@ -7497,9 +7504,9 @@ dependencies = [
[[package]]
name = "wasm-bindgen"
version = "0.2.113"
version = "0.2.114"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "60722a937f594b7fde9adb894d7c092fc1bb6612897c46368d18e7a20208eff2"
checksum = "6532f9a5c1ece3798cb1c2cfdba640b9b3ba884f5db45973a6f442510a87d38e"
dependencies = [
"cfg-if 1.0.4",
"once_cell",
......@@ -7510,9 +7517,9 @@ dependencies = [
[[package]]
name = "wasm-bindgen-futures"
version = "0.4.63"
version = "0.4.64"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8a89f4650b770e4521aa6573724e2aed4704372151bd0de9d16a3bbabb87441a"
checksum = "e9c5522b3a28661442748e09d40924dfb9ca614b21c00d3fd135720e48b67db8"
dependencies = [
"cfg-if 1.0.4",
"futures-util",
......@@ -7524,9 +7531,9 @@ dependencies = [
[[package]]
name = "wasm-bindgen-macro"
version = "0.2.113"
version = "0.2.114"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0fac8c6395094b6b91c4af293f4c79371c163f9a6f56184d2c9a85f5a95f3950"
checksum = "18a2d50fcf105fb33bb15f00e7a77b772945a2ee45dcf454961fd843e74c18e6"
dependencies = [
"quote",
"wasm-bindgen-macro-support",
......@@ -7534,9 +7541,9 @@ dependencies = [
[[package]]
name = "wasm-bindgen-macro-support"
version = "0.2.113"
version = "0.2.114"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ab3fabce6159dc20728033842636887e4877688ae94382766e00b180abac9d60"
checksum = "03ce4caeaac547cdf713d280eda22a730824dd11e6b8c3ca9e42247b25c631e3"
dependencies = [
"bumpalo",
"proc-macro2",
......@@ -7547,9 +7554,9 @@ dependencies = [
[[package]]
name = "wasm-bindgen-shared"
version = "0.2.113"
version = "0.2.114"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "de0e091bdb824da87dc01d967388880d017a0a9bc4f3bdc0d86ee9f9336e3bb5"
checksum = "75a326b8c223ee17883a4251907455a2431acc2791c98c26279376490c378c16"
dependencies = [
"unicode-ident",
]
......@@ -7603,9 +7610,9 @@ dependencies = [
[[package]]
name = "web-sys"
version = "0.3.90"
version = "0.3.91"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "705eceb4ce901230f8625bd1d665128056ccbe4b7408faa625eec1ba80f59a97"
checksum = "854ba17bb104abfb26ba36da9729addc7ce7f06f5c0f90f3c391f8461cca21f9"
dependencies = [
"js-sys",
"wasm-bindgen",
......
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
[package]
name = "kvbm-common"
version.workspace = true
edition.workspace = true
description.workspace = true
authors.workspace = true
license.workspace = true
homepage.workspace = true
repository.workspace = true
keywords.workspace = true
[dependencies]
dynamo-tokens = { workspace = true }
serde = { workspace = true }
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
use serde::{Deserialize, Serialize};
pub type BlockId = usize;
pub type SequenceHash = dynamo_tokens::PositionalLineageHash;
pub use dynamo_tokens as tokens;
/// Logical layout handle type encoding the layout ID.
///
/// KVBM manages G1, G2 and G3 layouts directly. G4 is managed by an external service.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum LogicalLayoutHandle {
/// Representation of GPU / Device Memory
/// G1 is fixed sized and managed by either the framework or the local instance of KVBM.
G1,
/// Representation of CPU / Host Memory
/// G2 is fixed sized and managed by the local instance of KVBM.
G2,
/// Representation of Disk Storage (Local or AttachedStorage)
/// G3 is fixed sized and managed by the local instance of KVBM.
G3,
/// Representation of Blocks held in an external service
/// outside the control of the KVBM system.
G4,
}
......@@ -20,10 +20,9 @@ use cudarc::runtime::sys as cuda_runtime;
use kvbm_kernels::{MemcpyBatchMode, is_memcpy_batch_available, is_using_stubs, memcpy_batch};
// Direct FFI for cudaMallocHost / cudaFreeHost.
// We bypass cudarc's runtime::sys because cudarc eagerly resolves ALL runtime
// symbols on first use, and CUDA 13.x removed `cudaGetDeviceProperties_v2`
// which causes a panic. Our test binary links against libcudart directly
// (through kvbm-kernels' build.rs), so these symbols are always available.
// We link against libcudart directly (through kvbm-kernels' build.rs),
// so these symbols are always available without going through cudarc's
// dynamic loader.
unsafe extern "C" {
fn cudaMallocHost(ptr: *mut *mut c_void, size: usize) -> u32;
fn cudaFreeHost(ptr: *mut c_void) -> u32;
......
......@@ -15,7 +15,6 @@ dynamo-tokens = { workspace = true }
anyhow = { workspace = true }
async-stream = "0.3"
bytes = "1.10"
bincode = { version = "2.0.1", features = ["serde", "derive"] }
derive_builder = { workspace = true }
futures = { workspace = true }
lru = "0.16"
......
......@@ -15,9 +15,6 @@ pub mod tinylfu;
#[cfg(any(test, feature = "testing"))]
pub mod testing;
use bincode::{Decode, Encode};
use serde::{Deserialize, Serialize};
// Re-export common types and traits
pub use blocks::{
BlockError, BlockMetadata, CompleteBlock, ImmutableBlock, MutableBlock, WeakBlock,
......@@ -37,19 +34,3 @@ impl KvbmSequenceHashProvider for dynamo_tokens::TokenBlock {
self.positional_lineage_hash()
}
}
/// Logical layout handle type encoding the layout ID.
///
/// KVBM manages G1, G2 and G3 layouts directly. G4 is managed by an external service.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Encode, Decode, Serialize, Deserialize)]
pub enum LogicalLayoutHandle {
/// Representation of GPU / Device Memory
G1,
/// Representation of CPU / Host Memory
G2,
/// Representation of Disk Storage
G3,
/// Representation of Blocks held in an external service
/// outside the control of the KVBM system.
G4,
}
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
[package]
name = "kvbm-physical"
version.workspace = true
edition.workspace = true
authors.workspace = true
license.workspace = true
repository.workspace = true
[dependencies]
dynamo-memory = { workspace = true }
kvbm-common = { workspace = true }
kvbm-kernels = { workspace = true }
velo-events = { workspace = true }
aligned-vec = "0.6.4"
anyhow = { workspace = true }
bincode = { version = "2.0.0", features = ["serde", "derive"] }
blake3 = { version = "1" }
cudarc = { workspace = true }
derive_builder = { workspace = true }
futures = { workspace = true }
derive-getters = { version = "0.5" }
parking_lot = { workspace = true }
serde = { workspace = true }
serde_json = { workspace = true }
thiserror = { workspace = true }
tokio = { workspace = true }
tracing = { workspace = true }
uuid = { workspace = true }
validator = { workspace = true }
[features]
default = []
collectives = []
testing-kvbm = []
testing-nixl-gds = []
[dev-dependencies]
dynamo-memory = { workspace = true, features = ["unsafe-slices"] }
rstest = "0.26"
# kvbm-physical
Physical layout and transfer management for KV cache block storage.
`kvbm-physical` provides the low-level building blocks for mapping KV cache blocks to memory, registering them for RDMA transfers via NIXL, and executing transfers between heterogeneous storage tiers (GPU, host, disk, remote).
## Modules
### `layout` — Block-to-memory mapping
Abstractions for how KV cache blocks are organized in memory.
- **`Layout` trait** — Core abstraction mapping `(block_id, layer_id, outer_id)` to a `MemoryRegion`. Implementations include fully contiguous (single allocation) and layer-separate (one allocation per layer) variants.
- **`KvBlockLayout`** — Describes dimension ordering within a block. Five named formats (`UniversalTP`, `UniversalPP`, `OperationalHND`, `OperationalNHD`, `Custom`) plus `Unknown`. Provides `requires_transform()`, `is_operational()`, and `is_universal()` for kernel selection.
- **`PhysicalLayout`** — Wraps a `Layout` with its physical storage location (`StorageKind`) and NIXL registration metadata (`NixlMetadata`). Constructed via a type-state builder: Config &rarr; Layout type &rarr; Memory allocation &rarr; `build()`.
- **`LayoutConfig`** — Block dimensions: `num_blocks`, `num_layers`, `outer_dim`, `page_size`, `inner_dim`, `dtype_width_bytes`, optional `num_heads`.
- **`KvBlocks`** — Groups block IDs with a shared `PhysicalLayout` and optional `KvBlockLayout` override for cross-format transfers.
### `manager` — Layout registration and transfer orchestration
- **`TransferManager`** — Primary API. Registers layouts, exports/imports RDMA metadata between workers, and executes transfers by handle.
- **`LayoutHandle`** — Compact `u128` encoding `(worker_id, layout_id)`. Identifies a registered layout within a specific worker; not symmetric across workers.
- **`LogicalLayoutDescriptor`** — Bridges a `LayoutHandle` to a `LogicalLayoutHandle` (G1/G2/G3/G4 tier). Enables callers to say "copy from G1 to G2" while `TransferManager` resolves worker-specific physical handles.
- **`SerializedLayout`** — Wire format for RDMA metadata exchange. Packs worker address, NIXL metadata, and layout descriptors into a bincode blob.
- **`WorkerAddress`**`(worker_id, nixl_agent_name)` pair identifying a worker on the network.
### `transfer` — Transfer configuration and execution
- **`TransferConfig` / builder** — Configures event system, NIXL backends, CUDA device, capabilities, and memory pool before building a `TransferManager`.
- **`TransferOptions`** — Per-transfer configuration: `layer_range`, `nixl_write_notification`, `bounce_buffer`, caller-provided `cuda_stream`, and src/dst `kv_layout` overrides.
- **`TransferPreferences`** — Strategy hints via `NativeVsNixlPolicy` (PreferNative / PreferNixl / Automatic).
- **`TransferCompleteNotification`**`Either<Ready, EventAwaiter>` implementing `IntoFuture`. Zero-cost for synchronous completions. `aggregate()` composes multiple notifications. `could_yield()` checks if awaiting will suspend.
- **`BounceBuffer`** — Staging area for two-hop transfers (e.g., Device &rarr; Host &rarr; Remote).
- **Checksum utilities** — BLAKE3 block/layer checksums for transfer verification.
- **Fill utilities** — Constant/sequential patterns for testing and initialization.
## Quick Start
```rust,ignore
use kvbm_physical::{TransferManager, TransferOptions};
use kvbm_physical::layout::{LayoutConfig, PhysicalLayout};
// 1. Build the TransferManager (creates NIXL agent, CUDA streams, event system)
let manager = TransferManager::builder()
.nixl_backend("ucx")
.cuda_device_id(0)
.build()?;
// 2. Configure a layout
let config = LayoutConfig::builder()
.num_blocks(64)
.num_layers(32)
.outer_dim(2)
.page_size(16)
.inner_dim(128)
.dtype_width_bytes(2)
.build()?;
// 3. Build a physical layout (type-state builder: config -> layout type -> memory -> build)
let gpu_layout = PhysicalLayout::builder(manager.nixl_agent().clone())
.with_config(config.clone())
.fully_contiguous()
.allocate_device(0)
.build()?;
let host_layout = PhysicalLayout::builder(manager.nixl_agent().clone())
.with_config(config)
.fully_contiguous()
.allocate_pinned(Some(0))
.build()?;
// 4. Register layouts to get handles
let gpu_handle = manager.register_layout(gpu_layout)?;
let host_handle = manager.register_layout(host_layout)?;
// 5. Execute a transfer and await completion
let notification = manager.execute_transfer(
gpu_handle,
&[0, 1, 2, 3], // source block IDs
host_handle,
&[0, 1, 2, 3], // destination block IDs
TransferOptions::new(),
)?;
notification.await?;
```
## Testing
All functional tests in `kvbm-physical` require a real NIXL installation and a CUDA GPU. They are gated behind two feature flags:
- **`testing-kvbm`** — enables tests requiring NIXL and CUDA (creates NixlAgent instances and allocates device memory / launches kernels)
### Running tests
```bash
# Without GPU/NIXL — only the sentinel test runs (confirms skipping)
cargo test -p kvbm-physical
# With GPU + NIXL available
cargo test -p kvbm-physical --features testing-kvbm
```
When neither feature is enabled, a single **sentinel test** runs and prints a reminder message. This ensures `cargo test` never silently passes with zero tests.
### What the sentinel test looks like
```
running 1 test
test sentinel::all_functional_tests_skipped___enable_testing_nixl_and_testing_cuda ... ok
```
The `test_version_check_on_deserialization` test in `layout::tests` is the only functional test that runs without feature flags, as it does not require NIXL or CUDA.
## Documentation
- [v1 Migration Guide](docs/v1_migration.md) — Migration from `dynamo-llm::block_manager` to `kvbm-physical`
# Migration Guide: block_manager to kvbm-physical
Guide for migrating from `dynamo-llm::block_manager` (v1) to `kvbm-physical`.
## Overview
`kvbm-physical` is a ground-up rewrite of the physical transfer layer from `lib/llm/src/block_manager/`. The core data flow is the same (register layouts, exchange metadata, execute transfers), but `kvbm-physical` adds block format awareness, richer transfer options, and a cleaner separation between logical tiers and physical handles.
Both implementations use the same `vectorized_copy` CUDA kernel. The original embeds it in a `.fatbin` (`lib/llm/src/block_manager/block/transfer/kernels/vectorized_copy.fatbin`) loaded via `cuModuleLoadData`. `kvbm-physical` wraps the same kernel via the `kvbm-kernels` crate with explicit Rust FFI for transparency and testability.
## Type mapping table
| Original (block_manager) | kvbm-physical | Notes |
|--------------------------|---------------|-------|
| `TransportManager` | `TransferManager` | Same role, richer API |
| `LayoutHandle` | `LayoutHandle` | Same concept; encoding changed — see LayoutHandle docs for details |
| `PhysicalLayout` + builder | `PhysicalLayout` + builder | Same pattern; adds `with_external_device_regions()` |
| `LayoutConfig` | `LayoutConfig` | Same fields + optional `num_heads` |
| `TransferOptions` | `TransferOptions` | Adds `cuda_stream`, `src_kv_layout`, `dst_kv_layout` |
| `TransferCapabilities` | `TransferCapabilities` | Same |
| `TransferPreferences` | `TransferPreferences` | Same |
| `SerializedLayout` | `SerializedLayout` | Same wire format concept |
| `WorkerAddress` | `WorkerAddress` | Same |
| `TransferCompleteNotification` (oneshot) | `TransferCompleteNotification` (`Either`/`EventAwaiter`) | Zero-cost sync path |
| `BounceBufferSpec` (trait object) | `BounceBuffer` (concrete struct) | Simpler, no heap allocation |
| N/A | `LogicalLayoutDescriptor` | **New** — tier bridging |
| N/A | `KvBlockLayout` | **New** — block format awareness |
| N/A | `KvBlocks` | **New** — grouped blocks with layout override |
| `CudaBlockingH2D` / `CudaBlockingD2H` | Removed | Async-only; `.await` for sync behavior |
| `OperationalCopyBackend` | Removed | Replaced by `kvbm_kernels` direct FFI |
## What kvbm-physical adds
### LogicalLayoutDescriptor
Bridges `LayoutHandle` (physical) to `LogicalLayoutHandle` (G1/G2/G3/G4 tier). This is the key new abstraction for multi-worker coordination: callers say "copy from G1 to G2" while `TransferManager` resolves worker-specific handles.
```rust,ignore
// Build descriptor for RDMA exchange
let descriptor = manager.build_logical_descriptor(gpu_handle, LogicalLayoutHandle::G1)?;
```
### KvBlockLayout
Five named block formats plus `Custom` and `Unknown`. Enables type-driven kernel selection for transfers between different dimension orderings.
```rust,ignore
let needs_permute = src_layout.requires_transform(&dst_layout);
```
### kvbm-kernels FFI
The `kvbm_kernels` crate provides `memcpy_batch` using CUDA 12.9+ batch API with automatic fallback to individual copies. This replaces the fatbin-loading approach with direct Rust FFI.
### Stream pooling
4 H2D + 4 D2H streams with round-robin selection, replacing the original 1+1 stream pair. Reduces contention for concurrent transfers.
### Caller-provided CUDA stream
`TransferOptions::cuda_stream` lets the caller pass in a stream. The executor skips event recording; the caller manages synchronization. Useful for layer-wise transfers where all layers must execute on the same stream.
```rust,ignore
let stream = manager.context().acquire_h2d_stream();
let options = TransferOptions::builder()
.cuda_stream(stream.clone())
.build()?;
```
### CudaMemPool
Device memory pool for kernel temporary allocations (permute buffers, etc.). Configured via `TransferConfig`:
```rust,ignore
TransferManager::builder()
.cuda_pool_reserve_size(64 * 1024 * 1024) // 64 MiB pre-allocated
.cuda_pool_release_threshold(Some(64 * 1024 * 1024)) // free above this
.build()?;
```
### TransferCompleteNotification::aggregate()
Compose multiple transfer notifications into one that completes when all are done. Optimizes away the aggregation when all inputs are already complete.
```rust,ignore
let combined = TransferCompleteNotification::aggregate(
vec![n1, n2, n3],
manager.context().event_system(),
&tokio::runtime::Handle::current(),
)?;
combined.await?;
```
### src/dst kv_layout overrides
`TransferOptions` now supports overriding the source and destination block layout interpretation, enabling cross-format transfers without modifying the registered layout.
```rust,ignore
let options = TransferOptions::builder()
.src_kv_layout(KvBlockLayout::OperationalNHD)
.dst_kv_layout(KvBlockLayout::UniversalTP)
.build()?;
```
## What was intentionally removed
### Blocking CUDA strategies
`CudaBlockingH2D` and `CudaBlockingD2H` are removed. All transfers are async. For synchronous behavior, just `.await` immediately:
```rust,ignore
// v1 (blocking)
let result = blocking_h2d_transfer(...);
// kvbm-physical (async, but can be used synchronously)
let notification = manager.execute_transfer(...)?;
notification.await?;
```
### OperationalCopyBackend enum
The `OperationalCopyBackend` enum (which selected between different kernel loading strategies) is removed. `kvbm-physical` uses `kvbm_kernels` direct FFI exclusively, making kernel dispatch transparent.
### Trait object bounce buffer
`BounceBufferSpec` (a trait object requiring heap allocation) is replaced by `BounceBuffer`, a concrete struct wrapping a `LayoutHandle` + block IDs:
```rust,ignore
// v1
struct MyBounce { layout: PhysicalLayout, blocks: Vec<BlockId> }
impl BounceBufferSpec for MyBounce { ... }
// kvbm-physical
let bounce = BounceBuffer::from_handle(host_handle, vec![0, 1, 2, 3]);
```
## Migration steps
### 1. Replace TransportManager with TransferManager
The builder pattern is the same. `TransferManager::builder()` returns the same kind of fluent builder.
```rust,ignore
// v1
let manager = TransportManager::builder()
.worker_id(0)
.nixl_backend("ucx")
.cuda_device_id(0)
.build()?;
// kvbm-physical
let manager = TransferManager::builder()
.nixl_backend("ucx")
.cuda_device_id(0)
.build()?;
// worker_id is now derived from the event system
```
### 2. Replace TransferOptions
Add new fields as needed. Existing `layer_range` and `nixl_write_notification` work the same way.
```rust,ignore
// v1
let options = TransferOptions::builder()
.layer_range(0..16)
.build()?;
// kvbm-physical (same, with optional new fields)
let options = TransferOptions::builder()
.layer_range(0..16)
.cuda_stream(stream) // new: caller-managed stream
.src_kv_layout(layout) // new: format override
.build()?;
```
### 3. Replace BounceBufferSpec with BounceBuffer
```rust,ignore
// v1 — trait object
let spec: Box<dyn BounceBufferSpec> = Box::new(MyBounce::new(layout, blocks));
options.bounce_buffer(spec);
// kvbm-physical — concrete type
let bounce = BounceBuffer::from_handle(host_handle, block_ids);
let options = TransferOptions::builder()
.bounce_buffer(bounce)
.build()?;
```
### 4. Replace TransferCompleteNotification await pattern
The notification now implements `IntoFuture` directly instead of wrapping a oneshot channel.
```rust,ignore
// v1
let notification = manager.execute_transfer(...)?;
notification.recv().await??;
// kvbm-physical
let notification = manager.execute_transfer(...)?;
notification.await?;
```
### 5. Add LogicalLayoutDescriptor for multi-worker tier resolution
If you coordinate transfers across multiple workers by tier name (G1, G2, etc.), use `LogicalLayoutDescriptor`:
```rust,ignore
// Build descriptors that include tier information
let g1_desc = manager.build_logical_descriptor(gpu_handle, LogicalLayoutHandle::G1)?;
let g2_desc = manager.build_logical_descriptor(host_handle, LogicalLayoutHandle::G2)?;
// Remote workers can now resolve "copy G1 to G2" to the correct physical handles
```
### 6. Consider KvBlockLayout annotations for cross-format transfers
If your transfers involve blocks stored in different dimension orderings (e.g., operational NHD from the engine vs. universal TP for storage), annotate with `KvBlockLayout`:
```rust,ignore
let options = TransferOptions::builder()
.src_kv_layout(KvBlockLayout::OperationalNHD)
.dst_kv_layout(KvBlockLayout::UniversalTP)
.build()?;
```
This tells the executor to select a permute kernel instead of a direct copy.
This diff is collapsed.
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
use derive_builder::Builder;
use serde::{Deserialize, Serialize};
use validator::{Validate, ValidationError};
/// Configuration for block layouts.
///
/// The `#[validate]` attributes on fields are checked during layout construction
/// (e.g., `FullyContiguousLayout::new_internal()`, `LayerSeparateLayout::new_internal()`),
/// not at builder `.build()` time.
#[derive(Debug, Clone, Builder, Validate, Serialize, Deserialize, PartialEq, Eq)]
pub struct LayoutConfig {
/// Number of blocks
#[validate(range(min = 1))]
pub num_blocks: usize,
/// Number of layers
#[validate(range(min = 1))]
pub num_layers: usize,
/// Number of outer dimensions
#[validate(range(min = 1, max = 2))]
pub outer_dim: usize,
/// Page size
#[validate(range(min = 1))]
pub page_size: usize,
/// Inner dimension
#[validate(range(min = 1))]
pub inner_dim: usize,
/// Alignment
#[validate(custom(function = "validate_power_of_2"))]
#[builder(default = "1")]
pub alignment: usize,
/// Data type
#[validate(custom(function = "validate_dtype_width_bytes"))]
#[builder(default = "2")]
pub dtype_width_bytes: usize,
/// Number of attention heads (optional).
///
/// When provided, enables KvBlockLayout support for universal formats.
/// The head dimension can be computed as: `inner_dim / (page_size * num_heads)`.
///
/// Required for:
/// - Universal layout transformations
/// - Per-head memory region access
#[builder(default = "None")]
#[serde(default)]
pub num_heads: Option<usize>,
}
impl LayoutConfig {
/// Builder for LayoutConfig
pub fn builder() -> LayoutConfigBuilder {
LayoutConfigBuilder::default()
}
pub fn required_bytes(&self) -> usize {
self.num_blocks
.saturating_mul(self.num_layers)
.saturating_mul(self.outer_dim)
.saturating_mul(self.page_size)
.saturating_mul(self.inner_dim)
.saturating_mul(self.dtype_width_bytes)
}
/// Get the number of bytes per block.
///
/// This is the total size of a single block across all layers and outer dimensions.
pub fn bytes_per_block(&self) -> usize {
self.num_layers
.saturating_mul(self.outer_dim)
.saturating_mul(self.page_size)
.saturating_mul(self.inner_dim)
.saturating_mul(self.dtype_width_bytes)
}
/// Get the head dimension if `num_heads` is specified.
///
/// Computes `inner_dim / (page_size * num_heads)`.
///
/// # Returns
/// `Some(head_dim)` if `num_heads` is set, `None` otherwise.
pub fn head_dim(&self) -> Option<usize> {
self.num_heads.map(|nh| {
let divisor = self.page_size * nh;
if divisor > 0 {
self.inner_dim / divisor
} else {
0
}
})
}
/// Check if this config supports KvBlockLayout operations.
///
/// Returns `true` if `num_heads` is set and the dimensions are valid
/// (inner_dim is evenly divisible by page_size * num_heads).
pub fn supports_kv_block_layout(&self) -> bool {
if let Some(nh) = self.num_heads {
let divisor = self.page_size * nh;
divisor > 0 && self.inner_dim.is_multiple_of(divisor)
} else {
false
}
}
/// Validate that this config supports KvBlockLayout operations.
///
/// # Returns
/// `Ok(())` if valid, `Err` with details otherwise.
pub fn validate_for_kv_block_layout(&self) -> Result<(), ValidationError> {
let nh = match self.num_heads {
Some(nh) => nh,
None => {
return Err(ValidationError::new(
"num_heads_required_for_kv_block_layout",
));
}
};
if nh == 0 {
return Err(ValidationError::new("num_heads_must_be_positive"));
}
let divisor = self.page_size * nh;
if !self.inner_dim.is_multiple_of(divisor) {
return Err(ValidationError::new(
"inner_dim_must_be_divisible_by_page_size_times_num_heads",
));
}
Ok(())
}
}
/// The first two dimensions of the tensor, `shape[0]` and `shape[1]`, one of those corresponds to the
/// block dimension, while the other corresponds to the outer dimension.
///
/// The outer dimension is typically:
/// - 1: MLA or K and V stored together,
/// - 2: K and V stored separately,
///
/// The block dimension tell us the number of blocks.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum BlockDimension {
/// The block dimension is the first dimension of the tensor, `[n_blocks, outer_dim, inner_dim]`
BlockIsFirstDim,
/// The block dimension is the second dimension of the tensor, `[outer_dim, n_blocks, inner_dim]`
/// This is a replacement for v1's `outer_contiguous` is true.
BlockIsSecondDim,
}
/// Validation function for Option<usize> to check if it's Some(power_of_2).
pub fn validate_power_of_2(alignment: usize) -> Result<(), ValidationError> {
if !alignment.is_power_of_two() {
// Return validation error if alignment is not a power of 2
return Err(validator::ValidationError::new(
"alignment_must_be_power_of_2",
));
}
// Passes validation if alignment is a power of 2
Ok(())
}
pub fn validate_dtype_width_bytes(dtype_width_bytes: usize) -> Result<(), ValidationError> {
if !dtype_width_bytes.is_power_of_two() || !(2..=8).contains(&dtype_width_bytes) {
return Err(validator::ValidationError::new(
"dtype_width_bytes_must_be_power_of_two_and_less_than_8_bytes",
));
}
Ok(())
}
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Fully contiguous layout implementation.
//!
//! This layout stores all blocks in a single contiguous memory allocation
//! with the shape: [num_blocks, num_layers, outer_dim, page_size, inner_dim].
use anyhow::{Result, anyhow};
use validator::Validate;
use super::serialize::{BlockFormat, FullyContiguousDetails, LayoutTypeDetails};
use super::{Buffer, KvBlockLayout, Layout, LayoutConfig, MemoryDescriptor, MemoryRegion};
/// Fully contiguous layout where all blocks are in a single allocation.
#[derive(Debug)]
pub struct FullyContiguousLayout {
config: LayoutConfig,
/// Base address of the allocation
base_addr: usize,
/// Stride between blocks in bytes
block_stride: usize,
/// Stride between layers in bytes
layer_stride: usize,
/// Stride between outer dimensions in bytes
outer_stride: usize,
/// Size of each memory region (page) in bytes
region_size: usize,
/// Owned memory region backing this layout
memory: Buffer,
/// Format of blocks in memory
block_format: BlockFormat,
/// KV block layout describing dimension ordering within blocks
kv_block_layout: KvBlockLayout,
}
/// Builder for creating [`FullyContiguousLayout`] instances.
///
/// # Example
///
/// ```ignore
/// let layout = FullyContiguousLayout::builder()
/// .config(config)
/// .memory(buffer)
/// .kv_block_layout(KvBlockLayout::UniversalTP)
/// .build()?;
/// ```
#[derive(Debug, Default)]
pub struct FullyContiguousLayoutBuilder {
config: Option<LayoutConfig>,
memory: Option<Buffer>,
kv_block_layout: KvBlockLayout,
block_format: BlockFormat,
}
impl FullyContiguousLayoutBuilder {
/// Create a new builder with default values.
pub fn new() -> Self {
Self {
config: None,
memory: None,
kv_block_layout: KvBlockLayout::Unknown,
block_format: BlockFormat::default(),
}
}
/// Set the layout configuration.
#[expect(dead_code)]
pub fn config(&mut self, config: LayoutConfig) -> &mut Self {
self.config = Some(config);
self
}
/// Set the memory buffer backing this layout.
#[expect(dead_code)]
pub fn memory(&mut self, memory: Buffer) -> &mut Self {
self.memory = Some(memory);
self
}
/// Set the KV block layout describing dimension ordering.
///
/// Default: `KvBlockLayout::Unknown`
#[expect(dead_code)]
pub fn kv_block_layout(&mut self, layout: KvBlockLayout) -> &mut Self {
self.kv_block_layout = layout;
self
}
/// Set the block format.
///
/// Default: `BlockFormat::default()` (Operational)
#[expect(dead_code)]
pub fn block_format(&mut self, format: BlockFormat) -> &mut Self {
self.block_format = format;
self
}
/// Build the [`FullyContiguousLayout`].
///
/// # Errors
///
/// Returns an error if:
/// - `config` is not set
/// - `memory` is not set
/// - The memory region is too small for the layout
/// - The config validation fails
#[expect(dead_code)]
pub fn build(&self) -> Result<FullyContiguousLayout> {
let config = self
.config
.clone()
.ok_or_else(|| anyhow!("config is required"))?;
let memory = self
.memory
.clone()
.ok_or_else(|| anyhow!("memory is required"))?;
FullyContiguousLayout::new_internal(config, memory, self.kv_block_layout, self.block_format)
}
}
impl FullyContiguousLayout {
/// Create a builder for `FullyContiguousLayout`.
#[expect(dead_code)]
pub fn builder() -> FullyContiguousLayoutBuilder {
FullyContiguousLayoutBuilder::new()
}
/// Create a new fully contiguous layout with default KV block layout.
///
/// # Arguments
/// * `config` - Layout configuration
/// * `memory` - Owned memory region that backs this layout
///
/// # Returns
/// A new FullyContiguousLayout instance with `KvBlockLayout::Unknown`
pub(crate) fn new(config: LayoutConfig, memory: Buffer) -> Result<Self> {
Self::new_internal(
config,
memory,
KvBlockLayout::Unknown,
BlockFormat::default(),
)
}
/// Internal constructor with all parameters.
fn new_internal(
config: LayoutConfig,
memory: Buffer,
kv_block_layout: KvBlockLayout,
block_format: BlockFormat,
) -> Result<Self> {
config.validate()?;
let base_addr = memory.addr();
// Calculate strides
let region_size = config.page_size * config.inner_dim * config.dtype_width_bytes;
let outer_stride = region_size;
let layer_stride = outer_stride * config.outer_dim;
let block_stride = layer_stride * config.num_layers;
// Validate that the memory region is large enough
let required_size = block_stride * config.num_blocks;
if memory.size() < required_size {
return Err(anyhow!(
"Memory region too small for layout. Required: {} bytes, got: {} bytes",
required_size,
memory.size()
));
}
Ok(Self {
config,
base_addr,
block_stride,
layer_stride,
outer_stride,
region_size,
memory,
block_format,
kv_block_layout,
})
}
/// Create a new fully contiguous layout with a specific block format and KV block layout.
///
/// # Arguments
/// * `config` - Layout configuration
/// * `memory` - Owned memory region that backs this layout
/// * `block_format` - Format of blocks in memory
/// * `kv_block_layout` - KV block layout describing dimension ordering
///
/// # Returns
/// A new FullyContiguousLayout instance
pub(crate) fn new_with_format(
config: LayoutConfig,
memory: Buffer,
block_format: BlockFormat,
kv_block_layout: KvBlockLayout,
) -> Result<Self> {
Self::new_internal(config, memory, kv_block_layout, block_format)
}
/// Get the block format.
#[expect(dead_code)]
pub fn block_format(&self) -> BlockFormat {
self.block_format
}
/// Get the KV block layout.
#[expect(dead_code)]
pub fn kv_block_layout(&self) -> KvBlockLayout {
self.kv_block_layout
}
/// Set the KV block layout.
#[expect(dead_code)]
pub fn set_kv_block_layout(&mut self, layout: KvBlockLayout) {
self.kv_block_layout = layout;
}
/// Calculate the address of a specific memory region.
fn calculate_address(
&self,
block_id: usize,
layer_id: usize,
outer_id: usize,
) -> Result<usize> {
if block_id >= self.config.num_blocks {
return Err(anyhow!(
"Block ID {} out of range (count: {})",
block_id,
self.config.num_blocks
));
}
if layer_id >= self.config.num_layers {
return Err(anyhow!(
"Layer ID {} out of range (count: {})",
layer_id,
self.config.num_layers
));
}
if outer_id >= self.config.outer_dim {
return Err(anyhow!(
"Outer ID {} out of range (count: {})",
outer_id,
self.config.outer_dim
));
}
Ok(self.base_addr
+ block_id * self.block_stride
+ layer_id * self.layer_stride
+ outer_id * self.outer_stride)
}
/// Get mutable reference to the memory Arc for NIXL registration.
#[expect(dead_code)]
pub fn memory_arc_mut(&mut self) -> &mut Buffer {
&mut self.memory
}
}
impl Layout for FullyContiguousLayout {
fn config(&self) -> &LayoutConfig {
&self.config
}
fn memory_regions(&self) -> &[Buffer] {
std::slice::from_ref(&self.memory)
}
fn memory_region(
&self,
block_id: usize,
layer_id: usize,
outer_id: usize,
) -> Result<MemoryRegion> {
let addr = self.calculate_address(block_id, layer_id, outer_id)?;
Ok(MemoryRegion::new(addr, self.region_size))
}
fn required_allocations(&self) -> Vec<usize> {
// Single contiguous allocation
vec![self.block_stride * self.config.num_blocks]
}
fn is_fully_contiguous(&self) -> bool {
true
}
fn num_blocks(&self) -> usize {
self.config.num_blocks
}
fn num_layers(&self) -> usize {
self.config.num_layers
}
fn outer_dim(&self) -> usize {
self.config.outer_dim
}
fn page_size(&self) -> usize {
self.config.page_size
}
fn inner_dim(&self) -> usize {
self.config.inner_dim
}
fn dtype_width_bytes(&self) -> usize {
self.config.dtype_width_bytes
}
fn serialization_details(&self) -> LayoutTypeDetails {
LayoutTypeDetails::FullyContiguous(FullyContiguousDetails {
block_format: self.block_format,
kv_block_layout: self.kv_block_layout,
})
}
fn block_layout(&self) -> KvBlockLayout {
self.kv_block_layout
}
}
impl super::ContiguousBlockLayout for FullyContiguousLayout {
fn num_blocks(&self) -> usize {
self.config.num_blocks
}
fn bytes_per_block(&self) -> usize {
self.block_stride
}
fn raw_block(&self, block_id: usize) -> Result<MemoryRegion> {
if block_id >= self.config.num_blocks {
return Err(anyhow!(
"Block ID {} out of range (max: {})",
block_id,
self.config.num_blocks
));
}
let addr = self.base_addr + block_id * self.block_stride;
Ok(MemoryRegion::new(addr, self.block_stride))
}
fn block_layout(&self) -> KvBlockLayout {
self.kv_block_layout
}
}
#[cfg(all(test, feature = "testing-kvbm"))]
mod tests {
use super::super::tests::*;
use super::*;
#[test]
fn test_fully_contiguous_layout_creation() {
let config = LayoutConfig::builder()
.num_blocks(10)
.num_layers(4)
.outer_dim(2)
.page_size(16)
.inner_dim(128)
.dtype_width_bytes(2)
.build()
.unwrap();
let required_bytes = config.required_bytes();
assert_eq!(required_bytes, 10 * 4 * 2 * 16 * 128 * 2);
let memory = Buffer::from_arc(MockMemory::new(0x1000, required_bytes));
let layout = FullyContiguousLayout::new(config, memory).unwrap();
assert_eq!(layout.num_blocks(), 10);
assert!(layout.is_fully_contiguous());
}
#[test]
fn test_memory_region() {
let config = LayoutConfig::builder()
.num_blocks(2)
.num_layers(2)
.outer_dim(2)
.page_size(16)
.inner_dim(128)
.dtype_width_bytes(2)
.build()
.unwrap();
let required_size = config.required_bytes();
let memory = Buffer::from_arc(MockMemory::new(0x1000, required_size));
let layout = FullyContiguousLayout::new(config.clone(), memory).unwrap();
// Test accessing specific memory regions
let region_size = config.page_size * config.inner_dim * config.dtype_width_bytes;
// Block 0, Layer 0, Outer 0
let region = layout.memory_region(0, 0, 0).unwrap();
assert_eq!(region.addr, 0x1000);
assert_eq!(region.size(), region_size);
// Block 0, Layer 0, Outer 1
let region = layout.memory_region(0, 0, 1).unwrap();
assert_eq!(region.addr, 0x1000 + region_size);
assert_eq!(region.size(), region_size);
// Block 0, Layer 1, Outer 0
let region = layout.memory_region(0, 1, 0).unwrap();
assert_eq!(region.addr, 0x1000 + 2 * region_size);
assert_eq!(region.size(), region_size);
// Block 1, Layer 0, Outer 0
let region = layout.memory_region(1, 0, 0).unwrap();
assert_eq!(
region.addr,
0x1000 + (config.outer_dim * config.num_layers * region_size)
);
assert_eq!(region.size(), region_size);
}
}
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! KV Block layout types for describing dimension permutations within blocks.
//!
//! This module provides types for describing how dimensions are ordered within
//! a fully contiguous KV cache block, enabling type-driven kernel selection
//! for transfers between different layout formats.
use serde::{Deserialize, Serialize};
/// Symbolic dimensions that can be permuted within a block.
///
/// The head dimension (hd) is always innermost and not included here.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum BlockDim {
/// Number of layers (nl)
Layer,
/// Outer dimension - typically 2 for K/V, 1 for MLA (no)
Outer,
/// Page size / tokens per block (nt)
Page,
/// Number of attention heads (nh)
Head,
}
/// Block layout defined by dimension ordering.
///
/// Describes how the 4 permutable dimensions (layer, outer, page, head) are
/// ordered within a fully contiguous block. The head dimension (hd) is always
/// innermost and implicit.
///
/// The order specifies outer-to-inner dimensions, with head_dim always last.
///
/// # Examples
///
/// - `UniversalTP`: `[nh, nl, no, nt, hd]` - heads outermost for TP resharding
/// - `OperationalNHD`: `[nl, no, nt, nh, hd]` - inner is `[nt, nh, hd]`
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum KvBlockLayout {
/// Universal format: `[nh, nl, no, nt, hd]`
///
/// Heads are outermost to enable tensor parallelism (TP) resharding.
/// Cache saved from one TP configuration can be loaded into another
/// by simply slicing the head dimension differently.
UniversalTP,
/// Pipeline parallelism format: `[nl, nh, no, nt, hd]`
///
/// Layers are outermost for pipeline parallelism scenarios.
UniversalPP,
/// Operational HND format: `[nl, no, nh, nt, hd]`
///
/// Inner tensor shape is `[nh, nt, hd]` (heads, tokens, head_dim).
OperationalHND,
/// Operational NHD format: `[nl, no, nt, nh, hd]`
///
/// Inner tensor shape is `[nt, nh, hd]` (tokens, heads, head_dim).
/// This is the most common format used by vLLM and other frameworks.
OperationalNHD,
/// Custom ordering with explicit dimension list.
///
/// The array specifies dimensions from outermost to innermost,
/// with head_dim always implicitly last.
Custom([BlockDim; 4]),
/// Unknown layout - fallback when format cannot be determined.
///
/// Operations involving Unknown layouts may fail or require explicit
/// configuration.
Unknown,
}
impl Default for KvBlockLayout {
fn default() -> Self {
// Unknown until runtime detection determines the actual format
Self::Unknown
}
}
impl KvBlockLayout {
/// Get the dimension ordering as an array.
///
/// Returns the 4 dimensions from outermost to innermost.
/// Head dimension (hd) is implicit as the innermost dimension.
///
/// # Returns
/// `None` for `Unknown` layout, `Some([BlockDim; 4])` otherwise.
pub fn dim_order(&self) -> Option<[BlockDim; 4]> {
use BlockDim::*;
match self {
Self::UniversalTP => Some([Head, Layer, Outer, Page]),
Self::UniversalPP => Some([Layer, Head, Outer, Page]),
Self::OperationalHND => Some([Layer, Outer, Head, Page]),
Self::OperationalNHD => Some([Layer, Outer, Page, Head]),
Self::Custom(order) => Some(*order),
Self::Unknown => None,
}
}
/// Check if two layouts require transformation (not just copy).
///
/// Returns `true` if the layouts have different dimension orderings,
/// meaning a transformation kernel is needed rather than a simple copy.
///
/// For Unknown→Unknown comparisons, returns `false` (compatible) but emits
/// a warning so these cases can be tracked and fixed.
///
/// Returns `true` if one is Unknown and the other is Known (conservative).
pub fn requires_transform(&self, other: &Self) -> bool {
match (self.dim_order(), other.dim_order()) {
(Some(a), Some(b)) => a != b,
(None, None) => {
// Unknown→Unknown is compatible, but warn so we can fix these
tracing::warn!("Unknown→Unknown KvBlockLayout comparison - this should be fixed");
false
}
// Unknown→Known requires transform (conservative)
_ => true,
}
}
/// Check if this is an operational layout (NHD or HND).
///
/// Operational layouts are used for direct computation and have
/// layer/outer as the outermost dimensions.
pub fn is_operational(&self) -> bool {
matches!(self, Self::OperationalNHD | Self::OperationalHND)
}
/// Check if this is a universal layout (TP or PP).
///
/// Universal layouts are optimized for storage and transfer,
/// with different parallelism-friendly orderings.
pub fn is_universal(&self) -> bool {
matches!(self, Self::UniversalTP | Self::UniversalPP)
}
/// Get the layout name as a string identifier.
pub fn name(&self) -> &'static str {
match self {
Self::UniversalTP => "universal_tp",
Self::UniversalPP => "universal_pp",
Self::OperationalHND => "operational_hnd",
Self::OperationalNHD => "operational_nhd",
Self::Custom(_) => "custom",
Self::Unknown => "unknown",
}
}
/// Try to create a KvBlockLayout from an InnerShape.
///
/// This provides compatibility with the existing InnerShape enum.
pub(crate) fn from_inner_shape(inner_shape: super::InnerShape) -> Self {
match inner_shape {
super::InnerShape::NHD => Self::OperationalNHD,
super::InnerShape::HND => Self::OperationalHND,
super::InnerShape::Unknown => Self::Unknown,
}
}
/// Convert to InnerShape if this is an operational layout.
///
/// Returns `None` for universal or custom layouts.
pub(crate) fn to_inner_shape(self) -> Option<super::InnerShape> {
match self {
Self::OperationalNHD => Some(super::InnerShape::NHD),
Self::OperationalHND => Some(super::InnerShape::HND),
Self::Unknown => Some(super::InnerShape::Unknown),
_ => None,
}
}
}
impl std::fmt::Display for KvBlockLayout {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::UniversalTP => write!(f, "Universal TP [nh, nl, no, nt, hd]"),
Self::UniversalPP => write!(f, "Universal PP [nl, nh, no, nt, hd]"),
Self::OperationalHND => write!(f, "Operational HND [nl, no, nh, nt, hd]"),
Self::OperationalNHD => write!(f, "Operational NHD [nl, no, nt, nh, hd]"),
Self::Custom(order) => write!(f, "Custom {:?}", order),
Self::Unknown => write!(f, "Unknown"),
}
}
}
impl std::fmt::Display for BlockDim {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::Layer => write!(f, "nl"),
Self::Outer => write!(f, "no"),
Self::Page => write!(f, "nt"),
Self::Head => write!(f, "nh"),
}
}
}
// ============================================================================
// KvBlocks - Collection wrapper for blocks with shared layout
// ============================================================================
use crate::BlockId;
use crate::layout::PhysicalLayout;
use std::sync::Arc;
/// A collection of blocks with a shared layout configuration and block layout type.
///
/// `KvBlocks` provides a convenient way to group blocks that should be treated
/// uniformly in transfer operations. All blocks in the collection share:
/// - The same [`PhysicalLayout`] (memory organization)
/// - The same [`KvBlockLayout`] interpretation (dimension ordering)
///
/// This enables efficient batch transfers with optional layout override.
///
/// # Example
///
/// ```ignore
/// // Create blocks with universal layout override
/// let blocks = KvBlocks::new(
/// physical_layout.clone(),
/// vec![0, 1, 2, 3], // block IDs
/// Some(KvBlockLayout::UniversalTP),
/// )?;
///
/// // Use in transfers - the override tells the transfer system
/// // to interpret these blocks as universal format
/// ```
#[derive(Debug, Clone)]
pub struct KvBlocks {
/// The physical layout containing these blocks
layout: Arc<PhysicalLayout>,
/// Block IDs within the layout
block_ids: Vec<BlockId>,
/// Optional layout override (None = use layout's native block_layout)
kv_layout_override: Option<KvBlockLayout>,
}
impl KvBlocks {
/// Create a new KvBlocks collection.
///
/// # Arguments
/// * `layout` - The physical layout containing the blocks
/// * `block_ids` - Block IDs to include in this collection
/// * `kv_layout_override` - Optional override for the block layout interpretation.
/// If `None`, uses the layout's native `block_layout()`.
/// If `Some`, overrides the interpretation for transfers.
///
/// # Validation
/// - For layer-separate layouts, only operational layouts (NHD/HND) are valid overrides
/// - For fully contiguous layouts, any layout is valid
/// - If the override matches the native layout, it is normalized to None
pub fn new(
layout: Arc<PhysicalLayout>,
block_ids: Vec<BlockId>,
kv_layout_override: Option<KvBlockLayout>,
) -> anyhow::Result<Self> {
// Validate block IDs are in range
let num_blocks = layout.layout().num_blocks();
for &id in &block_ids {
if id >= num_blocks {
return Err(anyhow::anyhow!(
"Block ID {} out of range (layout has {} blocks)",
id,
num_blocks
));
}
}
// Validate layout override compatibility
if let Some(ref override_layout) = kv_layout_override {
// Layer-separate layouts can only use operational formats
if !layout.layout().is_fully_contiguous() && !override_layout.is_operational() {
return Err(anyhow::anyhow!(
"Layer-separate layouts only support operational block layouts (NHD/HND), got {:?}",
override_layout
));
}
}
// Normalize: if override matches native layout, set to None
let normalized_override = kv_layout_override.and_then(|override_layout| {
if override_layout == layout.layout().block_layout() {
None
} else {
Some(override_layout)
}
});
Ok(Self {
layout,
block_ids,
kv_layout_override: normalized_override,
})
}
/// Create a KvBlocks collection without layout override.
#[expect(dead_code)]
pub fn from_layout(
layout: Arc<PhysicalLayout>,
block_ids: Vec<BlockId>,
) -> anyhow::Result<Self> {
Self::new(layout, block_ids, None)
}
/// Get the physical layout.
#[expect(dead_code)]
pub fn layout(&self) -> &Arc<PhysicalLayout> {
&self.layout
}
/// Get the block IDs.
#[expect(dead_code)]
pub fn block_ids(&self) -> &[BlockId] {
&self.block_ids
}
/// Get the effective block layout (override or native).
pub fn effective_block_layout(&self) -> KvBlockLayout {
self.kv_layout_override
.unwrap_or_else(|| self.layout.layout().block_layout())
}
/// Get the layout override if set.
#[expect(dead_code)]
pub fn layout_override(&self) -> Option<KvBlockLayout> {
self.kv_layout_override
}
/// Check if this collection has a layout override.
#[expect(dead_code)]
pub fn has_override(&self) -> bool {
self.kv_layout_override.is_some()
}
/// Get the number of blocks in this collection.
#[expect(dead_code)]
pub fn len(&self) -> usize {
self.block_ids.len()
}
/// Check if the collection is empty.
#[expect(dead_code)]
pub fn is_empty(&self) -> bool {
self.block_ids.is_empty()
}
/// Check if a transfer between two KvBlocks collections requires transformation.
///
/// Returns `true` if the effective layouts differ and a transformation kernel
/// is needed rather than a simple copy.
#[expect(dead_code)]
pub fn requires_transform_to(&self, dst: &KvBlocks) -> bool {
self.effective_block_layout()
.requires_transform(&dst.effective_block_layout())
}
}
#[cfg(all(test, feature = "testing-kvbm"))]
mod tests {
use super::*;
#[test]
fn test_dim_order() {
use BlockDim::*;
assert_eq!(
KvBlockLayout::UniversalTP.dim_order(),
Some([Head, Layer, Outer, Page])
);
assert_eq!(
KvBlockLayout::OperationalNHD.dim_order(),
Some([Layer, Outer, Page, Head])
);
assert_eq!(KvBlockLayout::Unknown.dim_order(), None);
}
#[test]
fn test_requires_transform() {
// Same layout - no transform
assert!(!KvBlockLayout::OperationalNHD.requires_transform(&KvBlockLayout::OperationalNHD));
// Different layouts - transform required
assert!(KvBlockLayout::OperationalNHD.requires_transform(&KvBlockLayout::UniversalTP));
assert!(KvBlockLayout::OperationalHND.requires_transform(&KvBlockLayout::OperationalNHD));
// Unknown→Known requires transform (conservative)
assert!(KvBlockLayout::Unknown.requires_transform(&KvBlockLayout::OperationalNHD));
assert!(KvBlockLayout::OperationalNHD.requires_transform(&KvBlockLayout::Unknown));
// Unknown→Unknown is compatible (but emits warning)
assert!(!KvBlockLayout::Unknown.requires_transform(&KvBlockLayout::Unknown));
}
#[test]
fn test_is_operational() {
assert!(KvBlockLayout::OperationalNHD.is_operational());
assert!(KvBlockLayout::OperationalHND.is_operational());
assert!(!KvBlockLayout::UniversalTP.is_operational());
assert!(!KvBlockLayout::Unknown.is_operational());
}
#[test]
fn test_is_universal() {
assert!(KvBlockLayout::UniversalTP.is_universal());
assert!(KvBlockLayout::UniversalPP.is_universal());
assert!(!KvBlockLayout::OperationalNHD.is_universal());
}
#[test]
fn test_default() {
assert_eq!(KvBlockLayout::default(), KvBlockLayout::Unknown);
}
#[test]
fn test_serialization() {
let layout = KvBlockLayout::UniversalTP;
let json = serde_json::to_string(&layout).unwrap();
let deserialized: KvBlockLayout = serde_json::from_str(&json).unwrap();
assert_eq!(layout, deserialized);
// Test custom layout
let custom = KvBlockLayout::Custom([
BlockDim::Head,
BlockDim::Page,
BlockDim::Layer,
BlockDim::Outer,
]);
let json = serde_json::to_string(&custom).unwrap();
let deserialized: KvBlockLayout = serde_json::from_str(&json).unwrap();
assert_eq!(custom, deserialized);
}
#[test]
fn test_inner_shape_conversion() {
use super::super::InnerShape;
assert_eq!(
KvBlockLayout::from_inner_shape(InnerShape::NHD),
KvBlockLayout::OperationalNHD
);
assert_eq!(
KvBlockLayout::from_inner_shape(InnerShape::HND),
KvBlockLayout::OperationalHND
);
assert_eq!(
KvBlockLayout::OperationalNHD.to_inner_shape(),
Some(InnerShape::NHD)
);
assert_eq!(KvBlockLayout::UniversalTP.to_inner_shape(), None);
}
}
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Layer-separate layout implementation.
//!
//! This layout stores each layer in its own allocation, which is the typical
//! vLLM layout. Each layer can be either block-contiguous or outer-contiguous:
//! - Block-contiguous: [num_blocks, outer_dim, page_size, inner_dim]
//! - Outer-contiguous: [outer_dim, num_blocks, page_size, inner_dim]
use anyhow::{Result, anyhow};
use validator::Validate;
use super::serialize::{LayerSeparateDetails, LayoutTypeDetails};
use super::{
BlockDimension, Buffer, InnerShape, KvBlockLayout, Layout, LayoutConfig, MemoryDescriptor,
MemoryRegion,
};
/// Layer-separate layout where each layer has its own allocation.
#[derive(Debug)]
pub struct LayerSeparateLayout {
config: LayoutConfig,
/// Base addresses for each layer
layer_base_addrs: Vec<usize>,
/// Whether the outer dimension is contiguous (vs block dimension)
block_dim: BlockDimension,
/// Stride between blocks in bytes
block_stride: usize,
/// Stride between outer dimensions in bytes
outer_stride: usize,
/// Size of each memory region (page) in bytes
region_size: usize,
/// Owned memory regions backing this layout (one per layer)
memory_regions: Vec<Buffer>,
/// KV block layout for inner tensor format (must be operational: NHD or HND)
kv_block_layout: KvBlockLayout,
}
/// Builder for creating [`LayerSeparateLayout`] instances.
///
/// # Example
///
/// ```ignore
/// let layout = LayerSeparateLayout::builder()
/// .config(config)
/// .memory(memory_regions)
/// .block_dim(BlockDimension::BlockIsFirstDim)
/// .inner_shape(InnerShape::NHD)
/// .build()?;
/// ```
#[derive(Debug, Default)]
pub struct LayerSeparateLayoutBuilder {
config: Option<LayoutConfig>,
memory: Option<Vec<Buffer>>,
block_dim: Option<BlockDimension>,
kv_block_layout: KvBlockLayout,
}
impl LayerSeparateLayoutBuilder {
/// Create a new builder with default values.
pub fn new() -> Self {
Self {
config: None,
memory: None,
block_dim: None,
kv_block_layout: KvBlockLayout::Unknown,
}
}
/// Set the layout configuration.
pub fn config(&mut self, config: LayoutConfig) -> &mut Self {
self.config = Some(config);
self
}
/// Set the memory buffers backing this layout (one per layer).
pub fn memory(&mut self, memory: Vec<Buffer>) -> &mut Self {
self.memory = Some(memory);
self
}
/// Set the block dimension ordering.
pub fn block_dim(&mut self, block_dim: BlockDimension) -> &mut Self {
self.block_dim = Some(block_dim);
self
}
/// Set the inner shape, which translates to the KV block layout.
///
/// Only operational layouts (NHD, HND) are valid for layer-separate layouts.
///
/// - `InnerShape::NHD` -> `KvBlockLayout::OperationalNHD`
/// - `InnerShape::HND` -> `KvBlockLayout::OperationalHND`
/// - `InnerShape::Unknown` -> `KvBlockLayout::Unknown`
///
/// Default: `KvBlockLayout::Unknown`
pub fn inner_shape(&mut self, shape: InnerShape) -> &mut Self {
self.kv_block_layout = KvBlockLayout::from_inner_shape(shape);
self
}
/// Build the [`LayerSeparateLayout`].
///
/// # Errors
///
/// Returns an error if:
/// - `config` is not set
/// - `memory` is not set
/// - `block_dim` is not set
/// - The memory region count doesn't match `num_layers`
/// - Any memory region is too small for the layout
/// - The config validation fails
pub fn build(&self) -> Result<LayerSeparateLayout> {
let config = self
.config
.clone()
.ok_or_else(|| anyhow!("config is required"))?;
let memory = self
.memory
.clone()
.ok_or_else(|| anyhow!("memory is required"))?;
let block_dim = self
.block_dim
.ok_or_else(|| anyhow!("block_dim is required"))?;
LayerSeparateLayout::new_internal(config, memory, block_dim, self.kv_block_layout)
}
}
impl LayerSeparateLayout {
/// Create a builder for `LayerSeparateLayout`.
pub fn builder() -> LayerSeparateLayoutBuilder {
LayerSeparateLayoutBuilder::new()
}
/// Create a new layer-separate layout with default KV block layout.
///
/// # Arguments
/// - `config` - Layout configuration
/// - `memory` - Vector of owned memory regions (one per layer)
/// - `block_dim` - Whether block or outer dimension is first
///
/// # Returns
/// A new LayerSeparateLayout instance with `KvBlockLayout::Unknown`
pub(crate) fn new(
config: LayoutConfig,
memory: Vec<Buffer>,
block_dim: BlockDimension,
) -> Result<Self> {
Self::new_internal(config, memory, block_dim, KvBlockLayout::Unknown)
}
/// Internal constructor with all parameters.
fn new_internal(
config: LayoutConfig,
memory: Vec<Buffer>,
block_dim: BlockDimension,
kv_block_layout: KvBlockLayout,
) -> Result<Self> {
config.validate()?;
if memory.len() != config.num_layers {
return Err(anyhow!(
"Memory region count ({}) must match num_layers ({})",
memory.len(),
config.num_layers
));
}
// Calculate strides
let region_size = config.page_size * config.inner_dim * config.dtype_width_bytes;
let (block_stride, outer_stride) = if block_dim == BlockDimension::BlockIsSecondDim {
// Layout: [outer_dim, num_blocks, page_size, inner_dim]
let block_stride = region_size;
let outer_stride = block_stride * config.num_blocks;
(block_stride, outer_stride)
} else {
// Layout: [num_blocks, outer_dim, page_size, inner_dim]
let outer_stride = region_size;
let block_stride = outer_stride * config.outer_dim;
(block_stride, outer_stride)
};
// Extract base addresses and validate sizes
let mut layer_base_addrs = Vec::with_capacity(config.num_layers);
let required_size = config.num_blocks * config.outer_dim * region_size;
for (i, mem) in memory.iter().enumerate() {
if mem.size() < required_size {
return Err(anyhow!(
"Memory region {} too small for layout. Required: {} bytes, got: {} bytes",
i,
required_size,
mem.size()
));
}
layer_base_addrs.push(mem.addr());
}
Ok(Self {
config,
layer_base_addrs,
block_dim,
block_stride,
outer_stride,
region_size,
memory_regions: memory,
kv_block_layout,
})
}
/// Calculate the address of a specific memory region.
fn calculate_address(
&self,
block_id: usize,
layer_id: usize,
outer_id: usize,
) -> Result<usize> {
if block_id >= self.config.num_blocks {
return Err(anyhow!(
"Block ID {} out of range (max: {})",
block_id,
self.config.num_blocks
));
}
if layer_id >= self.config.num_layers {
return Err(anyhow!(
"Layer ID {} out of range (max: {})",
layer_id,
self.config.num_layers
));
}
if outer_id >= self.config.outer_dim {
return Err(anyhow!(
"Outer ID {} out of range (max: {})",
outer_id,
self.config.outer_dim
));
}
let base_addr = self.layer_base_addrs[layer_id];
let offset = block_id * self.block_stride + outer_id * self.outer_stride;
Ok(base_addr + offset)
}
#[expect(dead_code)]
pub fn block_dim(&self) -> BlockDimension {
self.block_dim
}
/// Get mutable reference to the memory regions for NIXL registration.
#[expect(dead_code)]
pub fn memory_regions_mut(&mut self) -> &mut [Buffer] {
&mut self.memory_regions
}
/// Get the KV block layout.
#[expect(dead_code)]
pub fn kv_block_layout(&self) -> KvBlockLayout {
self.kv_block_layout
}
/// Set the KV block layout from an inner shape.
///
/// Note: Only operational layouts (NHD, HND) are valid for layer-separate layouts.
#[expect(dead_code)]
pub fn set_kv_block_layout(&mut self, inner_shape: InnerShape) {
self.kv_block_layout = KvBlockLayout::from_inner_shape(inner_shape);
}
}
impl Layout for LayerSeparateLayout {
fn config(&self) -> &LayoutConfig {
&self.config
}
fn memory_regions(&self) -> &[Buffer] {
&self.memory_regions
}
fn memory_region(
&self,
block_id: usize,
layer_id: usize,
outer_id: usize,
) -> Result<MemoryRegion> {
let addr = self.calculate_address(block_id, layer_id, outer_id)?;
Ok(MemoryRegion::new(addr, self.region_size))
}
fn required_allocations(&self) -> Vec<usize> {
// One allocation per layer
let per_layer_size = self.config.num_blocks * self.config.outer_dim * self.region_size;
vec![per_layer_size; self.config.num_layers]
}
fn is_fully_contiguous(&self) -> bool {
false
}
fn num_blocks(&self) -> usize {
self.config.num_blocks
}
fn num_layers(&self) -> usize {
self.config.num_layers
}
fn outer_dim(&self) -> usize {
self.config.outer_dim
}
fn page_size(&self) -> usize {
self.config.page_size
}
fn inner_dim(&self) -> usize {
self.config.inner_dim
}
fn dtype_width_bytes(&self) -> usize {
self.config.dtype_width_bytes
}
fn serialization_details(&self) -> LayoutTypeDetails {
LayoutTypeDetails::LayerSeparate(LayerSeparateDetails {
block_dim: self.block_dim,
kv_block_layout: self.kv_block_layout,
})
}
fn block_layout(&self) -> KvBlockLayout {
self.kv_block_layout
}
}
#[cfg(all(test, feature = "testing-kvbm"))]
mod tests {
use super::super::tests::*;
use super::*;
#[test]
fn test_layer_separate_block_contiguous() {
let config = LayoutConfig::builder()
.num_blocks(10)
.num_layers(4)
.outer_dim(2)
.page_size(16)
.inner_dim(128)
.dtype_width_bytes(2)
.build()
.unwrap();
let per_layer_size = 10 * 2 * 16 * 128 * 2;
let memory: Vec<Buffer> = (0..4)
.map(|i| Buffer::from_arc(MockMemory::new(0x1000 + i * per_layer_size, per_layer_size)))
.collect();
let layout =
LayerSeparateLayout::new(config, memory, BlockDimension::BlockIsFirstDim).unwrap();
assert_eq!(layout.num_blocks(), 10);
assert!(!layout.is_fully_contiguous());
assert_eq!(layout.required_allocations().len(), 4);
}
#[test]
fn test_layer_separate_outer_contiguous() {
let config = LayoutConfig::builder()
.num_blocks(10)
.num_layers(4)
.outer_dim(2)
.page_size(16)
.inner_dim(128)
.dtype_width_bytes(2)
.build()
.unwrap();
let per_layer_size = 10 * 2 * 16 * 128 * 2;
let memory: Vec<Buffer> = (0..4)
.map(|i| Buffer::from_arc(MockMemory::new(0x1000 + i * per_layer_size, per_layer_size)))
.collect();
let layout =
LayerSeparateLayout::new(config, memory, BlockDimension::BlockIsSecondDim).unwrap();
assert_eq!(layout.num_blocks(), 10);
assert!(!layout.is_fully_contiguous());
}
#[test]
fn test_memory_region() {
let config = LayoutConfig::builder()
.num_blocks(2)
.num_layers(2)
.outer_dim(2)
.page_size(16)
.inner_dim(128)
.dtype_width_bytes(2)
.build()
.unwrap();
let per_layer_size = 2 * 2 * 16 * 128 * 2;
let memory: Vec<Buffer> = (0..2)
.map(|i| Buffer::from_arc(MockMemory::new(0x1000 + i * per_layer_size, per_layer_size)))
.collect();
let layout =
LayerSeparateLayout::new(config, memory, BlockDimension::BlockIsFirstDim).unwrap();
// Test accessing specific memory regions
let region_size = 16 * 128 * 2;
// Block 0, Layer 0, Outer 0 - should be at layer 0's base address
let region = layout.memory_region(0, 0, 0).unwrap();
assert_eq!(region.addr, 0x1000);
assert_eq!(region.size, region_size);
// Block 0, Layer 1, Outer 0 - should be at layer 1's base address
let region = layout.memory_region(0, 1, 0).unwrap();
assert_eq!(region.addr, 0x1000 + per_layer_size);
assert_eq!(region.size, region_size);
// Block 0, Layer 0, Outer 1 - should be offset within layer 0
let region = layout.memory_region(0, 0, 1).unwrap();
assert_eq!(region.addr, 0x1000 + region_size);
assert_eq!(region.size, region_size);
}
}
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Decoupled layout system for block management.
//!
//! This module provides a simplified layout abstraction that:
//! - Maps block IDs to physical memory regions (address + size)
//! - Decouples memory regions from storage type information
//! - Specifies allocation requirements without performing allocation
//! - Uses trait objects for memory ownership
pub(crate) mod builder;
mod config;
mod fully_contiguous;
mod kv_block_layout;
mod layer_separate;
mod physical;
mod serialize;
mod validation;
#[cfg(all(test, feature = "testing-kvbm"))]
pub(super) mod tests;
// #[cfg(test)]
// mod integration_tests;
pub use builder::PhysicalLayoutBuilder;
pub use config::{BlockDimension, LayoutConfig};
pub(crate) use fully_contiguous::FullyContiguousLayout;
pub use kv_block_layout::{BlockDim, KvBlockLayout};
pub(crate) use layer_separate::LayerSeparateLayout;
pub use physical::NixlMetadata;
pub use physical::PhysicalLayout;
pub(crate) use serialize::LayoutDescriptor;
pub use serialize::{BlockFormat, FullyContiguousDetails, LayerSeparateDetails, LayoutTypeDetails};
// mod registration;
// pub use registration::{RegisteredLayout, RegisteredStorageMetadata, RegistrationManager};
use anyhow::Result;
use serde::{Deserialize, Serialize};
pub(crate) use dynamo_memory::MemoryDescriptor;
pub use dynamo_memory::{Buffer, MemoryRegion};
/// Core layout trait for mapping block IDs to memory regions.
///
/// Layouts specify how KV cache blocks are organized in memory without
/// performing allocation themselves. They provide:
/// - Memory region lookup for specific blocks
/// - Allocation requirements for external allocators
/// - Metadata about block organization
pub trait Layout: Send + Sync + std::fmt::Debug {
/// Get the configuration for this layout.
fn config(&self) -> &LayoutConfig;
/// Get the root memory regions backing this layout.
///
/// These regions correspond to the concrete allocations that store the layout's data.
/// Implementations that derive memory procedurally can return an empty slice.
fn memory_regions(&self) -> &[Buffer];
/// Get memory regions for a specific block_id, layer_id, outer_id.
///
/// Returns a [MemoryRegion] for the continuous region specified by the given block_id,
/// layer_id, outer_id.
///
/// # Arguments
/// * `block_id` - The ID of the block to query (0..num_blocks)
/// * `layer_id` - The ID of the layer to query (0..num_layers)
/// * `outer_id` - The ID of the outer dimension to query (0..outer_dim)
fn memory_region(
&self,
block_id: usize,
layer_id: usize,
outer_id: usize,
) -> Result<MemoryRegion>;
/// Get the allocation requirements for this layout.
///
/// Returns a vector of allocation sizes needed to back this layout.
/// For fully contiguous layouts, this will be a single size.
/// For layer-separate layouts, this will contain one size per layer.
///
/// # Returns
/// Vector of allocation sizes in bytes.
fn required_allocations(&self) -> Vec<usize>;
/// Check if this layout uses fully contiguous memory.
///
/// Fully contiguous layouts have all blocks in a single allocation,
/// which enables certain optimizations.
fn is_fully_contiguous(&self) -> bool;
/// Get the total number of blocks in this layout.
fn num_blocks(&self) -> usize;
/// Get the number of layers per block.
fn num_layers(&self) -> usize;
/// Get the outer dimension size.
///
/// In typical KV cache layouts, this is often 2 (for K and V),
/// but can be 1 for architectures like MLA.
fn outer_dim(&self) -> usize;
/// Get the page size (often corresponds to block size in tokens).
fn page_size(&self) -> usize;
/// Get the inner dimension size.
///
/// This is typically the hidden size divided by tensor parallel size.
fn inner_dim(&self) -> usize;
/// Get the data type width in bytes.
fn dtype_width_bytes(&self) -> usize;
/// Get serialization details for this layout type.
///
/// This provides the layout-type-specific information needed to serialize
/// and reconstruct the layout on a remote node.
fn serialization_details(&self) -> serialize::LayoutTypeDetails;
/// Get the KV block layout describing how dimensions are permuted within blocks.
///
/// Returns the internal tensor ordering for blocks in this layout.
/// For layer-separate layouts, this describes the inner tensor format.
/// For fully contiguous layouts, this describes the full block format.
fn block_layout(&self) -> KvBlockLayout;
}
/// Inner shape format for tensor layout
#[allow(clippy::upper_case_acronyms)]
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub(crate) enum InnerShape {
/// Unknown shape - fallback when we can't determine the format
Unknown,
/// NHD format: [block_size, num_heads, head_dim]
/// Common for attention layers where N=tokens, H=heads, D=dimension
NHD,
/// HND format: [num_heads, block_size, head_dim]
/// Alternative layout with heads first
HND,
}
/// Trait for layouts that provide contiguous per-block memory regions.
///
/// This trait enables direct access to entire blocks as contiguous memory,
/// without requiring layer/outer indexing. It is implemented by
/// [`FullyContiguousLayout`] but NOT by [`LayerSeparateLayout`] (which
/// stores each layer separately).
///
/// Use this trait when you need to:
/// - Access raw block memory for transformation kernels
/// - Reinterpret block memory under different [`KvBlockLayout`] formats
/// - Perform whole-block operations without layer decomposition
pub trait ContiguousBlockLayout: Send + Sync + std::fmt::Debug {
/// Get the total number of blocks in this layout.
fn num_blocks(&self) -> usize;
/// Get the size of each block in bytes.
fn bytes_per_block(&self) -> usize;
/// Get the contiguous memory region for a specific block.
///
/// # Arguments
/// * `block_id` - The ID of the block to query (0..num_blocks)
///
/// # Returns
/// A [`MemoryRegion`] covering the entire block's memory.
///
/// # Errors
/// Returns an error if `block_id` is out of range.
fn raw_block(&self, block_id: usize) -> Result<MemoryRegion>;
/// Get the KV block layout for this contiguous layout.
fn block_layout(&self) -> KvBlockLayout;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment