"lib/vscode:/vscode.git/clone" did not exist on "9254e3d4d20a2b158faef4a8dede73ca941eff90"
Unverified Commit 9ab148dc authored by Ryan Olson's avatar Ryan Olson Committed by GitHub
Browse files

feat: kvbm-physical (#6490)


Signed-off-by: default avatarRyan Olson <rolson@nvidia.com>
parent 7546c193
...@@ -154,6 +154,8 @@ jobs: ...@@ -154,6 +154,8 @@ jobs:
cargo fmt -- --check && \ cargo fmt -- --check && \
cargo clippy --features block-manager,media-ffmpeg,testing-nixl,integration --no-deps --all-targets -- -D warnings && \ cargo clippy --features block-manager,media-ffmpeg,testing-nixl,integration --no-deps --all-targets -- -D warnings && \
cargo test --locked --all-targets --features=block-manager,media-ffmpeg,testing-nixl,integration -- --nocapture && \ cargo test --locked --all-targets --features=block-manager,media-ffmpeg,testing-nixl,integration -- --nocapture && \
cargo clippy -p kvbm-physical --no-deps --all-targets -- -D warnings && \
cargo test --locked -p kvbm-physical --features testing-kvbm -- --nocapture --test-threads=4 && \
/workspace/container/use-sccache.sh show-stats "Rust Checks"' /workspace/container/use-sccache.sh show-stats "Rust Checks"'
test-parallel: test-parallel:
......
...@@ -3877,6 +3877,14 @@ dependencies = [ ...@@ -3877,6 +3877,14 @@ dependencies = [
"tracing", "tracing",
] ]
[[package]]
name = "kvbm-common"
version = "1.0.0"
dependencies = [
"dynamo-tokens",
"serde",
]
[[package]] [[package]]
name = "kvbm-kernels" name = "kvbm-kernels"
version = "1.0.0" version = "1.0.0"
...@@ -3894,7 +3902,6 @@ version = "1.0.0" ...@@ -3894,7 +3902,6 @@ version = "1.0.0"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"async-stream", "async-stream",
"bincode 2.0.1",
"bytes", "bytes",
"derive_builder", "derive_builder",
"dynamo-tokens", "dynamo-tokens",
...@@ -3914,6 +3921,33 @@ dependencies = [ ...@@ -3914,6 +3921,33 @@ dependencies = [
"xxhash-rust", "xxhash-rust",
] ]
[[package]]
name = "kvbm-physical"
version = "1.0.0"
dependencies = [
"aligned-vec",
"anyhow",
"bincode 2.0.1",
"blake3",
"cudarc",
"derive-getters",
"derive_builder",
"dynamo-memory",
"futures",
"kvbm-common",
"kvbm-kernels",
"parking_lot",
"rstest 0.26.1",
"serde",
"serde_json",
"thiserror 2.0.18",
"tokio",
"tracing",
"uuid",
"validator",
"velo-events",
]
[[package]] [[package]]
name = "lalrpop-util" name = "lalrpop-util"
version = "0.20.2" version = "0.20.2"
...@@ -3999,12 +4033,13 @@ checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981" ...@@ -3999,12 +4033,13 @@ checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981"
[[package]] [[package]]
name = "libredox" name = "libredox"
version = "0.1.12" version = "0.1.14"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3d0b95e02c851351f877147b7deea7b1afb1df71b63aa5f8270716e0c5720616" checksum = "1744e39d1d6a9948f4f388969627434e31128196de472883b39f148769bfe30a"
dependencies = [ dependencies = [
"bitflags 2.11.0", "bitflags 2.11.0",
"libc", "libc",
"plain",
"redox_syscall 0.7.3", "redox_syscall 0.7.3",
] ]
...@@ -5474,6 +5509,12 @@ version = "0.3.32" ...@@ -5474,6 +5509,12 @@ version = "0.3.32"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c"
[[package]]
name = "plain"
version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b4596b6d070b27117e987119b4dac604f3c58cfb0b191112e24771b2faeac1a6"
[[package]] [[package]]
name = "plotters" name = "plotters"
version = "0.3.7" version = "0.3.7"
......
...@@ -10,8 +10,10 @@ members = [ ...@@ -10,8 +10,10 @@ members = [
"lib/mocker", "lib/mocker",
"lib/kv-router", "lib/kv-router",
"lib/memory", "lib/memory",
"lib/kvbm-common",
"lib/kvbm-kernels", "lib/kvbm-kernels",
"lib/kvbm-logical", "lib/kvbm-logical",
"lib/kvbm-physical",
"lib/async-openai", "lib/async-openai",
"lib/parsers", "lib/parsers",
"lib/bench", "lib/bench",
...@@ -45,11 +47,13 @@ dynamo-async-openai = { path = "lib/async-openai", version = "1.0.0", features = ...@@ -45,11 +47,13 @@ dynamo-async-openai = { path = "lib/async-openai", version = "1.0.0", features =
dynamo-parsers = { path = "lib/parsers", version = "1.0.0" } dynamo-parsers = { path = "lib/parsers", version = "1.0.0" }
# kvbm # kvbm
kvbm-common = { path = "lib/kvbm-common", version = "1.0.0" }
kvbm-kernels = { path = "lib/kvbm-kernels", version = "1.0.0" } kvbm-kernels = { path = "lib/kvbm-kernels", version = "1.0.0" }
kvbm-logical = { path = "lib/kvbm-logical", version = "1.0.0" } kvbm-logical = { path = "lib/kvbm-logical", version = "1.0.0" }
kvbm-physical = { path = "lib/kvbm-physical", version = "1.0.0" }
# velo # velo
velo-events = { path = "lib/velo-events", version = "0.9.0" } velo-events = { path = "lib/velo-events", version = "1.0.0" }
# External dependencies # External dependencies
anyhow = { version = "1" } anyhow = { version = "1" }
...@@ -66,7 +70,7 @@ chrono = { version = "0.4", default-features = false, features = [ ...@@ -66,7 +70,7 @@ chrono = { version = "0.4", default-features = false, features = [
"now", "now",
"serde", "serde",
] } ] }
cudarc = { version = "0.19.2", features = ["cuda-12020"] } cudarc = { version = "0.19.2", features = ["cuda-version-from-build-system", "fallback-latest"] }
dashmap = { version = "6.1" } dashmap = { version = "6.1" }
derive_builder = { version = "0.20" } derive_builder = { version = "0.20" }
derive-getters = { version = "0.5" } derive-getters = { version = "0.5" }
......
...@@ -3015,9 +3015,9 @@ dependencies = [ ...@@ -3015,9 +3015,9 @@ dependencies = [
[[package]] [[package]]
name = "js-sys" name = "js-sys"
version = "0.3.90" version = "0.3.91"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "14dc6f6450b3f6d4ed5b16327f38fed626d375a886159ca555bd7822c0c3a5a6" checksum = "b49715b7073f385ba4bc528e5747d02e66cb39c6146efb66b781f131f0fb399c"
dependencies = [ dependencies = [
"once_cell", "once_cell",
"wasm-bindgen", "wasm-bindgen",
...@@ -3329,12 +3329,13 @@ checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981" ...@@ -3329,12 +3329,13 @@ checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981"
[[package]] [[package]]
name = "libredox" name = "libredox"
version = "0.1.12" version = "0.1.14"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3d0b95e02c851351f877147b7deea7b1afb1df71b63aa5f8270716e0c5720616" checksum = "1744e39d1d6a9948f4f388969627434e31128196de472883b39f148769bfe30a"
dependencies = [ dependencies = [
"bitflags 2.11.0", "bitflags 2.11.0",
"libc", "libc",
"plain",
"redox_syscall 0.7.3", "redox_syscall 0.7.3",
] ]
...@@ -4701,6 +4702,12 @@ version = "0.3.32" ...@@ -4701,6 +4702,12 @@ version = "0.3.32"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c"
[[package]]
name = "plain"
version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b4596b6d070b27117e987119b4dac604f3c58cfb0b191112e24771b2faeac1a6"
[[package]] [[package]]
name = "png" name = "png"
version = "0.18.1" version = "0.18.1"
...@@ -7422,9 +7429,9 @@ dependencies = [ ...@@ -7422,9 +7429,9 @@ dependencies = [
[[package]] [[package]]
name = "wasm-bindgen" name = "wasm-bindgen"
version = "0.2.113" version = "0.2.114"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "60722a937f594b7fde9adb894d7c092fc1bb6612897c46368d18e7a20208eff2" checksum = "6532f9a5c1ece3798cb1c2cfdba640b9b3ba884f5db45973a6f442510a87d38e"
dependencies = [ dependencies = [
"cfg-if 1.0.4", "cfg-if 1.0.4",
"once_cell", "once_cell",
...@@ -7435,9 +7442,9 @@ dependencies = [ ...@@ -7435,9 +7442,9 @@ dependencies = [
[[package]] [[package]]
name = "wasm-bindgen-futures" name = "wasm-bindgen-futures"
version = "0.4.63" version = "0.4.64"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8a89f4650b770e4521aa6573724e2aed4704372151bd0de9d16a3bbabb87441a" checksum = "e9c5522b3a28661442748e09d40924dfb9ca614b21c00d3fd135720e48b67db8"
dependencies = [ dependencies = [
"cfg-if 1.0.4", "cfg-if 1.0.4",
"futures-util", "futures-util",
...@@ -7449,9 +7456,9 @@ dependencies = [ ...@@ -7449,9 +7456,9 @@ dependencies = [
[[package]] [[package]]
name = "wasm-bindgen-macro" name = "wasm-bindgen-macro"
version = "0.2.113" version = "0.2.114"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0fac8c6395094b6b91c4af293f4c79371c163f9a6f56184d2c9a85f5a95f3950" checksum = "18a2d50fcf105fb33bb15f00e7a77b772945a2ee45dcf454961fd843e74c18e6"
dependencies = [ dependencies = [
"quote", "quote",
"wasm-bindgen-macro-support", "wasm-bindgen-macro-support",
...@@ -7459,9 +7466,9 @@ dependencies = [ ...@@ -7459,9 +7466,9 @@ dependencies = [
[[package]] [[package]]
name = "wasm-bindgen-macro-support" name = "wasm-bindgen-macro-support"
version = "0.2.113" version = "0.2.114"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ab3fabce6159dc20728033842636887e4877688ae94382766e00b180abac9d60" checksum = "03ce4caeaac547cdf713d280eda22a730824dd11e6b8c3ca9e42247b25c631e3"
dependencies = [ dependencies = [
"bumpalo", "bumpalo",
"proc-macro2", "proc-macro2",
...@@ -7472,9 +7479,9 @@ dependencies = [ ...@@ -7472,9 +7479,9 @@ dependencies = [
[[package]] [[package]]
name = "wasm-bindgen-shared" name = "wasm-bindgen-shared"
version = "0.2.113" version = "0.2.114"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "de0e091bdb824da87dc01d967388880d017a0a9bc4f3bdc0d86ee9f9336e3bb5" checksum = "75a326b8c223ee17883a4251907455a2431acc2791c98c26279376490c378c16"
dependencies = [ dependencies = [
"unicode-ident", "unicode-ident",
] ]
...@@ -7528,9 +7535,9 @@ dependencies = [ ...@@ -7528,9 +7535,9 @@ dependencies = [
[[package]] [[package]]
name = "web-sys" name = "web-sys"
version = "0.3.90" version = "0.3.91"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "705eceb4ce901230f8625bd1d665128056ccbe4b7408faa625eec1ba80f59a97" checksum = "854ba17bb104abfb26ba36da9729addc7ce7f06f5c0f90f3c391f8461cca21f9"
dependencies = [ dependencies = [
"js-sys", "js-sys",
"wasm-bindgen", "wasm-bindgen",
......
...@@ -56,6 +56,6 @@ pyo3-async-runtimes = { version = "0.23.0", default-features = false, features = ...@@ -56,6 +56,6 @@ pyo3-async-runtimes = { version = "0.23.0", default-features = false, features =
] } ] }
dlpark = { version = "0.5", features = ["pyo3", "half"], optional = true } dlpark = { version = "0.5", features = ["pyo3", "half"], optional = true }
cudarc = { version = "0.19.2", features = ["cuda-12020"], optional = true } cudarc = { version = "0.19.2", features = ["cuda-version-from-build-system", "fallback-latest"], optional = true }
[dev-dependencies] [dev-dependencies]
...@@ -3075,9 +3075,9 @@ dependencies = [ ...@@ -3075,9 +3075,9 @@ dependencies = [
[[package]] [[package]]
name = "js-sys" name = "js-sys"
version = "0.3.90" version = "0.3.91"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "14dc6f6450b3f6d4ed5b16327f38fed626d375a886159ca555bd7822c0c3a5a6" checksum = "b49715b7073f385ba4bc528e5747d02e66cb39c6146efb66b781f131f0fb399c"
dependencies = [ dependencies = [
"once_cell", "once_cell",
"wasm-bindgen", "wasm-bindgen",
...@@ -3368,12 +3368,13 @@ checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981" ...@@ -3368,12 +3368,13 @@ checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981"
[[package]] [[package]]
name = "libredox" name = "libredox"
version = "0.1.12" version = "0.1.14"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3d0b95e02c851351f877147b7deea7b1afb1df71b63aa5f8270716e0c5720616" checksum = "1744e39d1d6a9948f4f388969627434e31128196de472883b39f148769bfe30a"
dependencies = [ dependencies = [
"bitflags 2.11.0", "bitflags 2.11.0",
"libc", "libc",
"plain",
"redox_syscall 0.7.3", "redox_syscall 0.7.3",
] ]
...@@ -4749,6 +4750,12 @@ version = "0.3.32" ...@@ -4749,6 +4750,12 @@ version = "0.3.32"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c"
[[package]]
name = "plain"
version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b4596b6d070b27117e987119b4dac604f3c58cfb0b191112e24771b2faeac1a6"
[[package]] [[package]]
name = "png" name = "png"
version = "0.18.1" version = "0.18.1"
...@@ -7497,9 +7504,9 @@ dependencies = [ ...@@ -7497,9 +7504,9 @@ dependencies = [
[[package]] [[package]]
name = "wasm-bindgen" name = "wasm-bindgen"
version = "0.2.113" version = "0.2.114"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "60722a937f594b7fde9adb894d7c092fc1bb6612897c46368d18e7a20208eff2" checksum = "6532f9a5c1ece3798cb1c2cfdba640b9b3ba884f5db45973a6f442510a87d38e"
dependencies = [ dependencies = [
"cfg-if 1.0.4", "cfg-if 1.0.4",
"once_cell", "once_cell",
...@@ -7510,9 +7517,9 @@ dependencies = [ ...@@ -7510,9 +7517,9 @@ dependencies = [
[[package]] [[package]]
name = "wasm-bindgen-futures" name = "wasm-bindgen-futures"
version = "0.4.63" version = "0.4.64"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8a89f4650b770e4521aa6573724e2aed4704372151bd0de9d16a3bbabb87441a" checksum = "e9c5522b3a28661442748e09d40924dfb9ca614b21c00d3fd135720e48b67db8"
dependencies = [ dependencies = [
"cfg-if 1.0.4", "cfg-if 1.0.4",
"futures-util", "futures-util",
...@@ -7524,9 +7531,9 @@ dependencies = [ ...@@ -7524,9 +7531,9 @@ dependencies = [
[[package]] [[package]]
name = "wasm-bindgen-macro" name = "wasm-bindgen-macro"
version = "0.2.113" version = "0.2.114"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0fac8c6395094b6b91c4af293f4c79371c163f9a6f56184d2c9a85f5a95f3950" checksum = "18a2d50fcf105fb33bb15f00e7a77b772945a2ee45dcf454961fd843e74c18e6"
dependencies = [ dependencies = [
"quote", "quote",
"wasm-bindgen-macro-support", "wasm-bindgen-macro-support",
...@@ -7534,9 +7541,9 @@ dependencies = [ ...@@ -7534,9 +7541,9 @@ dependencies = [
[[package]] [[package]]
name = "wasm-bindgen-macro-support" name = "wasm-bindgen-macro-support"
version = "0.2.113" version = "0.2.114"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ab3fabce6159dc20728033842636887e4877688ae94382766e00b180abac9d60" checksum = "03ce4caeaac547cdf713d280eda22a730824dd11e6b8c3ca9e42247b25c631e3"
dependencies = [ dependencies = [
"bumpalo", "bumpalo",
"proc-macro2", "proc-macro2",
...@@ -7547,9 +7554,9 @@ dependencies = [ ...@@ -7547,9 +7554,9 @@ dependencies = [
[[package]] [[package]]
name = "wasm-bindgen-shared" name = "wasm-bindgen-shared"
version = "0.2.113" version = "0.2.114"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "de0e091bdb824da87dc01d967388880d017a0a9bc4f3bdc0d86ee9f9336e3bb5" checksum = "75a326b8c223ee17883a4251907455a2431acc2791c98c26279376490c378c16"
dependencies = [ dependencies = [
"unicode-ident", "unicode-ident",
] ]
...@@ -7603,9 +7610,9 @@ dependencies = [ ...@@ -7603,9 +7610,9 @@ dependencies = [
[[package]] [[package]]
name = "web-sys" name = "web-sys"
version = "0.3.90" version = "0.3.91"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "705eceb4ce901230f8625bd1d665128056ccbe4b7408faa625eec1ba80f59a97" checksum = "854ba17bb104abfb26ba36da9729addc7ce7f06f5c0f90f3c391f8461cca21f9"
dependencies = [ dependencies = [
"js-sys", "js-sys",
"wasm-bindgen", "wasm-bindgen",
......
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
[package]
name = "kvbm-common"
version.workspace = true
edition.workspace = true
description.workspace = true
authors.workspace = true
license.workspace = true
homepage.workspace = true
repository.workspace = true
keywords.workspace = true
[dependencies]
dynamo-tokens = { workspace = true }
serde = { workspace = true }
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
use serde::{Deserialize, Serialize};
pub type BlockId = usize;
pub type SequenceHash = dynamo_tokens::PositionalLineageHash;
pub use dynamo_tokens as tokens;
/// Logical layout handle type encoding the layout ID.
///
/// KVBM manages G1, G2 and G3 layouts directly. G4 is managed by an external service.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum LogicalLayoutHandle {
/// Representation of GPU / Device Memory
/// G1 is fixed sized and managed by either the framework or the local instance of KVBM.
G1,
/// Representation of CPU / Host Memory
/// G2 is fixed sized and managed by the local instance of KVBM.
G2,
/// Representation of Disk Storage (Local or AttachedStorage)
/// G3 is fixed sized and managed by the local instance of KVBM.
G3,
/// Representation of Blocks held in an external service
/// outside the control of the KVBM system.
G4,
}
...@@ -20,10 +20,9 @@ use cudarc::runtime::sys as cuda_runtime; ...@@ -20,10 +20,9 @@ use cudarc::runtime::sys as cuda_runtime;
use kvbm_kernels::{MemcpyBatchMode, is_memcpy_batch_available, is_using_stubs, memcpy_batch}; use kvbm_kernels::{MemcpyBatchMode, is_memcpy_batch_available, is_using_stubs, memcpy_batch};
// Direct FFI for cudaMallocHost / cudaFreeHost. // Direct FFI for cudaMallocHost / cudaFreeHost.
// We bypass cudarc's runtime::sys because cudarc eagerly resolves ALL runtime // We link against libcudart directly (through kvbm-kernels' build.rs),
// symbols on first use, and CUDA 13.x removed `cudaGetDeviceProperties_v2` // so these symbols are always available without going through cudarc's
// which causes a panic. Our test binary links against libcudart directly // dynamic loader.
// (through kvbm-kernels' build.rs), so these symbols are always available.
unsafe extern "C" { unsafe extern "C" {
fn cudaMallocHost(ptr: *mut *mut c_void, size: usize) -> u32; fn cudaMallocHost(ptr: *mut *mut c_void, size: usize) -> u32;
fn cudaFreeHost(ptr: *mut c_void) -> u32; fn cudaFreeHost(ptr: *mut c_void) -> u32;
......
...@@ -15,7 +15,6 @@ dynamo-tokens = { workspace = true } ...@@ -15,7 +15,6 @@ dynamo-tokens = { workspace = true }
anyhow = { workspace = true } anyhow = { workspace = true }
async-stream = "0.3" async-stream = "0.3"
bytes = "1.10" bytes = "1.10"
bincode = { version = "2.0.1", features = ["serde", "derive"] }
derive_builder = { workspace = true } derive_builder = { workspace = true }
futures = { workspace = true } futures = { workspace = true }
lru = "0.16" lru = "0.16"
......
...@@ -15,9 +15,6 @@ pub mod tinylfu; ...@@ -15,9 +15,6 @@ pub mod tinylfu;
#[cfg(any(test, feature = "testing"))] #[cfg(any(test, feature = "testing"))]
pub mod testing; pub mod testing;
use bincode::{Decode, Encode};
use serde::{Deserialize, Serialize};
// Re-export common types and traits // Re-export common types and traits
pub use blocks::{ pub use blocks::{
BlockError, BlockMetadata, CompleteBlock, ImmutableBlock, MutableBlock, WeakBlock, BlockError, BlockMetadata, CompleteBlock, ImmutableBlock, MutableBlock, WeakBlock,
...@@ -37,19 +34,3 @@ impl KvbmSequenceHashProvider for dynamo_tokens::TokenBlock { ...@@ -37,19 +34,3 @@ impl KvbmSequenceHashProvider for dynamo_tokens::TokenBlock {
self.positional_lineage_hash() self.positional_lineage_hash()
} }
} }
/// Logical layout handle type encoding the layout ID.
///
/// KVBM manages G1, G2 and G3 layouts directly. G4 is managed by an external service.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Encode, Decode, Serialize, Deserialize)]
pub enum LogicalLayoutHandle {
/// Representation of GPU / Device Memory
G1,
/// Representation of CPU / Host Memory
G2,
/// Representation of Disk Storage
G3,
/// Representation of Blocks held in an external service
/// outside the control of the KVBM system.
G4,
}
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
[package]
name = "kvbm-physical"
version.workspace = true
edition.workspace = true
authors.workspace = true
license.workspace = true
repository.workspace = true
[dependencies]
dynamo-memory = { workspace = true }
kvbm-common = { workspace = true }
kvbm-kernels = { workspace = true }
velo-events = { workspace = true }
aligned-vec = "0.6.4"
anyhow = { workspace = true }
bincode = { version = "2.0.0", features = ["serde", "derive"] }
blake3 = { version = "1" }
cudarc = { workspace = true }
derive_builder = { workspace = true }
futures = { workspace = true }
derive-getters = { version = "0.5" }
parking_lot = { workspace = true }
serde = { workspace = true }
serde_json = { workspace = true }
thiserror = { workspace = true }
tokio = { workspace = true }
tracing = { workspace = true }
uuid = { workspace = true }
validator = { workspace = true }
[features]
default = []
collectives = []
testing-kvbm = []
testing-nixl-gds = []
[dev-dependencies]
dynamo-memory = { workspace = true, features = ["unsafe-slices"] }
rstest = "0.26"
# kvbm-physical
Physical layout and transfer management for KV cache block storage.
`kvbm-physical` provides the low-level building blocks for mapping KV cache blocks to memory, registering them for RDMA transfers via NIXL, and executing transfers between heterogeneous storage tiers (GPU, host, disk, remote).
## Modules
### `layout` — Block-to-memory mapping
Abstractions for how KV cache blocks are organized in memory.
- **`Layout` trait** — Core abstraction mapping `(block_id, layer_id, outer_id)` to a `MemoryRegion`. Implementations include fully contiguous (single allocation) and layer-separate (one allocation per layer) variants.
- **`KvBlockLayout`** — Describes dimension ordering within a block. Five named formats (`UniversalTP`, `UniversalPP`, `OperationalHND`, `OperationalNHD`, `Custom`) plus `Unknown`. Provides `requires_transform()`, `is_operational()`, and `is_universal()` for kernel selection.
- **`PhysicalLayout`** — Wraps a `Layout` with its physical storage location (`StorageKind`) and NIXL registration metadata (`NixlMetadata`). Constructed via a type-state builder: Config &rarr; Layout type &rarr; Memory allocation &rarr; `build()`.
- **`LayoutConfig`** — Block dimensions: `num_blocks`, `num_layers`, `outer_dim`, `page_size`, `inner_dim`, `dtype_width_bytes`, optional `num_heads`.
- **`KvBlocks`** — Groups block IDs with a shared `PhysicalLayout` and optional `KvBlockLayout` override for cross-format transfers.
### `manager` — Layout registration and transfer orchestration
- **`TransferManager`** — Primary API. Registers layouts, exports/imports RDMA metadata between workers, and executes transfers by handle.
- **`LayoutHandle`** — Compact `u128` encoding `(worker_id, layout_id)`. Identifies a registered layout within a specific worker; not symmetric across workers.
- **`LogicalLayoutDescriptor`** — Bridges a `LayoutHandle` to a `LogicalLayoutHandle` (G1/G2/G3/G4 tier). Enables callers to say "copy from G1 to G2" while `TransferManager` resolves worker-specific physical handles.
- **`SerializedLayout`** — Wire format for RDMA metadata exchange. Packs worker address, NIXL metadata, and layout descriptors into a bincode blob.
- **`WorkerAddress`**`(worker_id, nixl_agent_name)` pair identifying a worker on the network.
### `transfer` — Transfer configuration and execution
- **`TransferConfig` / builder** — Configures event system, NIXL backends, CUDA device, capabilities, and memory pool before building a `TransferManager`.
- **`TransferOptions`** — Per-transfer configuration: `layer_range`, `nixl_write_notification`, `bounce_buffer`, caller-provided `cuda_stream`, and src/dst `kv_layout` overrides.
- **`TransferPreferences`** — Strategy hints via `NativeVsNixlPolicy` (PreferNative / PreferNixl / Automatic).
- **`TransferCompleteNotification`**`Either<Ready, EventAwaiter>` implementing `IntoFuture`. Zero-cost for synchronous completions. `aggregate()` composes multiple notifications. `could_yield()` checks if awaiting will suspend.
- **`BounceBuffer`** — Staging area for two-hop transfers (e.g., Device &rarr; Host &rarr; Remote).
- **Checksum utilities** — BLAKE3 block/layer checksums for transfer verification.
- **Fill utilities** — Constant/sequential patterns for testing and initialization.
## Quick Start
```rust,ignore
use kvbm_physical::{TransferManager, TransferOptions};
use kvbm_physical::layout::{LayoutConfig, PhysicalLayout};
// 1. Build the TransferManager (creates NIXL agent, CUDA streams, event system)
let manager = TransferManager::builder()
.nixl_backend("ucx")
.cuda_device_id(0)
.build()?;
// 2. Configure a layout
let config = LayoutConfig::builder()
.num_blocks(64)
.num_layers(32)
.outer_dim(2)
.page_size(16)
.inner_dim(128)
.dtype_width_bytes(2)
.build()?;
// 3. Build a physical layout (type-state builder: config -> layout type -> memory -> build)
let gpu_layout = PhysicalLayout::builder(manager.nixl_agent().clone())
.with_config(config.clone())
.fully_contiguous()
.allocate_device(0)
.build()?;
let host_layout = PhysicalLayout::builder(manager.nixl_agent().clone())
.with_config(config)
.fully_contiguous()
.allocate_pinned(Some(0))
.build()?;
// 4. Register layouts to get handles
let gpu_handle = manager.register_layout(gpu_layout)?;
let host_handle = manager.register_layout(host_layout)?;
// 5. Execute a transfer and await completion
let notification = manager.execute_transfer(
gpu_handle,
&[0, 1, 2, 3], // source block IDs
host_handle,
&[0, 1, 2, 3], // destination block IDs
TransferOptions::new(),
)?;
notification.await?;
```
## Testing
All functional tests in `kvbm-physical` require a real NIXL installation and a CUDA GPU. They are gated behind two feature flags:
- **`testing-kvbm`** — enables tests requiring NIXL and CUDA (creates NixlAgent instances and allocates device memory / launches kernels)
### Running tests
```bash
# Without GPU/NIXL — only the sentinel test runs (confirms skipping)
cargo test -p kvbm-physical
# With GPU + NIXL available
cargo test -p kvbm-physical --features testing-kvbm
```
When neither feature is enabled, a single **sentinel test** runs and prints a reminder message. This ensures `cargo test` never silently passes with zero tests.
### What the sentinel test looks like
```
running 1 test
test sentinel::all_functional_tests_skipped___enable_testing_nixl_and_testing_cuda ... ok
```
The `test_version_check_on_deserialization` test in `layout::tests` is the only functional test that runs without feature flags, as it does not require NIXL or CUDA.
## Documentation
- [v1 Migration Guide](docs/v1_migration.md) — Migration from `dynamo-llm::block_manager` to `kvbm-physical`
# Migration Guide: block_manager to kvbm-physical
Guide for migrating from `dynamo-llm::block_manager` (v1) to `kvbm-physical`.
## Overview
`kvbm-physical` is a ground-up rewrite of the physical transfer layer from `lib/llm/src/block_manager/`. The core data flow is the same (register layouts, exchange metadata, execute transfers), but `kvbm-physical` adds block format awareness, richer transfer options, and a cleaner separation between logical tiers and physical handles.
Both implementations use the same `vectorized_copy` CUDA kernel. The original embeds it in a `.fatbin` (`lib/llm/src/block_manager/block/transfer/kernels/vectorized_copy.fatbin`) loaded via `cuModuleLoadData`. `kvbm-physical` wraps the same kernel via the `kvbm-kernels` crate with explicit Rust FFI for transparency and testability.
## Type mapping table
| Original (block_manager) | kvbm-physical | Notes |
|--------------------------|---------------|-------|
| `TransportManager` | `TransferManager` | Same role, richer API |
| `LayoutHandle` | `LayoutHandle` | Same concept; encoding changed — see LayoutHandle docs for details |
| `PhysicalLayout` + builder | `PhysicalLayout` + builder | Same pattern; adds `with_external_device_regions()` |
| `LayoutConfig` | `LayoutConfig` | Same fields + optional `num_heads` |
| `TransferOptions` | `TransferOptions` | Adds `cuda_stream`, `src_kv_layout`, `dst_kv_layout` |
| `TransferCapabilities` | `TransferCapabilities` | Same |
| `TransferPreferences` | `TransferPreferences` | Same |
| `SerializedLayout` | `SerializedLayout` | Same wire format concept |
| `WorkerAddress` | `WorkerAddress` | Same |
| `TransferCompleteNotification` (oneshot) | `TransferCompleteNotification` (`Either`/`EventAwaiter`) | Zero-cost sync path |
| `BounceBufferSpec` (trait object) | `BounceBuffer` (concrete struct) | Simpler, no heap allocation |
| N/A | `LogicalLayoutDescriptor` | **New** — tier bridging |
| N/A | `KvBlockLayout` | **New** — block format awareness |
| N/A | `KvBlocks` | **New** — grouped blocks with layout override |
| `CudaBlockingH2D` / `CudaBlockingD2H` | Removed | Async-only; `.await` for sync behavior |
| `OperationalCopyBackend` | Removed | Replaced by `kvbm_kernels` direct FFI |
## What kvbm-physical adds
### LogicalLayoutDescriptor
Bridges `LayoutHandle` (physical) to `LogicalLayoutHandle` (G1/G2/G3/G4 tier). This is the key new abstraction for multi-worker coordination: callers say "copy from G1 to G2" while `TransferManager` resolves worker-specific handles.
```rust,ignore
// Build descriptor for RDMA exchange
let descriptor = manager.build_logical_descriptor(gpu_handle, LogicalLayoutHandle::G1)?;
```
### KvBlockLayout
Five named block formats plus `Custom` and `Unknown`. Enables type-driven kernel selection for transfers between different dimension orderings.
```rust,ignore
let needs_permute = src_layout.requires_transform(&dst_layout);
```
### kvbm-kernels FFI
The `kvbm_kernels` crate provides `memcpy_batch` using CUDA 12.9+ batch API with automatic fallback to individual copies. This replaces the fatbin-loading approach with direct Rust FFI.
### Stream pooling
4 H2D + 4 D2H streams with round-robin selection, replacing the original 1+1 stream pair. Reduces contention for concurrent transfers.
### Caller-provided CUDA stream
`TransferOptions::cuda_stream` lets the caller pass in a stream. The executor skips event recording; the caller manages synchronization. Useful for layer-wise transfers where all layers must execute on the same stream.
```rust,ignore
let stream = manager.context().acquire_h2d_stream();
let options = TransferOptions::builder()
.cuda_stream(stream.clone())
.build()?;
```
### CudaMemPool
Device memory pool for kernel temporary allocations (permute buffers, etc.). Configured via `TransferConfig`:
```rust,ignore
TransferManager::builder()
.cuda_pool_reserve_size(64 * 1024 * 1024) // 64 MiB pre-allocated
.cuda_pool_release_threshold(Some(64 * 1024 * 1024)) // free above this
.build()?;
```
### TransferCompleteNotification::aggregate()
Compose multiple transfer notifications into one that completes when all are done. Optimizes away the aggregation when all inputs are already complete.
```rust,ignore
let combined = TransferCompleteNotification::aggregate(
vec![n1, n2, n3],
manager.context().event_system(),
&tokio::runtime::Handle::current(),
)?;
combined.await?;
```
### src/dst kv_layout overrides
`TransferOptions` now supports overriding the source and destination block layout interpretation, enabling cross-format transfers without modifying the registered layout.
```rust,ignore
let options = TransferOptions::builder()
.src_kv_layout(KvBlockLayout::OperationalNHD)
.dst_kv_layout(KvBlockLayout::UniversalTP)
.build()?;
```
## What was intentionally removed
### Blocking CUDA strategies
`CudaBlockingH2D` and `CudaBlockingD2H` are removed. All transfers are async. For synchronous behavior, just `.await` immediately:
```rust,ignore
// v1 (blocking)
let result = blocking_h2d_transfer(...);
// kvbm-physical (async, but can be used synchronously)
let notification = manager.execute_transfer(...)?;
notification.await?;
```
### OperationalCopyBackend enum
The `OperationalCopyBackend` enum (which selected between different kernel loading strategies) is removed. `kvbm-physical` uses `kvbm_kernels` direct FFI exclusively, making kernel dispatch transparent.
### Trait object bounce buffer
`BounceBufferSpec` (a trait object requiring heap allocation) is replaced by `BounceBuffer`, a concrete struct wrapping a `LayoutHandle` + block IDs:
```rust,ignore
// v1
struct MyBounce { layout: PhysicalLayout, blocks: Vec<BlockId> }
impl BounceBufferSpec for MyBounce { ... }
// kvbm-physical
let bounce = BounceBuffer::from_handle(host_handle, vec![0, 1, 2, 3]);
```
## Migration steps
### 1. Replace TransportManager with TransferManager
The builder pattern is the same. `TransferManager::builder()` returns the same kind of fluent builder.
```rust,ignore
// v1
let manager = TransportManager::builder()
.worker_id(0)
.nixl_backend("ucx")
.cuda_device_id(0)
.build()?;
// kvbm-physical
let manager = TransferManager::builder()
.nixl_backend("ucx")
.cuda_device_id(0)
.build()?;
// worker_id is now derived from the event system
```
### 2. Replace TransferOptions
Add new fields as needed. Existing `layer_range` and `nixl_write_notification` work the same way.
```rust,ignore
// v1
let options = TransferOptions::builder()
.layer_range(0..16)
.build()?;
// kvbm-physical (same, with optional new fields)
let options = TransferOptions::builder()
.layer_range(0..16)
.cuda_stream(stream) // new: caller-managed stream
.src_kv_layout(layout) // new: format override
.build()?;
```
### 3. Replace BounceBufferSpec with BounceBuffer
```rust,ignore
// v1 — trait object
let spec: Box<dyn BounceBufferSpec> = Box::new(MyBounce::new(layout, blocks));
options.bounce_buffer(spec);
// kvbm-physical — concrete type
let bounce = BounceBuffer::from_handle(host_handle, block_ids);
let options = TransferOptions::builder()
.bounce_buffer(bounce)
.build()?;
```
### 4. Replace TransferCompleteNotification await pattern
The notification now implements `IntoFuture` directly instead of wrapping a oneshot channel.
```rust,ignore
// v1
let notification = manager.execute_transfer(...)?;
notification.recv().await??;
// kvbm-physical
let notification = manager.execute_transfer(...)?;
notification.await?;
```
### 5. Add LogicalLayoutDescriptor for multi-worker tier resolution
If you coordinate transfers across multiple workers by tier name (G1, G2, etc.), use `LogicalLayoutDescriptor`:
```rust,ignore
// Build descriptors that include tier information
let g1_desc = manager.build_logical_descriptor(gpu_handle, LogicalLayoutHandle::G1)?;
let g2_desc = manager.build_logical_descriptor(host_handle, LogicalLayoutHandle::G2)?;
// Remote workers can now resolve "copy G1 to G2" to the correct physical handles
```
### 6. Consider KvBlockLayout annotations for cross-format transfers
If your transfers involve blocks stored in different dimension orderings (e.g., operational NHD from the engine vs. universal TP for storage), annotate with `KvBlockLayout`:
```rust,ignore
let options = TransferOptions::builder()
.src_kv_layout(KvBlockLayout::OperationalNHD)
.dst_kv_layout(KvBlockLayout::UniversalTP)
.build()?;
```
This tells the executor to select a permute kernel instead of a direct copy.
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Typed builder for constructing [`PhysicalLayout`](crate::layout::PhysicalLayout)
//! instances with strongly-typed configuration, layout selection, and memory provisioning.
//!
//! The builder enforces the three steps required to materialize a physical layout:
//! 1. Provide a [`LayoutConfig`]
//! 2. Select a concrete layout (fully contiguous or layer separate)
//! 3. Specify memory backing (either by allocating or by supplying existing regions)
//!
//! NIXL registration is always enabled. Callers must provide a [`nixl_sys::Agent`], and any memory
//! supplied to the builder must implement [`NixlCompatible`].
use crate::layout::physical::PhysicalLayout;
use super::{
BlockDimension, FullyContiguousLayout, LayerSeparateLayout, Layout, LayoutConfig,
MemoryDescriptor, physical::NixlMetadata,
};
use anyhow::{Result, anyhow, bail};
use dynamo_memory::{
Buffer, DiskStorage, OffsetBuffer, StorageKind, SystemStorage, create_buffer,
nixl::{MemType, NixlAgent, NixlDescriptor, register_with_nixl},
prelude::{NixlCompatible, RegisteredView},
};
#[allow(unused_imports)]
use std::marker::PhantomData;
use std::path::PathBuf;
use std::sync::Arc;
use dynamo_memory::{DeviceStorage, PinnedStorage};
const REGION_ALIGNMENT: usize = 512;
/// Layout selection exposed by the builder.
#[derive(Debug, Clone)]
pub enum LayoutKind {
FullyContiguous,
LayerSeparate { block_dim: BlockDimension },
}
/// Allocation strategies for builder-managed memory.
#[derive(Debug, Clone)]
enum AllocationKind {
System,
/// Pinned (page-locked) host memory. If `device_id` is Some, NUMA-aware
/// allocation is used on the GPU's NUMA node (when NUMA is enabled).
Pinned {
device_id: Option<u32>,
},
Device {
device_id: u32,
},
Disk {
path: Option<PathBuf>,
},
}
/// Memory provisioning plan (either provided regions or an allocation request).
#[derive(Debug, Clone)]
enum MemoryPlan {
Provided(Vec<MemoryEntry>),
Allocate(AllocationKind),
}
/// Memory tenancy captured during the build process.
#[derive(Debug, Clone)]
struct MemoryEntry {
region: Buffer,
descriptor: Option<NixlDescriptor>,
}
impl MemoryEntry {
fn new(region: Buffer, descriptor: Option<NixlDescriptor>) -> Self {
Self { region, descriptor }
}
fn ensure_registered(mut self) -> Result<Self> {
if self.descriptor.is_none() {
self.descriptor = self.region.nixl_descriptor();
}
#[cfg(not(test))]
{
// In production, require NIXL registration
if self.descriptor.is_none() {
bail!(
"memory region {} is not registered with NIXL",
self.region.addr()
);
}
}
// In test builds, allow None descriptors for local-only layouts
Ok(self)
}
}
/// Marker types for the builder state machine.
pub struct NoConfig;
pub struct HasConfig;
pub struct NoLayout;
pub struct HasLayout;
pub struct NoMemory;
pub struct HasMemory;
/// Default builder state type alias.
pub type PhysicalLayoutBuilderDefault = PhysicalLayoutBuilder<NoConfig, NoLayout, NoMemory>;
/// Typed builder enforcing configuration, layout selection, and memory provisioning phases.
pub struct PhysicalLayoutBuilder<C, L, M> {
agent: NixlAgent,
config: Option<LayoutConfig>,
layout_kind: Option<LayoutKind>,
memory_plan: Option<MemoryPlan>,
_config: PhantomData<C>,
_layout: PhantomData<L>,
_memory: PhantomData<M>,
}
impl PhysicalLayoutBuilder<NoConfig, NoLayout, NoMemory> {
/// Create a new builder in its initial state.
pub fn new(agent: NixlAgent) -> Self {
Self {
agent,
config: None,
layout_kind: None,
memory_plan: None,
_config: PhantomData,
_layout: PhantomData,
_memory: PhantomData,
}
}
}
impl<C, L, M> PhysicalLayoutBuilder<C, L, M> {
fn into_parts(
self,
) -> (
NixlAgent,
Option<LayoutConfig>,
Option<LayoutKind>,
Option<MemoryPlan>,
) {
(self.agent, self.config, self.layout_kind, self.memory_plan)
}
fn from_parts<C2, L2, M2>(
agent: NixlAgent,
config: Option<LayoutConfig>,
layout_kind: Option<LayoutKind>,
memory_plan: Option<MemoryPlan>,
) -> PhysicalLayoutBuilder<C2, L2, M2> {
PhysicalLayoutBuilder {
agent,
config,
layout_kind,
memory_plan,
_config: PhantomData,
_layout: PhantomData,
_memory: PhantomData,
}
}
}
impl<L, M> PhysicalLayoutBuilder<NoConfig, L, M> {
/// Attach the [`LayoutConfig`] required to size the layout and allocations.
pub fn with_config(self, config: LayoutConfig) -> PhysicalLayoutBuilder<HasConfig, L, M> {
let (agent, _config, layout_kind, memory_plan) = self.into_parts();
PhysicalLayoutBuilder::<HasConfig, L, M>::from_parts(
agent,
Some(config),
layout_kind,
memory_plan,
)
}
}
impl<M> PhysicalLayoutBuilder<HasConfig, NoLayout, M> {
/// Select the fully contiguous layout variant.
pub fn fully_contiguous(self) -> PhysicalLayoutBuilder<HasConfig, HasLayout, M> {
let (agent, config, _layout, memory_plan) = self.into_parts();
PhysicalLayoutBuilder::<HasConfig, HasLayout, M>::from_parts(
agent,
config,
Some(LayoutKind::FullyContiguous),
memory_plan,
)
}
/// Select the layer-separate layout variant with the provided block dimension ordering.
pub fn layer_separate(
self,
block_dim: BlockDimension,
) -> PhysicalLayoutBuilder<HasConfig, HasLayout, M> {
let (agent, config, _layout, memory_plan) = self.into_parts();
PhysicalLayoutBuilder::<HasConfig, HasLayout, M>::from_parts(
agent,
config,
Some(LayoutKind::LayerSeparate { block_dim }),
memory_plan,
)
}
}
impl PhysicalLayoutBuilder<HasConfig, HasLayout, NoMemory> {
fn set_memory_plan(
self,
plan: MemoryPlan,
) -> PhysicalLayoutBuilder<HasConfig, HasLayout, HasMemory> {
let (agent, config, layout_kind, _memory) = self.into_parts();
PhysicalLayoutBuilder::<HasConfig, HasLayout, HasMemory>::from_parts(
agent,
config,
layout_kind,
Some(plan),
)
}
pub fn allocate_system(self) -> PhysicalLayoutBuilder<HasConfig, HasLayout, HasMemory> {
self.set_memory_plan(MemoryPlan::Allocate(AllocationKind::System))
}
/// Allocate pinned (page-locked) host memory.
///
/// # Arguments
/// * `device_id` - If `Some(id)`, enables NUMA-aware allocation on the GPU's NUMA node
/// (when `DYN_KVBM_ENABLE_NUMA=1` is set). If `None`, uses direct allocation.
pub fn allocate_pinned(
self,
device_id: Option<u32>,
) -> PhysicalLayoutBuilder<HasConfig, HasLayout, HasMemory> {
self.set_memory_plan(MemoryPlan::Allocate(AllocationKind::Pinned { device_id }))
}
/// Allocate device memory on the specified CUDA device (or the context device if `None`).
pub fn allocate_device(
self,
device_id: u32,
) -> PhysicalLayoutBuilder<HasConfig, HasLayout, HasMemory> {
self.set_memory_plan(MemoryPlan::Allocate(AllocationKind::Device { device_id }))
}
/// Allocate disk-backed storage. When `path` is `None`, a temporary file is used.
pub fn allocate_disk(
self,
path: Option<PathBuf>,
) -> PhysicalLayoutBuilder<HasConfig, HasLayout, HasMemory> {
self.set_memory_plan(MemoryPlan::Allocate(AllocationKind::Disk { path }))
}
/// Use existing NIXL-compatible memory regions supplied by the caller.
pub fn with_memory_regions<S>(
self,
regions: Vec<S>,
) -> Result<PhysicalLayoutBuilder<HasConfig, HasLayout, HasMemory>>
where
S: MemoryDescriptor + NixlCompatible + 'static,
{
let (agent, config, layout_kind, _memory) = self.into_parts();
let entries = register_existing_regions(&agent, regions)?;
Ok(
PhysicalLayoutBuilder::<HasConfig, HasLayout, HasMemory>::from_parts(
agent,
config,
layout_kind,
Some(MemoryPlan::Provided(entries)),
),
)
}
/// Use pre-registered memory regions (already wrapped in `Arc<dyn MemoryDescriptor>`).
///
/// All regions must already expose a NIXL descriptor.
pub fn with_registered_regions(
self,
regions: Vec<Buffer>,
) -> Result<PhysicalLayoutBuilder<HasConfig, HasLayout, HasMemory>> {
let entries = regions
.into_iter()
.enumerate()
.map(|(index, region)| {
let descriptor = region.nixl_descriptor().ok_or_else(|| {
anyhow!(
"provided memory region at index {} is not NIXL registered",
index
)
})?;
Ok(MemoryEntry::new(region, Some(descriptor)))
})
.collect::<Result<Vec<_>>>()?;
let (agent, config, layout_kind, _memory) = self.into_parts();
Ok(
PhysicalLayoutBuilder::<HasConfig, HasLayout, HasMemory>::from_parts(
agent,
config,
layout_kind,
Some(MemoryPlan::Provided(entries)),
),
)
}
/// Register external KV cache tensors with NIXL for RDMA access.
///
/// This is the **CRITICAL** step that enables remote GPU-to-GPU transfers.
/// Each tensor's memory is wrapped in `ExternalDeviceMemory` and registered
/// with NIXL.
///
/// # Arguments
/// * `tensors` - KV cache tensors from vLLM (one per layer). All tensors must:
/// - Be on the same CUDA device
/// - Be contiguous in memory
/// - Have the same shape
///
/// # Requirements
/// - The NIXL agent must be registered with an RDMA-capable backend
/// - The external framework (vLLM) must keep the tensors valid while registered
///
/// # Example
/// ```ignore
/// let physical_layout = PhysicalLayoutBuilder::new(nixl_agent)
/// .with_config(layout_config)
/// .layer_separate(block_dim)
/// .with_external_device_regions(kv_tensors)? // NIXL registration here
/// .build()?;
/// ```
pub fn with_external_device_regions(
self,
tensors: Vec<Arc<dyn dynamo_memory::TensorDescriptor>>,
) -> Result<PhysicalLayoutBuilder<HasConfig, HasLayout, HasMemory>> {
use dynamo_memory::TensorDescriptorExt;
if tensors.is_empty() {
bail!("with_external_device_regions requires at least one tensor");
}
let (agent, config, layout_kind, _memory) = self.into_parts();
let mut entries = Vec::with_capacity(tensors.len());
for (index, tensor) in tensors.into_iter().enumerate() {
// Verify the tensor is on a CUDA device
if tensor.cuda_device_id().is_none() {
bail!("tensor at index {} is not on a CUDA device", index);
}
// Register tensor with NIXL for RDMA
// Arc<dyn TensorDescriptor> implements both MemoryDescriptor and NixlCompatible,
// so we can register it directly. This is the critical step that enables
// remote GPU-to-GPU transfers via UCX backend.
let entry = register_storage(tensor, &agent).map_err(|e| {
anyhow!(
"failed to register tensor {} with NIXL (UCX backend required for VRAM): {}",
index,
e
)
})?;
entries.push(entry);
}
Ok(
PhysicalLayoutBuilder::<HasConfig, HasLayout, HasMemory>::from_parts(
agent,
config,
layout_kind,
Some(MemoryPlan::Provided(entries)),
),
)
}
}
impl PhysicalLayoutBuilder<HasConfig, HasLayout, HasMemory> {
/// Finalize the builder, constructing the [`PhysicalLayout`].
pub fn build(self) -> Result<PhysicalLayout> {
let (agent, config, layout_kind, memory_plan) = self.into_parts();
let config = config.ok_or_else(|| anyhow!("layout config missing despite type state"))?;
let layout_kind =
layout_kind.ok_or_else(|| anyhow!("layout kind missing despite type state"))?;
let memory_plan =
memory_plan.ok_or_else(|| anyhow!("memory plan missing despite type state"))?;
let required_sizes = compute_allocation_sizes(&config, &layout_kind)?;
let entries = resolve_memory_plan(&agent, memory_plan, &required_sizes)?;
validate_memory_sizes(&entries, &required_sizes)?;
let kind = derive_storage_kind(&entries)?;
let metadata = derive_nixl_metadata(&agent, &entries)?;
let layout: Arc<dyn Layout> = match layout_kind {
LayoutKind::FullyContiguous => {
let entry = entries.first().ok_or_else(|| {
anyhow!("fully contiguous layout requires a single memory region")
})?;
let layout = FullyContiguousLayout::new(config.clone(), entry.region.clone())?;
Arc::new(layout)
}
LayoutKind::LayerSeparate { block_dim } => {
let regions: Vec<Buffer> =
entries.iter().map(|entry| entry.region.clone()).collect();
let layout = LayerSeparateLayout::new(config.clone(), regions, block_dim)?;
Arc::new(layout)
}
};
Ok(PhysicalLayout::new_local(layout, kind, metadata))
}
}
fn register_existing_regions<S>(agent: &NixlAgent, regions: Vec<S>) -> Result<Vec<MemoryEntry>>
where
S: MemoryDescriptor + NixlCompatible + 'static,
{
regions
.into_iter()
.map(|region| register_storage(region, agent))
.collect()
}
fn resolve_memory_plan(
agent: &NixlAgent,
plan: MemoryPlan,
sizes: &[usize],
) -> Result<Vec<MemoryEntry>> {
match plan {
MemoryPlan::Provided(entries) => {
if entries.len() != sizes.len() {
bail!(
"provided memory count ({}) does not match required allocations ({})",
entries.len(),
sizes.len()
);
}
entries
.into_iter()
.map(MemoryEntry::ensure_registered)
.collect()
}
MemoryPlan::Allocate(strategy) => allocate_regions(agent, strategy, sizes),
}
}
fn allocate_regions(
agent: &NixlAgent,
strategy: AllocationKind,
sizes: &[usize],
) -> Result<Vec<MemoryEntry>> {
if sizes.is_empty() {
return Ok(Vec::new());
}
let reserve_size = total_allocation_size(sizes, REGION_ALIGNMENT)?;
let base_entry = match strategy {
AllocationKind::System => allocate_system_entry(reserve_size, agent)?,
AllocationKind::Pinned { device_id } => {
allocate_pinned_entry(reserve_size, agent, device_id)?
}
AllocationKind::Device { device_id } => {
allocate_device_entry(reserve_size, agent, device_id)?
}
AllocationKind::Disk { path } => allocate_disk_entry(reserve_size, agent, path)?,
};
create_offset_entries(base_entry, sizes, REGION_ALIGNMENT)
}
fn allocate_system_entry(size: usize, agent: &NixlAgent) -> Result<MemoryEntry> {
let storage = SystemStorage::new(size)
.map_err(|e| anyhow!("failed to allocate system memory ({size} bytes): {e}"))?;
register_storage(storage, agent)
}
fn allocate_pinned_entry(
size: usize,
agent: &NixlAgent,
device_id: Option<u32>,
) -> Result<MemoryEntry> {
let storage = PinnedStorage::new_for_device(size, device_id)
.map_err(|e| anyhow!("failed to allocate pinned memory ({size} bytes): {e}"))?;
register_storage(storage, agent)
}
fn allocate_device_entry(size: usize, agent: &NixlAgent, device_id: u32) -> Result<MemoryEntry> {
let storage = DeviceStorage::new(size, device_id).map_err(|e| {
anyhow!("failed to allocate device memory ({size} bytes) on device {device_id}: {e}")
})?;
register_storage(storage, agent)
}
fn allocate_disk_entry(
size: usize,
agent: &NixlAgent,
path: Option<PathBuf>,
) -> Result<MemoryEntry> {
let storage = if let Some(path) = path {
DiskStorage::new_at(&path, size)
.map_err(|e| anyhow!("failed to allocate disk storage at {}: {e}", path.display()))?
} else {
DiskStorage::new(size).map_err(|e| anyhow!("failed to allocate disk storage: {e}"))?
};
register_storage(storage, agent)
}
// When testing, we allow unregistered layouts to help with test time. NIXL + UCX is very expensive to setup
// so we only use that backend when it's needed.
#[cfg(test)]
fn register_storage<S>(storage: S, agent: &NixlAgent) -> Result<MemoryEntry>
where
S: MemoryDescriptor + NixlCompatible + 'static,
{
let storage_kind = storage.storage_kind();
// Determine if registration is needed based on storage type and available backends
let should_register = match storage_kind {
StorageKind::System | StorageKind::Pinned => {
// System/Pinned memory needs UCX for remote transfers
agent.has_backend("UCX") || agent.has_backend("POSIX")
}
StorageKind::Device(_) => {
// Device memory needs UCX for remote transfers OR GDS for direct disk transfers
agent.has_backend("UCX") || agent.has_backend("GDS_MT")
}
StorageKind::Disk(_) => {
// Disk storage needs POSIX for regular I/O OR GDS for GPU direct I/O
agent.has_backend("POSIX") || agent.has_backend("GDS_MT")
}
};
if !should_register {
// Skip registration - only local non-NIXL transfers will be used
let region = Buffer::from_arc(Arc::new(storage));
return Ok(MemoryEntry::new(region, None));
}
// Register with NIXL using the appropriate backend
match register_with_nixl(storage, agent, None) {
Ok(registered) => {
let descriptor = registered.descriptor();
let region = Buffer::from_arc(Arc::new(registered));
Ok(MemoryEntry::new(region, Some(descriptor)))
}
Err(_storage) => bail!("failed to register memory with NIXL agent {}", agent.name()),
}
}
// Production builds always register
#[cfg(not(test))]
fn register_storage<S>(storage: S, agent: &NixlAgent) -> Result<MemoryEntry>
where
S: MemoryDescriptor + NixlCompatible + 'static,
{
// Production builds always register for safety
match register_with_nixl(storage, agent, None) {
Ok(registered) => {
let descriptor = registered.descriptor();
let region: Buffer = create_buffer(registered);
Ok(MemoryEntry::new(region, Some(descriptor)))
}
Err(_storage) => bail!("failed to register memory with NIXL agent {}", agent.name()),
}
}
fn create_offset_entries(
base_entry: MemoryEntry,
sizes: &[usize],
alignment: usize,
) -> Result<Vec<MemoryEntry>> {
if sizes.is_empty() {
return Ok(Vec::new());
}
let base_region = base_entry.region;
let base_descriptor = base_entry.descriptor;
let base_addr = base_region.addr();
let base_len = base_region.size();
let mut entries = Vec::with_capacity(sizes.len());
let mut offset = 0usize;
for (index, &size) in sizes.iter().enumerate() {
let region = if index == 0 && offset == 0 && size == base_len && sizes.len() == 1 {
base_region.clone()
} else {
let view = OffsetBuffer::new(base_region.clone(), offset, size)
.map_err(|e| anyhow!("failed to create offset region: {e}"))?;
create_buffer(view)
};
let descriptor = base_descriptor
.as_ref()
.map(|descriptor| derive_descriptor(descriptor, offset, size))
.transpose()?;
entries.push(MemoryEntry::new(region, descriptor));
offset = offset
.checked_add(size)
.ok_or_else(|| anyhow!("offset computation overflow"))?;
if index + 1 < sizes.len() && alignment > 1 {
let current_addr = base_addr
.checked_add(offset)
.ok_or_else(|| anyhow!("address computation overflow"))?;
let aligned_addr = align_up(current_addr, alignment)?;
offset = aligned_addr
.checked_sub(base_addr)
.ok_or_else(|| anyhow!("alignment subtraction overflow"))?;
}
}
if offset > base_len {
bail!(
"allocated base region ({base_len} bytes) is insufficient for {offset} bytes with padding"
);
}
Ok(entries)
}
fn derive_descriptor(base: &NixlDescriptor, offset: usize, size: usize) -> Result<NixlDescriptor> {
let mut descriptor = base.clone();
descriptor.size = size;
if descriptor.mem_type != MemType::File {
descriptor.addr = descriptor
.addr
.checked_add(offset as u64)
.ok_or_else(|| anyhow!("descriptor address overflow"))?;
}
Ok(descriptor)
}
fn compute_allocation_sizes(config: &LayoutConfig, kind: &LayoutKind) -> Result<Vec<usize>> {
match kind {
LayoutKind::FullyContiguous => {
let factors = [
config.num_blocks,
config.num_layers,
config.outer_dim,
config.page_size,
config.inner_dim,
config.dtype_width_bytes,
];
let total = mul_chain(&factors)?;
Ok(vec![total])
}
LayoutKind::LayerSeparate { .. } => {
let factors = [
config.num_blocks,
config.outer_dim,
config.page_size,
config.inner_dim,
config.dtype_width_bytes,
];
let per_layer = mul_chain(&factors)?;
Ok(vec![per_layer; config.num_layers])
}
}
}
fn mul_chain(factors: &[usize]) -> Result<usize> {
factors.iter().try_fold(1usize, |acc, &value| {
acc.checked_mul(value)
.ok_or_else(|| anyhow!("allocation size overflow during layout computation"))
})
}
fn total_allocation_size(sizes: &[usize], alignment: usize) -> Result<usize> {
if sizes.is_empty() {
return Ok(0);
}
let mut total = *sizes
.first()
.ok_or_else(|| anyhow!("allocation requires at least one region"))?;
for size in sizes.iter().skip(1) {
total = total
.checked_add(*size)
.ok_or_else(|| anyhow!("allocation size overflow during aggregation"))?;
if alignment > 1 {
total = total
.checked_add(alignment - 1)
.ok_or_else(|| anyhow!("allocation alignment padding overflow"))?;
}
}
Ok(total)
}
fn align_up(value: usize, alignment: usize) -> Result<usize> {
if alignment <= 1 {
return Ok(value);
}
let remainder = value % alignment;
if remainder == 0 {
Ok(value)
} else {
value
.checked_add(alignment - remainder)
.ok_or_else(|| anyhow!("alignment overflow"))
}
}
fn validate_memory_sizes(entries: &[MemoryEntry], required: &[usize]) -> Result<()> {
for (entry, &required_size) in entries.iter().zip(required.iter()) {
if entry.region.size() < required_size {
bail!(
"memory region too small: required {} bytes, available {} bytes",
required_size,
entry.region.size()
);
}
}
Ok(())
}
fn derive_storage_kind(entries: &[MemoryEntry]) -> Result<StorageKind> {
let first = entries
.first()
.ok_or_else(|| anyhow!("no memory regions available to determine storage location"))?;
let first_kind = first.region.storage_kind();
for entry in entries.iter().skip(1) {
let kind = entry.region.storage_kind();
if kind != first_kind {
bail!(
"all memory regions must share the same storage location (found {:?} and {:?})",
first_kind,
kind
);
}
}
Ok(first_kind)
}
fn derive_nixl_metadata(agent: &NixlAgent, entries: &[MemoryEntry]) -> Result<NixlMetadata> {
// Try to find a descriptor from entries
let descriptor_opt = entries.iter().find_map(|entry| entry.descriptor.clone());
#[cfg(test)]
{
// In test builds, allow layouts without NIXL registration
// Use defaults for local-only transfers
if let Some(descriptor) = descriptor_opt {
Ok(NixlMetadata::new(
agent.name().to_string(),
descriptor.mem_type,
descriptor.device_id,
))
} else {
// Use placeholder metadata for unregistered layouts
let first_entry = entries
.first()
.ok_or_else(|| anyhow!("no memory entries"))?;
let storage_kind = first_entry.region.storage_kind();
let (mem_type, device_id) = match storage_kind {
StorageKind::System => (MemType::Dram, 0),
StorageKind::Pinned => (MemType::Dram, 0),
StorageKind::Device(id) => (MemType::Vram, id as u64),
StorageKind::Disk(id) => (MemType::File, id),
};
Ok(NixlMetadata::new(
agent.name().to_string(),
mem_type,
device_id,
))
}
}
#[cfg(not(test))]
{
let descriptor = descriptor_opt
.ok_or_else(|| anyhow!("memory entries missing NIXL registration metadata"))?;
Ok(NixlMetadata::new(
agent.name().to_string(),
descriptor.mem_type,
descriptor.device_id,
))
}
}
#[cfg(all(test, feature = "testing-kvbm"))]
mod tests {
use super::super::{BlockDimension, LayoutConfig};
use super::*;
use dynamo_memory::{Buffer, MemoryDescriptor, StorageKind};
use std::any::Any;
#[derive(Debug)]
struct TestRegisteredRegion {
data: Vec<u8>,
kind: StorageKind,
descriptor: NixlDescriptor,
}
impl TestRegisteredRegion {
fn new(size: usize, kind: StorageKind, mem_type: MemType, device_id: u64) -> Self {
let data = vec![0u8; size];
let addr = data.as_ptr() as u64;
let descriptor = NixlDescriptor {
addr,
size,
mem_type,
device_id,
};
Self {
data,
kind,
descriptor,
}
}
}
impl MemoryDescriptor for TestRegisteredRegion {
fn addr(&self) -> usize {
self.data.as_ptr() as usize
}
fn size(&self) -> usize {
self.data.len()
}
fn storage_kind(&self) -> StorageKind {
self.kind
}
fn as_any(&self) -> &dyn Any {
self
}
fn nixl_descriptor(&self) -> Option<NixlDescriptor> {
Some(self.descriptor.clone())
}
}
fn make_layout_config() -> LayoutConfig {
LayoutConfig::builder()
.num_blocks(2)
.num_layers(3)
.outer_dim(2)
.page_size(4)
.inner_dim(8)
.dtype_width_bytes(2)
.build()
.unwrap()
}
fn fully_contiguous_size(cfg: &LayoutConfig) -> usize {
cfg.num_blocks
* cfg.num_layers
* cfg.outer_dim
* cfg.page_size
* cfg.inner_dim
* cfg.dtype_width_bytes
}
fn per_layer_size(cfg: &LayoutConfig) -> usize {
cfg.num_blocks * cfg.outer_dim * cfg.page_size * cfg.inner_dim * cfg.dtype_width_bytes
}
#[test]
fn builds_fully_contiguous_from_registered_regions() {
let agent = NixlAgent::new("builder-test-fully").expect("failed to create agent");
let cfg = make_layout_config();
let required = fully_contiguous_size(&cfg);
let region = create_buffer(TestRegisteredRegion::new(
required,
StorageKind::System,
MemType::Dram,
0,
));
let physical = PhysicalLayoutBuilder::new(agent.clone())
.with_config(cfg.clone())
.fully_contiguous()
.with_registered_regions(vec![region])
.expect("registered regions accepted")
.build()
.expect("builder should succeed");
assert_eq!(physical.location(), StorageKind::System);
assert!(physical.layout().as_ref().is_fully_contiguous());
assert_eq!(physical.layout().config().num_blocks, cfg.num_blocks);
assert_eq!(physical.layout().config().num_layers, cfg.num_layers);
let metadata = physical.nixl_metadata();
assert_eq!(metadata.agent_name(), agent.name());
assert_eq!(metadata.mem_type(), MemType::Dram);
}
#[test]
fn builds_layer_separate_from_registered_regions() {
let agent = NixlAgent::new("builder-test-layer").expect("failed to create agent");
let cfg = make_layout_config();
let per_layer = per_layer_size(&cfg);
let regions: Vec<Buffer> = (0..cfg.num_layers)
.map(|_| {
create_buffer(TestRegisteredRegion::new(
per_layer,
StorageKind::System,
MemType::Dram,
0,
))
})
.collect();
let physical = PhysicalLayoutBuilder::new(agent.clone())
.with_config(cfg.clone())
.layer_separate(BlockDimension::BlockIsFirstDim)
.with_registered_regions(regions)
.expect("registered layer regions accepted")
.build()
.expect("builder should succeed");
assert_eq!(physical.location(), StorageKind::System);
assert!(!physical.layout().as_ref().is_fully_contiguous());
assert_eq!(physical.layout().config().num_layers, cfg.num_layers);
let metadata = physical.nixl_metadata();
assert_eq!(metadata.agent_name(), agent.name());
assert_eq!(metadata.mem_type(), MemType::Dram);
}
}
// fn context_device_id(ctx: &TransferContext) -> u32 {
// ctx.stream().context().ordinal() as u32
// }
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
use derive_builder::Builder;
use serde::{Deserialize, Serialize};
use validator::{Validate, ValidationError};
/// Configuration for block layouts.
///
/// The `#[validate]` attributes on fields are checked during layout construction
/// (e.g., `FullyContiguousLayout::new_internal()`, `LayerSeparateLayout::new_internal()`),
/// not at builder `.build()` time.
#[derive(Debug, Clone, Builder, Validate, Serialize, Deserialize, PartialEq, Eq)]
pub struct LayoutConfig {
/// Number of blocks
#[validate(range(min = 1))]
pub num_blocks: usize,
/// Number of layers
#[validate(range(min = 1))]
pub num_layers: usize,
/// Number of outer dimensions
#[validate(range(min = 1, max = 2))]
pub outer_dim: usize,
/// Page size
#[validate(range(min = 1))]
pub page_size: usize,
/// Inner dimension
#[validate(range(min = 1))]
pub inner_dim: usize,
/// Alignment
#[validate(custom(function = "validate_power_of_2"))]
#[builder(default = "1")]
pub alignment: usize,
/// Data type
#[validate(custom(function = "validate_dtype_width_bytes"))]
#[builder(default = "2")]
pub dtype_width_bytes: usize,
/// Number of attention heads (optional).
///
/// When provided, enables KvBlockLayout support for universal formats.
/// The head dimension can be computed as: `inner_dim / (page_size * num_heads)`.
///
/// Required for:
/// - Universal layout transformations
/// - Per-head memory region access
#[builder(default = "None")]
#[serde(default)]
pub num_heads: Option<usize>,
}
impl LayoutConfig {
/// Builder for LayoutConfig
pub fn builder() -> LayoutConfigBuilder {
LayoutConfigBuilder::default()
}
pub fn required_bytes(&self) -> usize {
self.num_blocks
.saturating_mul(self.num_layers)
.saturating_mul(self.outer_dim)
.saturating_mul(self.page_size)
.saturating_mul(self.inner_dim)
.saturating_mul(self.dtype_width_bytes)
}
/// Get the number of bytes per block.
///
/// This is the total size of a single block across all layers and outer dimensions.
pub fn bytes_per_block(&self) -> usize {
self.num_layers
.saturating_mul(self.outer_dim)
.saturating_mul(self.page_size)
.saturating_mul(self.inner_dim)
.saturating_mul(self.dtype_width_bytes)
}
/// Get the head dimension if `num_heads` is specified.
///
/// Computes `inner_dim / (page_size * num_heads)`.
///
/// # Returns
/// `Some(head_dim)` if `num_heads` is set, `None` otherwise.
pub fn head_dim(&self) -> Option<usize> {
self.num_heads.map(|nh| {
let divisor = self.page_size * nh;
if divisor > 0 {
self.inner_dim / divisor
} else {
0
}
})
}
/// Check if this config supports KvBlockLayout operations.
///
/// Returns `true` if `num_heads` is set and the dimensions are valid
/// (inner_dim is evenly divisible by page_size * num_heads).
pub fn supports_kv_block_layout(&self) -> bool {
if let Some(nh) = self.num_heads {
let divisor = self.page_size * nh;
divisor > 0 && self.inner_dim.is_multiple_of(divisor)
} else {
false
}
}
/// Validate that this config supports KvBlockLayout operations.
///
/// # Returns
/// `Ok(())` if valid, `Err` with details otherwise.
pub fn validate_for_kv_block_layout(&self) -> Result<(), ValidationError> {
let nh = match self.num_heads {
Some(nh) => nh,
None => {
return Err(ValidationError::new(
"num_heads_required_for_kv_block_layout",
));
}
};
if nh == 0 {
return Err(ValidationError::new("num_heads_must_be_positive"));
}
let divisor = self.page_size * nh;
if !self.inner_dim.is_multiple_of(divisor) {
return Err(ValidationError::new(
"inner_dim_must_be_divisible_by_page_size_times_num_heads",
));
}
Ok(())
}
}
/// The first two dimensions of the tensor, `shape[0]` and `shape[1]`, one of those corresponds to the
/// block dimension, while the other corresponds to the outer dimension.
///
/// The outer dimension is typically:
/// - 1: MLA or K and V stored together,
/// - 2: K and V stored separately,
///
/// The block dimension tell us the number of blocks.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum BlockDimension {
/// The block dimension is the first dimension of the tensor, `[n_blocks, outer_dim, inner_dim]`
BlockIsFirstDim,
/// The block dimension is the second dimension of the tensor, `[outer_dim, n_blocks, inner_dim]`
/// This is a replacement for v1's `outer_contiguous` is true.
BlockIsSecondDim,
}
/// Validation function for Option<usize> to check if it's Some(power_of_2).
pub fn validate_power_of_2(alignment: usize) -> Result<(), ValidationError> {
if !alignment.is_power_of_two() {
// Return validation error if alignment is not a power of 2
return Err(validator::ValidationError::new(
"alignment_must_be_power_of_2",
));
}
// Passes validation if alignment is a power of 2
Ok(())
}
pub fn validate_dtype_width_bytes(dtype_width_bytes: usize) -> Result<(), ValidationError> {
if !dtype_width_bytes.is_power_of_two() || !(2..=8).contains(&dtype_width_bytes) {
return Err(validator::ValidationError::new(
"dtype_width_bytes_must_be_power_of_two_and_less_than_8_bytes",
));
}
Ok(())
}
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Fully contiguous layout implementation.
//!
//! This layout stores all blocks in a single contiguous memory allocation
//! with the shape: [num_blocks, num_layers, outer_dim, page_size, inner_dim].
use anyhow::{Result, anyhow};
use validator::Validate;
use super::serialize::{BlockFormat, FullyContiguousDetails, LayoutTypeDetails};
use super::{Buffer, KvBlockLayout, Layout, LayoutConfig, MemoryDescriptor, MemoryRegion};
/// Fully contiguous layout where all blocks are in a single allocation.
#[derive(Debug)]
pub struct FullyContiguousLayout {
config: LayoutConfig,
/// Base address of the allocation
base_addr: usize,
/// Stride between blocks in bytes
block_stride: usize,
/// Stride between layers in bytes
layer_stride: usize,
/// Stride between outer dimensions in bytes
outer_stride: usize,
/// Size of each memory region (page) in bytes
region_size: usize,
/// Owned memory region backing this layout
memory: Buffer,
/// Format of blocks in memory
block_format: BlockFormat,
/// KV block layout describing dimension ordering within blocks
kv_block_layout: KvBlockLayout,
}
/// Builder for creating [`FullyContiguousLayout`] instances.
///
/// # Example
///
/// ```ignore
/// let layout = FullyContiguousLayout::builder()
/// .config(config)
/// .memory(buffer)
/// .kv_block_layout(KvBlockLayout::UniversalTP)
/// .build()?;
/// ```
#[derive(Debug, Default)]
pub struct FullyContiguousLayoutBuilder {
config: Option<LayoutConfig>,
memory: Option<Buffer>,
kv_block_layout: KvBlockLayout,
block_format: BlockFormat,
}
impl FullyContiguousLayoutBuilder {
/// Create a new builder with default values.
pub fn new() -> Self {
Self {
config: None,
memory: None,
kv_block_layout: KvBlockLayout::Unknown,
block_format: BlockFormat::default(),
}
}
/// Set the layout configuration.
#[expect(dead_code)]
pub fn config(&mut self, config: LayoutConfig) -> &mut Self {
self.config = Some(config);
self
}
/// Set the memory buffer backing this layout.
#[expect(dead_code)]
pub fn memory(&mut self, memory: Buffer) -> &mut Self {
self.memory = Some(memory);
self
}
/// Set the KV block layout describing dimension ordering.
///
/// Default: `KvBlockLayout::Unknown`
#[expect(dead_code)]
pub fn kv_block_layout(&mut self, layout: KvBlockLayout) -> &mut Self {
self.kv_block_layout = layout;
self
}
/// Set the block format.
///
/// Default: `BlockFormat::default()` (Operational)
#[expect(dead_code)]
pub fn block_format(&mut self, format: BlockFormat) -> &mut Self {
self.block_format = format;
self
}
/// Build the [`FullyContiguousLayout`].
///
/// # Errors
///
/// Returns an error if:
/// - `config` is not set
/// - `memory` is not set
/// - The memory region is too small for the layout
/// - The config validation fails
#[expect(dead_code)]
pub fn build(&self) -> Result<FullyContiguousLayout> {
let config = self
.config
.clone()
.ok_or_else(|| anyhow!("config is required"))?;
let memory = self
.memory
.clone()
.ok_or_else(|| anyhow!("memory is required"))?;
FullyContiguousLayout::new_internal(config, memory, self.kv_block_layout, self.block_format)
}
}
impl FullyContiguousLayout {
/// Create a builder for `FullyContiguousLayout`.
#[expect(dead_code)]
pub fn builder() -> FullyContiguousLayoutBuilder {
FullyContiguousLayoutBuilder::new()
}
/// Create a new fully contiguous layout with default KV block layout.
///
/// # Arguments
/// * `config` - Layout configuration
/// * `memory` - Owned memory region that backs this layout
///
/// # Returns
/// A new FullyContiguousLayout instance with `KvBlockLayout::Unknown`
pub(crate) fn new(config: LayoutConfig, memory: Buffer) -> Result<Self> {
Self::new_internal(
config,
memory,
KvBlockLayout::Unknown,
BlockFormat::default(),
)
}
/// Internal constructor with all parameters.
fn new_internal(
config: LayoutConfig,
memory: Buffer,
kv_block_layout: KvBlockLayout,
block_format: BlockFormat,
) -> Result<Self> {
config.validate()?;
let base_addr = memory.addr();
// Calculate strides
let region_size = config.page_size * config.inner_dim * config.dtype_width_bytes;
let outer_stride = region_size;
let layer_stride = outer_stride * config.outer_dim;
let block_stride = layer_stride * config.num_layers;
// Validate that the memory region is large enough
let required_size = block_stride * config.num_blocks;
if memory.size() < required_size {
return Err(anyhow!(
"Memory region too small for layout. Required: {} bytes, got: {} bytes",
required_size,
memory.size()
));
}
Ok(Self {
config,
base_addr,
block_stride,
layer_stride,
outer_stride,
region_size,
memory,
block_format,
kv_block_layout,
})
}
/// Create a new fully contiguous layout with a specific block format and KV block layout.
///
/// # Arguments
/// * `config` - Layout configuration
/// * `memory` - Owned memory region that backs this layout
/// * `block_format` - Format of blocks in memory
/// * `kv_block_layout` - KV block layout describing dimension ordering
///
/// # Returns
/// A new FullyContiguousLayout instance
pub(crate) fn new_with_format(
config: LayoutConfig,
memory: Buffer,
block_format: BlockFormat,
kv_block_layout: KvBlockLayout,
) -> Result<Self> {
Self::new_internal(config, memory, kv_block_layout, block_format)
}
/// Get the block format.
#[expect(dead_code)]
pub fn block_format(&self) -> BlockFormat {
self.block_format
}
/// Get the KV block layout.
#[expect(dead_code)]
pub fn kv_block_layout(&self) -> KvBlockLayout {
self.kv_block_layout
}
/// Set the KV block layout.
#[expect(dead_code)]
pub fn set_kv_block_layout(&mut self, layout: KvBlockLayout) {
self.kv_block_layout = layout;
}
/// Calculate the address of a specific memory region.
fn calculate_address(
&self,
block_id: usize,
layer_id: usize,
outer_id: usize,
) -> Result<usize> {
if block_id >= self.config.num_blocks {
return Err(anyhow!(
"Block ID {} out of range (count: {})",
block_id,
self.config.num_blocks
));
}
if layer_id >= self.config.num_layers {
return Err(anyhow!(
"Layer ID {} out of range (count: {})",
layer_id,
self.config.num_layers
));
}
if outer_id >= self.config.outer_dim {
return Err(anyhow!(
"Outer ID {} out of range (count: {})",
outer_id,
self.config.outer_dim
));
}
Ok(self.base_addr
+ block_id * self.block_stride
+ layer_id * self.layer_stride
+ outer_id * self.outer_stride)
}
/// Get mutable reference to the memory Arc for NIXL registration.
#[expect(dead_code)]
pub fn memory_arc_mut(&mut self) -> &mut Buffer {
&mut self.memory
}
}
impl Layout for FullyContiguousLayout {
fn config(&self) -> &LayoutConfig {
&self.config
}
fn memory_regions(&self) -> &[Buffer] {
std::slice::from_ref(&self.memory)
}
fn memory_region(
&self,
block_id: usize,
layer_id: usize,
outer_id: usize,
) -> Result<MemoryRegion> {
let addr = self.calculate_address(block_id, layer_id, outer_id)?;
Ok(MemoryRegion::new(addr, self.region_size))
}
fn required_allocations(&self) -> Vec<usize> {
// Single contiguous allocation
vec![self.block_stride * self.config.num_blocks]
}
fn is_fully_contiguous(&self) -> bool {
true
}
fn num_blocks(&self) -> usize {
self.config.num_blocks
}
fn num_layers(&self) -> usize {
self.config.num_layers
}
fn outer_dim(&self) -> usize {
self.config.outer_dim
}
fn page_size(&self) -> usize {
self.config.page_size
}
fn inner_dim(&self) -> usize {
self.config.inner_dim
}
fn dtype_width_bytes(&self) -> usize {
self.config.dtype_width_bytes
}
fn serialization_details(&self) -> LayoutTypeDetails {
LayoutTypeDetails::FullyContiguous(FullyContiguousDetails {
block_format: self.block_format,
kv_block_layout: self.kv_block_layout,
})
}
fn block_layout(&self) -> KvBlockLayout {
self.kv_block_layout
}
}
impl super::ContiguousBlockLayout for FullyContiguousLayout {
fn num_blocks(&self) -> usize {
self.config.num_blocks
}
fn bytes_per_block(&self) -> usize {
self.block_stride
}
fn raw_block(&self, block_id: usize) -> Result<MemoryRegion> {
if block_id >= self.config.num_blocks {
return Err(anyhow!(
"Block ID {} out of range (max: {})",
block_id,
self.config.num_blocks
));
}
let addr = self.base_addr + block_id * self.block_stride;
Ok(MemoryRegion::new(addr, self.block_stride))
}
fn block_layout(&self) -> KvBlockLayout {
self.kv_block_layout
}
}
#[cfg(all(test, feature = "testing-kvbm"))]
mod tests {
use super::super::tests::*;
use super::*;
#[test]
fn test_fully_contiguous_layout_creation() {
let config = LayoutConfig::builder()
.num_blocks(10)
.num_layers(4)
.outer_dim(2)
.page_size(16)
.inner_dim(128)
.dtype_width_bytes(2)
.build()
.unwrap();
let required_bytes = config.required_bytes();
assert_eq!(required_bytes, 10 * 4 * 2 * 16 * 128 * 2);
let memory = Buffer::from_arc(MockMemory::new(0x1000, required_bytes));
let layout = FullyContiguousLayout::new(config, memory).unwrap();
assert_eq!(layout.num_blocks(), 10);
assert!(layout.is_fully_contiguous());
}
#[test]
fn test_memory_region() {
let config = LayoutConfig::builder()
.num_blocks(2)
.num_layers(2)
.outer_dim(2)
.page_size(16)
.inner_dim(128)
.dtype_width_bytes(2)
.build()
.unwrap();
let required_size = config.required_bytes();
let memory = Buffer::from_arc(MockMemory::new(0x1000, required_size));
let layout = FullyContiguousLayout::new(config.clone(), memory).unwrap();
// Test accessing specific memory regions
let region_size = config.page_size * config.inner_dim * config.dtype_width_bytes;
// Block 0, Layer 0, Outer 0
let region = layout.memory_region(0, 0, 0).unwrap();
assert_eq!(region.addr, 0x1000);
assert_eq!(region.size(), region_size);
// Block 0, Layer 0, Outer 1
let region = layout.memory_region(0, 0, 1).unwrap();
assert_eq!(region.addr, 0x1000 + region_size);
assert_eq!(region.size(), region_size);
// Block 0, Layer 1, Outer 0
let region = layout.memory_region(0, 1, 0).unwrap();
assert_eq!(region.addr, 0x1000 + 2 * region_size);
assert_eq!(region.size(), region_size);
// Block 1, Layer 0, Outer 0
let region = layout.memory_region(1, 0, 0).unwrap();
assert_eq!(
region.addr,
0x1000 + (config.outer_dim * config.num_layers * region_size)
);
assert_eq!(region.size(), region_size);
}
}
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! KV Block layout types for describing dimension permutations within blocks.
//!
//! This module provides types for describing how dimensions are ordered within
//! a fully contiguous KV cache block, enabling type-driven kernel selection
//! for transfers between different layout formats.
use serde::{Deserialize, Serialize};
/// Symbolic dimensions that can be permuted within a block.
///
/// The head dimension (hd) is always innermost and not included here.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum BlockDim {
/// Number of layers (nl)
Layer,
/// Outer dimension - typically 2 for K/V, 1 for MLA (no)
Outer,
/// Page size / tokens per block (nt)
Page,
/// Number of attention heads (nh)
Head,
}
/// Block layout defined by dimension ordering.
///
/// Describes how the 4 permutable dimensions (layer, outer, page, head) are
/// ordered within a fully contiguous block. The head dimension (hd) is always
/// innermost and implicit.
///
/// The order specifies outer-to-inner dimensions, with head_dim always last.
///
/// # Examples
///
/// - `UniversalTP`: `[nh, nl, no, nt, hd]` - heads outermost for TP resharding
/// - `OperationalNHD`: `[nl, no, nt, nh, hd]` - inner is `[nt, nh, hd]`
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum KvBlockLayout {
/// Universal format: `[nh, nl, no, nt, hd]`
///
/// Heads are outermost to enable tensor parallelism (TP) resharding.
/// Cache saved from one TP configuration can be loaded into another
/// by simply slicing the head dimension differently.
UniversalTP,
/// Pipeline parallelism format: `[nl, nh, no, nt, hd]`
///
/// Layers are outermost for pipeline parallelism scenarios.
UniversalPP,
/// Operational HND format: `[nl, no, nh, nt, hd]`
///
/// Inner tensor shape is `[nh, nt, hd]` (heads, tokens, head_dim).
OperationalHND,
/// Operational NHD format: `[nl, no, nt, nh, hd]`
///
/// Inner tensor shape is `[nt, nh, hd]` (tokens, heads, head_dim).
/// This is the most common format used by vLLM and other frameworks.
OperationalNHD,
/// Custom ordering with explicit dimension list.
///
/// The array specifies dimensions from outermost to innermost,
/// with head_dim always implicitly last.
Custom([BlockDim; 4]),
/// Unknown layout - fallback when format cannot be determined.
///
/// Operations involving Unknown layouts may fail or require explicit
/// configuration.
Unknown,
}
impl Default for KvBlockLayout {
fn default() -> Self {
// Unknown until runtime detection determines the actual format
Self::Unknown
}
}
impl KvBlockLayout {
/// Get the dimension ordering as an array.
///
/// Returns the 4 dimensions from outermost to innermost.
/// Head dimension (hd) is implicit as the innermost dimension.
///
/// # Returns
/// `None` for `Unknown` layout, `Some([BlockDim; 4])` otherwise.
pub fn dim_order(&self) -> Option<[BlockDim; 4]> {
use BlockDim::*;
match self {
Self::UniversalTP => Some([Head, Layer, Outer, Page]),
Self::UniversalPP => Some([Layer, Head, Outer, Page]),
Self::OperationalHND => Some([Layer, Outer, Head, Page]),
Self::OperationalNHD => Some([Layer, Outer, Page, Head]),
Self::Custom(order) => Some(*order),
Self::Unknown => None,
}
}
/// Check if two layouts require transformation (not just copy).
///
/// Returns `true` if the layouts have different dimension orderings,
/// meaning a transformation kernel is needed rather than a simple copy.
///
/// For Unknown→Unknown comparisons, returns `false` (compatible) but emits
/// a warning so these cases can be tracked and fixed.
///
/// Returns `true` if one is Unknown and the other is Known (conservative).
pub fn requires_transform(&self, other: &Self) -> bool {
match (self.dim_order(), other.dim_order()) {
(Some(a), Some(b)) => a != b,
(None, None) => {
// Unknown→Unknown is compatible, but warn so we can fix these
tracing::warn!("Unknown→Unknown KvBlockLayout comparison - this should be fixed");
false
}
// Unknown→Known requires transform (conservative)
_ => true,
}
}
/// Check if this is an operational layout (NHD or HND).
///
/// Operational layouts are used for direct computation and have
/// layer/outer as the outermost dimensions.
pub fn is_operational(&self) -> bool {
matches!(self, Self::OperationalNHD | Self::OperationalHND)
}
/// Check if this is a universal layout (TP or PP).
///
/// Universal layouts are optimized for storage and transfer,
/// with different parallelism-friendly orderings.
pub fn is_universal(&self) -> bool {
matches!(self, Self::UniversalTP | Self::UniversalPP)
}
/// Get the layout name as a string identifier.
pub fn name(&self) -> &'static str {
match self {
Self::UniversalTP => "universal_tp",
Self::UniversalPP => "universal_pp",
Self::OperationalHND => "operational_hnd",
Self::OperationalNHD => "operational_nhd",
Self::Custom(_) => "custom",
Self::Unknown => "unknown",
}
}
/// Try to create a KvBlockLayout from an InnerShape.
///
/// This provides compatibility with the existing InnerShape enum.
pub(crate) fn from_inner_shape(inner_shape: super::InnerShape) -> Self {
match inner_shape {
super::InnerShape::NHD => Self::OperationalNHD,
super::InnerShape::HND => Self::OperationalHND,
super::InnerShape::Unknown => Self::Unknown,
}
}
/// Convert to InnerShape if this is an operational layout.
///
/// Returns `None` for universal or custom layouts.
pub(crate) fn to_inner_shape(self) -> Option<super::InnerShape> {
match self {
Self::OperationalNHD => Some(super::InnerShape::NHD),
Self::OperationalHND => Some(super::InnerShape::HND),
Self::Unknown => Some(super::InnerShape::Unknown),
_ => None,
}
}
}
impl std::fmt::Display for KvBlockLayout {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::UniversalTP => write!(f, "Universal TP [nh, nl, no, nt, hd]"),
Self::UniversalPP => write!(f, "Universal PP [nl, nh, no, nt, hd]"),
Self::OperationalHND => write!(f, "Operational HND [nl, no, nh, nt, hd]"),
Self::OperationalNHD => write!(f, "Operational NHD [nl, no, nt, nh, hd]"),
Self::Custom(order) => write!(f, "Custom {:?}", order),
Self::Unknown => write!(f, "Unknown"),
}
}
}
impl std::fmt::Display for BlockDim {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::Layer => write!(f, "nl"),
Self::Outer => write!(f, "no"),
Self::Page => write!(f, "nt"),
Self::Head => write!(f, "nh"),
}
}
}
// ============================================================================
// KvBlocks - Collection wrapper for blocks with shared layout
// ============================================================================
use crate::BlockId;
use crate::layout::PhysicalLayout;
use std::sync::Arc;
/// A collection of blocks with a shared layout configuration and block layout type.
///
/// `KvBlocks` provides a convenient way to group blocks that should be treated
/// uniformly in transfer operations. All blocks in the collection share:
/// - The same [`PhysicalLayout`] (memory organization)
/// - The same [`KvBlockLayout`] interpretation (dimension ordering)
///
/// This enables efficient batch transfers with optional layout override.
///
/// # Example
///
/// ```ignore
/// // Create blocks with universal layout override
/// let blocks = KvBlocks::new(
/// physical_layout.clone(),
/// vec![0, 1, 2, 3], // block IDs
/// Some(KvBlockLayout::UniversalTP),
/// )?;
///
/// // Use in transfers - the override tells the transfer system
/// // to interpret these blocks as universal format
/// ```
#[derive(Debug, Clone)]
pub struct KvBlocks {
/// The physical layout containing these blocks
layout: Arc<PhysicalLayout>,
/// Block IDs within the layout
block_ids: Vec<BlockId>,
/// Optional layout override (None = use layout's native block_layout)
kv_layout_override: Option<KvBlockLayout>,
}
impl KvBlocks {
/// Create a new KvBlocks collection.
///
/// # Arguments
/// * `layout` - The physical layout containing the blocks
/// * `block_ids` - Block IDs to include in this collection
/// * `kv_layout_override` - Optional override for the block layout interpretation.
/// If `None`, uses the layout's native `block_layout()`.
/// If `Some`, overrides the interpretation for transfers.
///
/// # Validation
/// - For layer-separate layouts, only operational layouts (NHD/HND) are valid overrides
/// - For fully contiguous layouts, any layout is valid
/// - If the override matches the native layout, it is normalized to None
pub fn new(
layout: Arc<PhysicalLayout>,
block_ids: Vec<BlockId>,
kv_layout_override: Option<KvBlockLayout>,
) -> anyhow::Result<Self> {
// Validate block IDs are in range
let num_blocks = layout.layout().num_blocks();
for &id in &block_ids {
if id >= num_blocks {
return Err(anyhow::anyhow!(
"Block ID {} out of range (layout has {} blocks)",
id,
num_blocks
));
}
}
// Validate layout override compatibility
if let Some(ref override_layout) = kv_layout_override {
// Layer-separate layouts can only use operational formats
if !layout.layout().is_fully_contiguous() && !override_layout.is_operational() {
return Err(anyhow::anyhow!(
"Layer-separate layouts only support operational block layouts (NHD/HND), got {:?}",
override_layout
));
}
}
// Normalize: if override matches native layout, set to None
let normalized_override = kv_layout_override.and_then(|override_layout| {
if override_layout == layout.layout().block_layout() {
None
} else {
Some(override_layout)
}
});
Ok(Self {
layout,
block_ids,
kv_layout_override: normalized_override,
})
}
/// Create a KvBlocks collection without layout override.
#[expect(dead_code)]
pub fn from_layout(
layout: Arc<PhysicalLayout>,
block_ids: Vec<BlockId>,
) -> anyhow::Result<Self> {
Self::new(layout, block_ids, None)
}
/// Get the physical layout.
#[expect(dead_code)]
pub fn layout(&self) -> &Arc<PhysicalLayout> {
&self.layout
}
/// Get the block IDs.
#[expect(dead_code)]
pub fn block_ids(&self) -> &[BlockId] {
&self.block_ids
}
/// Get the effective block layout (override or native).
pub fn effective_block_layout(&self) -> KvBlockLayout {
self.kv_layout_override
.unwrap_or_else(|| self.layout.layout().block_layout())
}
/// Get the layout override if set.
#[expect(dead_code)]
pub fn layout_override(&self) -> Option<KvBlockLayout> {
self.kv_layout_override
}
/// Check if this collection has a layout override.
#[expect(dead_code)]
pub fn has_override(&self) -> bool {
self.kv_layout_override.is_some()
}
/// Get the number of blocks in this collection.
#[expect(dead_code)]
pub fn len(&self) -> usize {
self.block_ids.len()
}
/// Check if the collection is empty.
#[expect(dead_code)]
pub fn is_empty(&self) -> bool {
self.block_ids.is_empty()
}
/// Check if a transfer between two KvBlocks collections requires transformation.
///
/// Returns `true` if the effective layouts differ and a transformation kernel
/// is needed rather than a simple copy.
#[expect(dead_code)]
pub fn requires_transform_to(&self, dst: &KvBlocks) -> bool {
self.effective_block_layout()
.requires_transform(&dst.effective_block_layout())
}
}
#[cfg(all(test, feature = "testing-kvbm"))]
mod tests {
use super::*;
#[test]
fn test_dim_order() {
use BlockDim::*;
assert_eq!(
KvBlockLayout::UniversalTP.dim_order(),
Some([Head, Layer, Outer, Page])
);
assert_eq!(
KvBlockLayout::OperationalNHD.dim_order(),
Some([Layer, Outer, Page, Head])
);
assert_eq!(KvBlockLayout::Unknown.dim_order(), None);
}
#[test]
fn test_requires_transform() {
// Same layout - no transform
assert!(!KvBlockLayout::OperationalNHD.requires_transform(&KvBlockLayout::OperationalNHD));
// Different layouts - transform required
assert!(KvBlockLayout::OperationalNHD.requires_transform(&KvBlockLayout::UniversalTP));
assert!(KvBlockLayout::OperationalHND.requires_transform(&KvBlockLayout::OperationalNHD));
// Unknown→Known requires transform (conservative)
assert!(KvBlockLayout::Unknown.requires_transform(&KvBlockLayout::OperationalNHD));
assert!(KvBlockLayout::OperationalNHD.requires_transform(&KvBlockLayout::Unknown));
// Unknown→Unknown is compatible (but emits warning)
assert!(!KvBlockLayout::Unknown.requires_transform(&KvBlockLayout::Unknown));
}
#[test]
fn test_is_operational() {
assert!(KvBlockLayout::OperationalNHD.is_operational());
assert!(KvBlockLayout::OperationalHND.is_operational());
assert!(!KvBlockLayout::UniversalTP.is_operational());
assert!(!KvBlockLayout::Unknown.is_operational());
}
#[test]
fn test_is_universal() {
assert!(KvBlockLayout::UniversalTP.is_universal());
assert!(KvBlockLayout::UniversalPP.is_universal());
assert!(!KvBlockLayout::OperationalNHD.is_universal());
}
#[test]
fn test_default() {
assert_eq!(KvBlockLayout::default(), KvBlockLayout::Unknown);
}
#[test]
fn test_serialization() {
let layout = KvBlockLayout::UniversalTP;
let json = serde_json::to_string(&layout).unwrap();
let deserialized: KvBlockLayout = serde_json::from_str(&json).unwrap();
assert_eq!(layout, deserialized);
// Test custom layout
let custom = KvBlockLayout::Custom([
BlockDim::Head,
BlockDim::Page,
BlockDim::Layer,
BlockDim::Outer,
]);
let json = serde_json::to_string(&custom).unwrap();
let deserialized: KvBlockLayout = serde_json::from_str(&json).unwrap();
assert_eq!(custom, deserialized);
}
#[test]
fn test_inner_shape_conversion() {
use super::super::InnerShape;
assert_eq!(
KvBlockLayout::from_inner_shape(InnerShape::NHD),
KvBlockLayout::OperationalNHD
);
assert_eq!(
KvBlockLayout::from_inner_shape(InnerShape::HND),
KvBlockLayout::OperationalHND
);
assert_eq!(
KvBlockLayout::OperationalNHD.to_inner_shape(),
Some(InnerShape::NHD)
);
assert_eq!(KvBlockLayout::UniversalTP.to_inner_shape(), None);
}
}
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Layer-separate layout implementation.
//!
//! This layout stores each layer in its own allocation, which is the typical
//! vLLM layout. Each layer can be either block-contiguous or outer-contiguous:
//! - Block-contiguous: [num_blocks, outer_dim, page_size, inner_dim]
//! - Outer-contiguous: [outer_dim, num_blocks, page_size, inner_dim]
use anyhow::{Result, anyhow};
use validator::Validate;
use super::serialize::{LayerSeparateDetails, LayoutTypeDetails};
use super::{
BlockDimension, Buffer, InnerShape, KvBlockLayout, Layout, LayoutConfig, MemoryDescriptor,
MemoryRegion,
};
/// Layer-separate layout where each layer has its own allocation.
#[derive(Debug)]
pub struct LayerSeparateLayout {
config: LayoutConfig,
/// Base addresses for each layer
layer_base_addrs: Vec<usize>,
/// Whether the outer dimension is contiguous (vs block dimension)
block_dim: BlockDimension,
/// Stride between blocks in bytes
block_stride: usize,
/// Stride between outer dimensions in bytes
outer_stride: usize,
/// Size of each memory region (page) in bytes
region_size: usize,
/// Owned memory regions backing this layout (one per layer)
memory_regions: Vec<Buffer>,
/// KV block layout for inner tensor format (must be operational: NHD or HND)
kv_block_layout: KvBlockLayout,
}
/// Builder for creating [`LayerSeparateLayout`] instances.
///
/// # Example
///
/// ```ignore
/// let layout = LayerSeparateLayout::builder()
/// .config(config)
/// .memory(memory_regions)
/// .block_dim(BlockDimension::BlockIsFirstDim)
/// .inner_shape(InnerShape::NHD)
/// .build()?;
/// ```
#[derive(Debug, Default)]
pub struct LayerSeparateLayoutBuilder {
config: Option<LayoutConfig>,
memory: Option<Vec<Buffer>>,
block_dim: Option<BlockDimension>,
kv_block_layout: KvBlockLayout,
}
impl LayerSeparateLayoutBuilder {
/// Create a new builder with default values.
pub fn new() -> Self {
Self {
config: None,
memory: None,
block_dim: None,
kv_block_layout: KvBlockLayout::Unknown,
}
}
/// Set the layout configuration.
pub fn config(&mut self, config: LayoutConfig) -> &mut Self {
self.config = Some(config);
self
}
/// Set the memory buffers backing this layout (one per layer).
pub fn memory(&mut self, memory: Vec<Buffer>) -> &mut Self {
self.memory = Some(memory);
self
}
/// Set the block dimension ordering.
pub fn block_dim(&mut self, block_dim: BlockDimension) -> &mut Self {
self.block_dim = Some(block_dim);
self
}
/// Set the inner shape, which translates to the KV block layout.
///
/// Only operational layouts (NHD, HND) are valid for layer-separate layouts.
///
/// - `InnerShape::NHD` -> `KvBlockLayout::OperationalNHD`
/// - `InnerShape::HND` -> `KvBlockLayout::OperationalHND`
/// - `InnerShape::Unknown` -> `KvBlockLayout::Unknown`
///
/// Default: `KvBlockLayout::Unknown`
pub fn inner_shape(&mut self, shape: InnerShape) -> &mut Self {
self.kv_block_layout = KvBlockLayout::from_inner_shape(shape);
self
}
/// Build the [`LayerSeparateLayout`].
///
/// # Errors
///
/// Returns an error if:
/// - `config` is not set
/// - `memory` is not set
/// - `block_dim` is not set
/// - The memory region count doesn't match `num_layers`
/// - Any memory region is too small for the layout
/// - The config validation fails
pub fn build(&self) -> Result<LayerSeparateLayout> {
let config = self
.config
.clone()
.ok_or_else(|| anyhow!("config is required"))?;
let memory = self
.memory
.clone()
.ok_or_else(|| anyhow!("memory is required"))?;
let block_dim = self
.block_dim
.ok_or_else(|| anyhow!("block_dim is required"))?;
LayerSeparateLayout::new_internal(config, memory, block_dim, self.kv_block_layout)
}
}
impl LayerSeparateLayout {
/// Create a builder for `LayerSeparateLayout`.
pub fn builder() -> LayerSeparateLayoutBuilder {
LayerSeparateLayoutBuilder::new()
}
/// Create a new layer-separate layout with default KV block layout.
///
/// # Arguments
/// - `config` - Layout configuration
/// - `memory` - Vector of owned memory regions (one per layer)
/// - `block_dim` - Whether block or outer dimension is first
///
/// # Returns
/// A new LayerSeparateLayout instance with `KvBlockLayout::Unknown`
pub(crate) fn new(
config: LayoutConfig,
memory: Vec<Buffer>,
block_dim: BlockDimension,
) -> Result<Self> {
Self::new_internal(config, memory, block_dim, KvBlockLayout::Unknown)
}
/// Internal constructor with all parameters.
fn new_internal(
config: LayoutConfig,
memory: Vec<Buffer>,
block_dim: BlockDimension,
kv_block_layout: KvBlockLayout,
) -> Result<Self> {
config.validate()?;
if memory.len() != config.num_layers {
return Err(anyhow!(
"Memory region count ({}) must match num_layers ({})",
memory.len(),
config.num_layers
));
}
// Calculate strides
let region_size = config.page_size * config.inner_dim * config.dtype_width_bytes;
let (block_stride, outer_stride) = if block_dim == BlockDimension::BlockIsSecondDim {
// Layout: [outer_dim, num_blocks, page_size, inner_dim]
let block_stride = region_size;
let outer_stride = block_stride * config.num_blocks;
(block_stride, outer_stride)
} else {
// Layout: [num_blocks, outer_dim, page_size, inner_dim]
let outer_stride = region_size;
let block_stride = outer_stride * config.outer_dim;
(block_stride, outer_stride)
};
// Extract base addresses and validate sizes
let mut layer_base_addrs = Vec::with_capacity(config.num_layers);
let required_size = config.num_blocks * config.outer_dim * region_size;
for (i, mem) in memory.iter().enumerate() {
if mem.size() < required_size {
return Err(anyhow!(
"Memory region {} too small for layout. Required: {} bytes, got: {} bytes",
i,
required_size,
mem.size()
));
}
layer_base_addrs.push(mem.addr());
}
Ok(Self {
config,
layer_base_addrs,
block_dim,
block_stride,
outer_stride,
region_size,
memory_regions: memory,
kv_block_layout,
})
}
/// Calculate the address of a specific memory region.
fn calculate_address(
&self,
block_id: usize,
layer_id: usize,
outer_id: usize,
) -> Result<usize> {
if block_id >= self.config.num_blocks {
return Err(anyhow!(
"Block ID {} out of range (max: {})",
block_id,
self.config.num_blocks
));
}
if layer_id >= self.config.num_layers {
return Err(anyhow!(
"Layer ID {} out of range (max: {})",
layer_id,
self.config.num_layers
));
}
if outer_id >= self.config.outer_dim {
return Err(anyhow!(
"Outer ID {} out of range (max: {})",
outer_id,
self.config.outer_dim
));
}
let base_addr = self.layer_base_addrs[layer_id];
let offset = block_id * self.block_stride + outer_id * self.outer_stride;
Ok(base_addr + offset)
}
#[expect(dead_code)]
pub fn block_dim(&self) -> BlockDimension {
self.block_dim
}
/// Get mutable reference to the memory regions for NIXL registration.
#[expect(dead_code)]
pub fn memory_regions_mut(&mut self) -> &mut [Buffer] {
&mut self.memory_regions
}
/// Get the KV block layout.
#[expect(dead_code)]
pub fn kv_block_layout(&self) -> KvBlockLayout {
self.kv_block_layout
}
/// Set the KV block layout from an inner shape.
///
/// Note: Only operational layouts (NHD, HND) are valid for layer-separate layouts.
#[expect(dead_code)]
pub fn set_kv_block_layout(&mut self, inner_shape: InnerShape) {
self.kv_block_layout = KvBlockLayout::from_inner_shape(inner_shape);
}
}
impl Layout for LayerSeparateLayout {
fn config(&self) -> &LayoutConfig {
&self.config
}
fn memory_regions(&self) -> &[Buffer] {
&self.memory_regions
}
fn memory_region(
&self,
block_id: usize,
layer_id: usize,
outer_id: usize,
) -> Result<MemoryRegion> {
let addr = self.calculate_address(block_id, layer_id, outer_id)?;
Ok(MemoryRegion::new(addr, self.region_size))
}
fn required_allocations(&self) -> Vec<usize> {
// One allocation per layer
let per_layer_size = self.config.num_blocks * self.config.outer_dim * self.region_size;
vec![per_layer_size; self.config.num_layers]
}
fn is_fully_contiguous(&self) -> bool {
false
}
fn num_blocks(&self) -> usize {
self.config.num_blocks
}
fn num_layers(&self) -> usize {
self.config.num_layers
}
fn outer_dim(&self) -> usize {
self.config.outer_dim
}
fn page_size(&self) -> usize {
self.config.page_size
}
fn inner_dim(&self) -> usize {
self.config.inner_dim
}
fn dtype_width_bytes(&self) -> usize {
self.config.dtype_width_bytes
}
fn serialization_details(&self) -> LayoutTypeDetails {
LayoutTypeDetails::LayerSeparate(LayerSeparateDetails {
block_dim: self.block_dim,
kv_block_layout: self.kv_block_layout,
})
}
fn block_layout(&self) -> KvBlockLayout {
self.kv_block_layout
}
}
#[cfg(all(test, feature = "testing-kvbm"))]
mod tests {
use super::super::tests::*;
use super::*;
#[test]
fn test_layer_separate_block_contiguous() {
let config = LayoutConfig::builder()
.num_blocks(10)
.num_layers(4)
.outer_dim(2)
.page_size(16)
.inner_dim(128)
.dtype_width_bytes(2)
.build()
.unwrap();
let per_layer_size = 10 * 2 * 16 * 128 * 2;
let memory: Vec<Buffer> = (0..4)
.map(|i| Buffer::from_arc(MockMemory::new(0x1000 + i * per_layer_size, per_layer_size)))
.collect();
let layout =
LayerSeparateLayout::new(config, memory, BlockDimension::BlockIsFirstDim).unwrap();
assert_eq!(layout.num_blocks(), 10);
assert!(!layout.is_fully_contiguous());
assert_eq!(layout.required_allocations().len(), 4);
}
#[test]
fn test_layer_separate_outer_contiguous() {
let config = LayoutConfig::builder()
.num_blocks(10)
.num_layers(4)
.outer_dim(2)
.page_size(16)
.inner_dim(128)
.dtype_width_bytes(2)
.build()
.unwrap();
let per_layer_size = 10 * 2 * 16 * 128 * 2;
let memory: Vec<Buffer> = (0..4)
.map(|i| Buffer::from_arc(MockMemory::new(0x1000 + i * per_layer_size, per_layer_size)))
.collect();
let layout =
LayerSeparateLayout::new(config, memory, BlockDimension::BlockIsSecondDim).unwrap();
assert_eq!(layout.num_blocks(), 10);
assert!(!layout.is_fully_contiguous());
}
#[test]
fn test_memory_region() {
let config = LayoutConfig::builder()
.num_blocks(2)
.num_layers(2)
.outer_dim(2)
.page_size(16)
.inner_dim(128)
.dtype_width_bytes(2)
.build()
.unwrap();
let per_layer_size = 2 * 2 * 16 * 128 * 2;
let memory: Vec<Buffer> = (0..2)
.map(|i| Buffer::from_arc(MockMemory::new(0x1000 + i * per_layer_size, per_layer_size)))
.collect();
let layout =
LayerSeparateLayout::new(config, memory, BlockDimension::BlockIsFirstDim).unwrap();
// Test accessing specific memory regions
let region_size = 16 * 128 * 2;
// Block 0, Layer 0, Outer 0 - should be at layer 0's base address
let region = layout.memory_region(0, 0, 0).unwrap();
assert_eq!(region.addr, 0x1000);
assert_eq!(region.size, region_size);
// Block 0, Layer 1, Outer 0 - should be at layer 1's base address
let region = layout.memory_region(0, 1, 0).unwrap();
assert_eq!(region.addr, 0x1000 + per_layer_size);
assert_eq!(region.size, region_size);
// Block 0, Layer 0, Outer 1 - should be offset within layer 0
let region = layout.memory_region(0, 0, 1).unwrap();
assert_eq!(region.addr, 0x1000 + region_size);
assert_eq!(region.size, region_size);
}
}
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Decoupled layout system for block management.
//!
//! This module provides a simplified layout abstraction that:
//! - Maps block IDs to physical memory regions (address + size)
//! - Decouples memory regions from storage type information
//! - Specifies allocation requirements without performing allocation
//! - Uses trait objects for memory ownership
pub(crate) mod builder;
mod config;
mod fully_contiguous;
mod kv_block_layout;
mod layer_separate;
mod physical;
mod serialize;
mod validation;
#[cfg(all(test, feature = "testing-kvbm"))]
pub(super) mod tests;
// #[cfg(test)]
// mod integration_tests;
pub use builder::PhysicalLayoutBuilder;
pub use config::{BlockDimension, LayoutConfig};
pub(crate) use fully_contiguous::FullyContiguousLayout;
pub use kv_block_layout::{BlockDim, KvBlockLayout};
pub(crate) use layer_separate::LayerSeparateLayout;
pub use physical::NixlMetadata;
pub use physical::PhysicalLayout;
pub(crate) use serialize::LayoutDescriptor;
pub use serialize::{BlockFormat, FullyContiguousDetails, LayerSeparateDetails, LayoutTypeDetails};
// mod registration;
// pub use registration::{RegisteredLayout, RegisteredStorageMetadata, RegistrationManager};
use anyhow::Result;
use serde::{Deserialize, Serialize};
pub(crate) use dynamo_memory::MemoryDescriptor;
pub use dynamo_memory::{Buffer, MemoryRegion};
/// Core layout trait for mapping block IDs to memory regions.
///
/// Layouts specify how KV cache blocks are organized in memory without
/// performing allocation themselves. They provide:
/// - Memory region lookup for specific blocks
/// - Allocation requirements for external allocators
/// - Metadata about block organization
pub trait Layout: Send + Sync + std::fmt::Debug {
/// Get the configuration for this layout.
fn config(&self) -> &LayoutConfig;
/// Get the root memory regions backing this layout.
///
/// These regions correspond to the concrete allocations that store the layout's data.
/// Implementations that derive memory procedurally can return an empty slice.
fn memory_regions(&self) -> &[Buffer];
/// Get memory regions for a specific block_id, layer_id, outer_id.
///
/// Returns a [MemoryRegion] for the continuous region specified by the given block_id,
/// layer_id, outer_id.
///
/// # Arguments
/// * `block_id` - The ID of the block to query (0..num_blocks)
/// * `layer_id` - The ID of the layer to query (0..num_layers)
/// * `outer_id` - The ID of the outer dimension to query (0..outer_dim)
fn memory_region(
&self,
block_id: usize,
layer_id: usize,
outer_id: usize,
) -> Result<MemoryRegion>;
/// Get the allocation requirements for this layout.
///
/// Returns a vector of allocation sizes needed to back this layout.
/// For fully contiguous layouts, this will be a single size.
/// For layer-separate layouts, this will contain one size per layer.
///
/// # Returns
/// Vector of allocation sizes in bytes.
fn required_allocations(&self) -> Vec<usize>;
/// Check if this layout uses fully contiguous memory.
///
/// Fully contiguous layouts have all blocks in a single allocation,
/// which enables certain optimizations.
fn is_fully_contiguous(&self) -> bool;
/// Get the total number of blocks in this layout.
fn num_blocks(&self) -> usize;
/// Get the number of layers per block.
fn num_layers(&self) -> usize;
/// Get the outer dimension size.
///
/// In typical KV cache layouts, this is often 2 (for K and V),
/// but can be 1 for architectures like MLA.
fn outer_dim(&self) -> usize;
/// Get the page size (often corresponds to block size in tokens).
fn page_size(&self) -> usize;
/// Get the inner dimension size.
///
/// This is typically the hidden size divided by tensor parallel size.
fn inner_dim(&self) -> usize;
/// Get the data type width in bytes.
fn dtype_width_bytes(&self) -> usize;
/// Get serialization details for this layout type.
///
/// This provides the layout-type-specific information needed to serialize
/// and reconstruct the layout on a remote node.
fn serialization_details(&self) -> serialize::LayoutTypeDetails;
/// Get the KV block layout describing how dimensions are permuted within blocks.
///
/// Returns the internal tensor ordering for blocks in this layout.
/// For layer-separate layouts, this describes the inner tensor format.
/// For fully contiguous layouts, this describes the full block format.
fn block_layout(&self) -> KvBlockLayout;
}
/// Inner shape format for tensor layout
#[allow(clippy::upper_case_acronyms)]
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub(crate) enum InnerShape {
/// Unknown shape - fallback when we can't determine the format
Unknown,
/// NHD format: [block_size, num_heads, head_dim]
/// Common for attention layers where N=tokens, H=heads, D=dimension
NHD,
/// HND format: [num_heads, block_size, head_dim]
/// Alternative layout with heads first
HND,
}
/// Trait for layouts that provide contiguous per-block memory regions.
///
/// This trait enables direct access to entire blocks as contiguous memory,
/// without requiring layer/outer indexing. It is implemented by
/// [`FullyContiguousLayout`] but NOT by [`LayerSeparateLayout`] (which
/// stores each layer separately).
///
/// Use this trait when you need to:
/// - Access raw block memory for transformation kernels
/// - Reinterpret block memory under different [`KvBlockLayout`] formats
/// - Perform whole-block operations without layer decomposition
pub trait ContiguousBlockLayout: Send + Sync + std::fmt::Debug {
/// Get the total number of blocks in this layout.
fn num_blocks(&self) -> usize;
/// Get the size of each block in bytes.
fn bytes_per_block(&self) -> usize;
/// Get the contiguous memory region for a specific block.
///
/// # Arguments
/// * `block_id` - The ID of the block to query (0..num_blocks)
///
/// # Returns
/// A [`MemoryRegion`] covering the entire block's memory.
///
/// # Errors
/// Returns an error if `block_id` is out of range.
fn raw_block(&self, block_id: usize) -> Result<MemoryRegion>;
/// Get the KV block layout for this contiguous layout.
fn block_layout(&self) -> KvBlockLayout;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment