feat: kvbm-physical (#6490)

Signed-off-by: Ryan Olson <rolson@nvidia.com>

feat: kvbm-physical (#6490)
Signed-off-by: Ryan Olson <rolson@nvidia.com>
9ab148dc · Ryan Olson · GitHub · 7546c193 · 9ab148dc · 9ab148dc
Unverified Commit 9ab148dc authored Mar 01, 2026 by Ryan Olson Committed by GitHub Mar 01, 2026
20 changed files
--- a/.github/workflows/container-validation-dynamo.yml
+++ b/.github/workflows/container-validation-dynamo.yml
@@ -154,6 +154,8 @@ jobs:
                      cargo fmt -- --check && \
                      cargo clippy --features block-manager,media-ffmpeg,testing-nixl,integration --no-deps --all-targets -- -D warnings && \
                      cargo test --locked --all-targets --features=block-manager,media-ffmpeg,testing-nixl,integration -- --nocapture && \
+                      cargo clippy -p kvbm-physical --no-deps --all-targets -- -D warnings && \
+                      cargo test --locked -p kvbm-physical --features testing-kvbm -- --nocapture --test-threads=4 && \
                      /workspace/container/use-sccache.sh show-stats "Rust Checks"'
  test-parallel:

--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3877,6 +3877,14 @@ dependencies = [
 "tracing",
 ]
+[[package]]
+name = "kvbm-common"
+version = "1.0.0"
+dependencies = [
+ "dynamo-tokens",
+ "serde",
+]
 [[package]]
 name = "kvbm-kernels"
 version = "1.0.0"
@@ -3894,7 +3902,6 @@ version = "1.0.0"
 dependencies = [
 "anyhow",
 "async-stream",
- "bincode 2.0.1",
 "bytes",
 "derive_builder",
 "dynamo-tokens",
@@ -3914,6 +3921,33 @@ dependencies = [
 "xxhash-rust",
 ]
+[[package]]
+name = "kvbm-physical"
+version = "1.0.0"
+dependencies = [
+ "aligned-vec",
+ "anyhow",
+ "bincode 2.0.1",
+ "blake3",
+ "cudarc",
+ "derive-getters",
+ "derive_builder",
+ "dynamo-memory",
+ "futures",
+ "kvbm-common",
+ "kvbm-kernels",
+ "parking_lot",
+ "rstest 0.26.1",
+ "serde",
+ "serde_json",
+ "thiserror 2.0.18",
+ "tokio",
+ "tracing",
+ "uuid",
+ "validator",
+ "velo-events",
+]
 [[package]]
 name = "lalrpop-util"
 version = "0.20.2"
@@ -3999,12 +4033,13 @@ checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981"
 [[package]]
 name = "libredox"
-version = "0.1.12"
+version = "0.1.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3d0b95e02c851351f877147b7deea7b1afb1df71b63aa5f8270716e0c5720616"
+checksum = "1744e39d1d6a9948f4f388969627434e31128196de472883b39f148769bfe30a"
 dependencies = [
 "bitflags 2.11.0",
 "libc",
+ "plain",
 "redox_syscall 0.7.3",
 ]
@@ -5474,6 +5509,12 @@ version = "0.3.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c"
+[[package]]
+name = "plain"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b4596b6d070b27117e987119b4dac604f3c58cfb0b191112e24771b2faeac1a6"
 [[package]]
 name = "plotters"
 version = "0.3.7"

--- a/Cargo.toml
+++ b/Cargo.toml
@@ -10,8 +10,10 @@ members = [
    "lib/mocker",
    "lib/kv-router",
    "lib/memory",
+    "lib/kvbm-common",
    "lib/kvbm-kernels",
    "lib/kvbm-logical",
+    "lib/kvbm-physical",
    "lib/async-openai",
    "lib/parsers",
    "lib/bench",
@@ -45,11 +47,13 @@ dynamo-async-openai = { path = "lib/async-openai", version = "1.0.0", features =
 dynamo-parsers = { path = "lib/parsers", version = "1.0.0" }
 # kvbm
+kvbm-common = { path = "lib/kvbm-common", version = "1.0.0" }
 kvbm-kernels = { path = "lib/kvbm-kernels", version = "1.0.0" }
 kvbm-logical = { path = "lib/kvbm-logical", version = "1.0.0" }
+kvbm-physical = { path = "lib/kvbm-physical", version = "1.0.0" }
 # velo
-velo-events = { path = "lib/velo-events", version = "0.9.0" }
+velo-events = { path = "lib/velo-events", version = "1.0.0" }
 # External dependencies
 anyhow = { version = "1" }
@@ -66,7 +70,7 @@ chrono = { version = "0.4", default-features = false, features = [
    "now",
    "serde",
 ] }
-cudarc = { version = "0.19.2", features = ["cuda-12020"] }
+cudarc = { version = "0.19.2", features = ["cuda-version-from-build-system", "fallback-latest"] }
 dashmap = { version = "6.1" }
 derive_builder = { version = "0.20" }
 derive-getters = { version = "0.5" }

--- a/lib/bindings/kvbm/Cargo.lock
+++ b/lib/bindings/kvbm/Cargo.lock
@@ -3015,9 +3015,9 @@ dependencies = [
 [[package]]
 name = "js-sys"
-version = "0.3.90"
+version = "0.3.91"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "14dc6f6450b3f6d4ed5b16327f38fed626d375a886159ca555bd7822c0c3a5a6"
+checksum = "b49715b7073f385ba4bc528e5747d02e66cb39c6146efb66b781f131f0fb399c"
 dependencies = [
 "once_cell",
 "wasm-bindgen",
@@ -3329,12 +3329,13 @@ checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981"
 [[package]]
 name = "libredox"
-version = "0.1.12"
+version = "0.1.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3d0b95e02c851351f877147b7deea7b1afb1df71b63aa5f8270716e0c5720616"
+checksum = "1744e39d1d6a9948f4f388969627434e31128196de472883b39f148769bfe30a"
 dependencies = [
 "bitflags 2.11.0",
 "libc",
+ "plain",
 "redox_syscall 0.7.3",
 ]
@@ -4701,6 +4702,12 @@ version = "0.3.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c"
+[[package]]
+name = "plain"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b4596b6d070b27117e987119b4dac604f3c58cfb0b191112e24771b2faeac1a6"
 [[package]]
 name = "png"
 version = "0.18.1"
@@ -7422,9 +7429,9 @@ dependencies = [
 [[package]]
 name = "wasm-bindgen"
-version = "0.2.113"
+version = "0.2.114"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "60722a937f594b7fde9adb894d7c092fc1bb6612897c46368d18e7a20208eff2"
+checksum = "6532f9a5c1ece3798cb1c2cfdba640b9b3ba884f5db45973a6f442510a87d38e"
 dependencies = [
 "cfg-if 1.0.4",
 "once_cell",
@@ -7435,9 +7442,9 @@ dependencies = [
 [[package]]
 name = "wasm-bindgen-futures"
-version = "0.4.63"
+version = "0.4.64"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8a89f4650b770e4521aa6573724e2aed4704372151bd0de9d16a3bbabb87441a"
+checksum = "e9c5522b3a28661442748e09d40924dfb9ca614b21c00d3fd135720e48b67db8"
 dependencies = [
 "cfg-if 1.0.4",
 "futures-util",
@@ -7449,9 +7456,9 @@ dependencies = [
 [[package]]
 name = "wasm-bindgen-macro"
-version = "0.2.113"
+version = "0.2.114"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0fac8c6395094b6b91c4af293f4c79371c163f9a6f56184d2c9a85f5a95f3950"
+checksum = "18a2d50fcf105fb33bb15f00e7a77b772945a2ee45dcf454961fd843e74c18e6"
 dependencies = [
 "quote",
 "wasm-bindgen-macro-support",
@@ -7459,9 +7466,9 @@ dependencies = [
 [[package]]
 name = "wasm-bindgen-macro-support"
-version = "0.2.113"
+version = "0.2.114"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ab3fabce6159dc20728033842636887e4877688ae94382766e00b180abac9d60"
+checksum = "03ce4caeaac547cdf713d280eda22a730824dd11e6b8c3ca9e42247b25c631e3"
 dependencies = [
 "bumpalo",
 "proc-macro2",
@@ -7472,9 +7479,9 @@ dependencies = [
 [[package]]
 name = "wasm-bindgen-shared"
-version = "0.2.113"
+version = "0.2.114"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "de0e091bdb824da87dc01d967388880d017a0a9bc4f3bdc0d86ee9f9336e3bb5"
+checksum = "75a326b8c223ee17883a4251907455a2431acc2791c98c26279376490c378c16"
 dependencies = [
 "unicode-ident",
 ]
@@ -7528,9 +7535,9 @@ dependencies = [
 [[package]]
 name = "web-sys"
-version = "0.3.90"
+version = "0.3.91"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "705eceb4ce901230f8625bd1d665128056ccbe4b7408faa625eec1ba80f59a97"
+checksum = "854ba17bb104abfb26ba36da9729addc7ce7f06f5c0f90f3c391f8461cca21f9"
 dependencies = [
 "js-sys",
 "wasm-bindgen",

--- a/lib/bindings/kvbm/Cargo.toml
+++ b/lib/bindings/kvbm/Cargo.toml
@@ -56,6 +56,6 @@ pyo3-async-runtimes = { version = "0.23.0", default-features = false, features =
 ] }
 dlpark = { version = "0.5", features = ["pyo3", "half"], optional = true }
-cudarc = { version = "0.19.2", features = ["cuda-12020"], optional = true }
+cudarc = { version = "0.19.2", features = ["cuda-version-from-build-system", "fallback-latest"], optional = true }
 [dev-dependencies]
--- a/lib/bindings/python/Cargo.lock
+++ b/lib/bindings/python/Cargo.lock
@@ -3075,9 +3075,9 @@ dependencies = [
 [[package]]
 name = "js-sys"
-version = "0.3.90"
+version = "0.3.91"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "14dc6f6450b3f6d4ed5b16327f38fed626d375a886159ca555bd7822c0c3a5a6"
+checksum = "b49715b7073f385ba4bc528e5747d02e66cb39c6146efb66b781f131f0fb399c"
 dependencies = [
 "once_cell",
 "wasm-bindgen",
@@ -3368,12 +3368,13 @@ checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981"
 [[package]]
 name = "libredox"
-version = "0.1.12"
+version = "0.1.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3d0b95e02c851351f877147b7deea7b1afb1df71b63aa5f8270716e0c5720616"
+checksum = "1744e39d1d6a9948f4f388969627434e31128196de472883b39f148769bfe30a"
 dependencies = [
 "bitflags 2.11.0",
 "libc",
+ "plain",
 "redox_syscall 0.7.3",
 ]
@@ -4749,6 +4750,12 @@ version = "0.3.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c"
+[[package]]
+name = "plain"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b4596b6d070b27117e987119b4dac604f3c58cfb0b191112e24771b2faeac1a6"
 [[package]]
 name = "png"
 version = "0.18.1"
@@ -7497,9 +7504,9 @@ dependencies = [
 [[package]]
 name = "wasm-bindgen"
-version = "0.2.113"
+version = "0.2.114"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "60722a937f594b7fde9adb894d7c092fc1bb6612897c46368d18e7a20208eff2"
+checksum = "6532f9a5c1ece3798cb1c2cfdba640b9b3ba884f5db45973a6f442510a87d38e"
 dependencies = [
 "cfg-if 1.0.4",
 "once_cell",
@@ -7510,9 +7517,9 @@ dependencies = [
 [[package]]
 name = "wasm-bindgen-futures"
-version = "0.4.63"
+version = "0.4.64"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8a89f4650b770e4521aa6573724e2aed4704372151bd0de9d16a3bbabb87441a"
+checksum = "e9c5522b3a28661442748e09d40924dfb9ca614b21c00d3fd135720e48b67db8"
 dependencies = [
 "cfg-if 1.0.4",
 "futures-util",
@@ -7524,9 +7531,9 @@ dependencies = [
 [[package]]
 name = "wasm-bindgen-macro"
-version = "0.2.113"
+version = "0.2.114"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0fac8c6395094b6b91c4af293f4c79371c163f9a6f56184d2c9a85f5a95f3950"
+checksum = "18a2d50fcf105fb33bb15f00e7a77b772945a2ee45dcf454961fd843e74c18e6"
 dependencies = [
 "quote",
 "wasm-bindgen-macro-support",
@@ -7534,9 +7541,9 @@ dependencies = [
 [[package]]
 name = "wasm-bindgen-macro-support"
-version = "0.2.113"
+version = "0.2.114"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ab3fabce6159dc20728033842636887e4877688ae94382766e00b180abac9d60"
+checksum = "03ce4caeaac547cdf713d280eda22a730824dd11e6b8c3ca9e42247b25c631e3"
 dependencies = [
 "bumpalo",
 "proc-macro2",
@@ -7547,9 +7554,9 @@ dependencies = [
 [[package]]
 name = "wasm-bindgen-shared"
-version = "0.2.113"
+version = "0.2.114"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "de0e091bdb824da87dc01d967388880d017a0a9bc4f3bdc0d86ee9f9336e3bb5"
+checksum = "75a326b8c223ee17883a4251907455a2431acc2791c98c26279376490c378c16"
 dependencies = [
 "unicode-ident",
 ]
@@ -7603,9 +7610,9 @@ dependencies = [
 [[package]]
 name = "web-sys"
-version = "0.3.90"
+version = "0.3.91"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "705eceb4ce901230f8625bd1d665128056ccbe4b7408faa625eec1ba80f59a97"
+checksum = "854ba17bb104abfb26ba36da9729addc7ce7f06f5c0f90f3c391f8461cca21f9"
 dependencies = [
 "js-sys",
 "wasm-bindgen",

--- a/lib/kvbm-common/Cargo.toml
+++ b/lib/kvbm-common/Cargo.toml
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+[package]
+name = "kvbm-common"
+version.workspace = true
+edition.workspace = true
+description.workspace = true
+authors.workspace = true
+license.workspace = true
+homepage.workspace = true
+repository.workspace = true
+keywords.workspace = true
+[dependencies]
+dynamo-tokens = { workspace = true }
+serde = { workspace = true }
--- a/lib/kvbm-common/src/lib.rs
+++ b/lib/kvbm-common/src/lib.rs
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+use serde::{Deserialize, Serialize};
+pub type BlockId = usize;
+pub type SequenceHash = dynamo_tokens::PositionalLineageHash;
+pub use dynamo_tokens as tokens;
+/// Logical layout handle type encoding the layout ID.
+///
+/// KVBM manages G1, G2 and G3 layouts directly. G4 is managed by an external service.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub enum LogicalLayoutHandle {
+    /// Representation of GPU / Device Memory
+    /// G1 is fixed sized and managed by either the framework or the local instance of KVBM.
+    G1,
+    /// Representation of CPU / Host Memory
+    /// G2 is fixed sized and managed by the local instance of KVBM.
+    G2,
+    /// Representation of Disk Storage (Local or AttachedStorage)
+    /// G3 is fixed sized and managed by the local instance of KVBM.
+    G3,
+    /// Representation of Blocks held in an external service
+    /// outside the control of the KVBM system.
+    G4,
+}
--- a/lib/kvbm-kernels/tests/memcpy_batch.rs
+++ b/lib/kvbm-kernels/tests/memcpy_batch.rs
@@ -20,10 +20,9 @@ use cudarc::runtime::sys as cuda_runtime;
 use kvbm_kernels::{MemcpyBatchMode, is_memcpy_batch_available, is_using_stubs, memcpy_batch};
 // Direct FFI for cudaMallocHost / cudaFreeHost.
-// We bypass cudarc's runtime::sys because cudarc eagerly resolves ALL runtime
+// We link against libcudart directly (through kvbm-kernels' build.rs),
-// symbols on first use, and CUDA 13.x removed `cudaGetDeviceProperties_v2`
+// so these symbols are always available without going through cudarc's
-// which causes a panic.  Our test binary links against libcudart directly
+// dynamic loader.
-// (through kvbm-kernels' build.rs), so these symbols are always available.
 unsafe extern "C" {
    fn cudaMallocHost(ptr: *mut *mut c_void, size: usize) -> u32;
    fn cudaFreeHost(ptr: *mut c_void) -> u32;

--- a/lib/kvbm-logical/Cargo.toml
+++ b/lib/kvbm-logical/Cargo.toml
@@ -15,7 +15,6 @@ dynamo-tokens = { workspace = true }
 anyhow = { workspace = true }
 async-stream = "0.3"
 bytes = "1.10"
-bincode = { version = "2.0.1", features = ["serde", "derive"] }
 derive_builder = { workspace = true }
 futures = { workspace = true }
 lru = "0.16"

--- a/lib/kvbm-logical/src/lib.rs
+++ b/lib/kvbm-logical/src/lib.rs
@@ -15,9 +15,6 @@ pub mod tinylfu;
 #[cfg(any(test, feature = "testing"))]
 pub mod testing;
-use bincode::{Decode, Encode};
-use serde::{Deserialize, Serialize};
 // Re-export common types and traits
 pub use blocks::{
    BlockError, BlockMetadata, CompleteBlock, ImmutableBlock, MutableBlock, WeakBlock,
@@ -37,19 +34,3 @@ impl KvbmSequenceHashProvider for dynamo_tokens::TokenBlock {
        self.positional_lineage_hash()
    }
 }
-/// Logical layout handle type encoding the layout ID.
-///
-/// KVBM manages G1, G2 and G3 layouts directly. G4 is managed by an external service.
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Encode, Decode, Serialize, Deserialize)]
-pub enum LogicalLayoutHandle {
-    /// Representation of GPU / Device Memory
-    G1,
-    /// Representation of CPU / Host Memory
-    G2,
-    /// Representation of Disk Storage
-    G3,
-    /// Representation of Blocks held in an external service
-    /// outside the control of the KVBM system.
-    G4,
-}
--- a/lib/kvbm-physical/Cargo.toml
+++ b/lib/kvbm-physical/Cargo.toml
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+[package]
+name = "kvbm-physical"
+version.workspace = true
+edition.workspace = true
+authors.workspace = true
+license.workspace = true
+repository.workspace = true
+[dependencies]
+dynamo-memory = { workspace = true }
+kvbm-common = { workspace = true }
+kvbm-kernels = { workspace = true }
+velo-events = { workspace = true }
+aligned-vec = "0.6.4"
+anyhow = { workspace = true }
+bincode = { version = "2.0.0", features = ["serde", "derive"] }
+blake3 = { version = "1" }
+cudarc = { workspace = true }
+derive_builder = { workspace = true }
+futures = { workspace = true }
+derive-getters = { version = "0.5" }
+parking_lot = { workspace = true }
+serde = { workspace = true }
+serde_json = { workspace = true }
+thiserror = { workspace = true }
+tokio = { workspace = true }
+tracing = { workspace = true }
+uuid = { workspace = true }
+validator = { workspace = true }
+[features]
+default = []
+collectives = []
+testing-kvbm = []
+testing-nixl-gds = []
+[dev-dependencies]
+dynamo-memory = { workspace = true, features = ["unsafe-slices"] }
+rstest = "0.26"
--- a/lib/kvbm-physical/README.md
+++ b/lib/kvbm-physical/README.md
+# kvbm-physical
+Physical layout and transfer management for KV cache block storage.
+`kvbm-physical` provides the low-level building blocks for mapping KV cache blocks to memory, registering them for RDMA transfers via NIXL, and executing transfers between heterogeneous storage tiers (GPU, host, disk, remote).
+## Modules
+### `layout` — Block-to-memory mapping
+Abstractions for how KV cache blocks are organized in memory.
+- **`Layout` trait** — Core abstraction mapping `(block_id, layer_id, outer_id)` to a `MemoryRegion`. Implementations include fully contiguous (single allocation) and layer-separate (one allocation per layer) variants.
+- **`KvBlockLayout`** — Describes dimension ordering within a block. Five named formats (`UniversalTP`, `UniversalPP`, `OperationalHND`, `OperationalNHD`, `Custom`) plus `Unknown`. Provides `requires_transform()`, `is_operational()`, and `is_universal()` for kernel selection.
+- **`PhysicalLayout`** — Wraps a `Layout` with its physical storage location (`StorageKind`) and NIXL registration metadata (`NixlMetadata`). Constructed via a type-state builder: Config &rarr; Layout type &rarr; Memory allocation &rarr; `build()`.
+- **`LayoutConfig`** — Block dimensions: `num_blocks`, `num_layers`, `outer_dim`, `page_size`, `inner_dim`, `dtype_width_bytes`, optional `num_heads`.
+- **`KvBlocks`** — Groups block IDs with a shared `PhysicalLayout` and optional `KvBlockLayout` override for cross-format transfers.
+### `manager` — Layout registration and transfer orchestration
+- **`TransferManager`** — Primary API. Registers layouts, exports/imports RDMA metadata between workers, and executes transfers by handle.
+- **`LayoutHandle`** — Compact `u128` encoding `(worker_id, layout_id)`. Identifies a registered layout within a specific worker; not symmetric across workers.
+- **`LogicalLayoutDescriptor`** — Bridges a `LayoutHandle` to a `LogicalLayoutHandle` (G1/G2/G3/G4 tier). Enables callers to say "copy from G1 to G2" while `TransferManager` resolves worker-specific physical handles.
+- **`SerializedLayout`** — Wire format for RDMA metadata exchange. Packs worker address, NIXL metadata, and layout descriptors into a bincode blob.
+- **`WorkerAddress`** — `(worker_id, nixl_agent_name)` pair identifying a worker on the network.
+### `transfer` — Transfer configuration and execution
+- **`TransferConfig` / builder** — Configures event system, NIXL backends, CUDA device, capabilities, and memory pool before building a `TransferManager`.
+- **`TransferOptions`** — Per-transfer configuration: `layer_range`, `nixl_write_notification`, `bounce_buffer`, caller-provided `cuda_stream`, and src/dst `kv_layout` overrides.
+- **`TransferPreferences`** — Strategy hints via `NativeVsNixlPolicy` (PreferNative / PreferNixl / Automatic).
+- **`TransferCompleteNotification`** — `Either<Ready, EventAwaiter>` implementing `IntoFuture`. Zero-cost for synchronous completions. `aggregate()` composes multiple notifications. `could_yield()` checks if awaiting will suspend.
+- **`BounceBuffer`** — Staging area for two-hop transfers (e.g., Device &rarr; Host &rarr; Remote).
+- **Checksum utilities** — BLAKE3 block/layer checksums for transfer verification.
+- **Fill utilities** — Constant/sequential patterns for testing and initialization.
+## Quick Start
+```rust,ignore
+use kvbm_physical::{TransferManager, TransferOptions};
+use kvbm_physical::layout::{LayoutConfig, PhysicalLayout};
+// 1. Build the TransferManager (creates NIXL agent, CUDA streams, event system)
+let manager = TransferManager::builder()
+    .nixl_backend("ucx")
+    .cuda_device_id(0)
+    .build()?;
+// 2. Configure a layout
+let config = LayoutConfig::builder()
+    .num_blocks(64)
+    .num_layers(32)
+    .outer_dim(2)
+    .page_size(16)
+    .inner_dim(128)
+    .dtype_width_bytes(2)
+    .build()?;
+// 3. Build a physical layout (type-state builder: config -> layout type -> memory -> build)
+let gpu_layout = PhysicalLayout::builder(manager.nixl_agent().clone())
+    .with_config(config.clone())
+    .fully_contiguous()
+    .allocate_device(0)
+    .build()?;
+let host_layout = PhysicalLayout::builder(manager.nixl_agent().clone())
+    .with_config(config)
+    .fully_contiguous()
+    .allocate_pinned(Some(0))
+    .build()?;
+// 4. Register layouts to get handles
+let gpu_handle = manager.register_layout(gpu_layout)?;
+let host_handle = manager.register_layout(host_layout)?;
+// 5. Execute a transfer and await completion
+let notification = manager.execute_transfer(
+    gpu_handle,
+    &[0, 1, 2, 3],        // source block IDs
+    host_handle,
+    &[0, 1, 2, 3],        // destination block IDs
+    TransferOptions::new(),
+)?;
+notification.await?;
+```
+## Testing
+All functional tests in `kvbm-physical` require a real NIXL installation and a CUDA GPU. They are gated behind two feature flags:
+- **`testing-kvbm`** — enables tests requiring NIXL and CUDA (creates NixlAgent instances and allocates device memory / launches kernels)
+### Running tests
+```bash
+# Without GPU/NIXL — only the sentinel test runs (confirms skipping)
+cargo test -p kvbm-physical
+# With GPU + NIXL available
+cargo test -p kvbm-physical --features testing-kvbm
+```
+When neither feature is enabled, a single **sentinel test** runs and prints a reminder message. This ensures `cargo test` never silently passes with zero tests.
+### What the sentinel test looks like
+```
+running 1 test
+test sentinel::all_functional_tests_skipped___enable_testing_nixl_and_testing_cuda ... ok
+```
+The `test_version_check_on_deserialization` test in `layout::tests` is the only functional test that runs without feature flags, as it does not require NIXL or CUDA.
+## Documentation
+- [v1 Migration Guide](docs/v1_migration.md) — Migration from `dynamo-llm::block_manager` to `kvbm-physical`
--- a/lib/kvbm-physical/docs/v1_migration.md
+++ b/lib/kvbm-physical/docs/v1_migration.md
+# Migration Guide: block_manager to kvbm-physical
+Guide for migrating from `dynamo-llm::block_manager` (v1) to `kvbm-physical`.
+## Overview
+`kvbm-physical` is a ground-up rewrite of the physical transfer layer from `lib/llm/src/block_manager/`. The core data flow is the same (register layouts, exchange metadata, execute transfers), but `kvbm-physical` adds block format awareness, richer transfer options, and a cleaner separation between logical tiers and physical handles.
+Both implementations use the same `vectorized_copy` CUDA kernel. The original embeds it in a `.fatbin` (`lib/llm/src/block_manager/block/transfer/kernels/vectorized_copy.fatbin`) loaded via `cuModuleLoadData`. `kvbm-physical` wraps the same kernel via the `kvbm-kernels` crate with explicit Rust FFI for transparency and testability.
+## Type mapping table
+| Original (block_manager) | kvbm-physical | Notes |
+|--------------------------|---------------|-------|
+| `TransportManager` | `TransferManager` | Same role, richer API |
+| `LayoutHandle` | `LayoutHandle` | Same concept; encoding changed — see LayoutHandle docs for details |
+| `PhysicalLayout` + builder | `PhysicalLayout` + builder | Same pattern; adds `with_external_device_regions()` |
+| `LayoutConfig` | `LayoutConfig` | Same fields + optional `num_heads` |
+| `TransferOptions` | `TransferOptions` | Adds `cuda_stream`, `src_kv_layout`, `dst_kv_layout` |
+| `TransferCapabilities` | `TransferCapabilities` | Same |
+| `TransferPreferences` | `TransferPreferences` | Same |
+| `SerializedLayout` | `SerializedLayout` | Same wire format concept |
+| `WorkerAddress` | `WorkerAddress` | Same |
+| `TransferCompleteNotification` (oneshot) | `TransferCompleteNotification` (`Either`/`EventAwaiter`) | Zero-cost sync path |
+| `BounceBufferSpec` (trait object) | `BounceBuffer` (concrete struct) | Simpler, no heap allocation |
+| N/A | `LogicalLayoutDescriptor` | **New** — tier bridging |
+| N/A | `KvBlockLayout` | **New** — block format awareness |
+| N/A | `KvBlocks` | **New** — grouped blocks with layout override |
+| `CudaBlockingH2D` / `CudaBlockingD2H` | Removed | Async-only; `.await` for sync behavior |
+| `OperationalCopyBackend` | Removed | Replaced by `kvbm_kernels` direct FFI |
+## What kvbm-physical adds
+### LogicalLayoutDescriptor
+Bridges `LayoutHandle` (physical) to `LogicalLayoutHandle` (G1/G2/G3/G4 tier). This is the key new abstraction for multi-worker coordination: callers say "copy from G1 to G2" while `TransferManager` resolves worker-specific handles.
+```rust,ignore
+// Build descriptor for RDMA exchange
+let descriptor = manager.build_logical_descriptor(gpu_handle, LogicalLayoutHandle::G1)?;
+```
+### KvBlockLayout
+Five named block formats plus `Custom` and `Unknown`. Enables type-driven kernel selection for transfers between different dimension orderings.
+```rust,ignore
+let needs_permute = src_layout.requires_transform(&dst_layout);
+```
+### kvbm-kernels FFI
+The `kvbm_kernels` crate provides `memcpy_batch` using CUDA 12.9+ batch API with automatic fallback to individual copies. This replaces the fatbin-loading approach with direct Rust FFI.
+### Stream pooling
+4 H2D + 4 D2H streams with round-robin selection, replacing the original 1+1 stream pair. Reduces contention for concurrent transfers.
+### Caller-provided CUDA stream
+`TransferOptions::cuda_stream` lets the caller pass in a stream. The executor skips event recording; the caller manages synchronization. Useful for layer-wise transfers where all layers must execute on the same stream.
+```rust,ignore
+let stream = manager.context().acquire_h2d_stream();
+let options = TransferOptions::builder()
+    .cuda_stream(stream.clone())
+    .build()?;
+```
+### CudaMemPool
+Device memory pool for kernel temporary allocations (permute buffers, etc.). Configured via `TransferConfig`:
+```rust,ignore
+TransferManager::builder()
+    .cuda_pool_reserve_size(64 * 1024 * 1024)         // 64 MiB pre-allocated
+    .cuda_pool_release_threshold(Some(64 * 1024 * 1024)) // free above this
+    .build()?;
+```
+### TransferCompleteNotification::aggregate()
+Compose multiple transfer notifications into one that completes when all are done. Optimizes away the aggregation when all inputs are already complete.
+```rust,ignore
+let combined = TransferCompleteNotification::aggregate(
+    vec![n1, n2, n3],
+    manager.context().event_system(),
+    &tokio::runtime::Handle::current(),
+)?;
+combined.await?;
+```
+### src/dst kv_layout overrides
+`TransferOptions` now supports overriding the source and destination block layout interpretation, enabling cross-format transfers without modifying the registered layout.
+```rust,ignore
+let options = TransferOptions::builder()
+    .src_kv_layout(KvBlockLayout::OperationalNHD)
+    .dst_kv_layout(KvBlockLayout::UniversalTP)
+    .build()?;
+```
+## What was intentionally removed
+### Blocking CUDA strategies
+`CudaBlockingH2D` and `CudaBlockingD2H` are removed. All transfers are async. For synchronous behavior, just `.await` immediately:
+```rust,ignore
+// v1 (blocking)
+let result = blocking_h2d_transfer(...);
+// kvbm-physical (async, but can be used synchronously)
+let notification = manager.execute_transfer(...)?;
+notification.await?;
+```
+### OperationalCopyBackend enum
+The `OperationalCopyBackend` enum (which selected between different kernel loading strategies) is removed. `kvbm-physical` uses `kvbm_kernels` direct FFI exclusively, making kernel dispatch transparent.
+### Trait object bounce buffer
+`BounceBufferSpec` (a trait object requiring heap allocation) is replaced by `BounceBuffer`, a concrete struct wrapping a `LayoutHandle` + block IDs:
+```rust,ignore
+// v1
+struct MyBounce { layout: PhysicalLayout, blocks: Vec<BlockId> }
+impl BounceBufferSpec for MyBounce { ... }
+// kvbm-physical
+let bounce = BounceBuffer::from_handle(host_handle, vec![0, 1, 2, 3]);
+```
+## Migration steps
+### 1. Replace TransportManager with TransferManager
+The builder pattern is the same. `TransferManager::builder()` returns the same kind of fluent builder.
+```rust,ignore
+// v1
+let manager = TransportManager::builder()
+    .worker_id(0)
+    .nixl_backend("ucx")
+    .cuda_device_id(0)
+    .build()?;
+// kvbm-physical
+let manager = TransferManager::builder()
+    .nixl_backend("ucx")
+    .cuda_device_id(0)
+    .build()?;
+// worker_id is now derived from the event system
+```
+### 2. Replace TransferOptions
+Add new fields as needed. Existing `layer_range` and `nixl_write_notification` work the same way.
+```rust,ignore
+// v1
+let options = TransferOptions::builder()
+    .layer_range(0..16)
+    .build()?;
+// kvbm-physical (same, with optional new fields)
+let options = TransferOptions::builder()
+    .layer_range(0..16)
+    .cuda_stream(stream)        // new: caller-managed stream
+    .src_kv_layout(layout)      // new: format override
+    .build()?;
+```
+### 3. Replace BounceBufferSpec with BounceBuffer
+```rust,ignore
+// v1 — trait object
+let spec: Box<dyn BounceBufferSpec> = Box::new(MyBounce::new(layout, blocks));
+options.bounce_buffer(spec);
+// kvbm-physical — concrete type
+let bounce = BounceBuffer::from_handle(host_handle, block_ids);
+let options = TransferOptions::builder()
+    .bounce_buffer(bounce)
+    .build()?;
+```
+### 4. Replace TransferCompleteNotification await pattern
+The notification now implements `IntoFuture` directly instead of wrapping a oneshot channel.
+```rust,ignore
+// v1
+let notification = manager.execute_transfer(...)?;
+notification.recv().await??;
+// kvbm-physical
+let notification = manager.execute_transfer(...)?;
+notification.await?;
+```
+### 5. Add LogicalLayoutDescriptor for multi-worker tier resolution
+If you coordinate transfers across multiple workers by tier name (G1, G2, etc.), use `LogicalLayoutDescriptor`:
+```rust,ignore
+// Build descriptors that include tier information
+let g1_desc = manager.build_logical_descriptor(gpu_handle, LogicalLayoutHandle::G1)?;
+let g2_desc = manager.build_logical_descriptor(host_handle, LogicalLayoutHandle::G2)?;
+// Remote workers can now resolve "copy G1 to G2" to the correct physical handles
+```
+### 6. Consider KvBlockLayout annotations for cross-format transfers
+If your transfers involve blocks stored in different dimension orderings (e.g., operational NHD from the engine vs. universal TP for storage), annotate with `KvBlockLayout`:
+```rust,ignore
+let options = TransferOptions::builder()
+    .src_kv_layout(KvBlockLayout::OperationalNHD)
+    .dst_kv_layout(KvBlockLayout::UniversalTP)
+    .build()?;
+```
+This tells the executor to select a permute kernel instead of a direct copy.
--- a/lib/kvbm-physical/src/layout/builder.rs
+++ b/lib/kvbm-physical/src/layout/builder.rs
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//! Typed builder for constructing [`PhysicalLayout`](crate::layout::PhysicalLayout)
+//! instances with strongly-typed configuration, layout selection, and memory provisioning.
+//!
+//! The builder enforces the three steps required to materialize a physical layout:
+//! 1. Provide a [`LayoutConfig`]
+//! 2. Select a concrete layout (fully contiguous or layer separate)
+//! 3. Specify memory backing (either by allocating or by supplying existing regions)
+//!
+//! NIXL registration is always enabled. Callers must provide a [`nixl_sys::Agent`], and any memory
+//! supplied to the builder must implement [`NixlCompatible`].
+use crate::layout::physical::PhysicalLayout;
+use super::{
+    BlockDimension, FullyContiguousLayout, LayerSeparateLayout, Layout, LayoutConfig,
+    MemoryDescriptor, physical::NixlMetadata,
+};
+use anyhow::{Result, anyhow, bail};
+use dynamo_memory::{
+    Buffer, DiskStorage, OffsetBuffer, StorageKind, SystemStorage, create_buffer,
+    nixl::{MemType, NixlAgent, NixlDescriptor, register_with_nixl},
+    prelude::{NixlCompatible, RegisteredView},
+};
+#[allow(unused_imports)]
+use std::marker::PhantomData;
+use std::path::PathBuf;
+use std::sync::Arc;
+use dynamo_memory::{DeviceStorage, PinnedStorage};
+const REGION_ALIGNMENT: usize = 512;
+/// Layout selection exposed by the builder.
+#[derive(Debug, Clone)]
+pub enum LayoutKind {
+    FullyContiguous,
+    LayerSeparate { block_dim: BlockDimension },
+}
+/// Allocation strategies for builder-managed memory.
+#[derive(Debug, Clone)]
+enum AllocationKind {
+    System,
+    /// Pinned (page-locked) host memory. If `device_id` is Some, NUMA-aware
+    /// allocation is used on the GPU's NUMA node (when NUMA is enabled).
+    Pinned {
+        device_id: Option<u32>,
+    },
+    Device {
+        device_id: u32,
+    },
+    Disk {
+        path: Option<PathBuf>,
+    },
+}
+/// Memory provisioning plan (either provided regions or an allocation request).
+#[derive(Debug, Clone)]
+enum MemoryPlan {
+    Provided(Vec<MemoryEntry>),
+    Allocate(AllocationKind),
+}
+/// Memory tenancy captured during the build process.
+#[derive(Debug, Clone)]
+struct MemoryEntry {
+    region: Buffer,
+    descriptor: Option<NixlDescriptor>,
+}
+impl MemoryEntry {
+    fn new(region: Buffer, descriptor: Option<NixlDescriptor>) -> Self {
+        Self { region, descriptor }
+    }
+    fn ensure_registered(mut self) -> Result<Self> {
+        if self.descriptor.is_none() {
+            self.descriptor = self.region.nixl_descriptor();
+        }
+        #[cfg(not(test))]
+        {
+            // In production, require NIXL registration
+            if self.descriptor.is_none() {
+                bail!(
+                    "memory region {} is not registered with NIXL",
+                    self.region.addr()
+                );
+            }
+        }
+        // In test builds, allow None descriptors for local-only layouts
+        Ok(self)
+    }
+}
+/// Marker types for the builder state machine.
+pub struct NoConfig;
+pub struct HasConfig;
+pub struct NoLayout;
+pub struct HasLayout;
+pub struct NoMemory;
+pub struct HasMemory;
+/// Default builder state type alias.
+pub type PhysicalLayoutBuilderDefault = PhysicalLayoutBuilder<NoConfig, NoLayout, NoMemory>;
+/// Typed builder enforcing configuration, layout selection, and memory provisioning phases.
+pub struct PhysicalLayoutBuilder<C, L, M> {
+    agent: NixlAgent,
+    config: Option<LayoutConfig>,
+    layout_kind: Option<LayoutKind>,
+    memory_plan: Option<MemoryPlan>,
+    _config: PhantomData<C>,
+    _layout: PhantomData<L>,
+    _memory: PhantomData<M>,
+}
+impl PhysicalLayoutBuilder<NoConfig, NoLayout, NoMemory> {
+    /// Create a new builder in its initial state.
+    pub fn new(agent: NixlAgent) -> Self {
+        Self {
+            agent,
+            config: None,
+            layout_kind: None,
+            memory_plan: None,
+            _config: PhantomData,
+            _layout: PhantomData,
+            _memory: PhantomData,
+        }
+    }
+}
+impl<C, L, M> PhysicalLayoutBuilder<C, L, M> {
+    fn into_parts(
+        self,
+    ) -> (
+        NixlAgent,
+        Option<LayoutConfig>,
+        Option<LayoutKind>,
+        Option<MemoryPlan>,
+    ) {
+        (self.agent, self.config, self.layout_kind, self.memory_plan)
+    }
+    fn from_parts<C2, L2, M2>(
+        agent: NixlAgent,
+        config: Option<LayoutConfig>,
+        layout_kind: Option<LayoutKind>,
+        memory_plan: Option<MemoryPlan>,
+    ) -> PhysicalLayoutBuilder<C2, L2, M2> {
+        PhysicalLayoutBuilder {
+            agent,
+            config,
+            layout_kind,
+            memory_plan,
+            _config: PhantomData,
+            _layout: PhantomData,
+            _memory: PhantomData,
+        }
+    }
+}
+impl<L, M> PhysicalLayoutBuilder<NoConfig, L, M> {
+    /// Attach the [`LayoutConfig`] required to size the layout and allocations.
+    pub fn with_config(self, config: LayoutConfig) -> PhysicalLayoutBuilder<HasConfig, L, M> {
+        let (agent, _config, layout_kind, memory_plan) = self.into_parts();
+        PhysicalLayoutBuilder::<HasConfig, L, M>::from_parts(
+            agent,
+            Some(config),
+            layout_kind,
+            memory_plan,
+        )
+    }
+}
+impl<M> PhysicalLayoutBuilder<HasConfig, NoLayout, M> {
+    /// Select the fully contiguous layout variant.
+    pub fn fully_contiguous(self) -> PhysicalLayoutBuilder<HasConfig, HasLayout, M> {
+        let (agent, config, _layout, memory_plan) = self.into_parts();
+        PhysicalLayoutBuilder::<HasConfig, HasLayout, M>::from_parts(
+            agent,
+            config,
+            Some(LayoutKind::FullyContiguous),
+            memory_plan,
+        )
+    }
+    /// Select the layer-separate layout variant with the provided block dimension ordering.
+    pub fn layer_separate(
+        self,
+        block_dim: BlockDimension,
+    ) -> PhysicalLayoutBuilder<HasConfig, HasLayout, M> {
+        let (agent, config, _layout, memory_plan) = self.into_parts();
+        PhysicalLayoutBuilder::<HasConfig, HasLayout, M>::from_parts(
+            agent,
+            config,
+            Some(LayoutKind::LayerSeparate { block_dim }),
+            memory_plan,
+        )
+    }
+}
+impl PhysicalLayoutBuilder<HasConfig, HasLayout, NoMemory> {
+    fn set_memory_plan(
+        self,
+        plan: MemoryPlan,
+    ) -> PhysicalLayoutBuilder<HasConfig, HasLayout, HasMemory> {
+        let (agent, config, layout_kind, _memory) = self.into_parts();
+        PhysicalLayoutBuilder::<HasConfig, HasLayout, HasMemory>::from_parts(
+            agent,
+            config,
+            layout_kind,
+            Some(plan),
+        )
+    }
+    pub fn allocate_system(self) -> PhysicalLayoutBuilder<HasConfig, HasLayout, HasMemory> {
+        self.set_memory_plan(MemoryPlan::Allocate(AllocationKind::System))
+    }
+    /// Allocate pinned (page-locked) host memory.
+    ///
+    /// # Arguments
+    /// * `device_id` - If `Some(id)`, enables NUMA-aware allocation on the GPU's NUMA node
+    ///   (when `DYN_KVBM_ENABLE_NUMA=1` is set). If `None`, uses direct allocation.
+    pub fn allocate_pinned(
+        self,
+        device_id: Option<u32>,
+    ) -> PhysicalLayoutBuilder<HasConfig, HasLayout, HasMemory> {
+        self.set_memory_plan(MemoryPlan::Allocate(AllocationKind::Pinned { device_id }))
+    }
+    /// Allocate device memory on the specified CUDA device (or the context device if `None`).
+    pub fn allocate_device(
+        self,
+        device_id: u32,
+    ) -> PhysicalLayoutBuilder<HasConfig, HasLayout, HasMemory> {
+        self.set_memory_plan(MemoryPlan::Allocate(AllocationKind::Device { device_id }))
+    }
+    /// Allocate disk-backed storage. When `path` is `None`, a temporary file is used.
+    pub fn allocate_disk(
+        self,
+        path: Option<PathBuf>,
+    ) -> PhysicalLayoutBuilder<HasConfig, HasLayout, HasMemory> {
+        self.set_memory_plan(MemoryPlan::Allocate(AllocationKind::Disk { path }))
+    }
+    /// Use existing NIXL-compatible memory regions supplied by the caller.
+    pub fn with_memory_regions<S>(
+        self,
+        regions: Vec<S>,
+    ) -> Result<PhysicalLayoutBuilder<HasConfig, HasLayout, HasMemory>>
+    where
+        S: MemoryDescriptor + NixlCompatible + 'static,
+    {
+        let (agent, config, layout_kind, _memory) = self.into_parts();
+        let entries = register_existing_regions(&agent, regions)?;
+        Ok(
+            PhysicalLayoutBuilder::<HasConfig, HasLayout, HasMemory>::from_parts(
+                agent,
+                config,
+                layout_kind,
+                Some(MemoryPlan::Provided(entries)),
+            ),
+        )
+    }
+    /// Use pre-registered memory regions (already wrapped in `Arc<dyn MemoryDescriptor>`).
+    ///
+    /// All regions must already expose a NIXL descriptor.
+    pub fn with_registered_regions(
+        self,
+        regions: Vec<Buffer>,
+    ) -> Result<PhysicalLayoutBuilder<HasConfig, HasLayout, HasMemory>> {
+        let entries = regions
+            .into_iter()
+            .enumerate()
+            .map(|(index, region)| {
+                let descriptor = region.nixl_descriptor().ok_or_else(|| {
+                    anyhow!(
+                        "provided memory region at index {} is not NIXL registered",
+                        index
+                    )
+                })?;
+                Ok(MemoryEntry::new(region, Some(descriptor)))
+            })
+            .collect::<Result<Vec<_>>>()?;
+        let (agent, config, layout_kind, _memory) = self.into_parts();
+        Ok(
+            PhysicalLayoutBuilder::<HasConfig, HasLayout, HasMemory>::from_parts(
+                agent,
+                config,
+                layout_kind,
+                Some(MemoryPlan::Provided(entries)),
+            ),
+        )
+    }
+    /// Register external KV cache tensors with NIXL for RDMA access.
+    ///
+    /// This is the **CRITICAL** step that enables remote GPU-to-GPU transfers.
+    /// Each tensor's memory is wrapped in `ExternalDeviceMemory` and registered
+    /// with NIXL.
+    ///
+    /// # Arguments
+    /// * `tensors` - KV cache tensors from vLLM (one per layer). All tensors must:
+    ///   - Be on the same CUDA device
+    ///   - Be contiguous in memory
+    ///   - Have the same shape
+    ///
+    /// # Requirements
+    /// - The NIXL agent must be registered with an RDMA-capable backend
+    /// - The external framework (vLLM) must keep the tensors valid while registered
+    ///
+    /// # Example
+    /// ```ignore
+    /// let physical_layout = PhysicalLayoutBuilder::new(nixl_agent)
+    ///     .with_config(layout_config)
+    ///     .layer_separate(block_dim)
+    ///     .with_external_device_regions(kv_tensors)?  // NIXL registration here
+    ///     .build()?;
+    /// ```
+    pub fn with_external_device_regions(
+        self,
+        tensors: Vec<Arc<dyn dynamo_memory::TensorDescriptor>>,
+    ) -> Result<PhysicalLayoutBuilder<HasConfig, HasLayout, HasMemory>> {
+        use dynamo_memory::TensorDescriptorExt;
+        if tensors.is_empty() {
+            bail!("with_external_device_regions requires at least one tensor");
+        }
+        let (agent, config, layout_kind, _memory) = self.into_parts();
+        let mut entries = Vec::with_capacity(tensors.len());
+        for (index, tensor) in tensors.into_iter().enumerate() {
+            // Verify the tensor is on a CUDA device
+            if tensor.cuda_device_id().is_none() {
+                bail!("tensor at index {} is not on a CUDA device", index);
+            }
+            // Register tensor with NIXL for RDMA
+            // Arc<dyn TensorDescriptor> implements both MemoryDescriptor and NixlCompatible,
+            // so we can register it directly. This is the critical step that enables
+            // remote GPU-to-GPU transfers via UCX backend.
+            let entry = register_storage(tensor, &agent).map_err(|e| {
+                anyhow!(
+                    "failed to register tensor {} with NIXL (UCX backend required for VRAM): {}",
+                    index,
+                    e
+                )
+            })?;
+            entries.push(entry);
+        }
+        Ok(
+            PhysicalLayoutBuilder::<HasConfig, HasLayout, HasMemory>::from_parts(
+                agent,
+                config,
+                layout_kind,
+                Some(MemoryPlan::Provided(entries)),
+            ),
+        )
+    }
+}
+impl PhysicalLayoutBuilder<HasConfig, HasLayout, HasMemory> {
+    /// Finalize the builder, constructing the [`PhysicalLayout`].
+    pub fn build(self) -> Result<PhysicalLayout> {
+        let (agent, config, layout_kind, memory_plan) = self.into_parts();
+        let config = config.ok_or_else(|| anyhow!("layout config missing despite type state"))?;
+        let layout_kind =
+            layout_kind.ok_or_else(|| anyhow!("layout kind missing despite type state"))?;
+        let memory_plan =
+            memory_plan.ok_or_else(|| anyhow!("memory plan missing despite type state"))?;
+        let required_sizes = compute_allocation_sizes(&config, &layout_kind)?;
+        let entries = resolve_memory_plan(&agent, memory_plan, &required_sizes)?;
+        validate_memory_sizes(&entries, &required_sizes)?;
+        let kind = derive_storage_kind(&entries)?;
+        let metadata = derive_nixl_metadata(&agent, &entries)?;
+        let layout: Arc<dyn Layout> = match layout_kind {
+            LayoutKind::FullyContiguous => {
+                let entry = entries.first().ok_or_else(|| {
+                    anyhow!("fully contiguous layout requires a single memory region")
+                })?;
+                let layout = FullyContiguousLayout::new(config.clone(), entry.region.clone())?;
+                Arc::new(layout)
+            }
+            LayoutKind::LayerSeparate { block_dim } => {
+                let regions: Vec<Buffer> =
+                    entries.iter().map(|entry| entry.region.clone()).collect();
+                let layout = LayerSeparateLayout::new(config.clone(), regions, block_dim)?;
+                Arc::new(layout)
+            }
+        };
+        Ok(PhysicalLayout::new_local(layout, kind, metadata))
+    }
+}
+fn register_existing_regions<S>(agent: &NixlAgent, regions: Vec<S>) -> Result<Vec<MemoryEntry>>
+where
+    S: MemoryDescriptor + NixlCompatible + 'static,
+{
+    regions
+        .into_iter()
+        .map(|region| register_storage(region, agent))
+        .collect()
+}
+fn resolve_memory_plan(
+    agent: &NixlAgent,
+    plan: MemoryPlan,
+    sizes: &[usize],
+) -> Result<Vec<MemoryEntry>> {
+    match plan {
+        MemoryPlan::Provided(entries) => {
+            if entries.len() != sizes.len() {
+                bail!(
+                    "provided memory count ({}) does not match required allocations ({})",
+                    entries.len(),
+                    sizes.len()
+                );
+            }
+            entries
+                .into_iter()
+                .map(MemoryEntry::ensure_registered)
+                .collect()
+        }
+        MemoryPlan::Allocate(strategy) => allocate_regions(agent, strategy, sizes),
+    }
+}
+fn allocate_regions(
+    agent: &NixlAgent,
+    strategy: AllocationKind,
+    sizes: &[usize],
+) -> Result<Vec<MemoryEntry>> {
+    if sizes.is_empty() {
+        return Ok(Vec::new());
+    }
+    let reserve_size = total_allocation_size(sizes, REGION_ALIGNMENT)?;
+    let base_entry = match strategy {
+        AllocationKind::System => allocate_system_entry(reserve_size, agent)?,
+        AllocationKind::Pinned { device_id } => {
+            allocate_pinned_entry(reserve_size, agent, device_id)?
+        }
+        AllocationKind::Device { device_id } => {
+            allocate_device_entry(reserve_size, agent, device_id)?
+        }
+        AllocationKind::Disk { path } => allocate_disk_entry(reserve_size, agent, path)?,
+    };
+    create_offset_entries(base_entry, sizes, REGION_ALIGNMENT)
+}
+fn allocate_system_entry(size: usize, agent: &NixlAgent) -> Result<MemoryEntry> {
+    let storage = SystemStorage::new(size)
+        .map_err(|e| anyhow!("failed to allocate system memory ({size} bytes): {e}"))?;
+    register_storage(storage, agent)
+}
+fn allocate_pinned_entry(
+    size: usize,
+    agent: &NixlAgent,
+    device_id: Option<u32>,
+) -> Result<MemoryEntry> {
+    let storage = PinnedStorage::new_for_device(size, device_id)
+        .map_err(|e| anyhow!("failed to allocate pinned memory ({size} bytes): {e}"))?;
+    register_storage(storage, agent)
+}
+fn allocate_device_entry(size: usize, agent: &NixlAgent, device_id: u32) -> Result<MemoryEntry> {
+    let storage = DeviceStorage::new(size, device_id).map_err(|e| {
+        anyhow!("failed to allocate device memory ({size} bytes) on device {device_id}: {e}")
+    })?;
+    register_storage(storage, agent)
+}
+fn allocate_disk_entry(
+    size: usize,
+    agent: &NixlAgent,
+    path: Option<PathBuf>,
+) -> Result<MemoryEntry> {
+    let storage = if let Some(path) = path {
+        DiskStorage::new_at(&path, size)
+            .map_err(|e| anyhow!("failed to allocate disk storage at {}: {e}", path.display()))?
+    } else {
+        DiskStorage::new(size).map_err(|e| anyhow!("failed to allocate disk storage: {e}"))?
+    };
+    register_storage(storage, agent)
+}
+// When testing, we allow unregistered layouts to help with test time. NIXL + UCX is very expensive to setup
+// so we only use that backend when it's needed.
+#[cfg(test)]
+fn register_storage<S>(storage: S, agent: &NixlAgent) -> Result<MemoryEntry>
+where
+    S: MemoryDescriptor + NixlCompatible + 'static,
+{
+    let storage_kind = storage.storage_kind();
+    // Determine if registration is needed based on storage type and available backends
+    let should_register = match storage_kind {
+        StorageKind::System | StorageKind::Pinned => {
+            // System/Pinned memory needs UCX for remote transfers
+            agent.has_backend("UCX") || agent.has_backend("POSIX")
+        }
+        StorageKind::Device(_) => {
+            // Device memory needs UCX for remote transfers OR GDS for direct disk transfers
+            agent.has_backend("UCX") || agent.has_backend("GDS_MT")
+        }
+        StorageKind::Disk(_) => {
+            // Disk storage needs POSIX for regular I/O OR GDS for GPU direct I/O
+            agent.has_backend("POSIX") || agent.has_backend("GDS_MT")
+        }
+    };
+    if !should_register {
+        // Skip registration - only local non-NIXL transfers will be used
+        let region = Buffer::from_arc(Arc::new(storage));
+        return Ok(MemoryEntry::new(region, None));
+    }
+    // Register with NIXL using the appropriate backend
+    match register_with_nixl(storage, agent, None) {
+        Ok(registered) => {
+            let descriptor = registered.descriptor();
+            let region = Buffer::from_arc(Arc::new(registered));
+            Ok(MemoryEntry::new(region, Some(descriptor)))
+        }
+        Err(_storage) => bail!("failed to register memory with NIXL agent {}", agent.name()),
+    }
+}
+// Production builds always register
+#[cfg(not(test))]
+fn register_storage<S>(storage: S, agent: &NixlAgent) -> Result<MemoryEntry>
+where
+    S: MemoryDescriptor + NixlCompatible + 'static,
+{
+    // Production builds always register for safety
+    match register_with_nixl(storage, agent, None) {
+        Ok(registered) => {
+            let descriptor = registered.descriptor();
+            let region: Buffer = create_buffer(registered);
+            Ok(MemoryEntry::new(region, Some(descriptor)))
+        }
+        Err(_storage) => bail!("failed to register memory with NIXL agent {}", agent.name()),
+    }
+}
+fn create_offset_entries(
+    base_entry: MemoryEntry,
+    sizes: &[usize],
+    alignment: usize,
+) -> Result<Vec<MemoryEntry>> {
+    if sizes.is_empty() {
+        return Ok(Vec::new());
+    }
+    let base_region = base_entry.region;
+    let base_descriptor = base_entry.descriptor;
+    let base_addr = base_region.addr();
+    let base_len = base_region.size();
+    let mut entries = Vec::with_capacity(sizes.len());
+    let mut offset = 0usize;
+    for (index, &size) in sizes.iter().enumerate() {
+        let region = if index == 0 && offset == 0 && size == base_len && sizes.len() == 1 {
+            base_region.clone()
+        } else {
+            let view = OffsetBuffer::new(base_region.clone(), offset, size)
+                .map_err(|e| anyhow!("failed to create offset region: {e}"))?;
+            create_buffer(view)
+        };
+        let descriptor = base_descriptor
+            .as_ref()
+            .map(|descriptor| derive_descriptor(descriptor, offset, size))
+            .transpose()?;
+        entries.push(MemoryEntry::new(region, descriptor));
+        offset = offset
+            .checked_add(size)
+            .ok_or_else(|| anyhow!("offset computation overflow"))?;
+        if index + 1 < sizes.len() && alignment > 1 {
+            let current_addr = base_addr
+                .checked_add(offset)
+                .ok_or_else(|| anyhow!("address computation overflow"))?;
+            let aligned_addr = align_up(current_addr, alignment)?;
+            offset = aligned_addr
+                .checked_sub(base_addr)
+                .ok_or_else(|| anyhow!("alignment subtraction overflow"))?;
+        }
+    }
+    if offset > base_len {
+        bail!(
+            "allocated base region ({base_len} bytes) is insufficient for {offset} bytes with padding"
+        );
+    }
+    Ok(entries)
+}
+fn derive_descriptor(base: &NixlDescriptor, offset: usize, size: usize) -> Result<NixlDescriptor> {
+    let mut descriptor = base.clone();
+    descriptor.size = size;
+    if descriptor.mem_type != MemType::File {
+        descriptor.addr = descriptor
+            .addr
+            .checked_add(offset as u64)
+            .ok_or_else(|| anyhow!("descriptor address overflow"))?;
+    }
+    Ok(descriptor)
+}
+fn compute_allocation_sizes(config: &LayoutConfig, kind: &LayoutKind) -> Result<Vec<usize>> {
+    match kind {
+        LayoutKind::FullyContiguous => {
+            let factors = [
+                config.num_blocks,
+                config.num_layers,
+                config.outer_dim,
+                config.page_size,
+                config.inner_dim,
+                config.dtype_width_bytes,
+            ];
+            let total = mul_chain(&factors)?;
+            Ok(vec![total])
+        }
+        LayoutKind::LayerSeparate { .. } => {
+            let factors = [
+                config.num_blocks,
+                config.outer_dim,
+                config.page_size,
+                config.inner_dim,
+                config.dtype_width_bytes,
+            ];
+            let per_layer = mul_chain(&factors)?;
+            Ok(vec![per_layer; config.num_layers])
+        }
+    }
+}
+fn mul_chain(factors: &[usize]) -> Result<usize> {
+    factors.iter().try_fold(1usize, |acc, &value| {
+        acc.checked_mul(value)
+            .ok_or_else(|| anyhow!("allocation size overflow during layout computation"))
+    })
+}
+fn total_allocation_size(sizes: &[usize], alignment: usize) -> Result<usize> {
+    if sizes.is_empty() {
+        return Ok(0);
+    }
+    let mut total = *sizes
+        .first()
+        .ok_or_else(|| anyhow!("allocation requires at least one region"))?;
+    for size in sizes.iter().skip(1) {
+        total = total
+            .checked_add(*size)
+            .ok_or_else(|| anyhow!("allocation size overflow during aggregation"))?;
+        if alignment > 1 {
+            total = total
+                .checked_add(alignment - 1)
+                .ok_or_else(|| anyhow!("allocation alignment padding overflow"))?;
+        }
+    }
+    Ok(total)
+}
+fn align_up(value: usize, alignment: usize) -> Result<usize> {
+    if alignment <= 1 {
+        return Ok(value);
+    }
+    let remainder = value % alignment;
+    if remainder == 0 {
+        Ok(value)
+    } else {
+        value
+            .checked_add(alignment - remainder)
+            .ok_or_else(|| anyhow!("alignment overflow"))
+    }
+}
+fn validate_memory_sizes(entries: &[MemoryEntry], required: &[usize]) -> Result<()> {
+    for (entry, &required_size) in entries.iter().zip(required.iter()) {
+        if entry.region.size() < required_size {
+            bail!(
+                "memory region too small: required {} bytes, available {} bytes",
+                required_size,
+                entry.region.size()
+            );
+        }
+    }
+    Ok(())
+}
+fn derive_storage_kind(entries: &[MemoryEntry]) -> Result<StorageKind> {
+    let first = entries
+        .first()
+        .ok_or_else(|| anyhow!("no memory regions available to determine storage location"))?;
+    let first_kind = first.region.storage_kind();
+    for entry in entries.iter().skip(1) {
+        let kind = entry.region.storage_kind();
+        if kind != first_kind {
+            bail!(
+                "all memory regions must share the same storage location (found {:?} and {:?})",
+                first_kind,
+                kind
+            );
+        }
+    }
+    Ok(first_kind)
+}
+fn derive_nixl_metadata(agent: &NixlAgent, entries: &[MemoryEntry]) -> Result<NixlMetadata> {
+    // Try to find a descriptor from entries
+    let descriptor_opt = entries.iter().find_map(|entry| entry.descriptor.clone());
+    #[cfg(test)]
+    {
+        // In test builds, allow layouts without NIXL registration
+        // Use defaults for local-only transfers
+        if let Some(descriptor) = descriptor_opt {
+            Ok(NixlMetadata::new(
+                agent.name().to_string(),
+                descriptor.mem_type,
+                descriptor.device_id,
+            ))
+        } else {
+            // Use placeholder metadata for unregistered layouts
+            let first_entry = entries
+                .first()
+                .ok_or_else(|| anyhow!("no memory entries"))?;
+            let storage_kind = first_entry.region.storage_kind();
+            let (mem_type, device_id) = match storage_kind {
+                StorageKind::System => (MemType::Dram, 0),
+                StorageKind::Pinned => (MemType::Dram, 0),
+                StorageKind::Device(id) => (MemType::Vram, id as u64),
+                StorageKind::Disk(id) => (MemType::File, id),
+            };
+            Ok(NixlMetadata::new(
+                agent.name().to_string(),
+                mem_type,
+                device_id,
+            ))
+        }
+    }
+    #[cfg(not(test))]
+    {
+        let descriptor = descriptor_opt
+            .ok_or_else(|| anyhow!("memory entries missing NIXL registration metadata"))?;
+        Ok(NixlMetadata::new(
+            agent.name().to_string(),
+            descriptor.mem_type,
+            descriptor.device_id,
+        ))
+    }
+}
+#[cfg(all(test, feature = "testing-kvbm"))]
+mod tests {
+    use super::super::{BlockDimension, LayoutConfig};
+    use super::*;
+    use dynamo_memory::{Buffer, MemoryDescriptor, StorageKind};
+    use std::any::Any;
+    #[derive(Debug)]
+    struct TestRegisteredRegion {
+        data: Vec<u8>,
+        kind: StorageKind,
+        descriptor: NixlDescriptor,
+    }
+    impl TestRegisteredRegion {
+        fn new(size: usize, kind: StorageKind, mem_type: MemType, device_id: u64) -> Self {
+            let data = vec![0u8; size];
+            let addr = data.as_ptr() as u64;
+            let descriptor = NixlDescriptor {
+                addr,
+                size,
+                mem_type,
+                device_id,
+            };
+            Self {
+                data,
+                kind,
+                descriptor,
+            }
+        }
+    }
+    impl MemoryDescriptor for TestRegisteredRegion {
+        fn addr(&self) -> usize {
+            self.data.as_ptr() as usize
+        }
+        fn size(&self) -> usize {
+            self.data.len()
+        }
+        fn storage_kind(&self) -> StorageKind {
+            self.kind
+        }
+        fn as_any(&self) -> &dyn Any {
+            self
+        }
+        fn nixl_descriptor(&self) -> Option<NixlDescriptor> {
+            Some(self.descriptor.clone())
+        }
+    }
+    fn make_layout_config() -> LayoutConfig {
+        LayoutConfig::builder()
+            .num_blocks(2)
+            .num_layers(3)
+            .outer_dim(2)
+            .page_size(4)
+            .inner_dim(8)
+            .dtype_width_bytes(2)
+            .build()
+            .unwrap()
+    }
+    fn fully_contiguous_size(cfg: &LayoutConfig) -> usize {
+        cfg.num_blocks
+            * cfg.num_layers
+            * cfg.outer_dim
+            * cfg.page_size
+            * cfg.inner_dim
+            * cfg.dtype_width_bytes
+    }
+    fn per_layer_size(cfg: &LayoutConfig) -> usize {
+        cfg.num_blocks * cfg.outer_dim * cfg.page_size * cfg.inner_dim * cfg.dtype_width_bytes
+    }
+    #[test]
+    fn builds_fully_contiguous_from_registered_regions() {
+        let agent = NixlAgent::new("builder-test-fully").expect("failed to create agent");
+        let cfg = make_layout_config();
+        let required = fully_contiguous_size(&cfg);
+        let region = create_buffer(TestRegisteredRegion::new(
+            required,
+            StorageKind::System,
+            MemType::Dram,
+            0,
+        ));
+        let physical = PhysicalLayoutBuilder::new(agent.clone())
+            .with_config(cfg.clone())
+            .fully_contiguous()
+            .with_registered_regions(vec![region])
+            .expect("registered regions accepted")
+            .build()
+            .expect("builder should succeed");
+        assert_eq!(physical.location(), StorageKind::System);
+        assert!(physical.layout().as_ref().is_fully_contiguous());
+        assert_eq!(physical.layout().config().num_blocks, cfg.num_blocks);
+        assert_eq!(physical.layout().config().num_layers, cfg.num_layers);
+        let metadata = physical.nixl_metadata();
+        assert_eq!(metadata.agent_name(), agent.name());
+        assert_eq!(metadata.mem_type(), MemType::Dram);
+    }
+    #[test]
+    fn builds_layer_separate_from_registered_regions() {
+        let agent = NixlAgent::new("builder-test-layer").expect("failed to create agent");
+        let cfg = make_layout_config();
+        let per_layer = per_layer_size(&cfg);
+        let regions: Vec<Buffer> = (0..cfg.num_layers)
+            .map(|_| {
+                create_buffer(TestRegisteredRegion::new(
+                    per_layer,
+                    StorageKind::System,
+                    MemType::Dram,
+                    0,
+                ))
+            })
+            .collect();
+        let physical = PhysicalLayoutBuilder::new(agent.clone())
+            .with_config(cfg.clone())
+            .layer_separate(BlockDimension::BlockIsFirstDim)
+            .with_registered_regions(regions)
+            .expect("registered layer regions accepted")
+            .build()
+            .expect("builder should succeed");
+        assert_eq!(physical.location(), StorageKind::System);
+        assert!(!physical.layout().as_ref().is_fully_contiguous());
+        assert_eq!(physical.layout().config().num_layers, cfg.num_layers);
+        let metadata = physical.nixl_metadata();
+        assert_eq!(metadata.agent_name(), agent.name());
+        assert_eq!(metadata.mem_type(), MemType::Dram);
+    }
+}
+// fn context_device_id(ctx: &TransferContext) -> u32 {
+//     ctx.stream().context().ordinal() as u32
+// }
--- a/lib/kvbm-physical/src/layout/config.rs
+++ b/lib/kvbm-physical/src/layout/config.rs
+// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+use derive_builder::Builder;
+use serde::{Deserialize, Serialize};
+use validator::{Validate, ValidationError};
+/// Configuration for block layouts.
+///
+/// The `#[validate]` attributes on fields are checked during layout construction
+/// (e.g., `FullyContiguousLayout::new_internal()`, `LayerSeparateLayout::new_internal()`),
+/// not at builder `.build()` time.
+#[derive(Debug, Clone, Builder, Validate, Serialize, Deserialize, PartialEq, Eq)]
+pub struct LayoutConfig {
+    /// Number of blocks
+    #[validate(range(min = 1))]
+    pub num_blocks: usize,
+    /// Number of layers
+    #[validate(range(min = 1))]
+    pub num_layers: usize,
+    /// Number of outer dimensions
+    #[validate(range(min = 1, max = 2))]
+    pub outer_dim: usize,
+    /// Page size
+    #[validate(range(min = 1))]
+    pub page_size: usize,
+    /// Inner dimension
+    #[validate(range(min = 1))]
+    pub inner_dim: usize,
+    /// Alignment
+    #[validate(custom(function = "validate_power_of_2"))]
+    #[builder(default = "1")]
+    pub alignment: usize,
+    /// Data type
+    #[validate(custom(function = "validate_dtype_width_bytes"))]
+    #[builder(default = "2")]
+    pub dtype_width_bytes: usize,
+    /// Number of attention heads (optional).
+    ///
+    /// When provided, enables KvBlockLayout support for universal formats.
+    /// The head dimension can be computed as: `inner_dim / (page_size * num_heads)`.
+    ///
+    /// Required for:
+    /// - Universal layout transformations
+    /// - Per-head memory region access
+    #[builder(default = "None")]
+    #[serde(default)]
+    pub num_heads: Option<usize>,
+}
+impl LayoutConfig {
+    /// Builder for LayoutConfig
+    pub fn builder() -> LayoutConfigBuilder {
+        LayoutConfigBuilder::default()
+    }
+    pub fn required_bytes(&self) -> usize {
+        self.num_blocks
+            .saturating_mul(self.num_layers)
+            .saturating_mul(self.outer_dim)
+            .saturating_mul(self.page_size)
+            .saturating_mul(self.inner_dim)
+            .saturating_mul(self.dtype_width_bytes)
+    }
+    /// Get the number of bytes per block.
+    ///
+    /// This is the total size of a single block across all layers and outer dimensions.
+    pub fn bytes_per_block(&self) -> usize {
+        self.num_layers
+            .saturating_mul(self.outer_dim)
+            .saturating_mul(self.page_size)
+            .saturating_mul(self.inner_dim)
+            .saturating_mul(self.dtype_width_bytes)
+    }
+    /// Get the head dimension if `num_heads` is specified.
+    ///
+    /// Computes `inner_dim / (page_size * num_heads)`.
+    ///
+    /// # Returns
+    /// `Some(head_dim)` if `num_heads` is set, `None` otherwise.
+    pub fn head_dim(&self) -> Option<usize> {
+        self.num_heads.map(|nh| {
+            let divisor = self.page_size * nh;
+            if divisor > 0 {
+                self.inner_dim / divisor
+            } else {
+                0
+            }
+        })
+    }
+    /// Check if this config supports KvBlockLayout operations.
+    ///
+    /// Returns `true` if `num_heads` is set and the dimensions are valid
+    /// (inner_dim is evenly divisible by page_size * num_heads).
+    pub fn supports_kv_block_layout(&self) -> bool {
+        if let Some(nh) = self.num_heads {
+            let divisor = self.page_size * nh;
+            divisor > 0 && self.inner_dim.is_multiple_of(divisor)
+        } else {
+            false
+        }
+    }
+    /// Validate that this config supports KvBlockLayout operations.
+    ///
+    /// # Returns
+    /// `Ok(())` if valid, `Err` with details otherwise.
+    pub fn validate_for_kv_block_layout(&self) -> Result<(), ValidationError> {
+        let nh = match self.num_heads {
+            Some(nh) => nh,
+            None => {
+                return Err(ValidationError::new(
+                    "num_heads_required_for_kv_block_layout",
+                ));
+            }
+        };
+        if nh == 0 {
+            return Err(ValidationError::new("num_heads_must_be_positive"));
+        }
+        let divisor = self.page_size * nh;
+        if !self.inner_dim.is_multiple_of(divisor) {
+            return Err(ValidationError::new(
+                "inner_dim_must_be_divisible_by_page_size_times_num_heads",
+            ));
+        }
+        Ok(())
+    }
+}
+/// The first two dimensions of the tensor, `shape[0]` and `shape[1]`, one of those corresponds to the
+/// block dimension, while the other corresponds to the outer dimension.
+///
+/// The outer dimension is typically:
+/// - 1: MLA or K and V stored together,
+/// - 2: K and V stored separately,
+///
+/// The block dimension tell us the number of blocks.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+pub enum BlockDimension {
+    /// The block dimension is the first dimension of the tensor, `[n_blocks, outer_dim, inner_dim]`
+    BlockIsFirstDim,
+    /// The block dimension is the second dimension of the tensor, `[outer_dim, n_blocks, inner_dim]`
+    /// This is a replacement for v1's `outer_contiguous` is true.
+    BlockIsSecondDim,
+}
+/// Validation function for Option<usize> to check if it's Some(power_of_2).
+pub fn validate_power_of_2(alignment: usize) -> Result<(), ValidationError> {
+    if !alignment.is_power_of_two() {
+        // Return validation error if alignment is not a power of 2
+        return Err(validator::ValidationError::new(
+            "alignment_must_be_power_of_2",
+        ));
+    }
+    // Passes validation if alignment is a power of 2
+    Ok(())
+}
+pub fn validate_dtype_width_bytes(dtype_width_bytes: usize) -> Result<(), ValidationError> {
+    if !dtype_width_bytes.is_power_of_two() || !(2..=8).contains(&dtype_width_bytes) {
+        return Err(validator::ValidationError::new(
+            "dtype_width_bytes_must_be_power_of_two_and_less_than_8_bytes",
+        ));
+    }
+    Ok(())
+}
--- a/lib/kvbm-physical/src/layout/fully_contiguous.rs
+++ b/lib/kvbm-physical/src/layout/fully_contiguous.rs
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//! Fully contiguous layout implementation.
+//!
+//! This layout stores all blocks in a single contiguous memory allocation
+//! with the shape: [num_blocks, num_layers, outer_dim, page_size, inner_dim].
+use anyhow::{Result, anyhow};
+use validator::Validate;
+use super::serialize::{BlockFormat, FullyContiguousDetails, LayoutTypeDetails};
+use super::{Buffer, KvBlockLayout, Layout, LayoutConfig, MemoryDescriptor, MemoryRegion};
+/// Fully contiguous layout where all blocks are in a single allocation.
+#[derive(Debug)]
+pub struct FullyContiguousLayout {
+    config: LayoutConfig,
+    /// Base address of the allocation
+    base_addr: usize,
+    /// Stride between blocks in bytes
+    block_stride: usize,
+    /// Stride between layers in bytes
+    layer_stride: usize,
+    /// Stride between outer dimensions in bytes
+    outer_stride: usize,
+    /// Size of each memory region (page) in bytes
+    region_size: usize,
+    /// Owned memory region backing this layout
+    memory: Buffer,
+    /// Format of blocks in memory
+    block_format: BlockFormat,
+    /// KV block layout describing dimension ordering within blocks
+    kv_block_layout: KvBlockLayout,
+}
+/// Builder for creating [`FullyContiguousLayout`] instances.
+///
+/// # Example
+///
+/// ```ignore
+/// let layout = FullyContiguousLayout::builder()
+///     .config(config)
+///     .memory(buffer)
+///     .kv_block_layout(KvBlockLayout::UniversalTP)
+///     .build()?;
+/// ```
+#[derive(Debug, Default)]
+pub struct FullyContiguousLayoutBuilder {
+    config: Option<LayoutConfig>,
+    memory: Option<Buffer>,
+    kv_block_layout: KvBlockLayout,
+    block_format: BlockFormat,
+}
+impl FullyContiguousLayoutBuilder {
+    /// Create a new builder with default values.
+    pub fn new() -> Self {
+        Self {
+            config: None,
+            memory: None,
+            kv_block_layout: KvBlockLayout::Unknown,
+            block_format: BlockFormat::default(),
+        }
+    }
+    /// Set the layout configuration.
+    #[expect(dead_code)]
+    pub fn config(&mut self, config: LayoutConfig) -> &mut Self {
+        self.config = Some(config);
+        self
+    }
+    /// Set the memory buffer backing this layout.
+    #[expect(dead_code)]
+    pub fn memory(&mut self, memory: Buffer) -> &mut Self {
+        self.memory = Some(memory);
+        self
+    }
+    /// Set the KV block layout describing dimension ordering.
+    ///
+    /// Default: `KvBlockLayout::Unknown`
+    #[expect(dead_code)]
+    pub fn kv_block_layout(&mut self, layout: KvBlockLayout) -> &mut Self {
+        self.kv_block_layout = layout;
+        self
+    }
+    /// Set the block format.
+    ///
+    /// Default: `BlockFormat::default()` (Operational)
+    #[expect(dead_code)]
+    pub fn block_format(&mut self, format: BlockFormat) -> &mut Self {
+        self.block_format = format;
+        self
+    }
+    /// Build the [`FullyContiguousLayout`].
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if:
+    /// - `config` is not set
+    /// - `memory` is not set
+    /// - The memory region is too small for the layout
+    /// - The config validation fails
+    #[expect(dead_code)]
+    pub fn build(&self) -> Result<FullyContiguousLayout> {
+        let config = self
+            .config
+            .clone()
+            .ok_or_else(|| anyhow!("config is required"))?;
+        let memory = self
+            .memory
+            .clone()
+            .ok_or_else(|| anyhow!("memory is required"))?;
+        FullyContiguousLayout::new_internal(config, memory, self.kv_block_layout, self.block_format)
+    }
+}
+impl FullyContiguousLayout {
+    /// Create a builder for `FullyContiguousLayout`.
+    #[expect(dead_code)]
+    pub fn builder() -> FullyContiguousLayoutBuilder {
+        FullyContiguousLayoutBuilder::new()
+    }
+    /// Create a new fully contiguous layout with default KV block layout.
+    ///
+    /// # Arguments
+    /// * `config` - Layout configuration
+    /// * `memory` - Owned memory region that backs this layout
+    ///
+    /// # Returns
+    /// A new FullyContiguousLayout instance with `KvBlockLayout::Unknown`
+    pub(crate) fn new(config: LayoutConfig, memory: Buffer) -> Result<Self> {
+        Self::new_internal(
+            config,
+            memory,
+            KvBlockLayout::Unknown,
+            BlockFormat::default(),
+        )
+    }
+    /// Internal constructor with all parameters.
+    fn new_internal(
+        config: LayoutConfig,
+        memory: Buffer,
+        kv_block_layout: KvBlockLayout,
+        block_format: BlockFormat,
+    ) -> Result<Self> {
+        config.validate()?;
+        let base_addr = memory.addr();
+        // Calculate strides
+        let region_size = config.page_size * config.inner_dim * config.dtype_width_bytes;
+        let outer_stride = region_size;
+        let layer_stride = outer_stride * config.outer_dim;
+        let block_stride = layer_stride * config.num_layers;
+        // Validate that the memory region is large enough
+        let required_size = block_stride * config.num_blocks;
+        if memory.size() < required_size {
+            return Err(anyhow!(
+                "Memory region too small for layout. Required: {} bytes, got: {} bytes",
+                required_size,
+                memory.size()
+            ));
+        }
+        Ok(Self {
+            config,
+            base_addr,
+            block_stride,
+            layer_stride,
+            outer_stride,
+            region_size,
+            memory,
+            block_format,
+            kv_block_layout,
+        })
+    }
+    /// Create a new fully contiguous layout with a specific block format and KV block layout.
+    ///
+    /// # Arguments
+    /// * `config` - Layout configuration
+    /// * `memory` - Owned memory region that backs this layout
+    /// * `block_format` - Format of blocks in memory
+    /// * `kv_block_layout` - KV block layout describing dimension ordering
+    ///
+    /// # Returns
+    /// A new FullyContiguousLayout instance
+    pub(crate) fn new_with_format(
+        config: LayoutConfig,
+        memory: Buffer,
+        block_format: BlockFormat,
+        kv_block_layout: KvBlockLayout,
+    ) -> Result<Self> {
+        Self::new_internal(config, memory, kv_block_layout, block_format)
+    }
+    /// Get the block format.
+    #[expect(dead_code)]
+    pub fn block_format(&self) -> BlockFormat {
+        self.block_format
+    }
+    /// Get the KV block layout.
+    #[expect(dead_code)]
+    pub fn kv_block_layout(&self) -> KvBlockLayout {
+        self.kv_block_layout
+    }
+    /// Set the KV block layout.
+    #[expect(dead_code)]
+    pub fn set_kv_block_layout(&mut self, layout: KvBlockLayout) {
+        self.kv_block_layout = layout;
+    }
+    /// Calculate the address of a specific memory region.
+    fn calculate_address(
+        &self,
+        block_id: usize,
+        layer_id: usize,
+        outer_id: usize,
+    ) -> Result<usize> {
+        if block_id >= self.config.num_blocks {
+            return Err(anyhow!(
+                "Block ID {} out of range (count: {})",
+                block_id,
+                self.config.num_blocks
+            ));
+        }
+        if layer_id >= self.config.num_layers {
+            return Err(anyhow!(
+                "Layer ID {} out of range (count: {})",
+                layer_id,
+                self.config.num_layers
+            ));
+        }
+        if outer_id >= self.config.outer_dim {
+            return Err(anyhow!(
+                "Outer ID {} out of range (count: {})",
+                outer_id,
+                self.config.outer_dim
+            ));
+        }
+        Ok(self.base_addr
+            + block_id * self.block_stride
+            + layer_id * self.layer_stride
+            + outer_id * self.outer_stride)
+    }
+    /// Get mutable reference to the memory Arc for NIXL registration.
+    #[expect(dead_code)]
+    pub fn memory_arc_mut(&mut self) -> &mut Buffer {
+        &mut self.memory
+    }
+}
+impl Layout for FullyContiguousLayout {
+    fn config(&self) -> &LayoutConfig {
+        &self.config
+    }
+    fn memory_regions(&self) -> &[Buffer] {
+        std::slice::from_ref(&self.memory)
+    }
+    fn memory_region(
+        &self,
+        block_id: usize,
+        layer_id: usize,
+        outer_id: usize,
+    ) -> Result<MemoryRegion> {
+        let addr = self.calculate_address(block_id, layer_id, outer_id)?;
+        Ok(MemoryRegion::new(addr, self.region_size))
+    }
+    fn required_allocations(&self) -> Vec<usize> {
+        // Single contiguous allocation
+        vec![self.block_stride * self.config.num_blocks]
+    }
+    fn is_fully_contiguous(&self) -> bool {
+        true
+    }
+    fn num_blocks(&self) -> usize {
+        self.config.num_blocks
+    }
+    fn num_layers(&self) -> usize {
+        self.config.num_layers
+    }
+    fn outer_dim(&self) -> usize {
+        self.config.outer_dim
+    }
+    fn page_size(&self) -> usize {
+        self.config.page_size
+    }
+    fn inner_dim(&self) -> usize {
+        self.config.inner_dim
+    }
+    fn dtype_width_bytes(&self) -> usize {
+        self.config.dtype_width_bytes
+    }
+    fn serialization_details(&self) -> LayoutTypeDetails {
+        LayoutTypeDetails::FullyContiguous(FullyContiguousDetails {
+            block_format: self.block_format,
+            kv_block_layout: self.kv_block_layout,
+        })
+    }
+    fn block_layout(&self) -> KvBlockLayout {
+        self.kv_block_layout
+    }
+}
+impl super::ContiguousBlockLayout for FullyContiguousLayout {
+    fn num_blocks(&self) -> usize {
+        self.config.num_blocks
+    }
+    fn bytes_per_block(&self) -> usize {
+        self.block_stride
+    }
+    fn raw_block(&self, block_id: usize) -> Result<MemoryRegion> {
+        if block_id >= self.config.num_blocks {
+            return Err(anyhow!(
+                "Block ID {} out of range (max: {})",
+                block_id,
+                self.config.num_blocks
+            ));
+        }
+        let addr = self.base_addr + block_id * self.block_stride;
+        Ok(MemoryRegion::new(addr, self.block_stride))
+    }
+    fn block_layout(&self) -> KvBlockLayout {
+        self.kv_block_layout
+    }
+}
+#[cfg(all(test, feature = "testing-kvbm"))]
+mod tests {
+    use super::super::tests::*;
+    use super::*;
+    #[test]
+    fn test_fully_contiguous_layout_creation() {
+        let config = LayoutConfig::builder()
+            .num_blocks(10)
+            .num_layers(4)
+            .outer_dim(2)
+            .page_size(16)
+            .inner_dim(128)
+            .dtype_width_bytes(2)
+            .build()
+            .unwrap();
+        let required_bytes = config.required_bytes();
+        assert_eq!(required_bytes, 10 * 4 * 2 * 16 * 128 * 2);
+        let memory = Buffer::from_arc(MockMemory::new(0x1000, required_bytes));
+        let layout = FullyContiguousLayout::new(config, memory).unwrap();
+        assert_eq!(layout.num_blocks(), 10);
+        assert!(layout.is_fully_contiguous());
+    }
+    #[test]
+    fn test_memory_region() {
+        let config = LayoutConfig::builder()
+            .num_blocks(2)
+            .num_layers(2)
+            .outer_dim(2)
+            .page_size(16)
+            .inner_dim(128)
+            .dtype_width_bytes(2)
+            .build()
+            .unwrap();
+        let required_size = config.required_bytes();
+        let memory = Buffer::from_arc(MockMemory::new(0x1000, required_size));
+        let layout = FullyContiguousLayout::new(config.clone(), memory).unwrap();
+        // Test accessing specific memory regions
+        let region_size = config.page_size * config.inner_dim * config.dtype_width_bytes;
+        // Block 0, Layer 0, Outer 0
+        let region = layout.memory_region(0, 0, 0).unwrap();
+        assert_eq!(region.addr, 0x1000);
+        assert_eq!(region.size(), region_size);
+        // Block 0, Layer 0, Outer 1
+        let region = layout.memory_region(0, 0, 1).unwrap();
+        assert_eq!(region.addr, 0x1000 + region_size);
+        assert_eq!(region.size(), region_size);
+        // Block 0, Layer 1, Outer 0
+        let region = layout.memory_region(0, 1, 0).unwrap();
+        assert_eq!(region.addr, 0x1000 + 2 * region_size);
+        assert_eq!(region.size(), region_size);
+        // Block 1, Layer 0, Outer 0
+        let region = layout.memory_region(1, 0, 0).unwrap();
+        assert_eq!(
+            region.addr,
+            0x1000 + (config.outer_dim * config.num_layers * region_size)
+        );
+        assert_eq!(region.size(), region_size);
+    }
+}
--- a/lib/kvbm-physical/src/layout/kv_block_layout.rs
+++ b/lib/kvbm-physical/src/layout/kv_block_layout.rs
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//! KV Block layout types for describing dimension permutations within blocks.
+//!
+//! This module provides types for describing how dimensions are ordered within
+//! a fully contiguous KV cache block, enabling type-driven kernel selection
+//! for transfers between different layout formats.
+use serde::{Deserialize, Serialize};
+/// Symbolic dimensions that can be permuted within a block.
+///
+/// The head dimension (hd) is always innermost and not included here.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub enum BlockDim {
+    /// Number of layers (nl)
+    Layer,
+    /// Outer dimension - typically 2 for K/V, 1 for MLA (no)
+    Outer,
+    /// Page size / tokens per block (nt)
+    Page,
+    /// Number of attention heads (nh)
+    Head,
+}
+/// Block layout defined by dimension ordering.
+///
+/// Describes how the 4 permutable dimensions (layer, outer, page, head) are
+/// ordered within a fully contiguous block. The head dimension (hd) is always
+/// innermost and implicit.
+///
+/// The order specifies outer-to-inner dimensions, with head_dim always last.
+///
+/// # Examples
+///
+/// - `UniversalTP`: `[nh, nl, no, nt, hd]` - heads outermost for TP resharding
+/// - `OperationalNHD`: `[nl, no, nt, nh, hd]` - inner is `[nt, nh, hd]`
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub enum KvBlockLayout {
+    /// Universal format: `[nh, nl, no, nt, hd]`
+    ///
+    /// Heads are outermost to enable tensor parallelism (TP) resharding.
+    /// Cache saved from one TP configuration can be loaded into another
+    /// by simply slicing the head dimension differently.
+    UniversalTP,
+    /// Pipeline parallelism format: `[nl, nh, no, nt, hd]`
+    ///
+    /// Layers are outermost for pipeline parallelism scenarios.
+    UniversalPP,
+    /// Operational HND format: `[nl, no, nh, nt, hd]`
+    ///
+    /// Inner tensor shape is `[nh, nt, hd]` (heads, tokens, head_dim).
+    OperationalHND,
+    /// Operational NHD format: `[nl, no, nt, nh, hd]`
+    ///
+    /// Inner tensor shape is `[nt, nh, hd]` (tokens, heads, head_dim).
+    /// This is the most common format used by vLLM and other frameworks.
+    OperationalNHD,
+    /// Custom ordering with explicit dimension list.
+    ///
+    /// The array specifies dimensions from outermost to innermost,
+    /// with head_dim always implicitly last.
+    Custom([BlockDim; 4]),
+    /// Unknown layout - fallback when format cannot be determined.
+    ///
+    /// Operations involving Unknown layouts may fail or require explicit
+    /// configuration.
+    Unknown,
+}
+impl Default for KvBlockLayout {
+    fn default() -> Self {
+        // Unknown until runtime detection determines the actual format
+        Self::Unknown
+    }
+}
+impl KvBlockLayout {
+    /// Get the dimension ordering as an array.
+    ///
+    /// Returns the 4 dimensions from outermost to innermost.
+    /// Head dimension (hd) is implicit as the innermost dimension.
+    ///
+    /// # Returns
+    /// `None` for `Unknown` layout, `Some([BlockDim; 4])` otherwise.
+    pub fn dim_order(&self) -> Option<[BlockDim; 4]> {
+        use BlockDim::*;
+        match self {
+            Self::UniversalTP => Some([Head, Layer, Outer, Page]),
+            Self::UniversalPP => Some([Layer, Head, Outer, Page]),
+            Self::OperationalHND => Some([Layer, Outer, Head, Page]),
+            Self::OperationalNHD => Some([Layer, Outer, Page, Head]),
+            Self::Custom(order) => Some(*order),
+            Self::Unknown => None,
+        }
+    }
+    /// Check if two layouts require transformation (not just copy).
+    ///
+    /// Returns `true` if the layouts have different dimension orderings,
+    /// meaning a transformation kernel is needed rather than a simple copy.
+    ///
+    /// For Unknown→Unknown comparisons, returns `false` (compatible) but emits
+    /// a warning so these cases can be tracked and fixed.
+    ///
+    /// Returns `true` if one is Unknown and the other is Known (conservative).
+    pub fn requires_transform(&self, other: &Self) -> bool {
+        match (self.dim_order(), other.dim_order()) {
+            (Some(a), Some(b)) => a != b,
+            (None, None) => {
+                // Unknown→Unknown is compatible, but warn so we can fix these
+                tracing::warn!("Unknown→Unknown KvBlockLayout comparison - this should be fixed");
+                false
+            }
+            // Unknown→Known requires transform (conservative)
+            _ => true,
+        }
+    }
+    /// Check if this is an operational layout (NHD or HND).
+    ///
+    /// Operational layouts are used for direct computation and have
+    /// layer/outer as the outermost dimensions.
+    pub fn is_operational(&self) -> bool {
+        matches!(self, Self::OperationalNHD | Self::OperationalHND)
+    }
+    /// Check if this is a universal layout (TP or PP).
+    ///
+    /// Universal layouts are optimized for storage and transfer,
+    /// with different parallelism-friendly orderings.
+    pub fn is_universal(&self) -> bool {
+        matches!(self, Self::UniversalTP | Self::UniversalPP)
+    }
+    /// Get the layout name as a string identifier.
+    pub fn name(&self) -> &'static str {
+        match self {
+            Self::UniversalTP => "universal_tp",
+            Self::UniversalPP => "universal_pp",
+            Self::OperationalHND => "operational_hnd",
+            Self::OperationalNHD => "operational_nhd",
+            Self::Custom(_) => "custom",
+            Self::Unknown => "unknown",
+        }
+    }
+    /// Try to create a KvBlockLayout from an InnerShape.
+    ///
+    /// This provides compatibility with the existing InnerShape enum.
+    pub(crate) fn from_inner_shape(inner_shape: super::InnerShape) -> Self {
+        match inner_shape {
+            super::InnerShape::NHD => Self::OperationalNHD,
+            super::InnerShape::HND => Self::OperationalHND,
+            super::InnerShape::Unknown => Self::Unknown,
+        }
+    }
+    /// Convert to InnerShape if this is an operational layout.
+    ///
+    /// Returns `None` for universal or custom layouts.
+    pub(crate) fn to_inner_shape(self) -> Option<super::InnerShape> {
+        match self {
+            Self::OperationalNHD => Some(super::InnerShape::NHD),
+            Self::OperationalHND => Some(super::InnerShape::HND),
+            Self::Unknown => Some(super::InnerShape::Unknown),
+            _ => None,
+        }
+    }
+}
+impl std::fmt::Display for KvBlockLayout {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::UniversalTP => write!(f, "Universal TP [nh, nl, no, nt, hd]"),
+            Self::UniversalPP => write!(f, "Universal PP [nl, nh, no, nt, hd]"),
+            Self::OperationalHND => write!(f, "Operational HND [nl, no, nh, nt, hd]"),
+            Self::OperationalNHD => write!(f, "Operational NHD [nl, no, nt, nh, hd]"),
+            Self::Custom(order) => write!(f, "Custom {:?}", order),
+            Self::Unknown => write!(f, "Unknown"),
+        }
+    }
+}
+impl std::fmt::Display for BlockDim {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::Layer => write!(f, "nl"),
+            Self::Outer => write!(f, "no"),
+            Self::Page => write!(f, "nt"),
+            Self::Head => write!(f, "nh"),
+        }
+    }
+}
+// ============================================================================
+// KvBlocks - Collection wrapper for blocks with shared layout
+// ============================================================================
+use crate::BlockId;
+use crate::layout::PhysicalLayout;
+use std::sync::Arc;
+/// A collection of blocks with a shared layout configuration and block layout type.
+///
+/// `KvBlocks` provides a convenient way to group blocks that should be treated
+/// uniformly in transfer operations. All blocks in the collection share:
+/// - The same [`PhysicalLayout`] (memory organization)
+/// - The same [`KvBlockLayout`] interpretation (dimension ordering)
+///
+/// This enables efficient batch transfers with optional layout override.
+///
+/// # Example
+///
+/// ```ignore
+/// // Create blocks with universal layout override
+/// let blocks = KvBlocks::new(
+///     physical_layout.clone(),
+///     vec![0, 1, 2, 3],  // block IDs
+///     Some(KvBlockLayout::UniversalTP),
+/// )?;
+///
+/// // Use in transfers - the override tells the transfer system
+/// // to interpret these blocks as universal format
+/// ```
+#[derive(Debug, Clone)]
+pub struct KvBlocks {
+    /// The physical layout containing these blocks
+    layout: Arc<PhysicalLayout>,
+    /// Block IDs within the layout
+    block_ids: Vec<BlockId>,
+    /// Optional layout override (None = use layout's native block_layout)
+    kv_layout_override: Option<KvBlockLayout>,
+}
+impl KvBlocks {
+    /// Create a new KvBlocks collection.
+    ///
+    /// # Arguments
+    /// * `layout` - The physical layout containing the blocks
+    /// * `block_ids` - Block IDs to include in this collection
+    /// * `kv_layout_override` - Optional override for the block layout interpretation.
+    ///   If `None`, uses the layout's native `block_layout()`.
+    ///   If `Some`, overrides the interpretation for transfers.
+    ///
+    /// # Validation
+    /// - For layer-separate layouts, only operational layouts (NHD/HND) are valid overrides
+    /// - For fully contiguous layouts, any layout is valid
+    /// - If the override matches the native layout, it is normalized to None
+    pub fn new(
+        layout: Arc<PhysicalLayout>,
+        block_ids: Vec<BlockId>,
+        kv_layout_override: Option<KvBlockLayout>,
+    ) -> anyhow::Result<Self> {
+        // Validate block IDs are in range
+        let num_blocks = layout.layout().num_blocks();
+        for &id in &block_ids {
+            if id >= num_blocks {
+                return Err(anyhow::anyhow!(
+                    "Block ID {} out of range (layout has {} blocks)",
+                    id,
+                    num_blocks
+                ));
+            }
+        }
+        // Validate layout override compatibility
+        if let Some(ref override_layout) = kv_layout_override {
+            // Layer-separate layouts can only use operational formats
+            if !layout.layout().is_fully_contiguous() && !override_layout.is_operational() {
+                return Err(anyhow::anyhow!(
+                    "Layer-separate layouts only support operational block layouts (NHD/HND), got {:?}",
+                    override_layout
+                ));
+            }
+        }
+        // Normalize: if override matches native layout, set to None
+        let normalized_override = kv_layout_override.and_then(|override_layout| {
+            if override_layout == layout.layout().block_layout() {
+                None
+            } else {
+                Some(override_layout)
+            }
+        });
+        Ok(Self {
+            layout,
+            block_ids,
+            kv_layout_override: normalized_override,
+        })
+    }
+    /// Create a KvBlocks collection without layout override.
+    #[expect(dead_code)]
+    pub fn from_layout(
+        layout: Arc<PhysicalLayout>,
+        block_ids: Vec<BlockId>,
+    ) -> anyhow::Result<Self> {
+        Self::new(layout, block_ids, None)
+    }
+    /// Get the physical layout.
+    #[expect(dead_code)]
+    pub fn layout(&self) -> &Arc<PhysicalLayout> {
+        &self.layout
+    }
+    /// Get the block IDs.
+    #[expect(dead_code)]
+    pub fn block_ids(&self) -> &[BlockId] {
+        &self.block_ids
+    }
+    /// Get the effective block layout (override or native).
+    pub fn effective_block_layout(&self) -> KvBlockLayout {
+        self.kv_layout_override
+            .unwrap_or_else(|| self.layout.layout().block_layout())
+    }
+    /// Get the layout override if set.
+    #[expect(dead_code)]
+    pub fn layout_override(&self) -> Option<KvBlockLayout> {
+        self.kv_layout_override
+    }
+    /// Check if this collection has a layout override.
+    #[expect(dead_code)]
+    pub fn has_override(&self) -> bool {
+        self.kv_layout_override.is_some()
+    }
+    /// Get the number of blocks in this collection.
+    #[expect(dead_code)]
+    pub fn len(&self) -> usize {
+        self.block_ids.len()
+    }
+    /// Check if the collection is empty.
+    #[expect(dead_code)]
+    pub fn is_empty(&self) -> bool {
+        self.block_ids.is_empty()
+    }
+    /// Check if a transfer between two KvBlocks collections requires transformation.
+    ///
+    /// Returns `true` if the effective layouts differ and a transformation kernel
+    /// is needed rather than a simple copy.
+    #[expect(dead_code)]
+    pub fn requires_transform_to(&self, dst: &KvBlocks) -> bool {
+        self.effective_block_layout()
+            .requires_transform(&dst.effective_block_layout())
+    }
+}
+#[cfg(all(test, feature = "testing-kvbm"))]
+mod tests {
+    use super::*;
+    #[test]
+    fn test_dim_order() {
+        use BlockDim::*;
+        assert_eq!(
+            KvBlockLayout::UniversalTP.dim_order(),
+            Some([Head, Layer, Outer, Page])
+        );
+        assert_eq!(
+            KvBlockLayout::OperationalNHD.dim_order(),
+            Some([Layer, Outer, Page, Head])
+        );
+        assert_eq!(KvBlockLayout::Unknown.dim_order(), None);
+    }
+    #[test]
+    fn test_requires_transform() {
+        // Same layout - no transform
+        assert!(!KvBlockLayout::OperationalNHD.requires_transform(&KvBlockLayout::OperationalNHD));
+        // Different layouts - transform required
+        assert!(KvBlockLayout::OperationalNHD.requires_transform(&KvBlockLayout::UniversalTP));
+        assert!(KvBlockLayout::OperationalHND.requires_transform(&KvBlockLayout::OperationalNHD));
+        // Unknown→Known requires transform (conservative)
+        assert!(KvBlockLayout::Unknown.requires_transform(&KvBlockLayout::OperationalNHD));
+        assert!(KvBlockLayout::OperationalNHD.requires_transform(&KvBlockLayout::Unknown));
+        // Unknown→Unknown is compatible (but emits warning)
+        assert!(!KvBlockLayout::Unknown.requires_transform(&KvBlockLayout::Unknown));
+    }
+    #[test]
+    fn test_is_operational() {
+        assert!(KvBlockLayout::OperationalNHD.is_operational());
+        assert!(KvBlockLayout::OperationalHND.is_operational());
+        assert!(!KvBlockLayout::UniversalTP.is_operational());
+        assert!(!KvBlockLayout::Unknown.is_operational());
+    }
+    #[test]
+    fn test_is_universal() {
+        assert!(KvBlockLayout::UniversalTP.is_universal());
+        assert!(KvBlockLayout::UniversalPP.is_universal());
+        assert!(!KvBlockLayout::OperationalNHD.is_universal());
+    }
+    #[test]
+    fn test_default() {
+        assert_eq!(KvBlockLayout::default(), KvBlockLayout::Unknown);
+    }
+    #[test]
+    fn test_serialization() {
+        let layout = KvBlockLayout::UniversalTP;
+        let json = serde_json::to_string(&layout).unwrap();
+        let deserialized: KvBlockLayout = serde_json::from_str(&json).unwrap();
+        assert_eq!(layout, deserialized);
+        // Test custom layout
+        let custom = KvBlockLayout::Custom([
+            BlockDim::Head,
+            BlockDim::Page,
+            BlockDim::Layer,
+            BlockDim::Outer,
+        ]);
+        let json = serde_json::to_string(&custom).unwrap();
+        let deserialized: KvBlockLayout = serde_json::from_str(&json).unwrap();
+        assert_eq!(custom, deserialized);
+    }
+    #[test]
+    fn test_inner_shape_conversion() {
+        use super::super::InnerShape;
+        assert_eq!(
+            KvBlockLayout::from_inner_shape(InnerShape::NHD),
+            KvBlockLayout::OperationalNHD
+        );
+        assert_eq!(
+            KvBlockLayout::from_inner_shape(InnerShape::HND),
+            KvBlockLayout::OperationalHND
+        );
+        assert_eq!(
+            KvBlockLayout::OperationalNHD.to_inner_shape(),
+            Some(InnerShape::NHD)
+        );
+        assert_eq!(KvBlockLayout::UniversalTP.to_inner_shape(), None);
+    }
+}
--- a/lib/kvbm-physical/src/layout/layer_separate.rs
+++ b/lib/kvbm-physical/src/layout/layer_separate.rs
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//! Layer-separate layout implementation.
+//!
+//! This layout stores each layer in its own allocation, which is the typical
+//! vLLM layout. Each layer can be either block-contiguous or outer-contiguous:
+//! - Block-contiguous: [num_blocks, outer_dim, page_size, inner_dim]
+//! - Outer-contiguous: [outer_dim, num_blocks, page_size, inner_dim]
+use anyhow::{Result, anyhow};
+use validator::Validate;
+use super::serialize::{LayerSeparateDetails, LayoutTypeDetails};
+use super::{
+    BlockDimension, Buffer, InnerShape, KvBlockLayout, Layout, LayoutConfig, MemoryDescriptor,
+    MemoryRegion,
+};
+/// Layer-separate layout where each layer has its own allocation.
+#[derive(Debug)]
+pub struct LayerSeparateLayout {
+    config: LayoutConfig,
+    /// Base addresses for each layer
+    layer_base_addrs: Vec<usize>,
+    /// Whether the outer dimension is contiguous (vs block dimension)
+    block_dim: BlockDimension,
+    /// Stride between blocks in bytes
+    block_stride: usize,
+    /// Stride between outer dimensions in bytes
+    outer_stride: usize,
+    /// Size of each memory region (page) in bytes
+    region_size: usize,
+    /// Owned memory regions backing this layout (one per layer)
+    memory_regions: Vec<Buffer>,
+    /// KV block layout for inner tensor format (must be operational: NHD or HND)
+    kv_block_layout: KvBlockLayout,
+}
+/// Builder for creating [`LayerSeparateLayout`] instances.
+///
+/// # Example
+///
+/// ```ignore
+/// let layout = LayerSeparateLayout::builder()
+///     .config(config)
+///     .memory(memory_regions)
+///     .block_dim(BlockDimension::BlockIsFirstDim)
+///     .inner_shape(InnerShape::NHD)
+///     .build()?;
+/// ```
+#[derive(Debug, Default)]
+pub struct LayerSeparateLayoutBuilder {
+    config: Option<LayoutConfig>,
+    memory: Option<Vec<Buffer>>,
+    block_dim: Option<BlockDimension>,
+    kv_block_layout: KvBlockLayout,
+}
+impl LayerSeparateLayoutBuilder {
+    /// Create a new builder with default values.
+    pub fn new() -> Self {
+        Self {
+            config: None,
+            memory: None,
+            block_dim: None,
+            kv_block_layout: KvBlockLayout::Unknown,
+        }
+    }
+    /// Set the layout configuration.
+    pub fn config(&mut self, config: LayoutConfig) -> &mut Self {
+        self.config = Some(config);
+        self
+    }
+    /// Set the memory buffers backing this layout (one per layer).
+    pub fn memory(&mut self, memory: Vec<Buffer>) -> &mut Self {
+        self.memory = Some(memory);
+        self
+    }
+    /// Set the block dimension ordering.
+    pub fn block_dim(&mut self, block_dim: BlockDimension) -> &mut Self {
+        self.block_dim = Some(block_dim);
+        self
+    }
+    /// Set the inner shape, which translates to the KV block layout.
+    ///
+    /// Only operational layouts (NHD, HND) are valid for layer-separate layouts.
+    ///
+    /// - `InnerShape::NHD` -> `KvBlockLayout::OperationalNHD`
+    /// - `InnerShape::HND` -> `KvBlockLayout::OperationalHND`
+    /// - `InnerShape::Unknown` -> `KvBlockLayout::Unknown`
+    ///
+    /// Default: `KvBlockLayout::Unknown`
+    pub fn inner_shape(&mut self, shape: InnerShape) -> &mut Self {
+        self.kv_block_layout = KvBlockLayout::from_inner_shape(shape);
+        self
+    }
+    /// Build the [`LayerSeparateLayout`].
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if:
+    /// - `config` is not set
+    /// - `memory` is not set
+    /// - `block_dim` is not set
+    /// - The memory region count doesn't match `num_layers`
+    /// - Any memory region is too small for the layout
+    /// - The config validation fails
+    pub fn build(&self) -> Result<LayerSeparateLayout> {
+        let config = self
+            .config
+            .clone()
+            .ok_or_else(|| anyhow!("config is required"))?;
+        let memory = self
+            .memory
+            .clone()
+            .ok_or_else(|| anyhow!("memory is required"))?;
+        let block_dim = self
+            .block_dim
+            .ok_or_else(|| anyhow!("block_dim is required"))?;
+        LayerSeparateLayout::new_internal(config, memory, block_dim, self.kv_block_layout)
+    }
+}
+impl LayerSeparateLayout {
+    /// Create a builder for `LayerSeparateLayout`.
+    pub fn builder() -> LayerSeparateLayoutBuilder {
+        LayerSeparateLayoutBuilder::new()
+    }
+    /// Create a new layer-separate layout with default KV block layout.
+    ///
+    /// # Arguments
+    /// - `config` - Layout configuration
+    /// - `memory` - Vector of owned memory regions (one per layer)
+    /// - `block_dim` - Whether block or outer dimension is first
+    ///
+    /// # Returns
+    /// A new LayerSeparateLayout instance with `KvBlockLayout::Unknown`
+    pub(crate) fn new(
+        config: LayoutConfig,
+        memory: Vec<Buffer>,
+        block_dim: BlockDimension,
+    ) -> Result<Self> {
+        Self::new_internal(config, memory, block_dim, KvBlockLayout::Unknown)
+    }
+    /// Internal constructor with all parameters.
+    fn new_internal(
+        config: LayoutConfig,
+        memory: Vec<Buffer>,
+        block_dim: BlockDimension,
+        kv_block_layout: KvBlockLayout,
+    ) -> Result<Self> {
+        config.validate()?;
+        if memory.len() != config.num_layers {
+            return Err(anyhow!(
+                "Memory region count ({}) must match num_layers ({})",
+                memory.len(),
+                config.num_layers
+            ));
+        }
+        // Calculate strides
+        let region_size = config.page_size * config.inner_dim * config.dtype_width_bytes;
+        let (block_stride, outer_stride) = if block_dim == BlockDimension::BlockIsSecondDim {
+            // Layout: [outer_dim, num_blocks, page_size, inner_dim]
+            let block_stride = region_size;
+            let outer_stride = block_stride * config.num_blocks;
+            (block_stride, outer_stride)
+        } else {
+            // Layout: [num_blocks, outer_dim, page_size, inner_dim]
+            let outer_stride = region_size;
+            let block_stride = outer_stride * config.outer_dim;
+            (block_stride, outer_stride)
+        };
+        // Extract base addresses and validate sizes
+        let mut layer_base_addrs = Vec::with_capacity(config.num_layers);
+        let required_size = config.num_blocks * config.outer_dim * region_size;
+        for (i, mem) in memory.iter().enumerate() {
+            if mem.size() < required_size {
+                return Err(anyhow!(
+                    "Memory region {} too small for layout. Required: {} bytes, got: {} bytes",
+                    i,
+                    required_size,
+                    mem.size()
+                ));
+            }
+            layer_base_addrs.push(mem.addr());
+        }
+        Ok(Self {
+            config,
+            layer_base_addrs,
+            block_dim,
+            block_stride,
+            outer_stride,
+            region_size,
+            memory_regions: memory,
+            kv_block_layout,
+        })
+    }
+    /// Calculate the address of a specific memory region.
+    fn calculate_address(
+        &self,
+        block_id: usize,
+        layer_id: usize,
+        outer_id: usize,
+    ) -> Result<usize> {
+        if block_id >= self.config.num_blocks {
+            return Err(anyhow!(
+                "Block ID {} out of range (max: {})",
+                block_id,
+                self.config.num_blocks
+            ));
+        }
+        if layer_id >= self.config.num_layers {
+            return Err(anyhow!(
+                "Layer ID {} out of range (max: {})",
+                layer_id,
+                self.config.num_layers
+            ));
+        }
+        if outer_id >= self.config.outer_dim {
+            return Err(anyhow!(
+                "Outer ID {} out of range (max: {})",
+                outer_id,
+                self.config.outer_dim
+            ));
+        }
+        let base_addr = self.layer_base_addrs[layer_id];
+        let offset = block_id * self.block_stride + outer_id * self.outer_stride;
+        Ok(base_addr + offset)
+    }
+    #[expect(dead_code)]
+    pub fn block_dim(&self) -> BlockDimension {
+        self.block_dim
+    }
+    /// Get mutable reference to the memory regions for NIXL registration.
+    #[expect(dead_code)]
+    pub fn memory_regions_mut(&mut self) -> &mut [Buffer] {
+        &mut self.memory_regions
+    }
+    /// Get the KV block layout.
+    #[expect(dead_code)]
+    pub fn kv_block_layout(&self) -> KvBlockLayout {
+        self.kv_block_layout
+    }
+    /// Set the KV block layout from an inner shape.
+    ///
+    /// Note: Only operational layouts (NHD, HND) are valid for layer-separate layouts.
+    #[expect(dead_code)]
+    pub fn set_kv_block_layout(&mut self, inner_shape: InnerShape) {
+        self.kv_block_layout = KvBlockLayout::from_inner_shape(inner_shape);
+    }
+}
+impl Layout for LayerSeparateLayout {
+    fn config(&self) -> &LayoutConfig {
+        &self.config
+    }
+    fn memory_regions(&self) -> &[Buffer] {
+        &self.memory_regions
+    }
+    fn memory_region(
+        &self,
+        block_id: usize,
+        layer_id: usize,
+        outer_id: usize,
+    ) -> Result<MemoryRegion> {
+        let addr = self.calculate_address(block_id, layer_id, outer_id)?;
+        Ok(MemoryRegion::new(addr, self.region_size))
+    }
+    fn required_allocations(&self) -> Vec<usize> {
+        // One allocation per layer
+        let per_layer_size = self.config.num_blocks * self.config.outer_dim * self.region_size;
+        vec![per_layer_size; self.config.num_layers]
+    }
+    fn is_fully_contiguous(&self) -> bool {
+        false
+    }
+    fn num_blocks(&self) -> usize {
+        self.config.num_blocks
+    }
+    fn num_layers(&self) -> usize {
+        self.config.num_layers
+    }
+    fn outer_dim(&self) -> usize {
+        self.config.outer_dim
+    }
+    fn page_size(&self) -> usize {
+        self.config.page_size
+    }
+    fn inner_dim(&self) -> usize {
+        self.config.inner_dim
+    }
+    fn dtype_width_bytes(&self) -> usize {
+        self.config.dtype_width_bytes
+    }
+    fn serialization_details(&self) -> LayoutTypeDetails {
+        LayoutTypeDetails::LayerSeparate(LayerSeparateDetails {
+            block_dim: self.block_dim,
+            kv_block_layout: self.kv_block_layout,
+        })
+    }
+    fn block_layout(&self) -> KvBlockLayout {
+        self.kv_block_layout
+    }
+}
+#[cfg(all(test, feature = "testing-kvbm"))]
+mod tests {
+    use super::super::tests::*;
+    use super::*;
+    #[test]
+    fn test_layer_separate_block_contiguous() {
+        let config = LayoutConfig::builder()
+            .num_blocks(10)
+            .num_layers(4)
+            .outer_dim(2)
+            .page_size(16)
+            .inner_dim(128)
+            .dtype_width_bytes(2)
+            .build()
+            .unwrap();
+        let per_layer_size = 10 * 2 * 16 * 128 * 2;
+        let memory: Vec<Buffer> = (0..4)
+            .map(|i| Buffer::from_arc(MockMemory::new(0x1000 + i * per_layer_size, per_layer_size)))
+            .collect();
+        let layout =
+            LayerSeparateLayout::new(config, memory, BlockDimension::BlockIsFirstDim).unwrap();
+        assert_eq!(layout.num_blocks(), 10);
+        assert!(!layout.is_fully_contiguous());
+        assert_eq!(layout.required_allocations().len(), 4);
+    }
+    #[test]
+    fn test_layer_separate_outer_contiguous() {
+        let config = LayoutConfig::builder()
+            .num_blocks(10)
+            .num_layers(4)
+            .outer_dim(2)
+            .page_size(16)
+            .inner_dim(128)
+            .dtype_width_bytes(2)
+            .build()
+            .unwrap();
+        let per_layer_size = 10 * 2 * 16 * 128 * 2;
+        let memory: Vec<Buffer> = (0..4)
+            .map(|i| Buffer::from_arc(MockMemory::new(0x1000 + i * per_layer_size, per_layer_size)))
+            .collect();
+        let layout =
+            LayerSeparateLayout::new(config, memory, BlockDimension::BlockIsSecondDim).unwrap();
+        assert_eq!(layout.num_blocks(), 10);
+        assert!(!layout.is_fully_contiguous());
+    }
+    #[test]
+    fn test_memory_region() {
+        let config = LayoutConfig::builder()
+            .num_blocks(2)
+            .num_layers(2)
+            .outer_dim(2)
+            .page_size(16)
+            .inner_dim(128)
+            .dtype_width_bytes(2)
+            .build()
+            .unwrap();
+        let per_layer_size = 2 * 2 * 16 * 128 * 2;
+        let memory: Vec<Buffer> = (0..2)
+            .map(|i| Buffer::from_arc(MockMemory::new(0x1000 + i * per_layer_size, per_layer_size)))
+            .collect();
+        let layout =
+            LayerSeparateLayout::new(config, memory, BlockDimension::BlockIsFirstDim).unwrap();
+        // Test accessing specific memory regions
+        let region_size = 16 * 128 * 2;
+        // Block 0, Layer 0, Outer 0 - should be at layer 0's base address
+        let region = layout.memory_region(0, 0, 0).unwrap();
+        assert_eq!(region.addr, 0x1000);
+        assert_eq!(region.size, region_size);
+        // Block 0, Layer 1, Outer 0 - should be at layer 1's base address
+        let region = layout.memory_region(0, 1, 0).unwrap();
+        assert_eq!(region.addr, 0x1000 + per_layer_size);
+        assert_eq!(region.size, region_size);
+        // Block 0, Layer 0, Outer 1 - should be offset within layer 0
+        let region = layout.memory_region(0, 0, 1).unwrap();
+        assert_eq!(region.addr, 0x1000 + region_size);
+        assert_eq!(region.size, region_size);
+    }
+}
--- a/lib/kvbm-physical/src/layout/mod.rs
+++ b/lib/kvbm-physical/src/layout/mod.rs
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//! Decoupled layout system for block management.
+//!
+//! This module provides a simplified layout abstraction that:
+//! - Maps block IDs to physical memory regions (address + size)
+//! - Decouples memory regions from storage type information
+//! - Specifies allocation requirements without performing allocation
+//! - Uses trait objects for memory ownership
+pub(crate) mod builder;
+mod config;
+mod fully_contiguous;
+mod kv_block_layout;
+mod layer_separate;
+mod physical;
+mod serialize;
+mod validation;
+#[cfg(all(test, feature = "testing-kvbm"))]
+pub(super) mod tests;
+// #[cfg(test)]
+// mod integration_tests;
+pub use builder::PhysicalLayoutBuilder;
+pub use config::{BlockDimension, LayoutConfig};
+pub(crate) use fully_contiguous::FullyContiguousLayout;
+pub use kv_block_layout::{BlockDim, KvBlockLayout};
+pub(crate) use layer_separate::LayerSeparateLayout;
+pub use physical::NixlMetadata;
+pub use physical::PhysicalLayout;
+pub(crate) use serialize::LayoutDescriptor;
+pub use serialize::{BlockFormat, FullyContiguousDetails, LayerSeparateDetails, LayoutTypeDetails};
+// mod registration;
+// pub use registration::{RegisteredLayout, RegisteredStorageMetadata, RegistrationManager};
+use anyhow::Result;
+use serde::{Deserialize, Serialize};
+pub(crate) use dynamo_memory::MemoryDescriptor;
+pub use dynamo_memory::{Buffer, MemoryRegion};
+/// Core layout trait for mapping block IDs to memory regions.
+///
+/// Layouts specify how KV cache blocks are organized in memory without
+/// performing allocation themselves. They provide:
+/// - Memory region lookup for specific blocks
+/// - Allocation requirements for external allocators
+/// - Metadata about block organization
+pub trait Layout: Send + Sync + std::fmt::Debug {
+    /// Get the configuration for this layout.
+    fn config(&self) -> &LayoutConfig;
+    /// Get the root memory regions backing this layout.
+    ///
+    /// These regions correspond to the concrete allocations that store the layout's data.
+    /// Implementations that derive memory procedurally can return an empty slice.
+    fn memory_regions(&self) -> &[Buffer];
+    /// Get memory regions for a specific block_id, layer_id, outer_id.
+    ///
+    /// Returns a [MemoryRegion] for the continuous region specified by the given block_id,
+    /// layer_id, outer_id.
+    ///
+    /// # Arguments
+    /// * `block_id` - The ID of the block to query (0..num_blocks)
+    /// * `layer_id` - The ID of the layer to query (0..num_layers)
+    /// * `outer_id` - The ID of the outer dimension to query (0..outer_dim)
+    fn memory_region(
+        &self,
+        block_id: usize,
+        layer_id: usize,
+        outer_id: usize,
+    ) -> Result<MemoryRegion>;
+    /// Get the allocation requirements for this layout.
+    ///
+    /// Returns a vector of allocation sizes needed to back this layout.
+    /// For fully contiguous layouts, this will be a single size.
+    /// For layer-separate layouts, this will contain one size per layer.
+    ///
+    /// # Returns
+    /// Vector of allocation sizes in bytes.
+    fn required_allocations(&self) -> Vec<usize>;
+    /// Check if this layout uses fully contiguous memory.
+    ///
+    /// Fully contiguous layouts have all blocks in a single allocation,
+    /// which enables certain optimizations.
+    fn is_fully_contiguous(&self) -> bool;
+    /// Get the total number of blocks in this layout.
+    fn num_blocks(&self) -> usize;
+    /// Get the number of layers per block.
+    fn num_layers(&self) -> usize;
+    /// Get the outer dimension size.
+    ///
+    /// In typical KV cache layouts, this is often 2 (for K and V),
+    /// but can be 1 for architectures like MLA.
+    fn outer_dim(&self) -> usize;
+    /// Get the page size (often corresponds to block size in tokens).
+    fn page_size(&self) -> usize;
+    /// Get the inner dimension size.
+    ///
+    /// This is typically the hidden size divided by tensor parallel size.
+    fn inner_dim(&self) -> usize;
+    /// Get the data type width in bytes.
+    fn dtype_width_bytes(&self) -> usize;
+    /// Get serialization details for this layout type.
+    ///
+    /// This provides the layout-type-specific information needed to serialize
+    /// and reconstruct the layout on a remote node.
+    fn serialization_details(&self) -> serialize::LayoutTypeDetails;
+    /// Get the KV block layout describing how dimensions are permuted within blocks.
+    ///
+    /// Returns the internal tensor ordering for blocks in this layout.
+    /// For layer-separate layouts, this describes the inner tensor format.
+    /// For fully contiguous layouts, this describes the full block format.
+    fn block_layout(&self) -> KvBlockLayout;
+}
+/// Inner shape format for tensor layout
+#[allow(clippy::upper_case_acronyms)]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+pub(crate) enum InnerShape {
+    /// Unknown shape - fallback when we can't determine the format
+    Unknown,
+    /// NHD format: [block_size, num_heads, head_dim]
+    /// Common for attention layers where N=tokens, H=heads, D=dimension
+    NHD,
+    /// HND format: [num_heads, block_size, head_dim]
+    /// Alternative layout with heads first
+    HND,
+}
+/// Trait for layouts that provide contiguous per-block memory regions.
+///
+/// This trait enables direct access to entire blocks as contiguous memory,
+/// without requiring layer/outer indexing. It is implemented by
+/// [`FullyContiguousLayout`] but NOT by [`LayerSeparateLayout`] (which
+/// stores each layer separately).
+///
+/// Use this trait when you need to:
+/// - Access raw block memory for transformation kernels
+/// - Reinterpret block memory under different [`KvBlockLayout`] formats
+/// - Perform whole-block operations without layer decomposition
+pub trait ContiguousBlockLayout: Send + Sync + std::fmt::Debug {
+    /// Get the total number of blocks in this layout.
+    fn num_blocks(&self) -> usize;
+    /// Get the size of each block in bytes.
+    fn bytes_per_block(&self) -> usize;
+    /// Get the contiguous memory region for a specific block.
+    ///
+    /// # Arguments
+    /// * `block_id` - The ID of the block to query (0..num_blocks)
+    ///
+    /// # Returns
+    /// A [`MemoryRegion`] covering the entire block's memory.
+    ///
+    /// # Errors
+    /// Returns an error if `block_id` is out of range.
+    fn raw_block(&self, block_id: usize) -> Result<MemoryRegion>;
+    /// Get the KV block layout for this contiguous layout.
+    fn block_layout(&self) -> KvBlockLayout;
+}