feat: kvbm-physical (#6490)

Signed-off-by: Ryan Olson <rolson@nvidia.com>

feat: kvbm-physical (#6490)
Signed-off-by: Ryan Olson <rolson@nvidia.com>
9ab148dc · Ryan Olson · GitHub · 7546c193 · 9ab148dc · 9ab148dc
Unverified Commit 9ab148dc authored Mar 01, 2026 by Ryan Olson Committed by GitHub Mar 01, 2026
20 changed files
--- a/.github/workflows/container-validation-dynamo.yml
+++ b/.github/workflows/container-validation-dynamo.yml
@@ -154,6 +154,8 @@ jobs:
                      cargo fmt -- --check && \
                      cargo clippy --features block-manager,media-ffmpeg,testing-nixl,integration --no-deps --all-targets -- -D warnings && \
                      cargo test --locked --all-targets --features=block-manager,media-ffmpeg,testing-nixl,integration -- --nocapture && \
+                      cargo clippy -p kvbm-physical --no-deps --all-targets -- -D warnings && \
+                      cargo test --locked -p kvbm-physical --features testing-kvbm -- --nocapture --test-threads=4 && \
                      /workspace/container/use-sccache.sh show-stats "Rust Checks"'

  test-parallel:

--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3877,6 +3877,14 @@ dependencies = [
 "tracing",
 ]

+[[package]]
+name = "kvbm-common"
+version = "1.0.0"
+dependencies = [
+ "dynamo-tokens",
+ "serde",
+]
+
 [[package]]
 name = "kvbm-kernels"
 version = "1.0.0"
@@ -3894,7 +3902,6 @@ version = "1.0.0"
 dependencies = [
 "anyhow",
 "async-stream",
- "bincode 2.0.1",
 "bytes",
 "derive_builder",
 "dynamo-tokens",
@@ -3914,6 +3921,33 @@ dependencies = [
 "xxhash-rust",
 ]

+[[package]]
+name = "kvbm-physical"
+version = "1.0.0"
+dependencies = [
+ "aligned-vec",
+ "anyhow",
+ "bincode 2.0.1",
+ "blake3",
+ "cudarc",
+ "derive-getters",
+ "derive_builder",
+ "dynamo-memory",
+ "futures",
+ "kvbm-common",
+ "kvbm-kernels",
+ "parking_lot",
+ "rstest 0.26.1",
+ "serde",
+ "serde_json",
+ "thiserror 2.0.18",
+ "tokio",
+ "tracing",
+ "uuid",
+ "validator",
+ "velo-events",
+]
+
 [[package]]
 name = "lalrpop-util"
 version = "0.20.2"
@@ -3999,12 +4033,13 @@ checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981"

 [[package]]
 name = "libredox"
-version = "0.1.12"
+version = "0.1.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3d0b95e02c851351f877147b7deea7b1afb1df71b63aa5f8270716e0c5720616"
+checksum = "1744e39d1d6a9948f4f388969627434e31128196de472883b39f148769bfe30a"
 dependencies = [
 "bitflags 2.11.0",
 "libc",
+ "plain",
 "redox_syscall 0.7.3",
 ]

@@ -5474,6 +5509,12 @@ version = "0.3.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c"

+[[package]]
+name = "plain"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b4596b6d070b27117e987119b4dac604f3c58cfb0b191112e24771b2faeac1a6"
+
 [[package]]
 name = "plotters"
 version = "0.3.7"

--- a/Cargo.toml
+++ b/Cargo.toml
@@ -10,8 +10,10 @@ members = [
    "lib/mocker",
    "lib/kv-router",
    "lib/memory",
+    "lib/kvbm-common",
    "lib/kvbm-kernels",
    "lib/kvbm-logical",
+    "lib/kvbm-physical",
    "lib/async-openai",
    "lib/parsers",
    "lib/bench",
@@ -45,11 +47,13 @@ dynamo-async-openai = { path = "lib/async-openai", version = "1.0.0", features =
 dynamo-parsers = { path = "lib/parsers", version = "1.0.0" }

 # kvbm
+kvbm-common = { path = "lib/kvbm-common", version = "1.0.0" }
 kvbm-kernels = { path = "lib/kvbm-kernels", version = "1.0.0" }
 kvbm-logical = { path = "lib/kvbm-logical", version = "1.0.0" }
+kvbm-physical = { path = "lib/kvbm-physical", version = "1.0.0" }

 # velo
-velo-events = { path = "lib/velo-events", version = "0.9.0" }
+velo-events = { path = "lib/velo-events", version = "1.0.0" }

 # External dependencies
 anyhow = { version = "1" }
@@ -66,7 +70,7 @@ chrono = { version = "0.4", default-features = false, features = [
    "now",
    "serde",
 ] }
-cudarc = { version = "0.19.2", features = ["cuda-12020"] }
+cudarc = { version = "0.19.2", features = ["cuda-version-from-build-system", "fallback-latest"] }
 dashmap = { version = "6.1" }
 derive_builder = { version = "0.20" }
 derive-getters = { version = "0.5" }

--- a/lib/bindings/kvbm/Cargo.lock
+++ b/lib/bindings/kvbm/Cargo.lock
@@ -3015,9 +3015,9 @@ dependencies = [

 [[package]]
 name = "js-sys"
-version = "0.3.90"
+version = "0.3.91"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "14dc6f6450b3f6d4ed5b16327f38fed626d375a886159ca555bd7822c0c3a5a6"
+checksum = "b49715b7073f385ba4bc528e5747d02e66cb39c6146efb66b781f131f0fb399c"
 dependencies = [
 "once_cell",
 "wasm-bindgen",
@@ -3329,12 +3329,13 @@ checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981"

 [[package]]
 name = "libredox"
-version = "0.1.12"
+version = "0.1.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3d0b95e02c851351f877147b7deea7b1afb1df71b63aa5f8270716e0c5720616"
+checksum = "1744e39d1d6a9948f4f388969627434e31128196de472883b39f148769bfe30a"
 dependencies = [
 "bitflags 2.11.0",
 "libc",
+ "plain",
 "redox_syscall 0.7.3",
 ]

@@ -4701,6 +4702,12 @@ version = "0.3.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c"

+[[package]]
+name = "plain"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b4596b6d070b27117e987119b4dac604f3c58cfb0b191112e24771b2faeac1a6"
+
 [[package]]
 name = "png"
 version = "0.18.1"
@@ -7422,9 +7429,9 @@ dependencies = [

 [[package]]
 name = "wasm-bindgen"
-version = "0.2.113"
+version = "0.2.114"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "60722a937f594b7fde9adb894d7c092fc1bb6612897c46368d18e7a20208eff2"
+checksum = "6532f9a5c1ece3798cb1c2cfdba640b9b3ba884f5db45973a6f442510a87d38e"
 dependencies = [
 "cfg-if 1.0.4",
 "once_cell",
@@ -7435,9 +7442,9 @@ dependencies = [

 [[package]]
 name = "wasm-bindgen-futures"
-version = "0.4.63"
+version = "0.4.64"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8a89f4650b770e4521aa6573724e2aed4704372151bd0de9d16a3bbabb87441a"
+checksum = "e9c5522b3a28661442748e09d40924dfb9ca614b21c00d3fd135720e48b67db8"
 dependencies = [
 "cfg-if 1.0.4",
 "futures-util",
@@ -7449,9 +7456,9 @@ dependencies = [

 [[package]]
 name = "wasm-bindgen-macro"
-version = "0.2.113"
+version = "0.2.114"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0fac8c6395094b6b91c4af293f4c79371c163f9a6f56184d2c9a85f5a95f3950"
+checksum = "18a2d50fcf105fb33bb15f00e7a77b772945a2ee45dcf454961fd843e74c18e6"
 dependencies = [
 "quote",
 "wasm-bindgen-macro-support",
@@ -7459,9 +7466,9 @@ dependencies = [

 [[package]]
 name = "wasm-bindgen-macro-support"
-version = "0.2.113"
+version = "0.2.114"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ab3fabce6159dc20728033842636887e4877688ae94382766e00b180abac9d60"
+checksum = "03ce4caeaac547cdf713d280eda22a730824dd11e6b8c3ca9e42247b25c631e3"
 dependencies = [
 "bumpalo",
 "proc-macro2",
@@ -7472,9 +7479,9 @@ dependencies = [

 [[package]]
 name = "wasm-bindgen-shared"
-version = "0.2.113"
+version = "0.2.114"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "de0e091bdb824da87dc01d967388880d017a0a9bc4f3bdc0d86ee9f9336e3bb5"
+checksum = "75a326b8c223ee17883a4251907455a2431acc2791c98c26279376490c378c16"
 dependencies = [
 "unicode-ident",
 ]
@@ -7528,9 +7535,9 @@ dependencies = [

 [[package]]
 name = "web-sys"
-version = "0.3.90"
+version = "0.3.91"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "705eceb4ce901230f8625bd1d665128056ccbe4b7408faa625eec1ba80f59a97"
+checksum = "854ba17bb104abfb26ba36da9729addc7ce7f06f5c0f90f3c391f8461cca21f9"
 dependencies = [
 "js-sys",
 "wasm-bindgen",

--- a/lib/bindings/kvbm/Cargo.toml
+++ b/lib/bindings/kvbm/Cargo.toml
@@ -56,6 +56,6 @@ pyo3-async-runtimes = { version = "0.23.0", default-features = false, features =
 ] }

 dlpark = { version = "0.5", features = ["pyo3", "half"], optional = true }
-cudarc = { version = "0.19.2", features = ["cuda-12020"], optional = true }
+cudarc = { version = "0.19.2", features = ["cuda-version-from-build-system", "fallback-latest"], optional = true }

 [dev-dependencies]
--- a/lib/bindings/python/Cargo.lock
+++ b/lib/bindings/python/Cargo.lock
@@ -3075,9 +3075,9 @@ dependencies = [

 [[package]]
 name = "js-sys"
-version = "0.3.90"
+version = "0.3.91"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "14dc6f6450b3f6d4ed5b16327f38fed626d375a886159ca555bd7822c0c3a5a6"
+checksum = "b49715b7073f385ba4bc528e5747d02e66cb39c6146efb66b781f131f0fb399c"
 dependencies = [
 "once_cell",
 "wasm-bindgen",
@@ -3368,12 +3368,13 @@ checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981"

 [[package]]
 name = "libredox"
-version = "0.1.12"
+version = "0.1.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3d0b95e02c851351f877147b7deea7b1afb1df71b63aa5f8270716e0c5720616"
+checksum = "1744e39d1d6a9948f4f388969627434e31128196de472883b39f148769bfe30a"
 dependencies = [
 "bitflags 2.11.0",
 "libc",
+ "plain",
 "redox_syscall 0.7.3",
 ]

@@ -4749,6 +4750,12 @@ version = "0.3.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c"

+[[package]]
+name = "plain"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b4596b6d070b27117e987119b4dac604f3c58cfb0b191112e24771b2faeac1a6"
+
 [[package]]
 name = "png"
 version = "0.18.1"
@@ -7497,9 +7504,9 @@ dependencies = [

 [[package]]
 name = "wasm-bindgen"
-version = "0.2.113"
+version = "0.2.114"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "60722a937f594b7fde9adb894d7c092fc1bb6612897c46368d18e7a20208eff2"
+checksum = "6532f9a5c1ece3798cb1c2cfdba640b9b3ba884f5db45973a6f442510a87d38e"
 dependencies = [
 "cfg-if 1.0.4",
 "once_cell",
@@ -7510,9 +7517,9 @@ dependencies = [

 [[package]]
 name = "wasm-bindgen-futures"
-version = "0.4.63"
+version = "0.4.64"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8a89f4650b770e4521aa6573724e2aed4704372151bd0de9d16a3bbabb87441a"
+checksum = "e9c5522b3a28661442748e09d40924dfb9ca614b21c00d3fd135720e48b67db8"
 dependencies = [
 "cfg-if 1.0.4",
 "futures-util",
@@ -7524,9 +7531,9 @@ dependencies = [

 [[package]]
 name = "wasm-bindgen-macro"
-version = "0.2.113"
+version = "0.2.114"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0fac8c6395094b6b91c4af293f4c79371c163f9a6f56184d2c9a85f5a95f3950"
+checksum = "18a2d50fcf105fb33bb15f00e7a77b772945a2ee45dcf454961fd843e74c18e6"
 dependencies = [
 "quote",
 "wasm-bindgen-macro-support",
@@ -7534,9 +7541,9 @@ dependencies = [

 [[package]]
 name = "wasm-bindgen-macro-support"
-version = "0.2.113"
+version = "0.2.114"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ab3fabce6159dc20728033842636887e4877688ae94382766e00b180abac9d60"
+checksum = "03ce4caeaac547cdf713d280eda22a730824dd11e6b8c3ca9e42247b25c631e3"
 dependencies = [
 "bumpalo",
 "proc-macro2",
@@ -7547,9 +7554,9 @@ dependencies = [

 [[package]]
 name = "wasm-bindgen-shared"
-version = "0.2.113"
+version = "0.2.114"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "de0e091bdb824da87dc01d967388880d017a0a9bc4f3bdc0d86ee9f9336e3bb5"
+checksum = "75a326b8c223ee17883a4251907455a2431acc2791c98c26279376490c378c16"
 dependencies = [
 "unicode-ident",
 ]
@@ -7603,9 +7610,9 @@ dependencies = [

 [[package]]
 name = "web-sys"
-version = "0.3.90"
+version = "0.3.91"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "705eceb4ce901230f8625bd1d665128056ccbe4b7408faa625eec1ba80f59a97"
+checksum = "854ba17bb104abfb26ba36da9729addc7ce7f06f5c0f90f3c391f8461cca21f9"
 dependencies = [
 "js-sys",
 "wasm-bindgen",

--- a/lib/kvbm-common/Cargo.toml
+++ b/lib/kvbm-common/Cargo.toml
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+[package]
+name = "kvbm-common"
+version.workspace = true
+edition.workspace = true
+description.workspace = true
+authors.workspace = true
+license.workspace = true
+homepage.workspace = true
+repository.workspace = true
+keywords.workspace = true
+
+[dependencies]
+dynamo-tokens = { workspace = true }
+serde = { workspace = true }
--- a/lib/kvbm-common/src/lib.rs
+++ b/lib/kvbm-common/src/lib.rs
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+use serde::{Deserialize, Serialize};
+
+pub type BlockId = usize;
+pub type SequenceHash = dynamo_tokens::PositionalLineageHash;
+
+pub use dynamo_tokens as tokens;
+
+/// Logical layout handle type encoding the layout ID.
+///
+/// KVBM manages G1, G2 and G3 layouts directly. G4 is managed by an external service.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub enum LogicalLayoutHandle {
+    /// Representation of GPU / Device Memory
+    /// G1 is fixed sized and managed by either the framework or the local instance of KVBM.
+    G1,
+    /// Representation of CPU / Host Memory
+    /// G2 is fixed sized and managed by the local instance of KVBM.
+    G2,
+    /// Representation of Disk Storage (Local or AttachedStorage)
+    /// G3 is fixed sized and managed by the local instance of KVBM.
+    G3,
+    /// Representation of Blocks held in an external service
+    /// outside the control of the KVBM system.
+    G4,
+}
--- a/lib/kvbm-kernels/tests/memcpy_batch.rs
+++ b/lib/kvbm-kernels/tests/memcpy_batch.rs
@@ -20,10 +20,9 @@ use cudarc::runtime::sys as cuda_runtime;
 use kvbm_kernels::{MemcpyBatchMode, is_memcpy_batch_available, is_using_stubs, memcpy_batch};

 // Direct FFI for cudaMallocHost / cudaFreeHost.
-// We bypass cudarc's runtime::sys because cudarc eagerly resolves ALL runtime
-// symbols on first use, and CUDA 13.x removed `cudaGetDeviceProperties_v2`
-// which causes a panic.  Our test binary links against libcudart directly
-// (through kvbm-kernels' build.rs), so these symbols are always available.
+// We link against libcudart directly (through kvbm-kernels' build.rs),
+// so these symbols are always available without going through cudarc's
+// dynamic loader.
 unsafe extern "C" {
    fn cudaMallocHost(ptr: *mut *mut c_void, size: usize) -> u32;
    fn cudaFreeHost(ptr: *mut c_void) -> u32;

--- a/lib/kvbm-logical/Cargo.toml
+++ b/lib/kvbm-logical/Cargo.toml
@@ -15,7 +15,6 @@ dynamo-tokens = { workspace = true }
 anyhow = { workspace = true }
 async-stream = "0.3"
 bytes = "1.10"
-bincode = { version = "2.0.1", features = ["serde", "derive"] }
 derive_builder = { workspace = true }
 futures = { workspace = true }
 lru = "0.16"

--- a/lib/kvbm-logical/src/lib.rs
+++ b/lib/kvbm-logical/src/lib.rs
@@ -15,9 +15,6 @@ pub mod tinylfu;
 #[cfg(any(test, feature = "testing"))]
 pub mod testing;

-use bincode::{Decode, Encode};
-use serde::{Deserialize, Serialize};
-
 // Re-export common types and traits
 pub use blocks::{
    BlockError, BlockMetadata, CompleteBlock, ImmutableBlock, MutableBlock, WeakBlock,
@@ -37,19 +34,3 @@ impl KvbmSequenceHashProvider for dynamo_tokens::TokenBlock {
        self.positional_lineage_hash()
    }
 }
-
-/// Logical layout handle type encoding the layout ID.
-///
-/// KVBM manages G1, G2 and G3 layouts directly. G4 is managed by an external service.
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Encode, Decode, Serialize, Deserialize)]
-pub enum LogicalLayoutHandle {
-    /// Representation of GPU / Device Memory
-    G1,
-    /// Representation of CPU / Host Memory
-    G2,
-    /// Representation of Disk Storage
-    G3,
-    /// Representation of Blocks held in an external service
-    /// outside the control of the KVBM system.
-    G4,
-}
--- a/lib/kvbm-physical/Cargo.toml
+++ b/lib/kvbm-physical/Cargo.toml
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+[package]
+name = "kvbm-physical"
+version.workspace = true
+edition.workspace = true
+authors.workspace = true
+license.workspace = true
+repository.workspace = true
+
+[dependencies]
+dynamo-memory = { workspace = true }
+kvbm-common = { workspace = true }
+kvbm-kernels = { workspace = true }
+velo-events = { workspace = true }
+
+aligned-vec = "0.6.4"
+anyhow = { workspace = true }
+bincode = { version = "2.0.0", features = ["serde", "derive"] }
+blake3 = { version = "1" }
+cudarc = { workspace = true }
+derive_builder = { workspace = true }
+futures = { workspace = true }
+derive-getters = { version = "0.5" }
+parking_lot = { workspace = true }
+serde = { workspace = true }
+serde_json = { workspace = true }
+thiserror = { workspace = true }
+tokio = { workspace = true }
+tracing = { workspace = true }
+uuid = { workspace = true }
+validator = { workspace = true }
+
+[features]
+default = []
+collectives = []
+testing-kvbm = []
+testing-nixl-gds = []
+
+[dev-dependencies]
+dynamo-memory = { workspace = true, features = ["unsafe-slices"] }
+rstest = "0.26"
--- a/lib/kvbm-physical/README.md
+++ b/lib/kvbm-physical/README.md
+# kvbm-physical
+
+Physical layout and transfer management for KV cache block storage.
+
+`kvbm-physical` provides the low-level building blocks for mapping KV cache blocks to memory, registering them for RDMA transfers via NIXL, and executing transfers between heterogeneous storage tiers (GPU, host, disk, remote).
+
+## Modules
+
+### `layout` — Block-to-memory mapping
+
+Abstractions for how KV cache blocks are organized in memory.
+
+- **`Layout` trait** — Core abstraction mapping `(block_id, layer_id, outer_id)` to a `MemoryRegion`. Implementations include fully contiguous (single allocation) and layer-separate (one allocation per layer) variants.
+- **`KvBlockLayout`** — Describes dimension ordering within a block. Five named formats (`UniversalTP`, `UniversalPP`, `OperationalHND`, `OperationalNHD`, `Custom`) plus `Unknown`. Provides `requires_transform()`, `is_operational()`, and `is_universal()` for kernel selection.
+- **`PhysicalLayout`** — Wraps a `Layout` with its physical storage location (`StorageKind`) and NIXL registration metadata (`NixlMetadata`). Constructed via a type-state builder: Config &rarr; Layout type &rarr; Memory allocation &rarr; `build()`.
+- **`LayoutConfig`** — Block dimensions: `num_blocks`, `num_layers`, `outer_dim`, `page_size`, `inner_dim`, `dtype_width_bytes`, optional `num_heads`.
+- **`KvBlocks`** — Groups block IDs with a shared `PhysicalLayout` and optional `KvBlockLayout` override for cross-format transfers.
+
+### `manager` — Layout registration and transfer orchestration
+
+- **`TransferManager`** — Primary API. Registers layouts, exports/imports RDMA metadata between workers, and executes transfers by handle.
+- **`LayoutHandle`** — Compact `u128` encoding `(worker_id, layout_id)`. Identifies a registered layout within a specific worker; not symmetric across workers.
+- **`LogicalLayoutDescriptor`** — Bridges a `LayoutHandle` to a `LogicalLayoutHandle` (G1/G2/G3/G4 tier). Enables callers to say "copy from G1 to G2" while `TransferManager` resolves worker-specific physical handles.
+- **`SerializedLayout`** — Wire format for RDMA metadata exchange. Packs worker address, NIXL metadata, and layout descriptors into a bincode blob.
+- **`WorkerAddress`** — `(worker_id, nixl_agent_name)` pair identifying a worker on the network.
+
+### `transfer` — Transfer configuration and execution
+
+- **`TransferConfig` / builder** — Configures event system, NIXL backends, CUDA device, capabilities, and memory pool before building a `TransferManager`.
+- **`TransferOptions`** — Per-transfer configuration: `layer_range`, `nixl_write_notification`, `bounce_buffer`, caller-provided `cuda_stream`, and src/dst `kv_layout` overrides.
+- **`TransferPreferences`** — Strategy hints via `NativeVsNixlPolicy` (PreferNative / PreferNixl / Automatic).
+- **`TransferCompleteNotification`** — `Either<Ready, EventAwaiter>` implementing `IntoFuture`. Zero-cost for synchronous completions. `aggregate()` composes multiple notifications. `could_yield()` checks if awaiting will suspend.
+- **`BounceBuffer`** — Staging area for two-hop transfers (e.g., Device &rarr; Host &rarr; Remote).
+- **Checksum utilities** — BLAKE3 block/layer checksums for transfer verification.
+- **Fill utilities** — Constant/sequential patterns for testing and initialization.
+
+## Quick Start
+
+```rust,ignore
+use kvbm_physical::{TransferManager, TransferOptions};
+use kvbm_physical::layout::{LayoutConfig, PhysicalLayout};
+
+// 1. Build the TransferManager (creates NIXL agent, CUDA streams, event system)
+let manager = TransferManager::builder()
+    .nixl_backend("ucx")
+    .cuda_device_id(0)
+    .build()?;
+
+// 2. Configure a layout
+let config = LayoutConfig::builder()
+    .num_blocks(64)
+    .num_layers(32)
+    .outer_dim(2)
+    .page_size(16)
+    .inner_dim(128)
+    .dtype_width_bytes(2)
+    .build()?;
+
+// 3. Build a physical layout (type-state builder: config -> layout type -> memory -> build)
+let gpu_layout = PhysicalLayout::builder(manager.nixl_agent().clone())
+    .with_config(config.clone())
+    .fully_contiguous()
+    .allocate_device(0)
+    .build()?;
+
+let host_layout = PhysicalLayout::builder(manager.nixl_agent().clone())
+    .with_config(config)
+    .fully_contiguous()
+    .allocate_pinned(Some(0))
+    .build()?;
+
+// 4. Register layouts to get handles
+let gpu_handle = manager.register_layout(gpu_layout)?;
+let host_handle = manager.register_layout(host_layout)?;
+
+// 5. Execute a transfer and await completion
+let notification = manager.execute_transfer(
+    gpu_handle,
+    &[0, 1, 2, 3],        // source block IDs
+    host_handle,
+    &[0, 1, 2, 3],        // destination block IDs
+    TransferOptions::new(),
+)?;
+notification.await?;
+```
+
+## Testing
+
+All functional tests in `kvbm-physical` require a real NIXL installation and a CUDA GPU. They are gated behind two feature flags:
+
+- **`testing-kvbm`** — enables tests requiring NIXL and CUDA (creates NixlAgent instances and allocates device memory / launches kernels)
+
+### Running tests
+
+```bash
+# Without GPU/NIXL — only the sentinel test runs (confirms skipping)
+cargo test -p kvbm-physical
+
+# With GPU + NIXL available
+cargo test -p kvbm-physical --features testing-kvbm
+```
+
+When neither feature is enabled, a single **sentinel test** runs and prints a reminder message. This ensures `cargo test` never silently passes with zero tests.
+
+### What the sentinel test looks like
+
+```
+running 1 test
+test sentinel::all_functional_tests_skipped___enable_testing_nixl_and_testing_cuda ... ok
+```
+
+The `test_version_check_on_deserialization` test in `layout::tests` is the only functional test that runs without feature flags, as it does not require NIXL or CUDA.
+
+## Documentation
+
+- [v1 Migration Guide](docs/v1_migration.md) — Migration from `dynamo-llm::block_manager` to `kvbm-physical`
--- a/lib/kvbm-physical/docs/v1_migration.md
+++ b/lib/kvbm-physical/docs/v1_migration.md
+# Migration Guide: block_manager to kvbm-physical
+
+Guide for migrating from `dynamo-llm::block_manager` (v1) to `kvbm-physical`.
+
+## Overview
+
+`kvbm-physical` is a ground-up rewrite of the physical transfer layer from `lib/llm/src/block_manager/`. The core data flow is the same (register layouts, exchange metadata, execute transfers), but `kvbm-physical` adds block format awareness, richer transfer options, and a cleaner separation between logical tiers and physical handles.
+
+Both implementations use the same `vectorized_copy` CUDA kernel. The original embeds it in a `.fatbin` (`lib/llm/src/block_manager/block/transfer/kernels/vectorized_copy.fatbin`) loaded via `cuModuleLoadData`. `kvbm-physical` wraps the same kernel via the `kvbm-kernels` crate with explicit Rust FFI for transparency and testability.
+
+## Type mapping table
+
+| Original (block_manager) | kvbm-physical | Notes |
+|--------------------------|---------------|-------|
+| `TransportManager` | `TransferManager` | Same role, richer API |
+| `LayoutHandle` | `LayoutHandle` | Same concept; encoding changed — see LayoutHandle docs for details |
+| `PhysicalLayout` + builder | `PhysicalLayout` + builder | Same pattern; adds `with_external_device_regions()` |
+| `LayoutConfig` | `LayoutConfig` | Same fields + optional `num_heads` |
+| `TransferOptions` | `TransferOptions` | Adds `cuda_stream`, `src_kv_layout`, `dst_kv_layout` |
+| `TransferCapabilities` | `TransferCapabilities` | Same |
+| `TransferPreferences` | `TransferPreferences` | Same |
+| `SerializedLayout` | `SerializedLayout` | Same wire format concept |
+| `WorkerAddress` | `WorkerAddress` | Same |
+| `TransferCompleteNotification` (oneshot) | `TransferCompleteNotification` (`Either`/`EventAwaiter`) | Zero-cost sync path |
+| `BounceBufferSpec` (trait object) | `BounceBuffer` (concrete struct) | Simpler, no heap allocation |
+| N/A | `LogicalLayoutDescriptor` | **New** — tier bridging |
+| N/A | `KvBlockLayout` | **New** — block format awareness |
+| N/A | `KvBlocks` | **New** — grouped blocks with layout override |
+| `CudaBlockingH2D` / `CudaBlockingD2H` | Removed | Async-only; `.await` for sync behavior |
+| `OperationalCopyBackend` | Removed | Replaced by `kvbm_kernels` direct FFI |
+
+## What kvbm-physical adds
+
+### LogicalLayoutDescriptor
+
+Bridges `LayoutHandle` (physical) to `LogicalLayoutHandle` (G1/G2/G3/G4 tier). This is the key new abstraction for multi-worker coordination: callers say "copy from G1 to G2" while `TransferManager` resolves worker-specific handles.
+
+```rust,ignore
+// Build descriptor for RDMA exchange
+let descriptor = manager.build_logical_descriptor(gpu_handle, LogicalLayoutHandle::G1)?;
+```
+
+### KvBlockLayout
+
+Five named block formats plus `Custom` and `Unknown`. Enables type-driven kernel selection for transfers between different dimension orderings.
+
+```rust,ignore
+let needs_permute = src_layout.requires_transform(&dst_layout);
+```
+
+### kvbm-kernels FFI
+
+The `kvbm_kernels` crate provides `memcpy_batch` using CUDA 12.9+ batch API with automatic fallback to individual copies. This replaces the fatbin-loading approach with direct Rust FFI.
+
+### Stream pooling
+
+4 H2D + 4 D2H streams with round-robin selection, replacing the original 1+1 stream pair. Reduces contention for concurrent transfers.
+
+### Caller-provided CUDA stream
+
+`TransferOptions::cuda_stream` lets the caller pass in a stream. The executor skips event recording; the caller manages synchronization. Useful for layer-wise transfers where all layers must execute on the same stream.
+
+```rust,ignore
+let stream = manager.context().acquire_h2d_stream();
+let options = TransferOptions::builder()
+    .cuda_stream(stream.clone())
+    .build()?;
+```
+
+### CudaMemPool
+
+Device memory pool for kernel temporary allocations (permute buffers, etc.). Configured via `TransferConfig`:
+
+```rust,ignore
+TransferManager::builder()
+    .cuda_pool_reserve_size(64 * 1024 * 1024)         // 64 MiB pre-allocated
+    .cuda_pool_release_threshold(Some(64 * 1024 * 1024)) // free above this
+    .build()?;
+```
+
+### TransferCompleteNotification::aggregate()
+
+Compose multiple transfer notifications into one that completes when all are done. Optimizes away the aggregation when all inputs are already complete.
+
+```rust,ignore
+let combined = TransferCompleteNotification::aggregate(
+    vec![n1, n2, n3],
+    manager.context().event_system(),
+    &tokio::runtime::Handle::current(),
+)?;
+combined.await?;
+```
+
+### src/dst kv_layout overrides
+
+`TransferOptions` now supports overriding the source and destination block layout interpretation, enabling cross-format transfers without modifying the registered layout.
+
+```rust,ignore
+let options = TransferOptions::builder()
+    .src_kv_layout(KvBlockLayout::OperationalNHD)
+    .dst_kv_layout(KvBlockLayout::UniversalTP)
+    .build()?;
+```
+
+## What was intentionally removed
+
+### Blocking CUDA strategies
+
+`CudaBlockingH2D` and `CudaBlockingD2H` are removed. All transfers are async. For synchronous behavior, just `.await` immediately:
+
+```rust,ignore
+// v1 (blocking)
+let result = blocking_h2d_transfer(...);
+
+// kvbm-physical (async, but can be used synchronously)
+let notification = manager.execute_transfer(...)?;
+notification.await?;
+```
+
+### OperationalCopyBackend enum
+
+The `OperationalCopyBackend` enum (which selected between different kernel loading strategies) is removed. `kvbm-physical` uses `kvbm_kernels` direct FFI exclusively, making kernel dispatch transparent.
+
+### Trait object bounce buffer
+
+`BounceBufferSpec` (a trait object requiring heap allocation) is replaced by `BounceBuffer`, a concrete struct wrapping a `LayoutHandle` + block IDs:
+
+```rust,ignore
+// v1
+struct MyBounce { layout: PhysicalLayout, blocks: Vec<BlockId> }
+impl BounceBufferSpec for MyBounce { ... }
+
+// kvbm-physical
+let bounce = BounceBuffer::from_handle(host_handle, vec![0, 1, 2, 3]);
+```
+
+## Migration steps
+
+### 1. Replace TransportManager with TransferManager
+
+The builder pattern is the same. `TransferManager::builder()` returns the same kind of fluent builder.
+
+```rust,ignore
+// v1
+let manager = TransportManager::builder()
+    .worker_id(0)
+    .nixl_backend("ucx")
+    .cuda_device_id(0)
+    .build()?;
+
+// kvbm-physical
+let manager = TransferManager::builder()
+    .nixl_backend("ucx")
+    .cuda_device_id(0)
+    .build()?;
+// worker_id is now derived from the event system
+```
+
+### 2. Replace TransferOptions
+
+Add new fields as needed. Existing `layer_range` and `nixl_write_notification` work the same way.
+
+```rust,ignore
+// v1
+let options = TransferOptions::builder()
+    .layer_range(0..16)
+    .build()?;
+
+// kvbm-physical (same, with optional new fields)
+let options = TransferOptions::builder()
+    .layer_range(0..16)
+    .cuda_stream(stream)        // new: caller-managed stream
+    .src_kv_layout(layout)      // new: format override
+    .build()?;
+```
+
+### 3. Replace BounceBufferSpec with BounceBuffer
+
+```rust,ignore
+// v1 — trait object
+let spec: Box<dyn BounceBufferSpec> = Box::new(MyBounce::new(layout, blocks));
+options.bounce_buffer(spec);
+
+// kvbm-physical — concrete type
+let bounce = BounceBuffer::from_handle(host_handle, block_ids);
+let options = TransferOptions::builder()
+    .bounce_buffer(bounce)
+    .build()?;
+```
+
+### 4. Replace TransferCompleteNotification await pattern
+
+The notification now implements `IntoFuture` directly instead of wrapping a oneshot channel.
+
+```rust,ignore
+// v1
+let notification = manager.execute_transfer(...)?;
+notification.recv().await??;
+
+// kvbm-physical
+let notification = manager.execute_transfer(...)?;
+notification.await?;
+```
+
+### 5. Add LogicalLayoutDescriptor for multi-worker tier resolution
+
+If you coordinate transfers across multiple workers by tier name (G1, G2, etc.), use `LogicalLayoutDescriptor`:
+
+```rust,ignore
+// Build descriptors that include tier information
+let g1_desc = manager.build_logical_descriptor(gpu_handle, LogicalLayoutHandle::G1)?;
+let g2_desc = manager.build_logical_descriptor(host_handle, LogicalLayoutHandle::G2)?;
+
+// Remote workers can now resolve "copy G1 to G2" to the correct physical handles
+```
+
+### 6. Consider KvBlockLayout annotations for cross-format transfers
+
+If your transfers involve blocks stored in different dimension orderings (e.g., operational NHD from the engine vs. universal TP for storage), annotate with `KvBlockLayout`:
+
+```rust,ignore
+let options = TransferOptions::builder()
+    .src_kv_layout(KvBlockLayout::OperationalNHD)
+    .dst_kv_layout(KvBlockLayout::UniversalTP)
+    .build()?;
+```
+
+This tells the executor to select a permute kernel instead of a direct copy.
--- a/lib/kvbm-physical/src/layout/builder.rs
+++ b/lib/kvbm-physical/src/layout/builder.rs
--- a/lib/kvbm-physical/src/layout/config.rs
+++ b/lib/kvbm-physical/src/layout/config.rs
+// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+use derive_builder::Builder;
+use serde::{Deserialize, Serialize};
+use validator::{Validate, ValidationError};
+
+/// Configuration for block layouts.
+///
+/// The `#[validate]` attributes on fields are checked during layout construction
+/// (e.g., `FullyContiguousLayout::new_internal()`, `LayerSeparateLayout::new_internal()`),
+/// not at builder `.build()` time.
+#[derive(Debug, Clone, Builder, Validate, Serialize, Deserialize, PartialEq, Eq)]
+pub struct LayoutConfig {
+    /// Number of blocks
+    #[validate(range(min = 1))]
+    pub num_blocks: usize,
+
+    /// Number of layers
+    #[validate(range(min = 1))]
+    pub num_layers: usize,
+
+    /// Number of outer dimensions
+    #[validate(range(min = 1, max = 2))]
+    pub outer_dim: usize,
+
+    /// Page size
+    #[validate(range(min = 1))]
+    pub page_size: usize,
+
+    /// Inner dimension
+    #[validate(range(min = 1))]
+    pub inner_dim: usize,
+
+    /// Alignment
+    #[validate(custom(function = "validate_power_of_2"))]
+    #[builder(default = "1")]
+    pub alignment: usize,
+
+    /// Data type
+    #[validate(custom(function = "validate_dtype_width_bytes"))]
+    #[builder(default = "2")]
+    pub dtype_width_bytes: usize,
+
+    /// Number of attention heads (optional).
+    ///
+    /// When provided, enables KvBlockLayout support for universal formats.
+    /// The head dimension can be computed as: `inner_dim / (page_size * num_heads)`.
+    ///
+    /// Required for:
+    /// - Universal layout transformations
+    /// - Per-head memory region access
+    #[builder(default = "None")]
+    #[serde(default)]
+    pub num_heads: Option<usize>,
+}
+
+impl LayoutConfig {
+    /// Builder for LayoutConfig
+    pub fn builder() -> LayoutConfigBuilder {
+        LayoutConfigBuilder::default()
+    }
+
+    pub fn required_bytes(&self) -> usize {
+        self.num_blocks
+            .saturating_mul(self.num_layers)
+            .saturating_mul(self.outer_dim)
+            .saturating_mul(self.page_size)
+            .saturating_mul(self.inner_dim)
+            .saturating_mul(self.dtype_width_bytes)
+    }
+
+    /// Get the number of bytes per block.
+    ///
+    /// This is the total size of a single block across all layers and outer dimensions.
+    pub fn bytes_per_block(&self) -> usize {
+        self.num_layers
+            .saturating_mul(self.outer_dim)
+            .saturating_mul(self.page_size)
+            .saturating_mul(self.inner_dim)
+            .saturating_mul(self.dtype_width_bytes)
+    }
+
+    /// Get the head dimension if `num_heads` is specified.
+    ///
+    /// Computes `inner_dim / (page_size * num_heads)`.
+    ///
+    /// # Returns
+    /// `Some(head_dim)` if `num_heads` is set, `None` otherwise.
+    pub fn head_dim(&self) -> Option<usize> {
+        self.num_heads.map(|nh| {
+            let divisor = self.page_size * nh;
+            if divisor > 0 {
+                self.inner_dim / divisor
+            } else {
+                0
+            }
+        })
+    }
+
+    /// Check if this config supports KvBlockLayout operations.
+    ///
+    /// Returns `true` if `num_heads` is set and the dimensions are valid
+    /// (inner_dim is evenly divisible by page_size * num_heads).
+    pub fn supports_kv_block_layout(&self) -> bool {
+        if let Some(nh) = self.num_heads {
+            let divisor = self.page_size * nh;
+            divisor > 0 && self.inner_dim.is_multiple_of(divisor)
+        } else {
+            false
+        }
+    }
+
+    /// Validate that this config supports KvBlockLayout operations.
+    ///
+    /// # Returns
+    /// `Ok(())` if valid, `Err` with details otherwise.
+    pub fn validate_for_kv_block_layout(&self) -> Result<(), ValidationError> {
+        let nh = match self.num_heads {
+            Some(nh) => nh,
+            None => {
+                return Err(ValidationError::new(
+                    "num_heads_required_for_kv_block_layout",
+                ));
+            }
+        };
+
+        if nh == 0 {
+            return Err(ValidationError::new("num_heads_must_be_positive"));
+        }
+
+        let divisor = self.page_size * nh;
+        if !self.inner_dim.is_multiple_of(divisor) {
+            return Err(ValidationError::new(
+                "inner_dim_must_be_divisible_by_page_size_times_num_heads",
+            ));
+        }
+
+        Ok(())
+    }
+}
+
+/// The first two dimensions of the tensor, `shape[0]` and `shape[1]`, one of those corresponds to the
+/// block dimension, while the other corresponds to the outer dimension.
+///
+/// The outer dimension is typically:
+/// - 1: MLA or K and V stored together,
+/// - 2: K and V stored separately,
+///
+/// The block dimension tell us the number of blocks.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+pub enum BlockDimension {
+    /// The block dimension is the first dimension of the tensor, `[n_blocks, outer_dim, inner_dim]`
+    BlockIsFirstDim,
+
+    /// The block dimension is the second dimension of the tensor, `[outer_dim, n_blocks, inner_dim]`
+    /// This is a replacement for v1's `outer_contiguous` is true.
+    BlockIsSecondDim,
+}
+
+/// Validation function for Option<usize> to check if it's Some(power_of_2).
+pub fn validate_power_of_2(alignment: usize) -> Result<(), ValidationError> {
+    if !alignment.is_power_of_two() {
+        // Return validation error if alignment is not a power of 2
+        return Err(validator::ValidationError::new(
+            "alignment_must_be_power_of_2",
+        ));
+    }
+    // Passes validation if alignment is a power of 2
+    Ok(())
+}
+
+pub fn validate_dtype_width_bytes(dtype_width_bytes: usize) -> Result<(), ValidationError> {
+    if !dtype_width_bytes.is_power_of_two() || !(2..=8).contains(&dtype_width_bytes) {
+        return Err(validator::ValidationError::new(
+            "dtype_width_bytes_must_be_power_of_two_and_less_than_8_bytes",
+        ));
+    }
+    Ok(())
+}
--- a/lib/kvbm-physical/src/layout/fully_contiguous.rs
+++ b/lib/kvbm-physical/src/layout/fully_contiguous.rs
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Fully contiguous layout implementation.
+//!
+//! This layout stores all blocks in a single contiguous memory allocation
+//! with the shape: [num_blocks, num_layers, outer_dim, page_size, inner_dim].
+
+use anyhow::{Result, anyhow};
+use validator::Validate;
+
+use super::serialize::{BlockFormat, FullyContiguousDetails, LayoutTypeDetails};
+use super::{Buffer, KvBlockLayout, Layout, LayoutConfig, MemoryDescriptor, MemoryRegion};
+
+/// Fully contiguous layout where all blocks are in a single allocation.
+#[derive(Debug)]
+pub struct FullyContiguousLayout {
+    config: LayoutConfig,
+    /// Base address of the allocation
+    base_addr: usize,
+    /// Stride between blocks in bytes
+    block_stride: usize,
+    /// Stride between layers in bytes
+    layer_stride: usize,
+    /// Stride between outer dimensions in bytes
+    outer_stride: usize,
+    /// Size of each memory region (page) in bytes
+    region_size: usize,
+    /// Owned memory region backing this layout
+    memory: Buffer,
+    /// Format of blocks in memory
+    block_format: BlockFormat,
+    /// KV block layout describing dimension ordering within blocks
+    kv_block_layout: KvBlockLayout,
+}
+
+/// Builder for creating [`FullyContiguousLayout`] instances.
+///
+/// # Example
+///
+/// ```ignore
+/// let layout = FullyContiguousLayout::builder()
+///     .config(config)
+///     .memory(buffer)
+///     .kv_block_layout(KvBlockLayout::UniversalTP)
+///     .build()?;
+/// ```
+#[derive(Debug, Default)]
+pub struct FullyContiguousLayoutBuilder {
+    config: Option<LayoutConfig>,
+    memory: Option<Buffer>,
+    kv_block_layout: KvBlockLayout,
+    block_format: BlockFormat,
+}
+
+impl FullyContiguousLayoutBuilder {
+    /// Create a new builder with default values.
+    pub fn new() -> Self {
+        Self {
+            config: None,
+            memory: None,
+            kv_block_layout: KvBlockLayout::Unknown,
+            block_format: BlockFormat::default(),
+        }
+    }
+
+    /// Set the layout configuration.
+    #[expect(dead_code)]
+    pub fn config(&mut self, config: LayoutConfig) -> &mut Self {
+        self.config = Some(config);
+        self
+    }
+
+    /// Set the memory buffer backing this layout.
+    #[expect(dead_code)]
+    pub fn memory(&mut self, memory: Buffer) -> &mut Self {
+        self.memory = Some(memory);
+        self
+    }
+
+    /// Set the KV block layout describing dimension ordering.
+    ///
+    /// Default: `KvBlockLayout::Unknown`
+    #[expect(dead_code)]
+    pub fn kv_block_layout(&mut self, layout: KvBlockLayout) -> &mut Self {
+        self.kv_block_layout = layout;
+        self
+    }
+
+    /// Set the block format.
+    ///
+    /// Default: `BlockFormat::default()` (Operational)
+    #[expect(dead_code)]
+    pub fn block_format(&mut self, format: BlockFormat) -> &mut Self {
+        self.block_format = format;
+        self
+    }
+
+    /// Build the [`FullyContiguousLayout`].
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if:
+    /// - `config` is not set
+    /// - `memory` is not set
+    /// - The memory region is too small for the layout
+    /// - The config validation fails
+    #[expect(dead_code)]
+    pub fn build(&self) -> Result<FullyContiguousLayout> {
+        let config = self
+            .config
+            .clone()
+            .ok_or_else(|| anyhow!("config is required"))?;
+        let memory = self
+            .memory
+            .clone()
+            .ok_or_else(|| anyhow!("memory is required"))?;
+
+        FullyContiguousLayout::new_internal(config, memory, self.kv_block_layout, self.block_format)
+    }
+}
+
+impl FullyContiguousLayout {
+    /// Create a builder for `FullyContiguousLayout`.
+    #[expect(dead_code)]
+    pub fn builder() -> FullyContiguousLayoutBuilder {
+        FullyContiguousLayoutBuilder::new()
+    }
+
+    /// Create a new fully contiguous layout with default KV block layout.
+    ///
+    /// # Arguments
+    /// * `config` - Layout configuration
+    /// * `memory` - Owned memory region that backs this layout
+    ///
+    /// # Returns
+    /// A new FullyContiguousLayout instance with `KvBlockLayout::Unknown`
+    pub(crate) fn new(config: LayoutConfig, memory: Buffer) -> Result<Self> {
+        Self::new_internal(
+            config,
+            memory,
+            KvBlockLayout::Unknown,
+            BlockFormat::default(),
+        )
+    }
+
+    /// Internal constructor with all parameters.
+    fn new_internal(
+        config: LayoutConfig,
+        memory: Buffer,
+        kv_block_layout: KvBlockLayout,
+        block_format: BlockFormat,
+    ) -> Result<Self> {
+        config.validate()?;
+
+        let base_addr = memory.addr();
+
+        // Calculate strides
+        let region_size = config.page_size * config.inner_dim * config.dtype_width_bytes;
+        let outer_stride = region_size;
+        let layer_stride = outer_stride * config.outer_dim;
+        let block_stride = layer_stride * config.num_layers;
+
+        // Validate that the memory region is large enough
+        let required_size = block_stride * config.num_blocks;
+        if memory.size() < required_size {
+            return Err(anyhow!(
+                "Memory region too small for layout. Required: {} bytes, got: {} bytes",
+                required_size,
+                memory.size()
+            ));
+        }
+
+        Ok(Self {
+            config,
+            base_addr,
+            block_stride,
+            layer_stride,
+            outer_stride,
+            region_size,
+            memory,
+            block_format,
+            kv_block_layout,
+        })
+    }
+
+    /// Create a new fully contiguous layout with a specific block format and KV block layout.
+    ///
+    /// # Arguments
+    /// * `config` - Layout configuration
+    /// * `memory` - Owned memory region that backs this layout
+    /// * `block_format` - Format of blocks in memory
+    /// * `kv_block_layout` - KV block layout describing dimension ordering
+    ///
+    /// # Returns
+    /// A new FullyContiguousLayout instance
+    pub(crate) fn new_with_format(
+        config: LayoutConfig,
+        memory: Buffer,
+        block_format: BlockFormat,
+        kv_block_layout: KvBlockLayout,
+    ) -> Result<Self> {
+        Self::new_internal(config, memory, kv_block_layout, block_format)
+    }
+
+    /// Get the block format.
+    #[expect(dead_code)]
+    pub fn block_format(&self) -> BlockFormat {
+        self.block_format
+    }
+
+    /// Get the KV block layout.
+    #[expect(dead_code)]
+    pub fn kv_block_layout(&self) -> KvBlockLayout {
+        self.kv_block_layout
+    }
+
+    /// Set the KV block layout.
+    #[expect(dead_code)]
+    pub fn set_kv_block_layout(&mut self, layout: KvBlockLayout) {
+        self.kv_block_layout = layout;
+    }
+
+    /// Calculate the address of a specific memory region.
+    fn calculate_address(
+        &self,
+        block_id: usize,
+        layer_id: usize,
+        outer_id: usize,
+    ) -> Result<usize> {
+        if block_id >= self.config.num_blocks {
+            return Err(anyhow!(
+                "Block ID {} out of range (count: {})",
+                block_id,
+                self.config.num_blocks
+            ));
+        }
+        if layer_id >= self.config.num_layers {
+            return Err(anyhow!(
+                "Layer ID {} out of range (count: {})",
+                layer_id,
+                self.config.num_layers
+            ));
+        }
+        if outer_id >= self.config.outer_dim {
+            return Err(anyhow!(
+                "Outer ID {} out of range (count: {})",
+                outer_id,
+                self.config.outer_dim
+            ));
+        }
+
+        Ok(self.base_addr
+            + block_id * self.block_stride
+            + layer_id * self.layer_stride
+            + outer_id * self.outer_stride)
+    }
+
+    /// Get mutable reference to the memory Arc for NIXL registration.
+    #[expect(dead_code)]
+    pub fn memory_arc_mut(&mut self) -> &mut Buffer {
+        &mut self.memory
+    }
+}
+
+impl Layout for FullyContiguousLayout {
+    fn config(&self) -> &LayoutConfig {
+        &self.config
+    }
+
+    fn memory_regions(&self) -> &[Buffer] {
+        std::slice::from_ref(&self.memory)
+    }
+
+    fn memory_region(
+        &self,
+        block_id: usize,
+        layer_id: usize,
+        outer_id: usize,
+    ) -> Result<MemoryRegion> {
+        let addr = self.calculate_address(block_id, layer_id, outer_id)?;
+        Ok(MemoryRegion::new(addr, self.region_size))
+    }
+
+    fn required_allocations(&self) -> Vec<usize> {
+        // Single contiguous allocation
+        vec![self.block_stride * self.config.num_blocks]
+    }
+
+    fn is_fully_contiguous(&self) -> bool {
+        true
+    }
+
+    fn num_blocks(&self) -> usize {
+        self.config.num_blocks
+    }
+
+    fn num_layers(&self) -> usize {
+        self.config.num_layers
+    }
+
+    fn outer_dim(&self) -> usize {
+        self.config.outer_dim
+    }
+
+    fn page_size(&self) -> usize {
+        self.config.page_size
+    }
+
+    fn inner_dim(&self) -> usize {
+        self.config.inner_dim
+    }
+
+    fn dtype_width_bytes(&self) -> usize {
+        self.config.dtype_width_bytes
+    }
+
+    fn serialization_details(&self) -> LayoutTypeDetails {
+        LayoutTypeDetails::FullyContiguous(FullyContiguousDetails {
+            block_format: self.block_format,
+            kv_block_layout: self.kv_block_layout,
+        })
+    }
+
+    fn block_layout(&self) -> KvBlockLayout {
+        self.kv_block_layout
+    }
+}
+
+impl super::ContiguousBlockLayout for FullyContiguousLayout {
+    fn num_blocks(&self) -> usize {
+        self.config.num_blocks
+    }
+
+    fn bytes_per_block(&self) -> usize {
+        self.block_stride
+    }
+
+    fn raw_block(&self, block_id: usize) -> Result<MemoryRegion> {
+        if block_id >= self.config.num_blocks {
+            return Err(anyhow!(
+                "Block ID {} out of range (max: {})",
+                block_id,
+                self.config.num_blocks
+            ));
+        }
+        let addr = self.base_addr + block_id * self.block_stride;
+        Ok(MemoryRegion::new(addr, self.block_stride))
+    }
+
+    fn block_layout(&self) -> KvBlockLayout {
+        self.kv_block_layout
+    }
+}
+
+#[cfg(all(test, feature = "testing-kvbm"))]
+mod tests {
+    use super::super::tests::*;
+    use super::*;
+
+    #[test]
+    fn test_fully_contiguous_layout_creation() {
+        let config = LayoutConfig::builder()
+            .num_blocks(10)
+            .num_layers(4)
+            .outer_dim(2)
+            .page_size(16)
+            .inner_dim(128)
+            .dtype_width_bytes(2)
+            .build()
+            .unwrap();
+
+        let required_bytes = config.required_bytes();
+        assert_eq!(required_bytes, 10 * 4 * 2 * 16 * 128 * 2);
+
+        let memory = Buffer::from_arc(MockMemory::new(0x1000, required_bytes));
+
+        let layout = FullyContiguousLayout::new(config, memory).unwrap();
+        assert_eq!(layout.num_blocks(), 10);
+        assert!(layout.is_fully_contiguous());
+    }
+
+    #[test]
+    fn test_memory_region() {
+        let config = LayoutConfig::builder()
+            .num_blocks(2)
+            .num_layers(2)
+            .outer_dim(2)
+            .page_size(16)
+            .inner_dim(128)
+            .dtype_width_bytes(2)
+            .build()
+            .unwrap();
+
+        let required_size = config.required_bytes();
+        let memory = Buffer::from_arc(MockMemory::new(0x1000, required_size));
+        let layout = FullyContiguousLayout::new(config.clone(), memory).unwrap();
+
+        // Test accessing specific memory regions
+        let region_size = config.page_size * config.inner_dim * config.dtype_width_bytes;
+
+        // Block 0, Layer 0, Outer 0
+        let region = layout.memory_region(0, 0, 0).unwrap();
+        assert_eq!(region.addr, 0x1000);
+        assert_eq!(region.size(), region_size);
+
+        // Block 0, Layer 0, Outer 1
+        let region = layout.memory_region(0, 0, 1).unwrap();
+        assert_eq!(region.addr, 0x1000 + region_size);
+        assert_eq!(region.size(), region_size);
+
+        // Block 0, Layer 1, Outer 0
+        let region = layout.memory_region(0, 1, 0).unwrap();
+        assert_eq!(region.addr, 0x1000 + 2 * region_size);
+        assert_eq!(region.size(), region_size);
+
+        // Block 1, Layer 0, Outer 0
+        let region = layout.memory_region(1, 0, 0).unwrap();
+        assert_eq!(
+            region.addr,
+            0x1000 + (config.outer_dim * config.num_layers * region_size)
+        );
+        assert_eq!(region.size(), region_size);
+    }
+}
--- a/lib/kvbm-physical/src/layout/kv_block_layout.rs
+++ b/lib/kvbm-physical/src/layout/kv_block_layout.rs
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! KV Block layout types for describing dimension permutations within blocks.
+//!
+//! This module provides types for describing how dimensions are ordered within
+//! a fully contiguous KV cache block, enabling type-driven kernel selection
+//! for transfers between different layout formats.
+
+use serde::{Deserialize, Serialize};
+
+/// Symbolic dimensions that can be permuted within a block.
+///
+/// The head dimension (hd) is always innermost and not included here.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub enum BlockDim {
+    /// Number of layers (nl)
+    Layer,
+    /// Outer dimension - typically 2 for K/V, 1 for MLA (no)
+    Outer,
+    /// Page size / tokens per block (nt)
+    Page,
+    /// Number of attention heads (nh)
+    Head,
+}
+
+/// Block layout defined by dimension ordering.
+///
+/// Describes how the 4 permutable dimensions (layer, outer, page, head) are
+/// ordered within a fully contiguous block. The head dimension (hd) is always
+/// innermost and implicit.
+///
+/// The order specifies outer-to-inner dimensions, with head_dim always last.
+///
+/// # Examples
+///
+/// - `UniversalTP`: `[nh, nl, no, nt, hd]` - heads outermost for TP resharding
+/// - `OperationalNHD`: `[nl, no, nt, nh, hd]` - inner is `[nt, nh, hd]`
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub enum KvBlockLayout {
+    /// Universal format: `[nh, nl, no, nt, hd]`
+    ///
+    /// Heads are outermost to enable tensor parallelism (TP) resharding.
+    /// Cache saved from one TP configuration can be loaded into another
+    /// by simply slicing the head dimension differently.
+    UniversalTP,
+
+    /// Pipeline parallelism format: `[nl, nh, no, nt, hd]`
+    ///
+    /// Layers are outermost for pipeline parallelism scenarios.
+    UniversalPP,
+
+    /// Operational HND format: `[nl, no, nh, nt, hd]`
+    ///
+    /// Inner tensor shape is `[nh, nt, hd]` (heads, tokens, head_dim).
+    OperationalHND,
+
+    /// Operational NHD format: `[nl, no, nt, nh, hd]`
+    ///
+    /// Inner tensor shape is `[nt, nh, hd]` (tokens, heads, head_dim).
+    /// This is the most common format used by vLLM and other frameworks.
+    OperationalNHD,
+
+    /// Custom ordering with explicit dimension list.
+    ///
+    /// The array specifies dimensions from outermost to innermost,
+    /// with head_dim always implicitly last.
+    Custom([BlockDim; 4]),
+
+    /// Unknown layout - fallback when format cannot be determined.
+    ///
+    /// Operations involving Unknown layouts may fail or require explicit
+    /// configuration.
+    Unknown,
+}
+
+impl Default for KvBlockLayout {
+    fn default() -> Self {
+        // Unknown until runtime detection determines the actual format
+        Self::Unknown
+    }
+}
+
+impl KvBlockLayout {
+    /// Get the dimension ordering as an array.
+    ///
+    /// Returns the 4 dimensions from outermost to innermost.
+    /// Head dimension (hd) is implicit as the innermost dimension.
+    ///
+    /// # Returns
+    /// `None` for `Unknown` layout, `Some([BlockDim; 4])` otherwise.
+    pub fn dim_order(&self) -> Option<[BlockDim; 4]> {
+        use BlockDim::*;
+        match self {
+            Self::UniversalTP => Some([Head, Layer, Outer, Page]),
+            Self::UniversalPP => Some([Layer, Head, Outer, Page]),
+            Self::OperationalHND => Some([Layer, Outer, Head, Page]),
+            Self::OperationalNHD => Some([Layer, Outer, Page, Head]),
+            Self::Custom(order) => Some(*order),
+            Self::Unknown => None,
+        }
+    }
+
+    /// Check if two layouts require transformation (not just copy).
+    ///
+    /// Returns `true` if the layouts have different dimension orderings,
+    /// meaning a transformation kernel is needed rather than a simple copy.
+    ///
+    /// For Unknown→Unknown comparisons, returns `false` (compatible) but emits
+    /// a warning so these cases can be tracked and fixed.
+    ///
+    /// Returns `true` if one is Unknown and the other is Known (conservative).
+    pub fn requires_transform(&self, other: &Self) -> bool {
+        match (self.dim_order(), other.dim_order()) {
+            (Some(a), Some(b)) => a != b,
+            (None, None) => {
+                // Unknown→Unknown is compatible, but warn so we can fix these
+                tracing::warn!("Unknown→Unknown KvBlockLayout comparison - this should be fixed");
+                false
+            }
+            // Unknown→Known requires transform (conservative)
+            _ => true,
+        }
+    }
+
+    /// Check if this is an operational layout (NHD or HND).
+    ///
+    /// Operational layouts are used for direct computation and have
+    /// layer/outer as the outermost dimensions.
+    pub fn is_operational(&self) -> bool {
+        matches!(self, Self::OperationalNHD | Self::OperationalHND)
+    }
+
+    /// Check if this is a universal layout (TP or PP).
+    ///
+    /// Universal layouts are optimized for storage and transfer,
+    /// with different parallelism-friendly orderings.
+    pub fn is_universal(&self) -> bool {
+        matches!(self, Self::UniversalTP | Self::UniversalPP)
+    }
+
+    /// Get the layout name as a string identifier.
+    pub fn name(&self) -> &'static str {
+        match self {
+            Self::UniversalTP => "universal_tp",
+            Self::UniversalPP => "universal_pp",
+            Self::OperationalHND => "operational_hnd",
+            Self::OperationalNHD => "operational_nhd",
+            Self::Custom(_) => "custom",
+            Self::Unknown => "unknown",
+        }
+    }
+
+    /// Try to create a KvBlockLayout from an InnerShape.
+    ///
+    /// This provides compatibility with the existing InnerShape enum.
+    pub(crate) fn from_inner_shape(inner_shape: super::InnerShape) -> Self {
+        match inner_shape {
+            super::InnerShape::NHD => Self::OperationalNHD,
+            super::InnerShape::HND => Self::OperationalHND,
+            super::InnerShape::Unknown => Self::Unknown,
+        }
+    }
+
+    /// Convert to InnerShape if this is an operational layout.
+    ///
+    /// Returns `None` for universal or custom layouts.
+    pub(crate) fn to_inner_shape(self) -> Option<super::InnerShape> {
+        match self {
+            Self::OperationalNHD => Some(super::InnerShape::NHD),
+            Self::OperationalHND => Some(super::InnerShape::HND),
+            Self::Unknown => Some(super::InnerShape::Unknown),
+            _ => None,
+        }
+    }
+}
+
+impl std::fmt::Display for KvBlockLayout {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::UniversalTP => write!(f, "Universal TP [nh, nl, no, nt, hd]"),
+            Self::UniversalPP => write!(f, "Universal PP [nl, nh, no, nt, hd]"),
+            Self::OperationalHND => write!(f, "Operational HND [nl, no, nh, nt, hd]"),
+            Self::OperationalNHD => write!(f, "Operational NHD [nl, no, nt, nh, hd]"),
+            Self::Custom(order) => write!(f, "Custom {:?}", order),
+            Self::Unknown => write!(f, "Unknown"),
+        }
+    }
+}
+
+impl std::fmt::Display for BlockDim {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::Layer => write!(f, "nl"),
+            Self::Outer => write!(f, "no"),
+            Self::Page => write!(f, "nt"),
+            Self::Head => write!(f, "nh"),
+        }
+    }
+}
+
+// ============================================================================
+// KvBlocks - Collection wrapper for blocks with shared layout
+// ============================================================================
+
+use crate::BlockId;
+use crate::layout::PhysicalLayout;
+use std::sync::Arc;
+
+/// A collection of blocks with a shared layout configuration and block layout type.
+///
+/// `KvBlocks` provides a convenient way to group blocks that should be treated
+/// uniformly in transfer operations. All blocks in the collection share:
+/// - The same [`PhysicalLayout`] (memory organization)
+/// - The same [`KvBlockLayout`] interpretation (dimension ordering)
+///
+/// This enables efficient batch transfers with optional layout override.
+///
+/// # Example
+///
+/// ```ignore
+/// // Create blocks with universal layout override
+/// let blocks = KvBlocks::new(
+///     physical_layout.clone(),
+///     vec![0, 1, 2, 3],  // block IDs
+///     Some(KvBlockLayout::UniversalTP),
+/// )?;
+///
+/// // Use in transfers - the override tells the transfer system
+/// // to interpret these blocks as universal format
+/// ```
+#[derive(Debug, Clone)]
+pub struct KvBlocks {
+    /// The physical layout containing these blocks
+    layout: Arc<PhysicalLayout>,
+    /// Block IDs within the layout
+    block_ids: Vec<BlockId>,
+    /// Optional layout override (None = use layout's native block_layout)
+    kv_layout_override: Option<KvBlockLayout>,
+}
+
+impl KvBlocks {
+    /// Create a new KvBlocks collection.
+    ///
+    /// # Arguments
+    /// * `layout` - The physical layout containing the blocks
+    /// * `block_ids` - Block IDs to include in this collection
+    /// * `kv_layout_override` - Optional override for the block layout interpretation.
+    ///   If `None`, uses the layout's native `block_layout()`.
+    ///   If `Some`, overrides the interpretation for transfers.
+    ///
+    /// # Validation
+    /// - For layer-separate layouts, only operational layouts (NHD/HND) are valid overrides
+    /// - For fully contiguous layouts, any layout is valid
+    /// - If the override matches the native layout, it is normalized to None
+    pub fn new(
+        layout: Arc<PhysicalLayout>,
+        block_ids: Vec<BlockId>,
+        kv_layout_override: Option<KvBlockLayout>,
+    ) -> anyhow::Result<Self> {
+        // Validate block IDs are in range
+        let num_blocks = layout.layout().num_blocks();
+        for &id in &block_ids {
+            if id >= num_blocks {
+                return Err(anyhow::anyhow!(
+                    "Block ID {} out of range (layout has {} blocks)",
+                    id,
+                    num_blocks
+                ));
+            }
+        }
+
+        // Validate layout override compatibility
+        if let Some(ref override_layout) = kv_layout_override {
+            // Layer-separate layouts can only use operational formats
+            if !layout.layout().is_fully_contiguous() && !override_layout.is_operational() {
+                return Err(anyhow::anyhow!(
+                    "Layer-separate layouts only support operational block layouts (NHD/HND), got {:?}",
+                    override_layout
+                ));
+            }
+        }
+
+        // Normalize: if override matches native layout, set to None
+        let normalized_override = kv_layout_override.and_then(|override_layout| {
+            if override_layout == layout.layout().block_layout() {
+                None
+            } else {
+                Some(override_layout)
+            }
+        });
+
+        Ok(Self {
+            layout,
+            block_ids,
+            kv_layout_override: normalized_override,
+        })
+    }
+
+    /// Create a KvBlocks collection without layout override.
+    #[expect(dead_code)]
+    pub fn from_layout(
+        layout: Arc<PhysicalLayout>,
+        block_ids: Vec<BlockId>,
+    ) -> anyhow::Result<Self> {
+        Self::new(layout, block_ids, None)
+    }
+
+    /// Get the physical layout.
+    #[expect(dead_code)]
+    pub fn layout(&self) -> &Arc<PhysicalLayout> {
+        &self.layout
+    }
+
+    /// Get the block IDs.
+    #[expect(dead_code)]
+    pub fn block_ids(&self) -> &[BlockId] {
+        &self.block_ids
+    }
+
+    /// Get the effective block layout (override or native).
+    pub fn effective_block_layout(&self) -> KvBlockLayout {
+        self.kv_layout_override
+            .unwrap_or_else(|| self.layout.layout().block_layout())
+    }
+
+    /// Get the layout override if set.
+    #[expect(dead_code)]
+    pub fn layout_override(&self) -> Option<KvBlockLayout> {
+        self.kv_layout_override
+    }
+
+    /// Check if this collection has a layout override.
+    #[expect(dead_code)]
+    pub fn has_override(&self) -> bool {
+        self.kv_layout_override.is_some()
+    }
+
+    /// Get the number of blocks in this collection.
+    #[expect(dead_code)]
+    pub fn len(&self) -> usize {
+        self.block_ids.len()
+    }
+
+    /// Check if the collection is empty.
+    #[expect(dead_code)]
+    pub fn is_empty(&self) -> bool {
+        self.block_ids.is_empty()
+    }
+
+    /// Check if a transfer between two KvBlocks collections requires transformation.
+    ///
+    /// Returns `true` if the effective layouts differ and a transformation kernel
+    /// is needed rather than a simple copy.
+    #[expect(dead_code)]
+    pub fn requires_transform_to(&self, dst: &KvBlocks) -> bool {
+        self.effective_block_layout()
+            .requires_transform(&dst.effective_block_layout())
+    }
+}
+
+#[cfg(all(test, feature = "testing-kvbm"))]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_dim_order() {
+        use BlockDim::*;
+
+        assert_eq!(
+            KvBlockLayout::UniversalTP.dim_order(),
+            Some([Head, Layer, Outer, Page])
+        );
+        assert_eq!(
+            KvBlockLayout::OperationalNHD.dim_order(),
+            Some([Layer, Outer, Page, Head])
+        );
+        assert_eq!(KvBlockLayout::Unknown.dim_order(), None);
+    }
+
+    #[test]
+    fn test_requires_transform() {
+        // Same layout - no transform
+        assert!(!KvBlockLayout::OperationalNHD.requires_transform(&KvBlockLayout::OperationalNHD));
+
+        // Different layouts - transform required
+        assert!(KvBlockLayout::OperationalNHD.requires_transform(&KvBlockLayout::UniversalTP));
+        assert!(KvBlockLayout::OperationalHND.requires_transform(&KvBlockLayout::OperationalNHD));
+
+        // Unknown→Known requires transform (conservative)
+        assert!(KvBlockLayout::Unknown.requires_transform(&KvBlockLayout::OperationalNHD));
+        assert!(KvBlockLayout::OperationalNHD.requires_transform(&KvBlockLayout::Unknown));
+
+        // Unknown→Unknown is compatible (but emits warning)
+        assert!(!KvBlockLayout::Unknown.requires_transform(&KvBlockLayout::Unknown));
+    }
+
+    #[test]
+    fn test_is_operational() {
+        assert!(KvBlockLayout::OperationalNHD.is_operational());
+        assert!(KvBlockLayout::OperationalHND.is_operational());
+        assert!(!KvBlockLayout::UniversalTP.is_operational());
+        assert!(!KvBlockLayout::Unknown.is_operational());
+    }
+
+    #[test]
+    fn test_is_universal() {
+        assert!(KvBlockLayout::UniversalTP.is_universal());
+        assert!(KvBlockLayout::UniversalPP.is_universal());
+        assert!(!KvBlockLayout::OperationalNHD.is_universal());
+    }
+
+    #[test]
+    fn test_default() {
+        assert_eq!(KvBlockLayout::default(), KvBlockLayout::Unknown);
+    }
+
+    #[test]
+    fn test_serialization() {
+        let layout = KvBlockLayout::UniversalTP;
+        let json = serde_json::to_string(&layout).unwrap();
+        let deserialized: KvBlockLayout = serde_json::from_str(&json).unwrap();
+        assert_eq!(layout, deserialized);
+
+        // Test custom layout
+        let custom = KvBlockLayout::Custom([
+            BlockDim::Head,
+            BlockDim::Page,
+            BlockDim::Layer,
+            BlockDim::Outer,
+        ]);
+        let json = serde_json::to_string(&custom).unwrap();
+        let deserialized: KvBlockLayout = serde_json::from_str(&json).unwrap();
+        assert_eq!(custom, deserialized);
+    }
+
+    #[test]
+    fn test_inner_shape_conversion() {
+        use super::super::InnerShape;
+
+        assert_eq!(
+            KvBlockLayout::from_inner_shape(InnerShape::NHD),
+            KvBlockLayout::OperationalNHD
+        );
+        assert_eq!(
+            KvBlockLayout::from_inner_shape(InnerShape::HND),
+            KvBlockLayout::OperationalHND
+        );
+
+        assert_eq!(
+            KvBlockLayout::OperationalNHD.to_inner_shape(),
+            Some(InnerShape::NHD)
+        );
+        assert_eq!(KvBlockLayout::UniversalTP.to_inner_shape(), None);
+    }
+}
--- a/lib/kvbm-physical/src/layout/layer_separate.rs
+++ b/lib/kvbm-physical/src/layout/layer_separate.rs
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Layer-separate layout implementation.
+//!
+//! This layout stores each layer in its own allocation, which is the typical
+//! vLLM layout. Each layer can be either block-contiguous or outer-contiguous:
+//! - Block-contiguous: [num_blocks, outer_dim, page_size, inner_dim]
+//! - Outer-contiguous: [outer_dim, num_blocks, page_size, inner_dim]
+
+use anyhow::{Result, anyhow};
+use validator::Validate;
+
+use super::serialize::{LayerSeparateDetails, LayoutTypeDetails};
+use super::{
+    BlockDimension, Buffer, InnerShape, KvBlockLayout, Layout, LayoutConfig, MemoryDescriptor,
+    MemoryRegion,
+};
+
+/// Layer-separate layout where each layer has its own allocation.
+#[derive(Debug)]
+pub struct LayerSeparateLayout {
+    config: LayoutConfig,
+    /// Base addresses for each layer
+    layer_base_addrs: Vec<usize>,
+    /// Whether the outer dimension is contiguous (vs block dimension)
+    block_dim: BlockDimension,
+    /// Stride between blocks in bytes
+    block_stride: usize,
+    /// Stride between outer dimensions in bytes
+    outer_stride: usize,
+    /// Size of each memory region (page) in bytes
+    region_size: usize,
+    /// Owned memory regions backing this layout (one per layer)
+    memory_regions: Vec<Buffer>,
+    /// KV block layout for inner tensor format (must be operational: NHD or HND)
+    kv_block_layout: KvBlockLayout,
+}
+
+/// Builder for creating [`LayerSeparateLayout`] instances.
+///
+/// # Example
+///
+/// ```ignore
+/// let layout = LayerSeparateLayout::builder()
+///     .config(config)
+///     .memory(memory_regions)
+///     .block_dim(BlockDimension::BlockIsFirstDim)
+///     .inner_shape(InnerShape::NHD)
+///     .build()?;
+/// ```
+#[derive(Debug, Default)]
+pub struct LayerSeparateLayoutBuilder {
+    config: Option<LayoutConfig>,
+    memory: Option<Vec<Buffer>>,
+    block_dim: Option<BlockDimension>,
+    kv_block_layout: KvBlockLayout,
+}
+
+impl LayerSeparateLayoutBuilder {
+    /// Create a new builder with default values.
+    pub fn new() -> Self {
+        Self {
+            config: None,
+            memory: None,
+            block_dim: None,
+            kv_block_layout: KvBlockLayout::Unknown,
+        }
+    }
+
+    /// Set the layout configuration.
+    pub fn config(&mut self, config: LayoutConfig) -> &mut Self {
+        self.config = Some(config);
+        self
+    }
+
+    /// Set the memory buffers backing this layout (one per layer).
+    pub fn memory(&mut self, memory: Vec<Buffer>) -> &mut Self {
+        self.memory = Some(memory);
+        self
+    }
+
+    /// Set the block dimension ordering.
+    pub fn block_dim(&mut self, block_dim: BlockDimension) -> &mut Self {
+        self.block_dim = Some(block_dim);
+        self
+    }
+
+    /// Set the inner shape, which translates to the KV block layout.
+    ///
+    /// Only operational layouts (NHD, HND) are valid for layer-separate layouts.
+    ///
+    /// - `InnerShape::NHD` -> `KvBlockLayout::OperationalNHD`
+    /// - `InnerShape::HND` -> `KvBlockLayout::OperationalHND`
+    /// - `InnerShape::Unknown` -> `KvBlockLayout::Unknown`
+    ///
+    /// Default: `KvBlockLayout::Unknown`
+    pub fn inner_shape(&mut self, shape: InnerShape) -> &mut Self {
+        self.kv_block_layout = KvBlockLayout::from_inner_shape(shape);
+        self
+    }
+
+    /// Build the [`LayerSeparateLayout`].
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if:
+    /// - `config` is not set
+    /// - `memory` is not set
+    /// - `block_dim` is not set
+    /// - The memory region count doesn't match `num_layers`
+    /// - Any memory region is too small for the layout
+    /// - The config validation fails
+    pub fn build(&self) -> Result<LayerSeparateLayout> {
+        let config = self
+            .config
+            .clone()
+            .ok_or_else(|| anyhow!("config is required"))?;
+        let memory = self
+            .memory
+            .clone()
+            .ok_or_else(|| anyhow!("memory is required"))?;
+        let block_dim = self
+            .block_dim
+            .ok_or_else(|| anyhow!("block_dim is required"))?;
+
+        LayerSeparateLayout::new_internal(config, memory, block_dim, self.kv_block_layout)
+    }
+}
+
+impl LayerSeparateLayout {
+    /// Create a builder for `LayerSeparateLayout`.
+    pub fn builder() -> LayerSeparateLayoutBuilder {
+        LayerSeparateLayoutBuilder::new()
+    }
+
+    /// Create a new layer-separate layout with default KV block layout.
+    ///
+    /// # Arguments
+    /// - `config` - Layout configuration
+    /// - `memory` - Vector of owned memory regions (one per layer)
+    /// - `block_dim` - Whether block or outer dimension is first
+    ///
+    /// # Returns
+    /// A new LayerSeparateLayout instance with `KvBlockLayout::Unknown`
+    pub(crate) fn new(
+        config: LayoutConfig,
+        memory: Vec<Buffer>,
+        block_dim: BlockDimension,
+    ) -> Result<Self> {
+        Self::new_internal(config, memory, block_dim, KvBlockLayout::Unknown)
+    }
+
+    /// Internal constructor with all parameters.
+    fn new_internal(
+        config: LayoutConfig,
+        memory: Vec<Buffer>,
+        block_dim: BlockDimension,
+        kv_block_layout: KvBlockLayout,
+    ) -> Result<Self> {
+        config.validate()?;
+
+        if memory.len() != config.num_layers {
+            return Err(anyhow!(
+                "Memory region count ({}) must match num_layers ({})",
+                memory.len(),
+                config.num_layers
+            ));
+        }
+
+        // Calculate strides
+        let region_size = config.page_size * config.inner_dim * config.dtype_width_bytes;
+
+        let (block_stride, outer_stride) = if block_dim == BlockDimension::BlockIsSecondDim {
+            // Layout: [outer_dim, num_blocks, page_size, inner_dim]
+            let block_stride = region_size;
+            let outer_stride = block_stride * config.num_blocks;
+            (block_stride, outer_stride)
+        } else {
+            // Layout: [num_blocks, outer_dim, page_size, inner_dim]
+            let outer_stride = region_size;
+            let block_stride = outer_stride * config.outer_dim;
+            (block_stride, outer_stride)
+        };
+
+        // Extract base addresses and validate sizes
+        let mut layer_base_addrs = Vec::with_capacity(config.num_layers);
+        let required_size = config.num_blocks * config.outer_dim * region_size;
+
+        for (i, mem) in memory.iter().enumerate() {
+            if mem.size() < required_size {
+                return Err(anyhow!(
+                    "Memory region {} too small for layout. Required: {} bytes, got: {} bytes",
+                    i,
+                    required_size,
+                    mem.size()
+                ));
+            }
+            layer_base_addrs.push(mem.addr());
+        }
+
+        Ok(Self {
+            config,
+            layer_base_addrs,
+            block_dim,
+            block_stride,
+            outer_stride,
+            region_size,
+            memory_regions: memory,
+            kv_block_layout,
+        })
+    }
+
+    /// Calculate the address of a specific memory region.
+    fn calculate_address(
+        &self,
+        block_id: usize,
+        layer_id: usize,
+        outer_id: usize,
+    ) -> Result<usize> {
+        if block_id >= self.config.num_blocks {
+            return Err(anyhow!(
+                "Block ID {} out of range (max: {})",
+                block_id,
+                self.config.num_blocks
+            ));
+        }
+        if layer_id >= self.config.num_layers {
+            return Err(anyhow!(
+                "Layer ID {} out of range (max: {})",
+                layer_id,
+                self.config.num_layers
+            ));
+        }
+        if outer_id >= self.config.outer_dim {
+            return Err(anyhow!(
+                "Outer ID {} out of range (max: {})",
+                outer_id,
+                self.config.outer_dim
+            ));
+        }
+
+        let base_addr = self.layer_base_addrs[layer_id];
+        let offset = block_id * self.block_stride + outer_id * self.outer_stride;
+
+        Ok(base_addr + offset)
+    }
+
+    #[expect(dead_code)]
+    pub fn block_dim(&self) -> BlockDimension {
+        self.block_dim
+    }
+
+    /// Get mutable reference to the memory regions for NIXL registration.
+    #[expect(dead_code)]
+    pub fn memory_regions_mut(&mut self) -> &mut [Buffer] {
+        &mut self.memory_regions
+    }
+
+    /// Get the KV block layout.
+    #[expect(dead_code)]
+    pub fn kv_block_layout(&self) -> KvBlockLayout {
+        self.kv_block_layout
+    }
+
+    /// Set the KV block layout from an inner shape.
+    ///
+    /// Note: Only operational layouts (NHD, HND) are valid for layer-separate layouts.
+    #[expect(dead_code)]
+    pub fn set_kv_block_layout(&mut self, inner_shape: InnerShape) {
+        self.kv_block_layout = KvBlockLayout::from_inner_shape(inner_shape);
+    }
+}
+
+impl Layout for LayerSeparateLayout {
+    fn config(&self) -> &LayoutConfig {
+        &self.config
+    }
+
+    fn memory_regions(&self) -> &[Buffer] {
+        &self.memory_regions
+    }
+
+    fn memory_region(
+        &self,
+        block_id: usize,
+        layer_id: usize,
+        outer_id: usize,
+    ) -> Result<MemoryRegion> {
+        let addr = self.calculate_address(block_id, layer_id, outer_id)?;
+        Ok(MemoryRegion::new(addr, self.region_size))
+    }
+
+    fn required_allocations(&self) -> Vec<usize> {
+        // One allocation per layer
+        let per_layer_size = self.config.num_blocks * self.config.outer_dim * self.region_size;
+        vec![per_layer_size; self.config.num_layers]
+    }
+
+    fn is_fully_contiguous(&self) -> bool {
+        false
+    }
+
+    fn num_blocks(&self) -> usize {
+        self.config.num_blocks
+    }
+
+    fn num_layers(&self) -> usize {
+        self.config.num_layers
+    }
+
+    fn outer_dim(&self) -> usize {
+        self.config.outer_dim
+    }
+
+    fn page_size(&self) -> usize {
+        self.config.page_size
+    }
+
+    fn inner_dim(&self) -> usize {
+        self.config.inner_dim
+    }
+
+    fn dtype_width_bytes(&self) -> usize {
+        self.config.dtype_width_bytes
+    }
+
+    fn serialization_details(&self) -> LayoutTypeDetails {
+        LayoutTypeDetails::LayerSeparate(LayerSeparateDetails {
+            block_dim: self.block_dim,
+            kv_block_layout: self.kv_block_layout,
+        })
+    }
+
+    fn block_layout(&self) -> KvBlockLayout {
+        self.kv_block_layout
+    }
+}
+
+#[cfg(all(test, feature = "testing-kvbm"))]
+mod tests {
+    use super::super::tests::*;
+    use super::*;
+
+    #[test]
+    fn test_layer_separate_block_contiguous() {
+        let config = LayoutConfig::builder()
+            .num_blocks(10)
+            .num_layers(4)
+            .outer_dim(2)
+            .page_size(16)
+            .inner_dim(128)
+            .dtype_width_bytes(2)
+            .build()
+            .unwrap();
+
+        let per_layer_size = 10 * 2 * 16 * 128 * 2;
+        let memory: Vec<Buffer> = (0..4)
+            .map(|i| Buffer::from_arc(MockMemory::new(0x1000 + i * per_layer_size, per_layer_size)))
+            .collect();
+
+        let layout =
+            LayerSeparateLayout::new(config, memory, BlockDimension::BlockIsFirstDim).unwrap();
+
+        assert_eq!(layout.num_blocks(), 10);
+        assert!(!layout.is_fully_contiguous());
+        assert_eq!(layout.required_allocations().len(), 4);
+    }
+
+    #[test]
+    fn test_layer_separate_outer_contiguous() {
+        let config = LayoutConfig::builder()
+            .num_blocks(10)
+            .num_layers(4)
+            .outer_dim(2)
+            .page_size(16)
+            .inner_dim(128)
+            .dtype_width_bytes(2)
+            .build()
+            .unwrap();
+
+        let per_layer_size = 10 * 2 * 16 * 128 * 2;
+        let memory: Vec<Buffer> = (0..4)
+            .map(|i| Buffer::from_arc(MockMemory::new(0x1000 + i * per_layer_size, per_layer_size)))
+            .collect();
+
+        let layout =
+            LayerSeparateLayout::new(config, memory, BlockDimension::BlockIsSecondDim).unwrap();
+        assert_eq!(layout.num_blocks(), 10);
+        assert!(!layout.is_fully_contiguous());
+    }
+
+    #[test]
+    fn test_memory_region() {
+        let config = LayoutConfig::builder()
+            .num_blocks(2)
+            .num_layers(2)
+            .outer_dim(2)
+            .page_size(16)
+            .inner_dim(128)
+            .dtype_width_bytes(2)
+            .build()
+            .unwrap();
+
+        let per_layer_size = 2 * 2 * 16 * 128 * 2;
+        let memory: Vec<Buffer> = (0..2)
+            .map(|i| Buffer::from_arc(MockMemory::new(0x1000 + i * per_layer_size, per_layer_size)))
+            .collect();
+
+        let layout =
+            LayerSeparateLayout::new(config, memory, BlockDimension::BlockIsFirstDim).unwrap();
+
+        // Test accessing specific memory regions
+        let region_size = 16 * 128 * 2;
+
+        // Block 0, Layer 0, Outer 0 - should be at layer 0's base address
+        let region = layout.memory_region(0, 0, 0).unwrap();
+        assert_eq!(region.addr, 0x1000);
+        assert_eq!(region.size, region_size);
+
+        // Block 0, Layer 1, Outer 0 - should be at layer 1's base address
+        let region = layout.memory_region(0, 1, 0).unwrap();
+        assert_eq!(region.addr, 0x1000 + per_layer_size);
+        assert_eq!(region.size, region_size);
+
+        // Block 0, Layer 0, Outer 1 - should be offset within layer 0
+        let region = layout.memory_region(0, 0, 1).unwrap();
+        assert_eq!(region.addr, 0x1000 + region_size);
+        assert_eq!(region.size, region_size);
+    }
+}
--- a/lib/kvbm-physical/src/layout/mod.rs
+++ b/lib/kvbm-physical/src/layout/mod.rs
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Decoupled layout system for block management.
+//!
+//! This module provides a simplified layout abstraction that:
+//! - Maps block IDs to physical memory regions (address + size)
+//! - Decouples memory regions from storage type information
+//! - Specifies allocation requirements without performing allocation
+//! - Uses trait objects for memory ownership
+
+pub(crate) mod builder;
+
+mod config;
+mod fully_contiguous;
+mod kv_block_layout;
+mod layer_separate;
+mod physical;
+mod serialize;
+mod validation;
+
+#[cfg(all(test, feature = "testing-kvbm"))]
+pub(super) mod tests;
+
+// #[cfg(test)]
+// mod integration_tests;
+
+pub use builder::PhysicalLayoutBuilder;
+pub use config::{BlockDimension, LayoutConfig};
+pub(crate) use fully_contiguous::FullyContiguousLayout;
+pub use kv_block_layout::{BlockDim, KvBlockLayout};
+pub(crate) use layer_separate::LayerSeparateLayout;
+pub use physical::NixlMetadata;
+pub use physical::PhysicalLayout;
+pub(crate) use serialize::LayoutDescriptor;
+pub use serialize::{BlockFormat, FullyContiguousDetails, LayerSeparateDetails, LayoutTypeDetails};
+
+// mod registration;
+// pub use registration::{RegisteredLayout, RegisteredStorageMetadata, RegistrationManager};
+
+use anyhow::Result;
+use serde::{Deserialize, Serialize};
+
+pub(crate) use dynamo_memory::MemoryDescriptor;
+pub use dynamo_memory::{Buffer, MemoryRegion};
+
+/// Core layout trait for mapping block IDs to memory regions.
+///
+/// Layouts specify how KV cache blocks are organized in memory without
+/// performing allocation themselves. They provide:
+/// - Memory region lookup for specific blocks
+/// - Allocation requirements for external allocators
+/// - Metadata about block organization
+pub trait Layout: Send + Sync + std::fmt::Debug {
+    /// Get the configuration for this layout.
+    fn config(&self) -> &LayoutConfig;
+
+    /// Get the root memory regions backing this layout.
+    ///
+    /// These regions correspond to the concrete allocations that store the layout's data.
+    /// Implementations that derive memory procedurally can return an empty slice.
+    fn memory_regions(&self) -> &[Buffer];
+
+    /// Get memory regions for a specific block_id, layer_id, outer_id.
+    ///
+    /// Returns a [MemoryRegion] for the continuous region specified by the given block_id,
+    /// layer_id, outer_id.
+    ///
+    /// # Arguments
+    /// * `block_id` - The ID of the block to query (0..num_blocks)
+    /// * `layer_id` - The ID of the layer to query (0..num_layers)
+    /// * `outer_id` - The ID of the outer dimension to query (0..outer_dim)
+    fn memory_region(
+        &self,
+        block_id: usize,
+        layer_id: usize,
+        outer_id: usize,
+    ) -> Result<MemoryRegion>;
+
+    /// Get the allocation requirements for this layout.
+    ///
+    /// Returns a vector of allocation sizes needed to back this layout.
+    /// For fully contiguous layouts, this will be a single size.
+    /// For layer-separate layouts, this will contain one size per layer.
+    ///
+    /// # Returns
+    /// Vector of allocation sizes in bytes.
+    fn required_allocations(&self) -> Vec<usize>;
+
+    /// Check if this layout uses fully contiguous memory.
+    ///
+    /// Fully contiguous layouts have all blocks in a single allocation,
+    /// which enables certain optimizations.
+    fn is_fully_contiguous(&self) -> bool;
+
+    /// Get the total number of blocks in this layout.
+    fn num_blocks(&self) -> usize;
+
+    /// Get the number of layers per block.
+    fn num_layers(&self) -> usize;
+
+    /// Get the outer dimension size.
+    ///
+    /// In typical KV cache layouts, this is often 2 (for K and V),
+    /// but can be 1 for architectures like MLA.
+    fn outer_dim(&self) -> usize;
+
+    /// Get the page size (often corresponds to block size in tokens).
+    fn page_size(&self) -> usize;
+
+    /// Get the inner dimension size.
+    ///
+    /// This is typically the hidden size divided by tensor parallel size.
+    fn inner_dim(&self) -> usize;
+
+    /// Get the data type width in bytes.
+    fn dtype_width_bytes(&self) -> usize;
+
+    /// Get serialization details for this layout type.
+    ///
+    /// This provides the layout-type-specific information needed to serialize
+    /// and reconstruct the layout on a remote node.
+    fn serialization_details(&self) -> serialize::LayoutTypeDetails;
+
+    /// Get the KV block layout describing how dimensions are permuted within blocks.
+    ///
+    /// Returns the internal tensor ordering for blocks in this layout.
+    /// For layer-separate layouts, this describes the inner tensor format.
+    /// For fully contiguous layouts, this describes the full block format.
+    fn block_layout(&self) -> KvBlockLayout;
+}
+
+/// Inner shape format for tensor layout
+#[allow(clippy::upper_case_acronyms)]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+pub(crate) enum InnerShape {
+    /// Unknown shape - fallback when we can't determine the format
+    Unknown,
+    /// NHD format: [block_size, num_heads, head_dim]
+    /// Common for attention layers where N=tokens, H=heads, D=dimension
+    NHD,
+    /// HND format: [num_heads, block_size, head_dim]
+    /// Alternative layout with heads first
+    HND,
+}
+
+/// Trait for layouts that provide contiguous per-block memory regions.
+///
+/// This trait enables direct access to entire blocks as contiguous memory,
+/// without requiring layer/outer indexing. It is implemented by
+/// [`FullyContiguousLayout`] but NOT by [`LayerSeparateLayout`] (which
+/// stores each layer separately).
+///
+/// Use this trait when you need to:
+/// - Access raw block memory for transformation kernels
+/// - Reinterpret block memory under different [`KvBlockLayout`] formats
+/// - Perform whole-block operations without layer decomposition
+pub trait ContiguousBlockLayout: Send + Sync + std::fmt::Debug {
+    /// Get the total number of blocks in this layout.
+    fn num_blocks(&self) -> usize;
+
+    /// Get the size of each block in bytes.
+    fn bytes_per_block(&self) -> usize;
+
+    /// Get the contiguous memory region for a specific block.
+    ///
+    /// # Arguments
+    /// * `block_id` - The ID of the block to query (0..num_blocks)
+    ///
+    /// # Returns
+    /// A [`MemoryRegion`] covering the entire block's memory.
+    ///
+    /// # Errors
+    /// Returns an error if `block_id` is out of range.
+    fn raw_block(&self, block_id: usize) -> Result<MemoryRegion>;
+
+    /// Get the KV block layout for this contiguous layout.
+    fn block_layout(&self) -> KvBlockLayout;
+}