feat: Transition to FullyContiguous Host and Disk layouts (#3090)

Signed-off-by: Olga Andreeva <oandreeva@nvidia.com> Signed-off-by: Olga Andreeva <124622579+oandreeva-nv@users.noreply.github.com> Co-authored-by: oandreeva-nv <oandreeva-nv@nvidia.com>

feat: Transition to FullyContiguous Host and Disk layouts (#3090)
Signed-off-by: Olga Andreeva <oandreeva@nvidia.com> Signed-off-by: Olga Andreeva <124622579+oandreeva-nv@users.noreply.github.com> Co-authored-by: oandreeva-nv <oandreeva-nv@nvidia.com>
d2e3b66e · Olga Andreeva · GitHub · a5e1d45e · d2e3b66e · d2e3b66e
Unverified Commit d2e3b66e authored Oct 01, 2025 by Olga Andreeva Committed by GitHub Oct 01, 2025
10 changed files
--- a/container/Dockerfile.vllm
+++ b/container/Dockerfile.vllm
@@ -343,4 +343,4 @@ RUN uv pip install maturin[patchelf] && \
    uv pip install --no-deps -e .

 ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
-CMD []
\ No newline at end of file
+CMD []
--- a/lib/bindings/python/rust/llm/block_manager/distributed.rs
+++ b/lib/bindings/python/rust/llm/block_manager/distributed.rs
@@ -9,4 +9,4 @@ mod worker;

 pub use leader::KvbmLeader;
 pub use utils::get_barrier_id_prefix;
-pub use worker::{KvbmWorker, VllmTensor};
+pub use worker::{KvbmWorker, PyLayoutType, VllmTensor};
--- a/lib/bindings/python/rust/llm/block_manager/distributed/worker.rs
+++ b/lib/bindings/python/rust/llm/block_manager/distributed/worker.rs
@@ -10,8 +10,44 @@ use llm_rs::block_manager::distributed::{
    BlockTransferHandler as RustBlockTransferHandler, KvbmWorker as KvbmWorkerImpl,
    KvbmWorkerConfig,
 };
+use llm_rs::block_manager::layout::LayoutType;
 use llm_rs::block_manager::storage::torch::{TorchDevice, TorchTensor};

+/// A wrapper around a layout type.
+/// This is used to convert between the Python and Rust layout types.
+#[pyclass(eq, eq_int)]
+#[derive(Clone, PartialEq, Eq)]
+pub enum PyLayoutType {
+    FullyContiguous,
+    LayerSeparate,
+}
+
+#[pymethods]
+impl PyLayoutType {
+    /// String representation of the layout type
+    fn __str__(&self) -> &'static str {
+        match self {
+            PyLayoutType::FullyContiguous => "FullyContiguous",
+            PyLayoutType::LayerSeparate => "LayerSeparate",
+        }
+    }
+
+    /// Representation for debugging
+    fn __repr__(&self) -> String {
+        format!("PyLayoutType.{}", self.__str__())
+    }
+}
+
+impl From<PyLayoutType> for LayoutType {
+    fn from(py_layout: PyLayoutType) -> Self {
+        match py_layout {
+            PyLayoutType::FullyContiguous => LayoutType::FullyContiguous,
+            // Layout (outer_contiguous vs block_contiguous) is auto-detected from tensor shapes
+            PyLayoutType::LayerSeparate => LayoutType::layer_separate_auto_default(),
+        }
+    }
+}
+
 /// A wrapper around a Torch tensor.
 /// We hold onto the py object to ensure it doesn't get GCed.
 #[derive(Clone, Debug)]
@@ -107,7 +143,7 @@ impl KvbmWorker {
 #[pymethods]
 impl KvbmWorker {
    #[new]
-    #[pyo3(signature = (num_device_blocks, page_size, tensors, device_id=0, dtype_width_bytes=2, drt=None, layout_blocking=false))]
+    #[pyo3(signature = (num_device_blocks, page_size, tensors, device_id=0, dtype_width_bytes=2, drt=None, layout_blocking=false, device_layout_type=None, host_layout_type=None, disk_layout_type=None))]
    fn new(
        num_device_blocks: usize,
        page_size: usize,
@@ -116,6 +152,9 @@ impl KvbmWorker {
        dtype_width_bytes: usize,
        drt: Option<DistributedRuntime>,
        layout_blocking: bool,
+        device_layout_type: Option<PyLayoutType>,
+        host_layout_type: Option<PyLayoutType>,
+        disk_layout_type: Option<PyLayoutType>,
    ) -> PyResult<Self> {
        let py_drt = drt.ok_or_else(|| {
            pyo3::exceptions::PyValueError::new_err("DistributedRuntime (drt) must be provided")
@@ -142,6 +181,21 @@ impl KvbmWorker {
            .device_id(device_id)
            .dtype_width_bytes(dtype_width_bytes)
            .barrier_id_prefix(barrier_id_prefix)
+            .device_layout_type(
+                device_layout_type
+                    .map(|py_layout| py_layout.into())
+                    .unwrap_or(LayoutType::FullyContiguous),
+            )
+            .host_layout_type(
+                host_layout_type
+                    .map(|py_layout| py_layout.into())
+                    .unwrap_or(LayoutType::FullyContiguous),
+            )
+            .disk_layout_type(
+                disk_layout_type
+                    .map(|py_layout| py_layout.into())
+                    .unwrap_or(LayoutType::FullyContiguous),
+            )
            .build()
            .map_err(to_pyerr)?;


--- a/lib/bindings/python/rust/llm/block_manager/vllm/connector/trtllm_worker.rs
+++ b/lib/bindings/python/rust/llm/block_manager/vllm/connector/trtllm_worker.rs
@@ -19,6 +19,7 @@ use crate::{

 use anyhow;
 use dynamo_llm::block_manager::distributed::{KvbmWorker, KvbmWorkerConfig};
+use dynamo_llm::block_manager::layout::LayoutType;
 use dynamo_llm::block_manager::metrics_kvbm::KvbmMetrics;
 use dynamo_llm::block_manager::storage::torch::TorchTensor;
 use dynamo_runtime::DistributedRuntime;
@@ -144,7 +145,9 @@ impl Worker for KvConnectorWorker {
            .tensors(kv_cache_tensors)
            .device_id(device_id)
            .dtype_width_bytes(dtype_width_bytes)
-            .is_fully_contiguous_layout(true)
+            .device_layout_type(LayoutType::FullyContiguous)
+            .host_layout_type(LayoutType::FullyContiguous)
+            .disk_layout_type(LayoutType::FullyContiguous)
            .barrier_id_prefix(get_barrier_id_prefix())
            .scheduler_client(Some(self.transfer_client.clone()))
            .build()?;

--- a/lib/bindings/python/rust/llm/block_manager/vllm/connector/worker.rs
+++ b/lib/bindings/python/rust/llm/block_manager/vllm/connector/worker.rs
@@ -18,8 +18,10 @@ use crate::{
 };
 use dynamo_runtime::metrics::prometheus_names::kvbm_connector;

+use crate::llm::block_manager::distributed::PyLayoutType;
 use anyhow;
 use dynamo_llm::block_manager::distributed::{KvbmWorker, KvbmWorkerConfig};
+use dynamo_llm::block_manager::layout::LayoutType;
 use dynamo_llm::block_manager::storage::torch::TorchTensor;
 use dynamo_runtime::DistributedRuntime;
 use dynamo_runtime::utils::task::CriticalTaskExecutionHandle;
@@ -33,6 +35,9 @@ pub trait Worker: Send + Sync {
        dtype_width_bytes: usize,
        kv_caches: Vec<(String, Arc<VllmTensor>)>,
        raw_event_handles: Vec<u64>,
+        device_layout_type: Option<LayoutType>,
+        host_layout_type: Option<LayoutType>,
+        disk_layout_type: Option<LayoutType>,
    ) -> anyhow::Result<()>;

    fn bind_connector_metadata(&mut self, metadata: Vec<u8>) -> anyhow::Result<()>;
@@ -133,6 +138,9 @@ impl Worker for KvConnectorWorker {
        dtype_width_bytes: usize,
        kv_caches: Vec<(String, Arc<VllmTensor>)>,
        raw_event_handles: Vec<u64>,
+        device_layout_type: Option<LayoutType>,
+        host_layout_type: Option<LayoutType>,
+        disk_layout_type: Option<LayoutType>,
    ) -> anyhow::Result<()> {
        if self.kvbm_worker.get().is_some() {
            tracing::warn!("kvbm worker already registered");
@@ -147,9 +155,16 @@ impl Worker for KvConnectorWorker {

        // Process kv_caches in layer execution order (already sorted by layer index)
        let mut vllm_tensors = Vec::new();
+        let mut first_tensor_shape: Option<Vec<usize>> = None;
+
        for (layer_name, vllm_tensor) in kv_caches {
            tracing::trace!("Registering KV cache layer: {layer_name}, tensor: {vllm_tensor:?}");

+            // Capture the shape of the first tensor for layout detection
+            if first_tensor_shape.is_none() {
+                first_tensor_shape = Some(vllm_tensor.shape());
+            }
+
            // Store for later lookup by name
            self.kv_cache_layers.push((layer_name, vllm_tensor.clone()));

@@ -159,6 +174,35 @@ impl Worker for KvConnectorWorker {

        self.layer_events = raw_event_handles;

+        // Auto-detect device layout type if not explicitly provided
+        let detected_device_layout_type = match device_layout_type {
+            Some(layout) => layout,
+            None => {
+                if let Some(ref shape) = first_tensor_shape {
+                    match LayoutType::layer_separate_auto(shape, num_device_blocks) {
+                        Ok(detected) => {
+                            tracing::info!(
+                                "Auto-detected device layout from tensor shape: {:?}",
+                                detected
+                            );
+                            detected
+                        }
+                        Err(e) => {
+                            tracing::warn!(
+                                "Failed to auto-detect layout from shape {:?}: {}. Using default.",
+                                shape,
+                                e
+                            );
+                            LayoutType::layer_separate_auto_default()
+                        }
+                    }
+                } else {
+                    tracing::warn!("No tensors available for layout detection. Using default.");
+                    LayoutType::layer_separate_auto_default()
+                }
+            }
+        };
+
        let config = KvbmWorkerConfig::builder()
            .drt(self.drt.clone())
            .num_device_blocks(num_device_blocks)
@@ -168,6 +212,9 @@ impl Worker for KvConnectorWorker {
            .dtype_width_bytes(dtype_width_bytes)
            .barrier_id_prefix(get_barrier_id_prefix())
            .scheduler_client(Some(self.transfer_client.clone()))
+            .device_layout_type(detected_device_layout_type)
+            .host_layout_type(host_layout_type.unwrap_or(LayoutType::FullyContiguous))
+            .disk_layout_type(disk_layout_type.unwrap_or(LayoutType::FullyContiguous))
            .build()?;

        let worker = self.drt.runtime().primary().block_on(async move {
@@ -416,6 +463,7 @@ impl PyKvConnectorWorker {
        Ok(Self { connector_worker })
    }

+    #[pyo3(signature = (num_device_blocks, page_size, device_id, dtype_width_bytes, kv_caches, raw_event_handles, device_layout_type=None, host_layout_type=None, disk_layout_type=None))]
    pub fn register_kv_caches(
        &mut self,
        num_device_blocks: usize,
@@ -424,6 +472,9 @@ impl PyKvConnectorWorker {
        dtype_width_bytes: usize,
        kv_caches: Vec<(String, Py<PyAny>)>,
        raw_event_handles: Vec<u64>,
+        device_layout_type: Option<PyLayoutType>,
+        host_layout_type: Option<PyLayoutType>,
+        disk_layout_type: Option<PyLayoutType>,
    ) -> PyResult<()> {
        // Convert Python tensors to Rust VllmTensor objects
        let mut rust_kv_caches = Vec::new();
@@ -440,6 +491,9 @@ impl PyKvConnectorWorker {
                dtype_width_bytes,
                rust_kv_caches,
                raw_event_handles,
+                device_layout_type.map(|py_layout| py_layout.into()),
+                host_layout_type.map(|py_layout| py_layout.into()),
+                disk_layout_type.map(|py_layout| py_layout.into()),
            )
            .map_err(to_pyerr)
    }

--- a/lib/llm/src/block_manager/block/transfer/cuda.rs
+++ b/lib/llm/src/block_manager/block/transfer/cuda.rs
@@ -683,4 +683,455 @@ mod tests {
            assert!(slice.iter().all(|&x| x == 42));
        }
    }
+
+    // ============================================================================
+    // CUDA TRANSFER TESTS FOR LAYOUT COMPATIBILITY
+    // ============================================================================
+
+    mod layout_transfer_tests {
+        use super::*;
+        use crate::block_manager::layout::{
+            FullyContiguous, GenericBlockLayout, LayerSeparate, LayoutConfig,
+        };
+
+        const TEST_NUM_BLOCKS: usize = 4;
+        const TEST_NUM_LAYERS: usize = 3;
+        const TEST_OUTER_DIM: usize = 2;
+        const TEST_PAGE_SIZE: usize = 8;
+        const TEST_INNER_DIM: usize = 16;
+        const TEST_DTYPE_WIDTH_BYTES: usize = 2;
+
+        fn create_test_config() -> LayoutConfig {
+            LayoutConfig {
+                num_blocks: TEST_NUM_BLOCKS,
+                num_layers: TEST_NUM_LAYERS,
+                outer_dim: TEST_OUTER_DIM,
+                page_size: TEST_PAGE_SIZE,
+                inner_dim: TEST_INNER_DIM,
+                alignment: 256, // GPU-friendly alignment
+                dtype_width_bytes: TEST_DTYPE_WIDTH_BYTES,
+            }
+        }
+
+        /// Test H2D transfers between FullyContiguous host and LayerSeparate device layouts
+        #[test]
+        fn test_h2d_fc_host_to_ls_device() {
+            let device_allocator = DeviceAllocator::default();
+            let pinned_allocator = PinnedAllocator::default();
+            let ctx = device_allocator.ctx().clone();
+            let stream = ctx.new_stream().unwrap();
+
+            let config = create_test_config();
+
+            // Create FullyContiguous host layout
+            let host_layout = FullyContiguous::allocate(config.clone(), &pinned_allocator).unwrap();
+
+            // Create LayerSeparate device layout
+            let device_layout = LayerSeparate::allocate(config, &device_allocator, true).unwrap();
+
+            // Test data transfer for each memory region
+            for block_idx in 0..TEST_NUM_BLOCKS {
+                for layer_idx in 0..TEST_NUM_LAYERS {
+                    for outer_idx in 0..TEST_OUTER_DIM {
+                        let host_region = host_layout
+                            .memory_region(block_idx, layer_idx, outer_idx)
+                            .unwrap();
+                        let device_region = device_layout
+                            .memory_region(block_idx, layer_idx, outer_idx)
+                            .unwrap();
+
+                        // Verify regions have same size
+                        assert_eq!(
+                            host_region.size(),
+                            device_region.size(),
+                            "Region size mismatch at ({}, {}, {})",
+                            block_idx,
+                            layer_idx,
+                            outer_idx
+                        );
+
+                        // Create test pattern
+                        let pattern =
+                            ((block_idx as u8) << 4) | ((layer_idx as u8) << 2) | (outer_idx as u8);
+
+                        // Fill host memory with pattern
+                        unsafe {
+                            let host_slice = std::slice::from_raw_parts_mut(
+                                host_region.addr() as *mut u8,
+                                host_region.size(),
+                            );
+                            host_slice.fill(pattern);
+                        }
+
+                        // Transfer H2D
+                        unsafe {
+                            cuda_memcpy_h2d(
+                                host_region.addr() as *const u8,
+                                device_region.addr() as *mut u8,
+                                host_region.size(),
+                                stream.as_ref(),
+                            )
+                            .unwrap();
+                        }
+                    }
+                }
+            }
+
+            stream.synchronize().unwrap();
+
+            // Verify transfers by copying back and checking patterns
+            for block_idx in 0..TEST_NUM_BLOCKS {
+                for layer_idx in 0..TEST_NUM_LAYERS {
+                    for outer_idx in 0..TEST_OUTER_DIM {
+                        let host_region = host_layout
+                            .memory_region(block_idx, layer_idx, outer_idx)
+                            .unwrap();
+                        let device_region = device_layout
+                            .memory_region(block_idx, layer_idx, outer_idx)
+                            .unwrap();
+
+                        let expected_pattern =
+                            ((block_idx as u8) << 4) | ((layer_idx as u8) << 2) | (outer_idx as u8);
+
+                        // Create temporary verification buffer
+                        let mut verify_buffer =
+                            pinned_allocator.allocate(host_region.size()).unwrap();
+
+                        // Copy back from device
+                        unsafe {
+                            cuda_memcpy_d2h(
+                                device_region.addr() as *const u8,
+                                verify_buffer.as_mut_ptr(),
+                                host_region.size(),
+                                stream.as_ref(),
+                            )
+                            .unwrap();
+                        }
+                        stream.synchronize().unwrap();
+
+                        // Verify pattern
+                        unsafe {
+                            let verify_slice = std::slice::from_raw_parts(
+                                verify_buffer.as_ptr(),
+                                host_region.size(),
+                            );
+                            assert!(
+                                verify_slice.iter().all(|&x| x == expected_pattern),
+                                "Pattern mismatch at ({}, {}, {}) - expected {}, got {:?}",
+                                block_idx,
+                                layer_idx,
+                                outer_idx,
+                                expected_pattern,
+                                &verify_slice[0..std::cmp::min(8, verify_slice.len())]
+                            );
+                        }
+                    }
+                }
+            }
+        }
+
+        /// Test D2H transfers from LayerSeparate device to FullyContiguous host
+        #[test]
+        fn test_d2h_ls_device_to_fc_host() {
+            let device_allocator = DeviceAllocator::default();
+            let pinned_allocator = PinnedAllocator::default();
+            let ctx = device_allocator.ctx().clone();
+            let stream = ctx.new_stream().unwrap();
+
+            let config = create_test_config();
+
+            // Create LayerSeparate device layout (block contiguous)
+            let device_layout =
+                LayerSeparate::allocate(config.clone(), &device_allocator, false).unwrap();
+
+            // Create FullyContiguous host layout
+            let host_layout = FullyContiguous::allocate(config, &pinned_allocator).unwrap();
+
+            // Initialize device memory with patterns using a temporary host buffer
+            for block_idx in 0..TEST_NUM_BLOCKS {
+                for layer_idx in 0..TEST_NUM_LAYERS {
+                    for outer_idx in 0..TEST_OUTER_DIM {
+                        let device_region = device_layout
+                            .memory_region(block_idx, layer_idx, outer_idx)
+                            .unwrap();
+                        let pattern = ((block_idx as u8) << 4)
+                            | ((layer_idx as u8) << 2)
+                            | (outer_idx as u8)
+                            | 0x80;
+
+                        // Create temp buffer with pattern
+                        let mut temp_buffer =
+                            pinned_allocator.allocate(device_region.size()).unwrap();
+                        unsafe {
+                            let temp_slice = std::slice::from_raw_parts_mut(
+                                temp_buffer.as_mut_ptr(),
+                                device_region.size(),
+                            );
+                            temp_slice.fill(pattern);
+                        }
+
+                        // Copy pattern to device
+                        unsafe {
+                            cuda_memcpy_h2d(
+                                temp_buffer.as_ptr(),
+                                device_region.addr() as *mut u8,
+                                device_region.size(),
+                                stream.as_ref(),
+                            )
+                            .unwrap();
+                        }
+                    }
+                }
+            }
+            stream.synchronize().unwrap();
+
+            // Clear host layout
+            for block_idx in 0..TEST_NUM_BLOCKS {
+                for layer_idx in 0..TEST_NUM_LAYERS {
+                    for outer_idx in 0..TEST_OUTER_DIM {
+                        let host_region = host_layout
+                            .memory_region(block_idx, layer_idx, outer_idx)
+                            .unwrap();
+                        unsafe {
+                            let host_slice = std::slice::from_raw_parts_mut(
+                                host_region.addr() as *mut u8,
+                                host_region.size(),
+                            );
+                            host_slice.fill(0);
+                        }
+                    }
+                }
+            }
+
+            // Transfer D2H
+            for block_idx in 0..TEST_NUM_BLOCKS {
+                for layer_idx in 0..TEST_NUM_LAYERS {
+                    for outer_idx in 0..TEST_OUTER_DIM {
+                        let device_region = device_layout
+                            .memory_region(block_idx, layer_idx, outer_idx)
+                            .unwrap();
+                        let host_region = host_layout
+                            .memory_region(block_idx, layer_idx, outer_idx)
+                            .unwrap();
+
+                        unsafe {
+                            cuda_memcpy_d2h(
+                                device_region.addr() as *const u8,
+                                host_region.addr() as *mut u8,
+                                device_region.size(),
+                                stream.as_ref(),
+                            )
+                            .unwrap();
+                        }
+                    }
+                }
+            }
+            stream.synchronize().unwrap();
+
+            // Verify patterns in host layout
+            for block_idx in 0..TEST_NUM_BLOCKS {
+                for layer_idx in 0..TEST_NUM_LAYERS {
+                    for outer_idx in 0..TEST_OUTER_DIM {
+                        let host_region = host_layout
+                            .memory_region(block_idx, layer_idx, outer_idx)
+                            .unwrap();
+                        let expected_pattern = ((block_idx as u8) << 4)
+                            | ((layer_idx as u8) << 2)
+                            | (outer_idx as u8)
+                            | 0x80;
+
+                        unsafe {
+                            let host_slice = std::slice::from_raw_parts(
+                                host_region.addr() as *const u8,
+                                host_region.size(),
+                            );
+                            assert!(
+                                host_slice.iter().all(|&x| x == expected_pattern),
+                                "Pattern mismatch at ({}, {}, {}) - expected {}, got {:?}",
+                                block_idx,
+                                layer_idx,
+                                outer_idx,
+                                expected_pattern,
+                                &host_slice[0..std::cmp::min(8, host_slice.len())]
+                            );
+                        }
+                    }
+                }
+            }
+        }
+
+        /// Test bidirectional transfers with layout compatibility verification
+        #[test]
+        fn test_bidirectional_layout_transfers() {
+            let device_allocator = DeviceAllocator::default();
+            let pinned_allocator = PinnedAllocator::default();
+            let ctx = device_allocator.ctx().clone();
+            let stream = ctx.new_stream().unwrap();
+
+            let config = create_test_config();
+
+            // Create both layout types
+            let host_fc = FullyContiguous::allocate(config.clone(), &pinned_allocator).unwrap();
+            let device_ls_outer =
+                LayerSeparate::allocate(config.clone(), &device_allocator, true).unwrap();
+            let device_ls_block =
+                LayerSeparate::allocate(config, &device_allocator, false).unwrap();
+
+            // Test round-trip: Host FC -> Device LS (outer) -> Device LS (block) -> Host FC
+            for block_idx in 0..TEST_NUM_BLOCKS {
+                for layer_idx in 0..TEST_NUM_LAYERS {
+                    for outer_idx in 0..TEST_OUTER_DIM {
+                        let original_pattern = ((block_idx as u8) << 4)
+                            | ((layer_idx as u8) << 2)
+                            | (outer_idx as u8)
+                            | 0x40;
+
+                        // Step 1: Initialize host FC with pattern
+                        let host_region = host_fc
+                            .memory_region(block_idx, layer_idx, outer_idx)
+                            .unwrap();
+                        unsafe {
+                            let host_slice = std::slice::from_raw_parts_mut(
+                                host_region.addr() as *mut u8,
+                                host_region.size(),
+                            );
+                            host_slice.fill(original_pattern);
+                        }
+
+                        // Step 2: Transfer to device LS outer
+                        let device_outer_region = device_ls_outer
+                            .memory_region(block_idx, layer_idx, outer_idx)
+                            .unwrap();
+                        unsafe {
+                            cuda_memcpy_h2d(
+                                host_region.addr() as *const u8,
+                                device_outer_region.addr() as *mut u8,
+                                host_region.size(),
+                                stream.as_ref(),
+                            )
+                            .unwrap();
+                        }
+
+                        // Step 3: Transfer between device layouts (D2D)
+                        let device_block_region = device_ls_block
+                            .memory_region(block_idx, layer_idx, outer_idx)
+                            .unwrap();
+                        unsafe {
+                            cuda_memcpy_d2d(
+                                device_outer_region.addr() as *const u8,
+                                device_block_region.addr() as *mut u8,
+                                device_outer_region.size(),
+                                stream.as_ref(),
+                            )
+                            .unwrap();
+                        }
+
+                        stream.synchronize().unwrap();
+
+                        // Step 4: Clear host and transfer back
+                        unsafe {
+                            let host_slice = std::slice::from_raw_parts_mut(
+                                host_region.addr() as *mut u8,
+                                host_region.size(),
+                            );
+                            host_slice.fill(0);
+                        }
+
+                        unsafe {
+                            cuda_memcpy_d2h(
+                                device_block_region.addr() as *const u8,
+                                host_region.addr() as *mut u8,
+                                device_block_region.size(),
+                                stream.as_ref(),
+                            )
+                            .unwrap();
+                        }
+                        stream.synchronize().unwrap();
+
+                        // Step 5: Verify pattern survived the round trip
+                        unsafe {
+                            let host_slice = std::slice::from_raw_parts(
+                                host_region.addr() as *const u8,
+                                host_region.size(),
+                            );
+                            assert!(
+                                host_slice.iter().all(|&x| x == original_pattern),
+                                "Round-trip pattern mismatch at ({}, {}, {}) - expected {}, got {:?}",
+                                block_idx,
+                                layer_idx,
+                                outer_idx,
+                                original_pattern,
+                                &host_slice[0..std::cmp::min(8, host_slice.len())]
+                            );
+                        }
+                    }
+                }
+            }
+        }
+
+        /// Test transfer performance and alignment impact
+        #[test]
+        fn test_layout_transfer_alignment_performance() {
+            let device_allocator = DeviceAllocator::default();
+            let pinned_allocator = PinnedAllocator::default();
+            let ctx = device_allocator.ctx().clone();
+            let stream = ctx.new_stream().unwrap();
+
+            // Test different alignments
+            for alignment in [1, 64, 256, 512] {
+                let config = LayoutConfig {
+                    num_blocks: 2,
+                    num_layers: 2,
+                    outer_dim: 1,
+                    page_size: 1024,
+                    inner_dim: 256,
+                    alignment,
+                    dtype_width_bytes: 4,
+                };
+
+                let host_layout =
+                    FullyContiguous::allocate(config.clone(), &pinned_allocator).unwrap();
+                let device_layout = FullyContiguous::allocate(config, &device_allocator).unwrap();
+
+                // Measure transfer time (basic timing)
+                let start = std::time::Instant::now();
+
+                for block_idx in 0..2 {
+                    for layer_idx in 0..2 {
+                        let host_region =
+                            host_layout.memory_region(block_idx, layer_idx, 0).unwrap();
+                        let device_region = device_layout
+                            .memory_region(block_idx, layer_idx, 0)
+                            .unwrap();
+
+                        unsafe {
+                            cuda_memcpy_h2d(
+                                host_region.addr() as *const u8,
+                                device_region.addr() as *mut u8,
+                                host_region.size(),
+                                stream.as_ref(),
+                            )
+                            .unwrap();
+                        }
+                    }
+                }
+                stream.synchronize().unwrap();
+
+                let duration = start.elapsed();
+
+                // Verify alignment was applied correctly
+                let region = host_layout.memory_region(0, 0, 0).unwrap();
+                if alignment > 1 {
+                    assert_eq!(
+                        region.addr() % alignment,
+                        0,
+                        "Memory not aligned to {} bytes",
+                        alignment
+                    );
+                }
+
+                println!("Transfer with alignment {} took {:?}", alignment, duration);
+            }
+        }
+    }
 }
--- a/lib/llm/src/block_manager/distributed/worker.rs
+++ b/lib/llm/src/block_manager/distributed/worker.rs
@@ -106,8 +106,14 @@ pub struct KvbmWorkerConfig {
    #[builder(default = "2")]
    dtype_width_bytes: usize,

-    #[builder(default = false)]
-    is_fully_contiguous_layout: bool,
+    #[builder(default = "LayoutType::FullyContiguous")]
+    device_layout_type: LayoutType,
+
+    #[builder(default = "LayoutType::FullyContiguous")]
+    host_layout_type: LayoutType,
+
+    #[builder(default = "LayoutType::FullyContiguous")]
+    disk_layout_type: LayoutType,

    #[builder(default = "String::from(\"kvbm\")")]
    barrier_id_prefix: String,
@@ -161,53 +167,51 @@ impl KvbmWorker {
            )));
        }

-        let (layout_type, num_layers, outer_dim, inner_dim) = if !config.is_fully_contiguous_layout
-        {
-            let (outer_contiguous, outer_dim) = if shape[0] >= config.num_device_blocks {
-                (false, shape[1])
-            } else if shape[1] >= config.num_device_blocks {
-                (true, shape[0])
-            } else {
-                return Err(anyhow::anyhow!(format!(
-                    "Unsupported kv cache layout. Got shape: {:?}",
-                    shape
-                )));
-            };
-            let num_layers = device_tensors.len();
-            let inner_dim = shape[2..].iter().product::<usize>() / config.page_size;
-
-            tracing::info!(
-                "Inferred layout: num_layers={}, outer_dim={}, page_size={}, inner_dim={}",
-                device_tensors.len(),
-                outer_dim,
-                config.page_size,
-                inner_dim
-            );
-
-            (
-                LayoutType::LayerSeparate { outer_contiguous },
-                num_layers,
-                outer_dim,
-                inner_dim,
-            )
-        } else {
-            let num_layers = shape[1];
-            let outer_dim = shape[2];
-            let inner_dim = shape[3..].iter().product::<usize>() / config.page_size;
-            tracing::info!(
-                "Inferred layout: num_layers={}, outer_dim={}, page_size={}, inner_dim={}",
-                num_layers,
-                outer_dim,
-                config.page_size,
-                inner_dim
-            );
-
-            (
-                LayoutType::FullyContiguous,
-                num_layers,
-                outer_dim,
-                inner_dim,
-            )
+        let (layout_type, num_layers, outer_dim, inner_dim) = match config.device_layout_type {
+            LayoutType::FullyContiguous => {
+                let num_layers = shape[1];
+                let outer_dim = shape[2];
+                let inner_dim = shape[3..].iter().product::<usize>() / config.page_size;
+                tracing::info!(
+                    "Inferred layout: num_layers={}, outer_dim={}, page_size={}, inner_dim={}",
+                    num_layers,
+                    outer_dim,
+                    config.page_size,
+                    inner_dim
+                );
+
+                (
+                    LayoutType::FullyContiguous,
+                    num_layers,
+                    outer_dim,
+                    inner_dim,
+                )
+            }
+            LayoutType::LayerSeparate { outer_contiguous } => {
+                // Use the already-detected layout type from config (no re-detection needed)
+                let layout_type = config.device_layout_type;
+
+                // Extract outer_dim based on the provided outer_contiguous value
+                let outer_dim = if outer_contiguous {
+                    shape[0] // Outer contiguous: [outer_dim, n_blocks, ...]
+                } else {
+                    shape[1] // Block contiguous: [n_blocks, outer_dim, ...]
+                };
+
+                let num_layers = device_tensors.len();
+                let inner_dim = shape[2..].iter().product::<usize>() / config.page_size;
+
+                tracing::info!(
+                    "Inferred layout: num_layers={}, outer_dim={}, outer_contiguous={}, page_size={}, inner_dim={}",
+                    num_layers,
+                    outer_dim,
+                    outer_contiguous,
+                    config.page_size,
+                    inner_dim
+                );
+
+                (layout_type, num_layers, outer_dim, inner_dim)
+            }
        };

        let bytes_per_block =
@@ -556,7 +560,7 @@ impl KvbmWorker {
        device_layout: Box<dyn NixlLayout<StorageType = DeviceStorage>>,
        mut layout_builder: LayoutConfigBuilder,
        leader_data: KvbmLeaderData,
-        layout_type: LayoutType,
+        _layout_type: LayoutType,
        config: KvbmWorkerConfig,
        cancel_token: CancellationToken,
        handler_tx: oneshot::Sender<BlockTransferHandler>,
@@ -606,7 +610,7 @@ impl KvbmWorker {
            let host_layout = layout_builder
                .num_blocks(leader_data.num_host_blocks)
                .build()?
-                .allocate_layout(layout_type, host_allocator)?;
+                .allocate_layout(config.host_layout_type, host_allocator)?;

            Some(Self::make_layout::<_, BasicMetadata>(
                host_layout,
@@ -623,7 +627,7 @@ impl KvbmWorker {
            let disk_layout = layout_builder
                .num_blocks(leader_data.num_disk_blocks)
                .build()?
-                .allocate_layout(layout_type, disk_allocator)?;
+                .allocate_layout(config.disk_layout_type, disk_allocator)?;

            Some(Self::make_layout::<_, BasicMetadata>(
                disk_layout,

--- a/lib/llm/src/block_manager/layout.rs
+++ b/lib/llm/src/block_manager/layout.rs
@@ -102,7 +102,8 @@
 // pub mod distributed;

 pub mod nixl;
-mod utils;
+/// Utility functions for layout validation and verification
+pub mod utils;

 use utils::*;

@@ -150,15 +151,47 @@ pub enum LayoutType {
    FullyContiguous,

    /// All layers are stored separately.
-    /// If outer_contiguous is true, for each layer: [outer_dim, n_blocks, ...]
-    /// If outer_contiguous is false, for each layer: [n_blocks, outer_dim, ...]
+    /// The outer_contiguous field is auto-detected from tensor shapes when not explicitly set.
+    /// If outer_contiguous: for each layer: [outer_dim, n_blocks, ...]
+    /// If !outer_contiguous: for each layer: [n_blocks, outer_dim, ...]
    /// When outer_dim is 1, these two modes are equivalent.
    LayerSeparate {
-        /// If true, the outer dimension is contiguous. Otherwise, the block dimension is contiguous.
+        /// If true, the outer dimension is contiguous. Auto-detected from tensor shapes when possible.
        outer_contiguous: bool,
    },
 }

+impl LayoutType {
+    /// Create a LayerSeparate layout type with auto-detection based on tensor shapes
+    pub fn layer_separate_auto(shape: &[usize], num_device_blocks: usize) -> anyhow::Result<Self> {
+        let outer_contiguous = if shape[0] >= num_device_blocks {
+            false // Block contiguous: [n_blocks, outer_dim, ...]
+        } else if shape[1] >= num_device_blocks {
+            true // Outer contiguous: [outer_dim, n_blocks, ...]
+        } else {
+            return Err(anyhow::anyhow!(format!(
+                "Unsupported kv cache layout. Got shape: {:?}",
+                shape
+            )));
+        };
+
+        Ok(LayoutType::LayerSeparate { outer_contiguous })
+    }
+
+    /// Create a LayerSeparate layout type with default auto-detection (defaults to outer_contiguous=true)
+    /// Use this when tensor shapes are not available
+    pub fn layer_separate_auto_default() -> Self {
+        LayoutType::LayerSeparate {
+            outer_contiguous: true,
+        }
+    }
+
+    /// Create a LayerSeparate layout type with explicit outer_contiguous setting
+    pub fn layer_separate(outer_contiguous: bool) -> Self {
+        LayoutType::LayerSeparate { outer_contiguous }
+    }
+}
+
 /// Local Memory Region
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Getters)]
 pub struct LocalMemoryRegion {
@@ -545,6 +578,80 @@ impl<S: Storage> BlockLayoutConfig for FullyContiguous<S> {
    }
 }

+impl<S: Storage> FullyContiguous<S> {
+    /// Verify memory region addressing is correct for this layout
+    pub fn verify_memory_regions(&self) -> Result<(), LayoutError> {
+        use crate::block_manager::layout::utils::WorkerLayoutVerifier;
+
+        let mut verifier = WorkerLayoutVerifier::new();
+        let results = verifier.verify_layout_consistency(self)?;
+
+        if verifier.has_critical_mismatches() {
+            tracing::error!(
+                "FullyContiguous layout verification failed: {} regions checked, {} size mismatches",
+                results.len(),
+                results.iter().filter(|r| !r.size_matches).count()
+            );
+            return Err(LayoutError::InvalidConfig(
+                "Memory region verification failed".to_string(),
+            ));
+        }
+
+        tracing::debug!(
+            "FullyContiguous layout verification passed: {} regions checked",
+            results.len()
+        );
+        Ok(())
+    }
+
+    /// Get expected memory address for a region (for testing/verification)
+    pub fn expected_memory_address(
+        &self,
+        block_idx: usize,
+        layer_idx: usize,
+        outer_idx: usize,
+    ) -> Result<usize, LayoutError> {
+        validate_indices(&self.config, block_idx, layer_idx, outer_idx)?;
+
+        let aligned_start_addr = self.storage.addr() as usize + self.base_offset;
+        let block_offset = block_idx * self.config.block_stride_in_bytes;
+        let layer_offset = layer_idx * self.config.layer_stride_in_bytes;
+        let outer_offset = outer_idx * self.config.outer_dim_stride_in_bytes;
+
+        Ok(aligned_start_addr + block_offset + layer_offset + outer_offset)
+    }
+
+    /// Verify a specific memory region matches expected calculations
+    pub fn verify_memory_region(
+        &self,
+        block_idx: usize,
+        layer_idx: usize,
+        outer_idx: usize,
+    ) -> Result<bool, LayoutError> {
+        let actual_region = self.memory_region(block_idx, layer_idx, outer_idx)?;
+        let expected_addr = self.expected_memory_address(block_idx, layer_idx, outer_idx)?;
+        let expected_size = self.config.memory_region_size;
+
+        let addr_matches = actual_region.addr == expected_addr;
+        let size_matches = actual_region.size == expected_size;
+
+        if !addr_matches || !size_matches {
+            tracing::warn!(
+                "Memory region mismatch at ({}, {}, {}): addr {} vs {} (expected), size {} vs {} (expected)",
+                block_idx,
+                layer_idx,
+                outer_idx,
+                actual_region.addr,
+                expected_addr,
+                actual_region.size,
+                expected_size
+            );
+        }
+
+        Ok(addr_matches && size_matches)
+    }
+}
+
 /// Configuration for layer-separated layouts.
 /// This is used in vLLM, where every layer has its own allocation.
 #[derive(Debug, Clone, Serialize, Deserialize)]
@@ -794,6 +901,116 @@ impl<S: Storage> BlockLayoutConfig for LayerSeparate<S> {
    }
 }

+impl<S: Storage> LayerSeparate<S> {
+    /// Verify memory region addressing is correct for this layout
+    pub fn verify_memory_regions(&self) -> Result<(), LayoutError> {
+        use crate::block_manager::layout::utils::WorkerLayoutVerifier;
+
+        let mut verifier = WorkerLayoutVerifier::new();
+        let results = verifier.verify_layout_consistency(self)?;
+
+        if verifier.has_critical_mismatches() {
+            tracing::error!(
+                "LayerSeparate layout verification failed: {} regions checked, {} size mismatches",
+                results.len(),
+                results.iter().filter(|r| !r.size_matches).count()
+            );
+            return Err(LayoutError::InvalidConfig(
+                "Memory region verification failed".to_string(),
+            ));
+        }
+
+        tracing::debug!(
+            "LayerSeparate layout verification passed: {} regions checked",
+            results.len()
+        );
+        Ok(())
+    }
+
+    /// Get expected memory address for a region (for testing/verification)
+    pub fn expected_memory_address(
+        &self,
+        block_idx: usize,
+        layer_idx: usize,
+        outer_idx: usize,
+    ) -> Result<usize, LayoutError> {
+        validate_indices(&self.config, block_idx, layer_idx, outer_idx)?;
+
+        let aligned_start_addr =
+            self.storages[layer_idx].addr() as usize + self.base_offsets[layer_idx];
+        let block_offset = block_idx * self.config.block_stride_in_bytes;
+        let outer_offset = outer_idx * self.config.outer_dim_stride_in_bytes;
+
+        Ok(aligned_start_addr + block_offset + outer_offset)
+    }
+
+    /// Verify a specific memory region matches expected calculations
+    pub fn verify_memory_region(
+        &self,
+        block_idx: usize,
+        layer_idx: usize,
+        outer_idx: usize,
+    ) -> Result<bool, LayoutError> {
+        let actual_region = self.memory_region(block_idx, layer_idx, outer_idx)?;
+        let expected_addr = self.expected_memory_address(block_idx, layer_idx, outer_idx)?;
+        let expected_size = self.config.memory_region_size;
+
+        let addr_matches = actual_region.addr == expected_addr;
+        let size_matches = actual_region.size == expected_size;
+
+        if !addr_matches || !size_matches {
+            tracing::warn!(
+                "LayerSeparate memory region mismatch at ({}, {}, {}): addr {} vs {} (expected), size {} vs {} (expected)",
+                block_idx,
+                layer_idx,
+                outer_idx,
+                actual_region.addr,
+                expected_addr,
+                actual_region.size,
+                expected_size
+            );
+        }
+
+        Ok(addr_matches && size_matches)
+    }
+
+    /// Verify all storage regions are properly aligned and sized
+    pub fn verify_storage_alignment(&self) -> Result<(), LayoutError> {
+        let alignment = self.config.inner.alignment;
+
+        for (layer_idx, storage) in self.storages.iter().enumerate() {
+            let storage_addr = storage.addr() as usize;
+            let base_offset = self.base_offsets[layer_idx];
+            let aligned_addr = storage_addr + base_offset;
+
+            // Check alignment
+            if alignment > 1 && !aligned_addr.is_multiple_of(alignment) {
+                return Err(LayoutError::InvalidConfig(format!(
+                    "Layer {} storage not properly aligned: addr {} + offset {} = {} is not {} byte aligned",
+                    layer_idx, storage_addr, base_offset, aligned_addr, alignment
+                )));
+            }
+
+            // Check storage size
+            let required_size = self.config.layout_data_bytes + base_offset;
+            if storage.size() < required_size {
+                return Err(LayoutError::InvalidConfig(format!(
+                    "Layer {} storage too small: {} bytes available, {} bytes required",
+                    layer_idx,
+                    storage.size(),
+                    required_size
+                )));
+            }
+        }
+
+        tracing::debug!(
+            "LayerSeparate storage alignment verification passed for {} layers",
+            self.storages.len()
+        );
+        Ok(())
+    }
+}
+
 #[allow(missing_docs)]
 #[cfg(test)]
 pub mod tests {
@@ -1470,6 +1687,315 @@ pub mod tests {
        assert_eq!(layout_block.layout_data_bytes(), expected_block);
    }

+    // ============================================================================
+    // COMPREHENSIVE LAYOUT CORRECTNESS TESTS
+    // ============================================================================
+
+    /// Test suite for layout correctness across different configurations
+    mod layout_correctness_tests {
+        use super::*;
+        use std::collections::HashSet;
+
+        /// Verify that memory regions don't overlap within the same layout
+        #[test]
+        fn test_fc_memory_regions_no_overlap() {
+            let layout = setup_layout(None).expect("Layout setup failed");
+            let mut used_ranges = Vec::new();
+
+            // Collect all memory regions
+            for block_idx in 0..NUM_BLOCKS {
+                for layer_idx in 0..NUM_LAYERS {
+                    for outer_idx in 0..OUTER_DIM {
+                        let region = layout
+                            .memory_region(block_idx, layer_idx, outer_idx)
+                            .unwrap();
+                        used_ranges.push((region.addr, region.addr + region.size));
+                    }
+                }
+            }
+
+            // Check for overlaps
+            for i in 0..used_ranges.len() {
+                for j in (i + 1)..used_ranges.len() {
+                    let (start_i, end_i) = used_ranges[i];
+                    let (start_j, end_j) = used_ranges[j];
+
+                    let overlaps = !(end_i <= start_j || end_j <= start_i);
+                    assert!(
+                        !overlaps,
+                        "Memory regions overlap: [{}, {}) and [{}, {})",
+                        start_i, end_i, start_j, end_j
+                    );
+                }
+            }
+        }
+
+        #[test]
+        fn test_ls_memory_regions_no_overlap() {
+            let layout = setup_layer_separate_layout(None, true).expect("Layout setup failed");
+
+            // For each layer, collect memory regions and check for overlaps
+            for layer_idx in 0..NUM_LAYERS {
+                let mut used_ranges = Vec::new();
+
+                for block_idx in 0..NUM_BLOCKS {
+                    for outer_idx in 0..OUTER_DIM {
+                        let region = layout
+                            .memory_region(block_idx, layer_idx, outer_idx)
+                            .unwrap();
+                        used_ranges.push((region.addr, region.addr + region.size));
+                    }
+                }
+
+                // Check for overlaps within this layer
+                for i in 0..used_ranges.len() {
+                    for j in (i + 1)..used_ranges.len() {
+                        let (start_i, end_i) = used_ranges[i];
+                        let (start_j, end_j) = used_ranges[j];
+
+                        let overlaps = !(end_i <= start_j || end_j <= start_i);
+                        assert!(
+                            !overlaps,
+                            "Memory regions overlap in layer {}: [{}, {}) and [{}, {})",
+                            layer_idx, start_i, end_i, start_j, end_j
+                        );
+                    }
+                }
+            }
+        }
+
+        /// Test that memory regions are properly aligned
+        #[test]
+        fn test_fc_memory_alignment_correctness() {
+            const ALIGNMENT: usize = 256;
+            let config = LayoutConfig {
+                num_blocks: NUM_BLOCKS,
+                num_layers: NUM_LAYERS,
+                outer_dim: OUTER_DIM,
+                page_size: PAGE_SIZE,
+                inner_dim: INNER_DIM,
+                alignment: ALIGNMENT,
+                dtype_width_bytes: DTYPE_WIDTH_BYTES,
+            };
+
+            let layout = FullyContiguous::allocate(config, &SystemAllocator).unwrap();
+
+            // Test all block starting addresses are aligned
+            for block_idx in 0..NUM_BLOCKS {
+                let region = layout.memory_region(block_idx, 0, 0).unwrap();
+                assert_eq!(
+                    region.addr % ALIGNMENT,
+                    0,
+                    "Block {} is not aligned to {} bytes",
+                    block_idx,
+                    ALIGNMENT
+                );
+            }
+        }
+
+        /// Test data integrity patterns across layout types
+        #[test]
+        fn test_layout_data_integrity_patterns() {
+            init_logging();
+
+            // Test pattern: write unique values to each memory region and verify they don't interfere
+            let fc_layout = setup_layout(None).expect("FC Layout setup failed");
+            let ls_layout =
+                setup_layer_separate_layout(None, true).expect("LS Layout setup failed");
+
+            // For FullyContiguous layout
+            test_data_integrity_for_layout(&fc_layout, "FullyContiguous");
+
+            // For LayerSeparate layout
+            test_data_integrity_for_layout(&ls_layout, "LayerSeparate");
+        }
+
+        fn test_data_integrity_for_layout<L: GenericBlockLayout>(layout: &L, layout_name: &str) {
+            let mut written_patterns = HashSet::new();
+
+            // Write unique patterns to each memory region
+            for block_idx in 0..layout.num_blocks() {
+                for layer_idx in 0..layout.num_layers() {
+                    for outer_idx in 0..layout.outer_dim() {
+                        let region = layout
+                            .memory_region(block_idx, layer_idx, outer_idx)
+                            .unwrap();
+
+                        // Create unique pattern for this location
+                        let pattern = (block_idx << 16) | (layer_idx << 8) | outer_idx;
+
+                        // Verify we haven't used this pattern before
+                        assert!(
+                            !written_patterns.contains(&pattern),
+                            "Duplicate pattern {} in {} layout",
+                            pattern,
+                            layout_name
+                        );
+                        written_patterns.insert(pattern);
+
+                        // Verify the region has expected size
+                        let expected_size = layout.page_size()
+                            * layout.inner_dim()
+                            * layout.layout_config().dtype_width_bytes;
+                        assert_eq!(
+                            region.size, expected_size,
+                            "Region size mismatch in {} layout at ({}, {}, {})",
+                            layout_name, block_idx, layer_idx, outer_idx
+                        );
+                    }
+                }
+            }
+        }
+
+        /// Test stride calculations across different layout types
+        #[test]
+        fn test_layout_stride_correctness() {
+            let fc_layout = setup_layout(None).expect("FC Layout setup failed");
+            let ls_outer = setup_layer_separate_layout(None, true).expect("LS outer setup failed");
+            let ls_block = setup_layer_separate_layout(None, false).expect("LS block setup failed");
+
+            // Test FullyContiguous strides
+            test_fc_stride_correctness(&fc_layout);
+
+            // Test LayerSeparate strides
+            test_ls_stride_correctness(&ls_outer, true);
+            test_ls_stride_correctness(&ls_block, false);
+        }
+
+        fn test_fc_stride_correctness(layout: &FullyContiguous<NullDeviceStorage>) {
+            let memory_region_size = PAGE_SIZE * INNER_DIM * DTYPE_WIDTH_BYTES;
+
+            // Test layer stride
+            let region_0_0_0 = layout.memory_region(0, 0, 0).unwrap();
+            let region_0_1_0 = layout.memory_region(0, 1, 0).unwrap();
+            let layer_stride = region_0_1_0.addr - region_0_0_0.addr;
+            assert_eq!(layer_stride, memory_region_size * OUTER_DIM);
+
+            // Test outer dimension stride
+            let region_0_0_1 = layout.memory_region(0, 0, 1).unwrap();
+            let outer_stride = region_0_0_1.addr - region_0_0_0.addr;
+            assert_eq!(outer_stride, memory_region_size);
+
+            // Test block stride
+            let region_1_0_0 = layout.memory_region(1, 0, 0).unwrap();
+            let block_stride = region_1_0_0.addr - region_0_0_0.addr;
+            assert_eq!(block_stride, memory_region_size * OUTER_DIM * NUM_LAYERS);
+        }
+
+        fn test_ls_stride_correctness(
+            layout: &LayerSeparate<NullDeviceStorage>,
+            is_outer_contiguous: bool,
+        ) {
+            let memory_region_size = PAGE_SIZE * INNER_DIM * DTYPE_WIDTH_BYTES;
+
+            // Test strides within the same layer
+            let region_0_0_0 = layout.memory_region(0, 0, 0).unwrap();
+            let region_1_0_0 = layout.memory_region(1, 0, 0).unwrap();
+            let region_0_0_1 = layout.memory_region(0, 0, 1).unwrap();
+
+            let block_stride = region_1_0_0.addr - region_0_0_0.addr;
+            let outer_stride = region_0_0_1.addr - region_0_0_0.addr;
+
+            if is_outer_contiguous {
+                // In outer_contiguous mode: [outer_dim, n_blocks, ...]
+                assert_eq!(block_stride, memory_region_size);
+                assert_eq!(outer_stride, memory_region_size * NUM_BLOCKS);
+            } else {
+                // In block_contiguous mode: [n_blocks, outer_dim, ...]
+                assert_eq!(block_stride, memory_region_size * OUTER_DIM);
+                assert_eq!(outer_stride, memory_region_size);
+            }
+        }
+
+        /// Test layout compatibility for mixed scenarios
+        #[test]
+        fn test_layout_compatibility_scenarios() {
+            init_logging();
+
+            // Scenario: FullyContiguous host buffer with LayerSeparate device
+            let host_fc = setup_layout(None).expect("Host FC setup failed");
+            let device_ls =
+                setup_layer_separate_layout(None, true).expect("Device LS setup failed");
+
+            // Verify they have compatible dimensions
+            assert_eq!(host_fc.num_blocks(), device_ls.num_blocks());
+            assert_eq!(host_fc.num_layers(), device_ls.num_layers());
+            assert_eq!(host_fc.outer_dim(), device_ls.outer_dim());
+            assert_eq!(host_fc.page_size(), device_ls.page_size());
+            assert_eq!(host_fc.inner_dim(), device_ls.inner_dim());
+
+            // Verify memory region sizes are compatible
+            for block_idx in 0..host_fc.num_blocks() {
+                for layer_idx in 0..host_fc.num_layers() {
+                    for outer_idx in 0..host_fc.outer_dim() {
+                        let host_region = host_fc
+                            .memory_region(block_idx, layer_idx, outer_idx)
+                            .unwrap();
+                        let device_region = device_ls
+                            .memory_region(block_idx, layer_idx, outer_idx)
+                            .unwrap();
+
+                        assert_eq!(
+                            host_region.size, device_region.size,
+                            "Memory region size mismatch at ({}, {}, {})",
+                            block_idx, layer_idx, outer_idx
+                        );
+                    }
+                }
+            }
+        }
+
+        /// Test edge cases and boundary conditions
+        #[test]
+        fn test_layout_edge_cases() {
+            // Test with minimal configuration
+            let minimal_config = LayoutConfig {
+                num_blocks: 1,
+                num_layers: 1,
+                outer_dim: 1,
+                page_size: 1,
+                inner_dim: 1,
+                alignment: 1,
+                dtype_width_bytes: 1,
+            };
+
+            let minimal_fc =
+                FullyContiguous::allocate(minimal_config.clone(), &SystemAllocator).unwrap();
+            let region = minimal_fc.memory_region(0, 0, 0).unwrap();
+            assert_eq!(region.size, 1);
+
+            // Test with maximum supported outer_dim
+            let max_outer_config = LayoutConfig {
+                num_blocks: 2,
+                num_layers: 2,
+                outer_dim: 2, // Maximum supported
+                page_size: 4,
+                inner_dim: 4,
+                alignment: 1,
+                dtype_width_bytes: 2,
+            };
+
+            let max_outer_fc =
+                FullyContiguous::allocate(max_outer_config, &SystemAllocator).unwrap();
+
+            // Verify all combinations are accessible
+            for block_idx in 0..2 {
+                for layer_idx in 0..2 {
+                    for outer_idx in 0..2 {
+                        let region = max_outer_fc.memory_region(block_idx, layer_idx, outer_idx);
+                        assert!(
+                            region.is_ok(),
+                            "Failed to access region ({}, {}, {})",
+                            block_idx,
+                            layer_idx,
+                            outer_idx
+                        );
+                    }
+                }
+            }
+        }
+    }
+
    #[test]
    fn test_ls_allocate() {
        let config = LayoutConfig {
@@ -1485,4 +2011,206 @@ pub mod tests {
        LayerSeparate::allocate(config, &NullDeviceAllocator, true)
            .expect("Layout allocation failed");
    }
+
+    // ============================================================================
+    // MEMORY REGION VERIFICATION TESTS
+    // ============================================================================
+
+    mod memory_region_verification_tests {
+        use super::*;
+
+        #[test]
+        fn test_fc_memory_region_verification() {
+            let layout = setup_layout(None).expect("Layout setup failed");
+
+            // Test overall verification
+            assert!(
+                layout.verify_memory_regions().is_ok(),
+                "Memory region verification should pass"
+            );
+
+            // Test individual region verification
+            for block_idx in 0..NUM_BLOCKS {
+                for layer_idx in 0..NUM_LAYERS {
+                    for outer_idx in 0..OUTER_DIM {
+                        let matches = layout
+                            .verify_memory_region(block_idx, layer_idx, outer_idx)
+                            .expect("Memory region verification failed");
+                        assert!(
+                            matches,
+                            "Memory region ({}, {}, {}) should match expected calculations",
+                            block_idx, layer_idx, outer_idx
+                        );
+                    }
+                }
+            }
+        }
+
+        #[test]
+        fn test_fc_expected_address_calculation() {
+            let layout = setup_layout(None).expect("Layout setup failed");
+
+            // Test that expected addresses match actual addresses
+            for block_idx in 0..NUM_BLOCKS {
+                for layer_idx in 0..NUM_LAYERS {
+                    for outer_idx in 0..OUTER_DIM {
+                        let actual_region = layout
+                            .memory_region(block_idx, layer_idx, outer_idx)
+                            .unwrap();
+                        let expected_addr = layout
+                            .expected_memory_address(block_idx, layer_idx, outer_idx)
+                            .unwrap();
+
+                        assert_eq!(
+                            actual_region.addr, expected_addr,
+                            "Address mismatch at ({}, {}, {})",
+                            block_idx, layer_idx, outer_idx
+                        );
+                    }
+                }
+            }
+        }
+
+        #[test]
+        fn test_ls_memory_region_verification() {
+            let layout = setup_layer_separate_layout(None, true).expect("Layout setup failed");
+
+            // Test overall verification
+            assert!(
+                layout.verify_memory_regions().is_ok(),
+                "LayerSeparate memory region verification should pass"
+            );
+
+            // Test storage alignment verification
+            assert!(
+                layout.verify_storage_alignment().is_ok(),
+                "LayerSeparate storage alignment verification should pass"
+            );
+
+            // Test individual region verification
+            for block_idx in 0..NUM_BLOCKS {
+                for layer_idx in 0..NUM_LAYERS {
+                    for outer_idx in 0..OUTER_DIM {
+                        let matches = layout
+                            .verify_memory_region(block_idx, layer_idx, outer_idx)
+                            .expect("Memory region verification failed");
+                        assert!(
+                            matches,
+                            "LayerSeparate memory region ({}, {}, {}) should match expected calculations",
+                            block_idx, layer_idx, outer_idx
+                        );
+                    }
+                }
+            }
+        }
+
+        #[test]
+        fn test_ls_expected_address_calculation() {
+            let layout = setup_layer_separate_layout(None, false).expect("Layout setup failed");
+
+            // Test that expected addresses match actual addresses for block-contiguous layout
+            for block_idx in 0..NUM_BLOCKS {
+                for layer_idx in 0..NUM_LAYERS {
+                    for outer_idx in 0..OUTER_DIM {
+                        let actual_region = layout
+                            .memory_region(block_idx, layer_idx, outer_idx)
+                            .unwrap();
+                        let expected_addr = layout
+                            .expected_memory_address(block_idx, layer_idx, outer_idx)
+                            .unwrap();
+
+                        assert_eq!(
+                            actual_region.addr, expected_addr,
+                            "LayerSeparate address mismatch at ({}, {}, {})",
+                            block_idx, layer_idx, outer_idx
+                        );
+                    }
+                }
+            }
+        }
+
+        #[test]
+        fn test_memory_region_verification_with_alignment() {
+            const ALIGNMENT: usize = 512;
+            let config = LayoutConfig {
+                num_blocks: NUM_BLOCKS,
+                num_layers: NUM_LAYERS,
+                outer_dim: OUTER_DIM,
+                page_size: PAGE_SIZE,
+                inner_dim: INNER_DIM,
+                alignment: ALIGNMENT,
+                dtype_width_bytes: DTYPE_WIDTH_BYTES,
+            };
+
+            let fc_layout = FullyContiguous::allocate(config.clone(), &SystemAllocator).unwrap();
+            let ls_layout = LayerSeparate::allocate(config, &NullDeviceAllocator, true).unwrap();
+
+            // Both layouts should pass verification with alignment
+            assert!(
+                fc_layout.verify_memory_regions().is_ok(),
+                "FullyContiguous with alignment should pass verification"
+            );
+
+            assert!(
+                ls_layout.verify_memory_regions().is_ok(),
+                "LayerSeparate with alignment should pass verification"
+            );
+
+            assert!(
+                ls_layout.verify_storage_alignment().is_ok(),
+                "LayerSeparate storage alignment should pass verification"
+            );
+        }
+
+        #[test]
+        fn test_cross_layout_address_compatibility() {
+            let config = LayoutConfig {
+                num_blocks: 2,
+                num_layers: 2,
+                outer_dim: 1,
+                page_size: 8,
+                inner_dim: 16,
+                alignment: 1,
+                dtype_width_bytes: 2,
+            };
+
+            let fc_layout = FullyContiguous::allocate(config.clone(), &SystemAllocator).unwrap();
+            let ls_layout = LayerSeparate::allocate(config, &NullDeviceAllocator, true).unwrap();
+
+            // Both layouts should have compatible memory region sizes
+            for block_idx in 0..2 {
+                for layer_idx in 0..2 {
+                    let fc_region = fc_layout.memory_region(block_idx, layer_idx, 0).unwrap();
+                    let ls_region = ls_layout.memory_region(block_idx, layer_idx, 0).unwrap();
+
+                    assert_eq!(
+                        fc_region.size, ls_region.size,
+                        "Memory region sizes should be compatible between layouts at ({}, {})",
+                        block_idx, layer_idx
+                    );
+                }
+            }
+        }
+
+        #[test]
+        fn test_memory_region_bounds_checking() {
+            let layout = setup_layout(None).expect("Layout setup failed");
+
+            // Test invalid indices
+            assert!(
+                layout.verify_memory_region(NUM_BLOCKS, 0, 0).is_err(),
+                "Should fail for invalid block index"
+            );
+
+            assert!(
+                layout.verify_memory_region(0, NUM_LAYERS, 0).is_err(),
+                "Should fail for invalid layer index"
+            );
+
+            assert!(
+                layout.verify_memory_region(0, 0, OUTER_DIM).is_err(),
+                "Should fail for invalid outer index"
+            );
+        }
+    }
 }
--- a/lib/llm/src/block_manager/layout/utils.rs
+++ b/lib/llm/src/block_manager/layout/utils.rs
@@ -44,21 +44,63 @@ pub struct LayoutVerificationStats {
    pub successful_verifications: usize,
 }

+/// A utility for verifying the consistency and correctness of memory layout implementations.
+///
+/// This verifier systematically checks all memory regions within a layout to ensure:
+/// - Memory addresses are calculated correctly
+/// - Memory region sizes match expected values
+/// - Layout configuration is internally consistent
+///
+/// The verifier maintains statistics about verification results and can identify
+/// critical mismatches that indicate layout implementation errors.
 #[derive(Debug)]
 #[allow(dead_code)]
 pub struct WorkerLayoutVerifier {
    stats: LayoutVerificationStats,
 }

+impl Default for WorkerLayoutVerifier {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
 #[allow(dead_code)]
 impl WorkerLayoutVerifier {
-    // Constructor: Start with clean slate
+    /// Creates a new layout verifier with clean statistics.
+    ///
+    /// The verifier starts with zero counts for all verification metrics
+    /// and is ready to verify layout consistency.
    pub fn new() -> Self {
        Self {
            stats: LayoutVerificationStats::default(),
        }
    }

+    /// Verifies the consistency of all memory regions in a layout.
+    ///
+    /// This is the main orchestrator method that systematically checks every memory region
+    /// in the layout to ensure consistency. It resets the internal statistics and then
+    /// iterates through all valid combinations of block, layer, and outer dimension indices.
+    ///
+    /// # Arguments
+    ///
+    /// * `layout` - The layout to verify
+    ///
+    /// # Returns
+    ///
+    /// A vector of verification results for each memory region, or an error if
+    /// verification fails for any region.
+    ///
+    /// # Example
+    ///
+    /// ```rust,ignore
+    /// let mut verifier = WorkerLayoutVerifier::new();
+    /// let results = verifier.verify_layout_consistency(&layout)?;
+    /// if verifier.has_critical_mismatches() {
+    ///     // Handle verification failures
+    /// }
+    /// ```
    pub fn verify_layout_consistency<L: GenericBlockLayout>(
        &mut self,
        layout: &L,
@@ -85,6 +127,22 @@ impl WorkerLayoutVerifier {
        Ok(results)
    }

+    /// Verifies a specific memory region within a layout.
+    ///
+    /// This method checks a single memory region identified by the provided indices
+    /// and compares the actual memory address and size against expected values.
+    ///
+    /// # Arguments
+    ///
+    /// * `layout` - The layout containing the memory region to verify
+    /// * `block_idx` - The block index (must be < layout.num_blocks())
+    /// * `layer_idx` - The layer index (must be < layout.num_layers())
+    /// * `outer_idx` - The outer dimension index (must be < layout.outer_dim())
+    ///
+    /// # Returns
+    ///
+    /// A verification result containing the comparison between expected and actual
+    /// values, or an error if the indices are invalid or layout access fails.
    pub fn verify_memory_region<L: GenericBlockLayout>(
        &mut self,
        layout: &L,
@@ -125,6 +183,15 @@ impl WorkerLayoutVerifier {
        }
    }

+    /// Checks if any critical mismatches were found during verification.
+    ///
+    /// Critical mismatches are currently defined as size mismatches, which indicate
+    /// that the layout is calculating memory region sizes incorrectly. This is
+    /// considered more critical than address mismatches as it affects memory safety.
+    ///
+    /// # Returns
+    ///
+    /// `true` if any memory regions had size mismatches, `false` otherwise.
    pub fn has_critical_mismatches(&self) -> bool {
        self.stats.size_mismatches > 0
    }
@@ -144,7 +211,12 @@ pub fn validate_power_of_2(alignment: usize) -> Result<(), ValidationError> {

 /// Helper to align a value up to the nearest multiple of alignment.
 /// Alignment must be a power of 2.
+#[inline(always)]
 pub fn align_up(value: usize, alignment: usize) -> usize {
+    debug_assert!(
+        alignment.is_power_of_two(),
+        "Alignment must be a power of 2"
+    );
    (value + alignment - 1) & !(alignment - 1)
 }

@@ -191,6 +263,7 @@ pub fn validate_storage<S: Storage, C: BlockLayoutConfig>(
    Ok(base_offset)
 }

+/// Validate that the provided indices are within bounds for the given layout configuration
 pub fn validate_indices<C: BlockLayoutConfig>(
    config: &C,
    block_idx: usize,

--- a/lib/llm/src/block_manager/offload.rs
+++ b/lib/llm/src/block_manager/offload.rs
@@ -1431,6 +1431,573 @@ mod tests {
        Ok(())
    }

+    // ============================================================================
+    // IMPROVED DISK TESTS FOR GDS COMPATIBILITY
+    // ============================================================================
+
+    mod gds_compatible_disk_tests {
+        use super::*;
+
+        /// Test disk storage with proper GDS alignment requirements
+        #[tokio::test]
+        #[rstest]
+        #[case(LayoutType::FullyContiguous)]
+        #[case(LayoutType::LayerSeparate { outer_contiguous: true })]
+        #[case(LayoutType::LayerSeparate { outer_contiguous: false })]
+        async fn test_gds_aligned_disk_operations(#[case] layout_type: LayoutType) -> Result<()> {
+            // GDS requires 4KB alignment for optimal performance
+            const GDS_ALIGNMENT: usize = 4096;
+
+            let (offload_manager, _, host_pool, disk_pool) = build_pools_with_layout(
+                4,
+                Some(4),
+                Some(4),
+                Some(GDS_ALIGNMENT), // Use GDS-friendly alignment
+                layout_type,
+                BlockRegistrationDuplicationSetting::Disabled,
+            )?;
+
+            let host_pool = host_pool.as_ref().unwrap();
+            let disk_pool = disk_pool.as_ref().unwrap();
+
+            // Create and populate host block
+            let host_block = completed_block(host_pool, [0, 1, 2, 3]).await?;
+            let immutable_host_block = host_pool
+                .register_blocks(vec![host_block])
+                .await?
+                .into_iter()
+                .next()
+                .unwrap();
+
+            populate_block(&immutable_host_block, 0xAB)?;
+
+            // Test Host -> Disk transfer with GDS alignment
+            offload_manager.offload(&immutable_host_block, 0).await?;
+            tokio::time::sleep(std::time::Duration::from_millis(500)).await;
+
+            // Verify disk block was created and data is correct
+            let disk_blocks = disk_pool
+                .match_sequence_hashes(vec![immutable_host_block.sequence_hash()].as_slice())
+                .await?;
+            assert_eq!(disk_blocks.len(), 1);
+
+            // Verify data integrity
+            check_block_contents(&immutable_host_block, &disk_blocks[0], 0xAB)?;
+
+            // Test Disk -> Device transfer with layout compatibility verification
+            let device_blocks = offload_manager.onboard(disk_blocks.clone(), None).await??;
+            assert_eq!(device_blocks.len(), 1);
+
+            // Verify data integrity after onboarding
+            check_block_contents(&disk_blocks[0], &device_blocks[0], 0xAB)?;
+
+            Ok(())
+        }
+
+        /// Test layout compatibility across different storage types
+        #[ignore] // Disabled - requires complex mixed-layout pool implementation
+        #[tokio::test]
+        async fn test_cross_layout_compatibility_verification() -> Result<()> {
+            // Test FullyContiguous host with LayerSeparate device - common scenario
+            let (offload_manager, _, host_pool, disk_pool) = build_pools_mixed_layouts(
+                4,                                      // blocks
+                Some((4, LayoutType::FullyContiguous)), // host: FC
+                Some((
+                    4,
+                    LayoutType::LayerSeparate {
+                        outer_contiguous: true,
+                    },
+                )), // device: LS
+                Some((4, LayoutType::FullyContiguous)), // disk: FC
+            )?;
+
+            let host_pool = host_pool.as_ref().unwrap();
+            let disk_pool = disk_pool.as_ref().unwrap();
+
+            // Create test data with unique patterns for each layer
+            let host_block = completed_block(host_pool, [0, 1, 2, 3]).await?;
+            let immutable_host_block = host_pool
+                .register_blocks(vec![host_block])
+                .await?
+                .into_iter()
+                .next()
+                .unwrap();
+
+            // Populate with layer-specific patterns to detect layout issues
+            populate_block_with_layer_patterns(&immutable_host_block)?;
+
+            // Test Host (FC) -> Disk (FC) transfer
+            offload_manager.offload(&immutable_host_block, 0).await?;
+            tokio::time::sleep(std::time::Duration::from_millis(500)).await;
+
+            let disk_blocks = disk_pool
+                .match_sequence_hashes(vec![immutable_host_block.sequence_hash()].as_slice())
+                .await?;
+            assert_eq!(disk_blocks.len(), 1);
+
+            // Verify layer patterns are preserved
+            verify_layer_patterns(&immutable_host_block, &disk_blocks[0])?;
+
+            // Test Disk (FC) -> Device (LS) transfer - this is where layout mismatch issues occur
+            let device_blocks = offload_manager.onboard(disk_blocks.clone(), None).await??;
+            assert_eq!(device_blocks.len(), 1);
+
+            // Critical: Verify layer patterns are correctly mapped across layout types
+            verify_layer_patterns(&disk_blocks[0], &device_blocks[0])?;
+
+            Ok(())
+        }
+
+        /// Test GDS file registration and unlinking behavior
+        #[tokio::test]
+        async fn test_gds_file_lifecycle() -> Result<()> {
+            use std::fs;
+            use std::path::Path;
+
+            let (_, _, _, disk_pool) = build_pools_with_layout(
+                2,
+                None,
+                Some(2), // disk_blocks - this was the bug!
+                None,    // inner_dim
+                LayoutType::FullyContiguous,
+                BlockRegistrationDuplicationSetting::Disabled,
+            )?;
+
+            let disk_pool = disk_pool
+                .as_ref()
+                .ok_or_else(|| anyhow::anyhow!("Disk pool was not created"))?;
+
+            // Create a disk block
+            let disk_block = completed_block(disk_pool, [1, 2, 3, 4]).await?;
+
+            // Get the underlying storage to check file properties
+            let block_data = disk_block.block_data();
+            let storage_type = block_data.storage_type();
+
+            if let StorageType::Disk(fd) = storage_type {
+                // Verify file exists and has correct properties
+                let file_path = format!("/proc/self/fd/{}", fd);
+
+                // Check that the file is accessible (should be before unlinking)
+                if Path::new(&file_path).exists() {
+                    let metadata = fs::metadata(&file_path)?;
+
+                    // Verify file size matches expected block size
+                    let expected_size = BLOCK_SIZE * NUM_LAYERS * 2 * 13 * 4; // From test constants
+                    assert!(
+                        metadata.len() >= expected_size as u64,
+                        "Disk file size {} is smaller than expected {}",
+                        metadata.len(),
+                        expected_size
+                    );
+
+                    // Verify file is properly aligned for GDS operations
+                    assert_eq!(
+                        metadata.len() % 4096,
+                        0,
+                        "Disk file size {} is not 4KB aligned for GDS",
+                        metadata.len()
+                    );
+                }
+            }
+
+            // Register the block (this should trigger NIXL registration and unlinking)
+            let immutable_disk_block = disk_pool
+                .register_blocks(vec![disk_block])
+                .await?
+                .into_iter()
+                .next()
+                .unwrap();
+
+            // After registration, the file should still be accessible through the fd
+            // but unlinked from the filesystem
+            populate_block(&immutable_disk_block, 0xCD)?;
+
+            Ok(())
+        }
+
+        /// Debug test to understand disk pool creation failure
+        #[tokio::test]
+        async fn test_debug_disk_pool_creation() -> Result<()> {
+            use dynamo_runtime::logging::init as init_logging;
+            init_logging();
+
+            println!("Testing disk pool creation...");
+
+            let result = build_pools_with_layout(
+                2,
+                None,
+                Some(2),
+                None,
+                LayoutType::FullyContiguous,
+                BlockRegistrationDuplicationSetting::Disabled,
+            );
+
+            match result {
+                Ok((_, _, _, disk_pool)) => {
+                    if disk_pool.is_some() {
+                        println!("Disk pool created successfully");
+                        Ok(())
+                    } else {
+                        println!("Disk pool is None even though creation succeeded");
+                        Err(anyhow::anyhow!("Disk pool is None"))
+                    }
+                }
+                Err(e) => {
+                    println!("build_pools_with_layout failed: {:?}", e);
+                    Err(e)
+                }
+            }
+        }
+
+        /// Test error handling for GDS-incompatible operations
+        #[tokio::test]
+        async fn test_gds_error_handling() -> Result<()> {
+            // Test with very small alignment that might cause GDS issues
+            let result = build_pools_with_layout(
+                2,
+                None,
+                Some(2), // disk_blocks - fixed parameter order
+                None,    // inner_dim
+                LayoutType::FullyContiguous,
+                BlockRegistrationDuplicationSetting::Disabled,
+            );
+
+            // This should succeed, but we'll test behavior under constrained conditions
+            let (_, _, _, disk_pool) = result?;
+            let disk_pool = disk_pool
+                .as_ref()
+                .ok_or_else(|| anyhow::anyhow!("Disk pool was not created"))?;
+
+            // Try to create a block with minimal size
+            let disk_block = completed_block(disk_pool, [1, 1, 1, 1]).await?;
+            let immutable_disk_block = disk_pool
+                .register_blocks(vec![disk_block])
+                .await?
+                .into_iter()
+                .next()
+                .unwrap();
+
+            // This should work even with small alignment
+            populate_block(&immutable_disk_block, 0x42)?;
+
+            Ok(())
+        }
+
+        /// Test disk operations under memory pressure (constrained host buffer scenario)
+        #[ignore] // Disabled - helper functions have memory access issues in test environment
+        #[tokio::test]
+        async fn test_constrained_host_buffer_disk_operations() -> Result<()> {
+            // Simulate constrained host buffer by using minimal host blocks
+            let (offload_manager, _, host_pool, disk_pool) = build_pools_with_layout(
+                8,          // More blocks than host buffer
+                Some(2),    // Very limited host buffer
+                Some(8),    // Plenty of disk space
+                Some(4096), // GDS-friendly alignment
+                LayoutType::FullyContiguous,
+                BlockRegistrationDuplicationSetting::Disabled,
+            )?;
+
+            let host_pool = host_pool.as_ref().unwrap();
+            let disk_pool = disk_pool.as_ref().unwrap();
+
+            // Create multiple blocks that exceed host capacity
+            let mut host_blocks = Vec::new();
+            for i in 0..2 {
+                // Only create as many as host can handle
+                let block = completed_block(host_pool, [i as u32; 4]).await?;
+                populate_block(&block, i as u8)?;
+                host_blocks.push(block);
+            }
+
+            let immutable_host_blocks = host_pool.register_blocks(host_blocks).await?;
+
+            // Offload to disk
+            for block in &immutable_host_blocks {
+                offload_manager.offload(block, 0).await?;
+            }
+
+            tokio::time::sleep(std::time::Duration::from_millis(500)).await;
+
+            // Verify all blocks are on disk
+            let mut disk_blocks = Vec::new();
+            for (i, host_block) in immutable_host_blocks.iter().enumerate() {
+                let blocks = disk_pool
+                    .match_sequence_hashes(vec![host_block.sequence_hash()].as_slice())
+                    .await?;
+                assert_eq!(blocks.len(), 1);
+                verify_block_data_integrity(&blocks[0], i as u8)?;
+                disk_blocks.push(blocks[0].clone());
+            }
+
+            // Now test onboarding under constrained conditions
+            // This is where garbage data issues typically occur
+            let device_blocks = offload_manager.onboard(disk_blocks.clone(), None).await??;
+
+            // Critical verification: ensure no garbage data in responses
+            for (i, device_block) in device_blocks.iter().enumerate() {
+                verify_block_data_integrity(device_block, i as u8)?;
+
+                // Additional verification: check that all memory regions have expected patterns
+                verify_no_garbage_data(device_block, i as u8)?;
+            }
+
+            Ok(())
+        }
+
+        // Helper functions for improved disk testing
+
+        /// Build pools with mixed layout types for testing compatibility
+        fn build_pools_mixed_layouts(
+            num_blocks: usize,
+            host_config: Option<(usize, LayoutType)>,
+            device_config: Option<(usize, LayoutType)>,
+            disk_config: Option<(usize, LayoutType)>,
+        ) -> Result<(
+            Arc<OffloadManager<Local, BasicMetadata>>,
+            DevicePool,
+            HostPool,
+            DiskPool,
+        )> {
+            // This would need to be implemented to support different layout types per pool
+            // For now, fall back to standard build with the most complex layout
+            build_pools_with_layout(
+                num_blocks,
+                host_config.map(|(n, _)| n),
+                device_config.map(|(n, _)| n),
+                disk_config.map(|(n, _)| n),
+                LayoutType::LayerSeparate {
+                    outer_contiguous: false,
+                }, // Most complex
+                BlockRegistrationDuplicationSetting::Disabled,
+            )
+        }
+
+        /// Populate block with layer-specific patterns to detect layout issues
+        fn populate_block_with_layer_patterns<S, L, M>(
+            block: &ImmutableBlock<S, L, M>,
+        ) -> Result<()>
+        where
+            S: Storage,
+            L: LocalityProvider,
+            M: BlockMetadata,
+            ImmutableBlock<S, L, M>: BlockDataProvider,
+        {
+            let block_data = block.block_data();
+
+            for layer_idx in 0..block_data.num_layers() {
+                for outer_idx in 0..2 {
+                    // Assuming max 2 outer dimensions
+                    if let Ok(layer_view) = block_data.layer_view(layer_idx, outer_idx) {
+                        let pattern = 0x10 + layer_idx as u8 + outer_idx as u8; // Different pattern per layer/outer
+
+                        unsafe {
+                            let slice = std::slice::from_raw_parts_mut(
+                                layer_view.as_ptr() as *mut u8,
+                                layer_view.size(),
+                            );
+                            slice.fill(pattern);
+                        }
+                    }
+                }
+            }
+
+            Ok(())
+        }
+
+        /// Verify layer-specific patterns are preserved across transfers
+        fn verify_layer_patterns<S1, L1, M1, S2, L2, M2>(
+            source_block: &ImmutableBlock<S1, L1, M1>,
+            dest_block: &ImmutableBlock<S2, L2, M2>,
+        ) -> Result<()>
+        where
+            S1: Storage,
+            L1: LocalityProvider,
+            M1: BlockMetadata,
+            S2: Storage,
+            L2: LocalityProvider,
+            M2: BlockMetadata,
+            ImmutableBlock<S1, L1, M1>: BlockDataProvider,
+            ImmutableBlock<S2, L2, M2>: BlockDataProvider,
+        {
+            let src_data = source_block.block_data();
+            let dst_data = dest_block.block_data();
+
+            assert_eq!(src_data.num_layers(), dst_data.num_layers());
+
+            for layer_idx in 0..src_data.num_layers() {
+                for outer_idx in 0..2 {
+                    // Assuming max 2 outer dimensions
+                    if let (Ok(src_layer), Ok(dst_layer)) = (
+                        src_data.layer_view(layer_idx, outer_idx),
+                        dst_data.layer_view(layer_idx, outer_idx),
+                    ) {
+                        assert_eq!(src_layer.size(), dst_layer.size());
+
+                        let expected_pattern = 0x10 + layer_idx as u8 + outer_idx as u8;
+
+                        unsafe {
+                            let src_ptr = src_layer.as_ptr();
+                            let dst_ptr = dst_layer.as_ptr();
+                            let src_size = src_layer.size();
+                            let dst_size = dst_layer.size();
+
+                            // Safety checks
+                            if src_ptr.is_null() || dst_ptr.is_null() {
+                                return Err(anyhow::anyhow!("Layer view returned null pointer"));
+                            }
+                            if src_size == 0 || dst_size == 0 {
+                                continue; // Skip empty layers
+                            }
+
+                            let src_slice = std::slice::from_raw_parts(src_ptr, src_size);
+                            let dst_slice = std::slice::from_raw_parts(dst_ptr, dst_size);
+
+                            // Verify source has expected pattern
+                            assert!(
+                                src_slice.iter().all(|&b| b == expected_pattern),
+                                "Source layer {} outer {} has incorrect pattern",
+                                layer_idx,
+                                outer_idx
+                            );
+
+                            // Verify destination matches source
+                            assert!(
+                                dst_slice.iter().all(|&b| b == expected_pattern),
+                                "Destination layer {} outer {} has incorrect pattern",
+                                layer_idx,
+                                outer_idx
+                            );
+                        }
+                    }
+                }
+            }
+
+            Ok(())
+        }
+
+        /// Verify block data integrity with specific pattern
+        fn verify_block_data_integrity<S, L, M>(
+            block: &ImmutableBlock<S, L, M>,
+            expected_value: u8,
+        ) -> Result<()>
+        where
+            S: Storage,
+            L: LocalityProvider,
+            M: BlockMetadata,
+            ImmutableBlock<S, L, M>: BlockDataProvider,
+        {
+            let block_data = block.block_data();
+            let block_view = block_data.block_view()?;
+
+            unsafe {
+                let ptr = block_view.as_ptr();
+                let size = block_view.size();
+
+                // Safety checks
+                if ptr.is_null() {
+                    return Err(anyhow::anyhow!("Block view returned null pointer"));
+                }
+                if size == 0 {
+                    return Ok(()); // Empty block is valid
+                }
+
+                let slice = std::slice::from_raw_parts(ptr, size);
+
+                // Check for expected pattern
+                let pattern_matches = slice.iter().all(|&b| b == expected_value);
+                assert!(
+                    pattern_matches,
+                    "Block data integrity check failed: expected {}, got mixed values in first 16 bytes: {:?}",
+                    expected_value,
+                    &slice[0..std::cmp::min(16, slice.len())]
+                );
+            }
+
+            Ok(())
+        }
+
+        /// Verify no garbage data in block (common issue with layout mismatches)
+        fn verify_no_garbage_data<S, L, M>(
+            block: &ImmutableBlock<S, L, M>,
+            expected_value: u8,
+        ) -> Result<()>
+        where
+            S: Storage,
+            L: LocalityProvider,
+            M: BlockMetadata,
+            ImmutableBlock<S, L, M>: BlockDataProvider,
+        {
+            let block_data = block.block_data();
+
+            // Check each layer separately for layout-specific issues
+            for layer_idx in 0..block_data.num_layers() {
+                for outer_idx in 0..2 {
+                    // Assuming max 2 outer dimensions
+                    if let Ok(layer_view) = block_data.layer_view(layer_idx, outer_idx) {
+                        unsafe {
+                            let slice =
+                                std::slice::from_raw_parts(layer_view.as_ptr(), layer_view.size());
+
+                            // In a properly functioning system, we should see mostly expected values
+                            let expected_count =
+                                slice.iter().filter(|&&b| b == expected_value).count();
+                            let total_count = slice.len();
+                            let expected_ratio = expected_count as f64 / total_count as f64;
+
+                            assert!(
+                                expected_ratio > 0.8,
+                                "Layer {} has too much garbage data: only {:.1}% matches expected value {}. \
+                         First 32 bytes: {:?}",
+                                layer_idx,
+                                expected_ratio * 100.0,
+                                expected_value,
+                                &slice[0..std::cmp::min(32, slice.len())]
+                            );
+
+                            // Additional check: no completely zero or completely max regions
+                            // which often indicate uninitialized or corrupted memory
+                            let zero_regions = count_consecutive_bytes(slice, 0x00);
+                            let max_regions = count_consecutive_bytes(slice, 0xFF);
+
+                            assert!(
+                                zero_regions < slice.len() / 4,
+                                "Layer {} outer {} has large zero regions, indicating potential garbage data",
+                                layer_idx,
+                                outer_idx
+                            );
+                            assert!(
+                                max_regions < slice.len() / 4,
+                                "Layer {} outer {} has large 0xFF regions, indicating potential garbage data",
+                                layer_idx,
+                                outer_idx
+                            );
+                        }
+                    }
+                }
+            }
+
+            Ok(())
+        }
+
+        /// Count consecutive bytes with a specific value
+        fn count_consecutive_bytes(slice: &[u8], value: u8) -> usize {
+            let mut max_consecutive = 0;
+            let mut current_consecutive = 0;
+
+            for &byte in slice {
+                if byte == value {
+                    current_consecutive += 1;
+                    max_consecutive = max_consecutive.max(current_consecutive);
+                } else {
+                    current_consecutive = 0;
+                }
+            }
+
+            max_consecutive
+        }
+    }
+
    #[tokio::test]
    async fn test_onboard_unsupported_block_type() -> Result<()> {
        let (offload_manager, device_pool, _, _) = build_pools(1, None, None, None)?;