feat: kvbm-physical (#6490)

Signed-off-by: Ryan Olson <rolson@nvidia.com>

feat: kvbm-physical (#6490)
Signed-off-by: Ryan Olson <rolson@nvidia.com>
9ab148dc · Ryan Olson · GitHub · 7546c193 · 9ab148dc · 9ab148dc
Unverified Commit 9ab148dc authored Mar 01, 2026 by Ryan Olson Committed by GitHub Mar 01, 2026
14 changed files
--- a/lib/kvbm-physical/src/transfer/notifications/mod.rs
+++ b/lib/kvbm-physical/src/transfer/notifications/mod.rs
+// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Transfer completion notification system.
+//!
+//! This module provides abstractions for waiting on transfer completions using different
+//! mechanisms: polling-based (NIXL status, CUDA events) and event-based (NIXL notifications).
+
+use std::collections::HashMap;
+use std::sync::Arc;
+use std::time::{Duration, Instant};
+
+use anyhow::Result;
+use tokio::sync::mpsc;
+use tokio::time::interval;
+use tracing::{error, warn};
+use uuid::Uuid;
+use velo_events::{EventHandle, EventManager};
+
+pub mod cuda_event;
+pub mod nixl_events;
+pub mod nixl_status;
+pub mod notification;
+
+pub use cuda_event::CudaEventChecker;
+pub use nixl_events::{RegisterNixlNotification, process_nixl_notification_events};
+pub use nixl_status::NixlStatusChecker;
+pub use notification::TransferCompleteNotification;
+
+/// Trait for checking if a transfer operation has completed.
+/// Supports polling-based completion checks (NIXL status, CUDA events).
+pub trait CompletionChecker: Send {
+    /// Returns true if the transfer is complete, false if still pending.
+    fn is_complete(&self) -> Result<bool>;
+}
+
+/// Registration message for polling-based transfer completion.
+pub struct RegisterPollingNotification<C: CompletionChecker> {
+    pub uuid: Uuid,
+    pub checker: C,
+    pub event_handle: EventHandle,
+}
+
+/// Tracking struct for outstanding polling-based transfers.
+struct OutstandingPollingTransfer<C: CompletionChecker> {
+    checker: C,
+    event_handle: EventHandle,
+    arrived_at: Instant,
+    last_warned_at: Option<Instant>,
+}
+
+/// Helper function to check if a transfer should be warned about and log the warning.
+/// Returns the new last_warned_at time if a warning was issued.
+fn check_and_warn_slow_transfer(
+    uuid: &Uuid,
+    arrived_at: Instant,
+    last_warned_at: Option<Instant>,
+) -> Option<Instant> {
+    let elapsed = arrived_at.elapsed();
+    if elapsed > Duration::from_secs(60) {
+        let should_warn = last_warned_at
+            .map(|last| last.elapsed() > Duration::from_secs(30))
+            .unwrap_or(true);
+
+        if should_warn {
+            warn!(
+                uuid = %uuid,
+                elapsed_secs = elapsed.as_secs(),
+                "Transfer has been pending for over 1 minute"
+            );
+            return Some(Instant::now());
+        }
+    }
+    last_warned_at
+}
+
+/// Generic polling-based transfer completion handler.
+/// Works with any CompletionChecker implementation (NIXL status, CUDA events, etc.)
+pub async fn process_polling_notifications<C: CompletionChecker>(
+    mut rx: mpsc::Receiver<RegisterPollingNotification<C>>,
+    system: Arc<EventManager>,
+) {
+    let mut outstanding: HashMap<Uuid, OutstandingPollingTransfer<C>> = HashMap::new();
+    let mut check_interval = interval(Duration::from_millis(1));
+
+    loop {
+        tokio::select! {
+            // Handle new transfer requests
+            notification = rx.recv() => {
+                match notification {
+                    Some(notif) => {
+                        outstanding.insert(notif.uuid, OutstandingPollingTransfer {
+                            checker: notif.checker,
+                            event_handle: notif.event_handle,
+                            arrived_at: Instant::now(),
+                            last_warned_at: None,
+                        });
+                    }
+                    None => {
+                        // Channel closed, finish processing outstanding transfers then exit
+                        break;
+                    }
+                }
+            }
+
+            // Periodically check status of outstanding transfers
+            _ = check_interval.tick(), if !outstanding.is_empty() => {
+                let mut completed = Vec::new();
+
+                for (uuid, transfer) in outstanding.iter_mut() {
+                    // Check transfer status
+                    match transfer.checker.is_complete() {
+                        Ok(true) => {
+                            // Transfer complete - mark for removal
+                            completed.push((*uuid, Ok(())));
+                        }
+                        Ok(false) => {
+                            // Transfer still in progress - check if we should warn
+                            transfer.last_warned_at = check_and_warn_slow_transfer(
+                                uuid,
+                                transfer.arrived_at,
+                                transfer.last_warned_at,
+                            );
+                        }
+                        Err(e) => {
+                            warn!(
+                                uuid = %uuid,
+                                error = %e,
+                                "Transfer status check failed"
+                            );
+                            completed.push((*uuid, Err(e)));
+                        }
+                    }
+                }
+
+                // Remove completed transfers and signal completion
+                for (uuid, result) in completed {
+                    if let Some(transfer) = outstanding.remove(&uuid) {
+                        // Signal completion via Nova event system
+                        match result {
+                            Ok(()) => {
+                                if let Err(e) = system.trigger(transfer.event_handle) {
+                                    error!(
+                                        uuid = %uuid,
+                                        error = %e,
+                                        "Failed to trigger completion event"
+                                    );
+                                }
+                            }
+                            Err(e) => {
+                                if let Err(err) = system.poison(transfer.event_handle, e.to_string()) {
+                                    error!(
+                                        uuid = %uuid,
+                                        error = %err,
+                                        "Failed to poison completion event"
+                                    );
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    // Channel closed, but we may still have outstanding transfers
+    // Continue processing them until all are complete
+    while !outstanding.is_empty() {
+        check_interval.tick().await;
+        let mut completed = Vec::new();
+
+        for (uuid, transfer) in outstanding.iter_mut() {
+            match transfer.checker.is_complete() {
+                Ok(true) => completed.push((*uuid, Ok(()))),
+                Ok(false) => {}
+                Err(e) => completed.push((*uuid, Err(e))),
+            }
+        }
+
+        for (uuid, result) in completed {
+            if let Some(transfer) = outstanding.remove(&uuid) {
+                match result {
+                    Ok(()) => {
+                        let _ = system.trigger(transfer.event_handle);
+                    }
+                    Err(e) => {
+                        let _ = system.poison(transfer.event_handle, e.to_string());
+                    }
+                }
+            }
+        }
+    }
+}
--- a/lib/kvbm-physical/src/transfer/notifications/nixl_events.rs
+++ b/lib/kvbm-physical/src/transfer/notifications/nixl_events.rs
+// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! NIXL notification-based completion handler.
+
+use std::collections::HashMap;
+use std::sync::Arc;
+use std::time::{Duration, Instant};
+
+use dynamo_memory::nixl::{Agent as NixlAgent, NotificationMap, XferRequest};
+use tokio::sync::mpsc;
+use tokio::time::interval;
+use tracing::{error, warn};
+use uuid::Uuid;
+use velo_events::{EventHandle, EventManager};
+
+/// Registration message for NIXL notification-based transfer completion.
+pub struct RegisterNixlNotification {
+    pub uuid: Uuid,
+    pub xfer_req: XferRequest,
+    pub event_handle: EventHandle,
+}
+
+/// Tracking struct for outstanding NIXL notification transfers.
+struct OutstandingTransfer {
+    #[allow(dead_code)] // Kept for potential future cleanup or debugging
+    xfer_req: XferRequest,
+    event_handle: EventHandle,
+    arrived_at: Instant,
+    last_warned_at: Option<Instant>,
+}
+
+/// Helper function to check if a transfer should be warned about and log the warning.
+/// Returns the new last_warned_at time if a warning was issued.
+fn check_and_warn_slow_transfer(
+    uuid: &Uuid,
+    arrived_at: Instant,
+    last_warned_at: Option<Instant>,
+) -> Option<Instant> {
+    let elapsed = arrived_at.elapsed();
+    if elapsed > Duration::from_secs(60) {
+        let should_warn = last_warned_at
+            .map(|last| last.elapsed() > Duration::from_secs(30))
+            .unwrap_or(true);
+
+        if should_warn {
+            warn!(
+                uuid = %uuid,
+                elapsed_secs = elapsed.as_secs(),
+                "Transfer has been pending for over 1 minute"
+            );
+            return Some(Instant::now());
+        }
+    }
+    last_warned_at
+}
+
+/// NIXL notification-based transfer completion handler.
+/// Fetches notifications in batches and matches them against outstanding transfers.
+pub async fn process_nixl_notification_events(
+    agent: NixlAgent,
+    mut rx: mpsc::Receiver<RegisterNixlNotification>,
+    system: Arc<EventManager>,
+) {
+    let mut outstanding: HashMap<Uuid, OutstandingTransfer> = HashMap::new();
+    let mut check_interval = interval(Duration::from_millis(1));
+
+    loop {
+        tokio::select! {
+            // Handle new transfer requests
+            notification = rx.recv() => {
+                match notification {
+                    Some(notif) => {
+                        outstanding.insert(notif.uuid, OutstandingTransfer {
+                            xfer_req: notif.xfer_req,
+                            event_handle: notif.event_handle,
+                            arrived_at: Instant::now(),
+                            last_warned_at: None,
+                        });
+                    }
+                    None => {
+                        // Channel closed, finish processing outstanding transfers then exit
+                        break;
+                    }
+                }
+            }
+
+            // Periodically fetch and process notifications
+            _ = check_interval.tick(), if !outstanding.is_empty() => {
+                // Create notification map inside this branch to avoid Send issues
+                let mut notif_map = match NotificationMap::new() {
+                    Ok(map) => map,
+                    Err(e) => {
+                        warn!(error = %e, "Failed to create notification map");
+                        continue;
+                    }
+                };
+
+                // Fetch all pending notifications
+                if let Err(e) = agent.get_notifications(&mut notif_map, None) {
+                    warn!(error = %e, "Failed to fetch NIXL notifications");
+                    continue;
+                }
+
+                // Process notifications and match against outstanding transfers
+                let notifications = match notif_map.take_notifs() {
+                    Ok(notifs) => notifs,
+                    Err(e) => {
+                        warn!(error = %e, "Failed to extract notifications from map");
+                        continue;
+                    }
+                };
+
+                let mut completed = Vec::new();
+
+                // Iterate through all notifications
+                for (_agent_name, notif_strings) in notifications {
+                    for notif_str in notif_strings {
+                        // Try to parse notification as UUID
+                        // NOTE: This assumes notifications contain UUIDs.
+                        // The actual format may be different and may need adjustment.
+                        if let Ok(notif_uuid) = Uuid::parse_str(&notif_str) {
+                            if outstanding.contains_key(&notif_uuid) {
+                                completed.push(notif_uuid);
+                            } else {
+                                // Notification arrived before we started waiting for it
+                                // This is the race condition we need to handle
+                                warn!(
+                                    uuid = %notif_uuid,
+                                    "Received notification for transfer not in outstanding map (early arrival)"
+                                );
+                            }
+                        }
+                    }
+                }
+
+                // Check for slow transfers and update warnings
+                for (uuid, transfer) in outstanding.iter_mut() {
+                    if !completed.contains(uuid) {
+                        transfer.last_warned_at = check_and_warn_slow_transfer(
+                            uuid,
+                            transfer.arrived_at,
+                            transfer.last_warned_at,
+                        );
+                    }
+                }
+
+                // Remove completed transfers and signal completion
+                for uuid in completed {
+                    if let Some(transfer) = outstanding.remove(&uuid)
+                        && let Err(e) = system.trigger(transfer.event_handle) {
+                            error!(
+                                uuid = %uuid,
+                                error = %e,
+                                "Failed to trigger completion event"
+                            );
+                        }
+                }
+            }
+        }
+    }
+
+    // Channel closed, but we may still have outstanding transfers
+    // Continue processing them until all are complete
+    while !outstanding.is_empty() {
+        check_interval.tick().await;
+
+        let mut notif_map = match NotificationMap::new() {
+            Ok(map) => map,
+            Err(_) => continue,
+        };
+
+        if let Ok(()) = agent.get_notifications(&mut notif_map, None)
+            && let Ok(notifications) = notif_map.take_notifs()
+        {
+            let mut completed = Vec::new();
+
+            for (_agent_name, notif_strings) in notifications {
+                for notif_str in notif_strings {
+                    if let Ok(notif_uuid) = Uuid::parse_str(&notif_str)
+                        && outstanding.contains_key(&notif_uuid)
+                    {
+                        completed.push(notif_uuid);
+                    }
+                }
+            }
+
+            for uuid in completed {
+                if let Some(transfer) = outstanding.remove(&uuid) {
+                    let _ = system.trigger(transfer.event_handle);
+                }
+            }
+        }
+    }
+}
--- a/lib/kvbm-physical/src/transfer/notifications/nixl_status.rs
+++ b/lib/kvbm-physical/src/transfer/notifications/nixl_status.rs
+// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! NIXL status polling-based completion checker.
+
+use anyhow::{Result, anyhow};
+use dynamo_memory::nixl::{Agent as NixlAgent, XferRequest};
+
+use super::CompletionChecker;
+
+/// Completion checker that polls NIXL transfer status.
+pub struct NixlStatusChecker {
+    agent: NixlAgent,
+    xfer_req: XferRequest,
+}
+
+impl NixlStatusChecker {
+    pub fn new(agent: NixlAgent, xfer_req: XferRequest) -> Self {
+        Self { agent, xfer_req }
+    }
+}
+
+impl CompletionChecker for NixlStatusChecker {
+    fn is_complete(&self) -> Result<bool> {
+        // get_xfer_status returns XferStatus enum:
+        // - XferStatus::Success means transfer is complete
+        // - XferStatus::InProgress means still pending
+        match self.agent.get_xfer_status(&self.xfer_req) {
+            Ok(status) => Ok(status.is_success()),
+            Err(e) => Err(anyhow!("NIXL transfer status check failed: {}", e)),
+        }
+    }
+}
--- a/lib/kvbm-physical/src/transfer/notifications/notification.rs
+++ b/lib/kvbm-physical/src/transfer/notifications/notification.rs
+// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Transfer completion notification handle.
+
+use anyhow::Result;
+use futures::future::{Either, Ready, ready};
+use std::{
+    pin::Pin,
+    sync::Arc,
+    task::{Context, Poll},
+};
+use velo_events::{Event, EventAwaiter, EventManager};
+
+pub enum TransferAwaiter {
+    Local(EventAwaiter),
+    // Sync(SyncResult),
+}
+
+impl std::future::Future for TransferAwaiter {
+    type Output = Result<()>;
+
+    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        match self.get_mut() {
+            Self::Local(waiter) => Pin::new(waiter).poll(cx),
+            // Self::Sync(sync) => Pin::new(sync).poll(cx),
+        }
+    }
+}
+
+/// Notification handle for an in-progress transfer.
+///
+/// This object can be awaited to block until the transfer completes.
+/// The transfer is tracked by a background handler that polls for completion
+/// or processes notification events.
+///
+/// Uses `futures::Either` to avoid event system overhead for synchronous completions.
+/// Pending transfers use `LocalEventWaiter` which avoids heap allocation and repeated
+/// DashMap lookups when awaiting.
+pub struct TransferCompleteNotification {
+    awaiter: Either<Ready<Result<()>>, TransferAwaiter>,
+}
+
+impl TransferCompleteNotification {
+    /// Create a notification that is already completed (for synchronous transfers).
+    ///
+    /// This is useful for transfers that complete immediately without needing
+    /// background polling, such as memcpy operations.
+    ///
+    /// This is extremely efficient - no allocations, locks, or event system overhead.
+    pub fn completed() -> Self {
+        Self {
+            awaiter: Either::Left(ready(Ok(()))),
+        }
+    }
+
+    /// Create a notification from a `LocalEventWaiter`.
+    ///
+    /// This is the primary way to construct a notification when you already
+    /// have an event waiter from the event system.
+    pub fn from_awaiter(awaiter: EventAwaiter) -> Self {
+        Self {
+            awaiter: Either::Right(TransferAwaiter::Local(awaiter)),
+        }
+    }
+
+    // /// Create a notification from a synchronous active message result.
+    // pub fn from_sync_result(sync: SyncResult) -> Self {
+    //     Self {
+    //         awaiter: Either::Right(TransferAwaiter::Sync(sync)),
+    //     }
+    // }
+
+    /// Check if the notification can yield the current task.
+    ///
+    /// The internal ::Left arm is guaranteed to be ready, while the ::Right arm is not.
+    pub fn could_yield(&self) -> bool {
+        matches!(self.awaiter, Either::Right(_))
+    }
+
+    /// Aggregate multiple notifications into one that completes when all are done.
+    ///
+    /// This is useful when a transfer is split across multiple workers and you want
+    /// to wait for all of them to complete.
+    ///
+    /// # Arguments
+    /// * `notifications` - The notifications to aggregate
+    /// * `events` - The event system to create the aggregate event
+    /// * `runtime` - The tokio runtime handle to spawn the aggregation task
+    ///
+    /// # Behavior
+    /// - If the list is empty, returns an already-completed notification
+    /// - If there's only one, returns it directly
+    /// - Otherwise, creates a new event and spawns a task to await all notifications
+    pub fn aggregate(
+        notifications: Vec<Self>,
+        events: &Arc<EventManager>,
+        runtime: &tokio::runtime::Handle,
+    ) -> Result<Self> {
+        if notifications.is_empty() {
+            return Ok(Self::completed());
+        }
+        if notifications.len() == 1 {
+            return Ok(notifications.into_iter().next().unwrap());
+        }
+
+        // Check if all notifications are already complete (no yielding needed)
+        if notifications.iter().all(|n| !n.could_yield()) {
+            return Ok(Self::completed());
+        }
+
+        // Create a new event for the aggregate completion
+        let event = events.new_event()?;
+        let awaiter = events.awaiter(event.handle())?;
+
+        // Spawn task that awaits all notifications and triggers/poisons the event
+        runtime.spawn(await_all_notifications(notifications, event));
+
+        Ok(Self::from_awaiter(awaiter))
+    }
+}
+
+/// Awaits all transfer notifications and signals completion via the event.
+///
+/// This function awaits ALL notifications regardless of individual failures,
+/// then triggers the event on success or poisons it with error details on failure.
+async fn await_all_notifications(
+    notifications: Vec<TransferCompleteNotification>,
+    local_event: Event,
+) {
+    // Await all notifications, collecting results
+    let results: Vec<Result<()>> =
+        futures::future::join_all(notifications.into_iter().map(|n| n.into_future())).await;
+
+    // Check for any failures
+    let errors: Vec<_> = results.into_iter().filter_map(|r| r.err()).collect();
+
+    if errors.is_empty() {
+        // Ignore trigger error - if event system is shutdown, nothing to do
+        let _ = local_event.trigger();
+    } else {
+        let error_msg = errors
+            .iter()
+            .map(|e| e.to_string())
+            .collect::<Vec<_>>()
+            .join("; ");
+        // Ignore poison error - if event system is shutdown, nothing to do
+        let _ = local_event.poison(error_msg);
+    }
+}
+
+impl std::future::IntoFuture for TransferCompleteNotification {
+    type Output = Result<()>;
+    type IntoFuture = Either<Ready<Result<()>>, TransferAwaiter>;
+
+    fn into_future(self) -> Self::IntoFuture {
+        self.awaiter
+    }
+}
--- a/lib/kvbm-physical/src/transfer/options.rs
+++ b/lib/kvbm-physical/src/transfer/options.rs
+// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Transfer options for configuring block and layer transfers.
+
+use super::BounceBuffer;
+use crate::layout::KvBlockLayout;
+use cudarc::driver::CudaStream;
+use derive_builder::Builder;
+use derive_getters::Dissolve;
+use std::ops::Range;
+use std::sync::Arc;
+
+/// Options for configuring transfer operations.
+///
+/// This structure provides configuration for block and layer transfers,
+/// including layer ranges, NIXL write notifications, and bounce buffers.
+///
+/// # Examples
+///
+/// ```rust,ignore
+/// let options = TransferOptions::builder()
+///     .nixl_write_notification(42)
+///     .layer_range(0..10)
+///     .build();
+/// ```
+#[derive(Clone, Default, Builder, Dissolve)]
+#[builder(pattern = "owned", default)]
+pub struct TransferOptions {
+    /// Range of layers to transfer (None = all layers).
+    ///
+    /// When specified, only the layers in this range will be transferred.
+    /// This is useful for partial block transfers or layer-specific operations.
+    #[builder(default, setter(strip_option))]
+    pub layer_range: Option<Range<usize>>,
+
+    /// NIXL write notification value delivered after RDMA write completes.
+    ///
+    /// When specified, NIXL will deliver this notification value to the remote
+    /// node after the RDMA write operation completes. This enables efficient
+    /// notification of transfer completion without requiring polling.
+    #[builder(default, setter(strip_option))]
+    pub nixl_write_notification: Option<u64>,
+
+    /// Bounce buffer specification for multi-hop transfers.
+    ///
+    /// When direct transfers are not allowed or efficient, this specifies
+    /// an intermediate staging area. The transfer will be split into two hops:
+    /// source → bounce buffer → destination.
+    #[builder(default, setter(strip_option, into))]
+    pub bounce_buffer: Option<BounceBuffer>,
+
+    /// Optional caller-provided CUDA stream for the transfer.
+    ///
+    /// When provided, the transfer executor will use this stream instead of
+    /// acquiring one from the pool. The caller is responsible for synchronization -
+    /// no event is recorded by the executor.
+    ///
+    /// This is useful for layer-wise transfers where all layers must execute
+    /// on the same stream to allow proper event sequencing.
+    #[builder(default, setter(strip_option))]
+    pub cuda_stream: Option<Arc<CudaStream>>,
+
+    /// Override source block layout interpretation.
+    ///
+    /// When set, the transfer executor will treat source blocks as having
+    /// this layout instead of the layout's default block_layout().
+    /// This enables transferring blocks that are stored in one format
+    /// but should be interpreted as another (e.g., operational → universal).
+    #[builder(default, setter(strip_option))]
+    pub src_kv_layout: Option<KvBlockLayout>,
+
+    /// Override destination block layout interpretation.
+    ///
+    /// When set, the transfer executor will treat destination blocks as having
+    /// this layout instead of the layout's default block_layout().
+    /// This enables writing blocks in a different format than the destination
+    /// layout's native format.
+    #[builder(default, setter(strip_option))]
+    pub dst_kv_layout: Option<KvBlockLayout>,
+}
+
+impl TransferOptions {
+    /// Create a new builder for transfer options.
+    pub fn builder() -> TransferOptionsBuilder {
+        TransferOptionsBuilder::default()
+    }
+
+    /// Create transfer options from an optional layer range.
+    pub fn from_layer_range(layer_range: Option<Range<usize>>) -> Self {
+        Self {
+            layer_range,
+            ..Self::default()
+        }
+    }
+
+    /// Create default transfer options.
+    ///
+    /// This transfers all layers with no special configuration.
+    pub fn new() -> Self {
+        Self::default()
+    }
+}
+
+#[cfg(all(test, feature = "testing-kvbm"))]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_default_options() {
+        let options = TransferOptions::default();
+        assert!(options.layer_range.is_none());
+        assert!(options.nixl_write_notification.is_none());
+        assert!(options.bounce_buffer.is_none());
+    }
+
+    #[test]
+    fn test_builder_with_notification() {
+        let options = TransferOptions::builder()
+            .nixl_write_notification(42)
+            .build()
+            .unwrap();
+
+        assert_eq!(options.nixl_write_notification, Some(42));
+        assert!(options.layer_range.is_none());
+    }
+
+    #[test]
+    fn test_builder_with_layer_range() {
+        let options = TransferOptions::builder()
+            .layer_range(0..10)
+            .build()
+            .unwrap();
+
+        assert_eq!(options.layer_range, Some(0..10));
+        assert!(options.nixl_write_notification.is_none());
+    }
+
+    #[test]
+    fn test_builder_with_all_options() {
+        let options = TransferOptions::builder()
+            .nixl_write_notification(100)
+            .layer_range(5..15)
+            .build()
+            .unwrap();
+
+        assert_eq!(options.nixl_write_notification, Some(100));
+        assert_eq!(options.layer_range, Some(5..15));
+    }
+}
--- a/lib/kvbm-physical/src/transfer/preferences.rs
+++ b/lib/kvbm-physical/src/transfer/preferences.rs
+// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#![allow(dead_code)]
+
+//! Transfer preferences for resolving redundant strategy choices.
+//!
+//! Some source/destination combinations can use multiple transfer strategies.
+//! For example:
+//! - System ↔ Pinned: memcpy or NIXL
+//! - Pinned ↔ Device: CUDA or NIXL
+//!
+//! This module provides preferences to control which strategy to prefer.
+
+use serde::{Deserialize, Serialize};
+
+/// Policy for choosing between native transports (memcpy/CUDA) and NIXL.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
+pub enum NativeVsNixlPolicy {
+    /// Always prefer native transports (memcpy/CUDA) when available
+    PreferNative,
+
+    /// Always prefer NIXL when available
+    PreferNixl,
+
+    /// Use native for local-to-local, NIXL for remote/disk
+    #[default]
+    Automatic,
+}
+
+/// Transfer preferences for strategy selection.
+///
+/// These preferences allow fine-grained control over transfer strategy selection
+/// when multiple valid strategies exist for a source/destination pair.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct TransferPreferences {
+    /// Policy for native vs NIXL transport selection
+    pub native_vs_nixl: NativeVsNixlPolicy,
+
+    /// Whether to prefer async CUDA operations over blocking ones
+    pub prefer_async_cuda: bool,
+}
+
+impl Default for TransferPreferences {
+    fn default() -> Self {
+        Self {
+            native_vs_nixl: NativeVsNixlPolicy::default(),
+            prefer_async_cuda: true,
+        }
+    }
+}
+
+impl TransferPreferences {
+    /// Create preferences with all defaults.
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    /// Create preferences that always prefer native transports.
+    pub fn prefer_native() -> Self {
+        Self {
+            native_vs_nixl: NativeVsNixlPolicy::PreferNative,
+            prefer_async_cuda: true,
+        }
+    }
+
+    /// Create preferences that always prefer NIXL.
+    pub fn prefer_nixl() -> Self {
+        Self {
+            native_vs_nixl: NativeVsNixlPolicy::PreferNixl,
+            prefer_async_cuda: true,
+        }
+    }
+
+    /// Set the native vs NIXL policy.
+    pub fn with_native_vs_nixl(mut self, policy: NativeVsNixlPolicy) -> Self {
+        self.native_vs_nixl = policy;
+        self
+    }
+
+    /// Set whether to prefer async CUDA operations.
+    pub fn with_async_cuda(mut self, prefer_async: bool) -> Self {
+        self.prefer_async_cuda = prefer_async;
+        self
+    }
+}
+
+#[cfg(all(test, feature = "testing-kvbm"))]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_default_preferences() {
+        let prefs = TransferPreferences::default();
+        assert_eq!(prefs.native_vs_nixl, NativeVsNixlPolicy::Automatic);
+        assert!(prefs.prefer_async_cuda);
+    }
+
+    #[test]
+    fn test_prefer_native() {
+        let prefs = TransferPreferences::prefer_native();
+        assert_eq!(prefs.native_vs_nixl, NativeVsNixlPolicy::PreferNative);
+        assert!(prefs.prefer_async_cuda);
+    }
+
+    #[test]
+    fn test_prefer_nixl() {
+        let prefs = TransferPreferences::prefer_nixl();
+        assert_eq!(prefs.native_vs_nixl, NativeVsNixlPolicy::PreferNixl);
+        assert!(prefs.prefer_async_cuda);
+    }
+
+    #[test]
+    fn test_builder_pattern() {
+        let prefs = TransferPreferences::new()
+            .with_native_vs_nixl(NativeVsNixlPolicy::PreferNixl)
+            .with_async_cuda(false);
+
+        assert_eq!(prefs.native_vs_nixl, NativeVsNixlPolicy::PreferNixl);
+        assert!(!prefs.prefer_async_cuda);
+    }
+}
--- a/lib/kvbm-physical/src/transfer/strategy.rs
+++ b/lib/kvbm-physical/src/transfer/strategy.rs
+// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Transfer strategy selection based on source and destination storage locations.
+
+use dynamo_memory::StorageKind;
+
+use crate::{layout::PhysicalLayout, transfer::TransferContext};
+
+use super::TransferCapabilities;
+
+/// Transfer strategy to use for copying memory between locations.
+///
+/// The strategy is determined by the source and destination storage locations.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum TransferStrategy {
+    /// CPU memcpy (for host-to-host transfers)
+    Memcpy,
+
+    /// CUDA async host-to-device transfer
+    CudaAsyncH2D,
+
+    /// CUDA async device-to-host transfer
+    CudaAsyncD2H,
+
+    /// CUDA async device-to-device transfer
+    CudaAsyncD2D,
+
+    /// NIXL read operation (pull from remote)
+    NixlRead,
+
+    /// NIXL write operation (push to remote)
+    NixlWrite,
+
+    /// NIXL write (flipped local and remote order)
+    /// This is needed for some NIXL backends.
+    /// For example, the POSIX backend requires that host memory
+    /// always be the "local" descriptor list, regardless of whether
+    /// it's a read or write.
+    #[expect(dead_code)]
+    NixlWriteFlipped,
+
+    /// NIXL read (flipped local and remote order)
+    NixlReadFlipped,
+
+    /// Invalid/unsupported transfer
+    #[expect(dead_code)]
+    Invalid,
+}
+
+/// Plan for executing a transfer, either direct or via bounce buffer.
+///
+/// Some transfers require staging through host memory when direct paths
+/// are not enabled via capabilities.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum TransferPlan {
+    /// Direct single-hop transfer using the specified strategy.
+    Direct(TransferStrategy),
+
+    /// Two-hop transfer requiring a bounce buffer in host memory.
+    ///
+    /// This is used when:
+    /// - Device → Remote (without GPU RDMA)
+    /// - Disk → Remote
+    /// - Device ↔ Disk (without GDS)
+    TwoHop {
+        /// First hop strategy (src → bounce)
+        first: TransferStrategy,
+
+        /// Bounce buffer location (always Pinned for best performance)
+        bounce_location: StorageKind,
+
+        /// Second hop strategy (bounce → dst)
+        second: TransferStrategy,
+    },
+}
+
+pub(crate) fn select_strategy(
+    src: &PhysicalLayout,
+    dst: &PhysicalLayout,
+    ctx: &TransferContext,
+) -> anyhow::Result<TransferPlan> {
+    let is_src_local = src.nixl_metadata().agent_name() == ctx.nixl_agent().name();
+    let is_dst_local = dst.nixl_metadata().agent_name() == ctx.nixl_agent().name();
+
+    if !is_src_local && !is_dst_local {
+        return Err(anyhow::anyhow!(
+            "Both src and dst are remote - this is not supported."
+        ));
+    }
+
+    if is_src_local && is_dst_local {
+        return Ok(select_direct_strategy(
+            src.location(),
+            dst.location(),
+            false,
+            ctx.capabilities(),
+        ));
+    }
+
+    select_remote_strategy_v2(
+        src.location(),
+        is_src_local,
+        dst.location(),
+        is_dst_local,
+        ctx.capabilities(),
+    )
+}
+
+/// Select the appropriate transfer plan based on source and destination locations.
+///
+/// # Arguments
+/// * `src` - Source storage location (always local)
+/// * `dst` - Destination storage location (can be local or remote)
+/// * `dst_is_remote` - Whether destination is on a remote node
+/// * `capabilities` - Transfer capability flags
+///
+/// # Returns
+/// A transfer plan (direct or two-hop)
+///
+/// # Conservative Default Policy
+///
+/// With default capabilities (all disabled):
+/// - Device can only transfer to/from Host
+/// - Disk can only transfer to/from Host
+/// - Host can transfer to Device, Disk, or Remote
+/// - Device ↔ Device is allowed (native CUDA)
+///
+/// Transfers that would violate this policy are staged through host:
+/// - Device → Remote: Device → Host → Remote (2 hops)
+/// - Disk → Remote: Disk → Host → Remote (2 hops)
+/// - Device ↔ Disk: Device → Host → Disk (2 hops)
+///
+/// # Optional Direct Paths
+///
+/// - `allow_gds`: Enables Disk ↔ Device direct transfers
+/// - `allow_gpu_rdma`: Enables Device → Remote direct transfers
+fn select_direct_strategy(
+    src: StorageKind,
+    dst: StorageKind,
+    dst_is_remote: bool,
+    capabilities: &TransferCapabilities,
+) -> TransferPlan {
+    use StorageKind::*;
+    use TransferStrategy::*;
+
+    // Handle remote destination
+    if dst_is_remote {
+        return select_remote_strategy(src, capabilities);
+    }
+
+    // Local-to-local transfers
+    match (src, dst) {
+        // Host ↔ Host - direct memcpy
+        (System, System) | (System, Pinned) | (Pinned, System) | (Pinned, Pinned) => {
+            TransferPlan::Direct(Memcpy)
+        }
+
+        // Host → Device - direct CUDA
+        (System, Device(_)) => panic!("System to Device transfers are not supported"),
+        (Pinned, Device(_)) => TransferPlan::Direct(CudaAsyncH2D),
+
+        // Device → Host - direct CUDA
+        (Device(_), System) => panic!("Device to System transfers are not supported"),
+        (Device(_), Pinned) => TransferPlan::Direct(CudaAsyncD2H),
+
+        // Device ↔ Device - direct CUDA
+        (Device(_), Device(_)) => TransferPlan::Direct(CudaAsyncD2D),
+
+        // Host ↔ Disk - direct NIXL
+        (System, Disk(_)) | (Pinned, Disk(_)) => TransferPlan::Direct(NixlWrite),
+        (Disk(_), System) | (Disk(_), Pinned) => TransferPlan::Direct(NixlReadFlipped),
+
+        // Disk ↔ Disk - NIXL doesn't seem to support direct transfers here.
+        // Leaving this as two-hop for now.
+        (Disk(_), Disk(_)) => TransferPlan::TwoHop {
+            first: NixlReadFlipped,
+            bounce_location: Pinned,
+            second: NixlWrite,
+        },
+
+        // Device ↔ Disk - check GDS capability
+        (Device(_), Disk(_)) => {
+            if capabilities.allows_device_disk_direct() {
+                // Direct GDS transfer
+                TransferPlan::Direct(NixlWrite)
+            } else {
+                // Stage through host: Device → Pinned → Disk
+                TransferPlan::TwoHop {
+                    first: CudaAsyncD2H,
+                    bounce_location: Pinned,
+                    second: NixlWrite,
+                }
+            }
+        }
+        (Disk(_), Device(_)) => {
+            if capabilities.allows_device_disk_direct() {
+                // Direct GDS transfer
+                TransferPlan::Direct(NixlRead)
+            } else {
+                // Stage through host: Disk → Pinned → Device
+                TransferPlan::TwoHop {
+                    first: NixlReadFlipped,
+                    bounce_location: Pinned,
+                    second: CudaAsyncH2D,
+                }
+            }
+        }
+    }
+}
+
+/// Select transfer strategy for remote destination.
+fn select_remote_strategy(src: StorageKind, capabilities: &TransferCapabilities) -> TransferPlan {
+    use StorageKind::*;
+    use TransferStrategy::*;
+
+    match src {
+        // Host → Remote - direct NIXL
+        System | Pinned => TransferPlan::Direct(NixlWrite),
+
+        // Device → Remote - check GPU RDMA capability
+        Device(_) => {
+            if capabilities.allows_device_remote_direct() {
+                // Direct GPU RDMA transfer
+                TransferPlan::Direct(NixlWrite)
+            } else {
+                // Stage through host: Device → Pinned → Remote
+                TransferPlan::TwoHop {
+                    first: CudaAsyncD2H,
+                    bounce_location: Pinned,
+                    second: NixlWrite,
+                }
+            }
+        }
+
+        // Disk → Remote - always stage through host
+        Disk(_) => TransferPlan::TwoHop {
+            first: NixlWrite,
+            bounce_location: Pinned,
+            second: NixlWrite,
+        },
+    }
+}
+
+fn select_remote_strategy_v2(
+    src: StorageKind,
+    is_src_local: bool,
+    dst: StorageKind,
+    is_dst_local: bool,
+    capabilities: &TransferCapabilities,
+) -> anyhow::Result<TransferPlan> {
+    // We only support System, Pinned and Device for remote transfers.
+    // Later we might support staged/bounce buffer transfers.
+
+    if matches!(src, StorageKind::Disk(_)) || matches!(dst, StorageKind::Disk(_)) {
+        return Err(anyhow::anyhow!(
+            "Neither local nor remote disk transfers are supported over NIXL at this time."
+        ));
+    }
+
+    if !capabilities.allow_gpu_rdma
+        && (matches!(src, StorageKind::Device(_)) || matches!(dst, StorageKind::Device(_)))
+    {
+        return Err(anyhow::anyhow!(
+            "GPU RDMA is disabled - this transfer requires GPU RDMA."
+        ));
+    }
+
+    if is_src_local && !is_dst_local {
+        return Ok(TransferPlan::Direct(TransferStrategy::NixlWrite));
+    }
+
+    if is_dst_local && !is_src_local {
+        return Ok(TransferPlan::Direct(TransferStrategy::NixlReadFlipped));
+    }
+
+    unreachable!("Both src and dst are remote - this is not supported.");
+}
+
+#[cfg(all(test, feature = "testing-kvbm"))]
+mod tests {
+    use super::*;
+
+    fn default_caps() -> TransferCapabilities {
+        TransferCapabilities::default()
+    }
+
+    #[test]
+    fn test_host_to_host_transfers() {
+        let caps = default_caps();
+        assert_eq!(
+            select_direct_strategy(StorageKind::System, StorageKind::System, false, &caps),
+            TransferPlan::Direct(TransferStrategy::Memcpy)
+        );
+        assert_eq!(
+            select_direct_strategy(StorageKind::System, StorageKind::Pinned, false, &caps),
+            TransferPlan::Direct(TransferStrategy::Memcpy)
+        );
+        assert_eq!(
+            select_direct_strategy(StorageKind::Pinned, StorageKind::System, false, &caps),
+            TransferPlan::Direct(TransferStrategy::Memcpy)
+        );
+        assert_eq!(
+            select_direct_strategy(StorageKind::Pinned, StorageKind::Pinned, false, &caps),
+            TransferPlan::Direct(TransferStrategy::Memcpy)
+        );
+    }
+
+    #[test]
+    fn test_host_to_device_transfers() {
+        let caps = default_caps();
+        // // System (unpinned) to device should be blocking
+        // assert_eq!(
+        //     select_direct_strategy(StorageKind::System, StorageKind::Device(0), false, &caps),
+        //     TransferPlan::Direct(TransferStrategy::CudaBlockingH2D)
+        // );
+
+        // Pinned to device should be async
+        assert_eq!(
+            select_direct_strategy(StorageKind::Pinned, StorageKind::Device(0), false, &caps),
+            TransferPlan::Direct(TransferStrategy::CudaAsyncH2D)
+        );
+    }
+
+    #[test]
+    fn test_device_to_host_transfers() {
+        let caps = default_caps();
+        //    // Device to system should be blocking
+        //     assert_eq!(
+        //         select_direct_strategy(StorageKind::Device(0), StorageKind::System, false, &caps),
+        //         TransferPlan::Direct(TransferStrategy::CudaBlockingD2H)
+        //     );
+
+        // Device to pinned should be async
+        assert_eq!(
+            select_direct_strategy(StorageKind::Device(0), StorageKind::Pinned, false, &caps),
+            TransferPlan::Direct(TransferStrategy::CudaAsyncD2H)
+        );
+    }
+
+    #[test]
+    fn test_device_to_device_transfers() {
+        let caps = default_caps();
+        assert_eq!(
+            select_direct_strategy(StorageKind::Device(0), StorageKind::Device(1), false, &caps),
+            TransferPlan::Direct(TransferStrategy::CudaAsyncD2D)
+        );
+        assert_eq!(
+            select_direct_strategy(StorageKind::Device(3), StorageKind::Device(3), false, &caps),
+            TransferPlan::Direct(TransferStrategy::CudaAsyncD2D)
+        );
+    }
+
+    #[test]
+    fn test_disk_to_host_transfers() {
+        let caps = default_caps();
+        // Disk to host - direct NIXL
+        assert_eq!(
+            select_direct_strategy(StorageKind::Disk(42), StorageKind::System, false, &caps),
+            TransferPlan::Direct(TransferStrategy::NixlReadFlipped)
+        );
+        assert_eq!(
+            select_direct_strategy(StorageKind::Disk(42), StorageKind::Pinned, false, &caps),
+            TransferPlan::Direct(TransferStrategy::NixlReadFlipped)
+        );
+    }
+
+    #[test]
+    fn test_host_to_disk_transfers() {
+        let caps = default_caps();
+        // Host to disk - direct NIXL
+        assert_eq!(
+            select_direct_strategy(StorageKind::System, StorageKind::Disk(42), false, &caps),
+            TransferPlan::Direct(TransferStrategy::NixlWrite)
+        );
+        assert_eq!(
+            select_direct_strategy(StorageKind::Pinned, StorageKind::Disk(42), false, &caps),
+            TransferPlan::Direct(TransferStrategy::NixlWrite)
+        );
+    }
+
+    #[test]
+    fn test_device_to_disk_without_gds() {
+        let caps = default_caps(); // GDS disabled
+        // Device → Disk should use bounce buffer
+        let plan =
+            select_direct_strategy(StorageKind::Device(0), StorageKind::Disk(42), false, &caps);
+        match plan {
+            TransferPlan::TwoHop {
+                first,
+                bounce_location,
+                second,
+            } => {
+                assert_eq!(first, TransferStrategy::CudaAsyncD2H);
+                assert_eq!(bounce_location, StorageKind::Pinned);
+                assert_eq!(second, TransferStrategy::NixlWrite);
+            }
+            _ => panic!("Expected TwoHop plan"),
+        }
+    }
+
+    #[test]
+    fn test_disk_to_device_without_gds() {
+        let caps = default_caps(); // GDS disabled
+        // Disk → Device should use bounce buffer
+        let plan =
+            select_direct_strategy(StorageKind::Disk(42), StorageKind::Device(0), false, &caps);
+        match plan {
+            TransferPlan::TwoHop {
+                first,
+                bounce_location,
+                second,
+            } => {
+                assert_eq!(first, TransferStrategy::NixlReadFlipped);
+                assert_eq!(bounce_location, StorageKind::Pinned);
+                assert_eq!(second, TransferStrategy::CudaAsyncH2D);
+            }
+            _ => panic!("Expected TwoHop plan"),
+        }
+    }
+
+    #[test]
+    fn test_device_to_disk_with_gds() {
+        let caps = TransferCapabilities::default().with_gds(true);
+        // Device → Disk should be direct with GDS
+        assert_eq!(
+            select_direct_strategy(StorageKind::Device(0), StorageKind::Disk(42), false, &caps),
+            TransferPlan::Direct(TransferStrategy::NixlWrite)
+        );
+    }
+
+    #[test]
+    fn test_disk_to_device_with_gds() {
+        let caps = TransferCapabilities::default().with_gds(true);
+        // Disk → Device should be direct with GDS
+        assert_eq!(
+            select_direct_strategy(StorageKind::Disk(42), StorageKind::Device(0), false, &caps),
+            TransferPlan::Direct(TransferStrategy::NixlRead)
+        );
+    }
+
+    #[test]
+    fn test_host_to_remote() {
+        let caps = default_caps();
+        // Host → Remote - always direct
+        assert_eq!(
+            select_direct_strategy(StorageKind::System, StorageKind::System, true, &caps),
+            TransferPlan::Direct(TransferStrategy::NixlWrite)
+        );
+        assert_eq!(
+            select_direct_strategy(StorageKind::Pinned, StorageKind::Pinned, true, &caps),
+            TransferPlan::Direct(TransferStrategy::NixlWrite)
+        );
+    }
+
+    #[test]
+    fn test_device_to_remote_without_rdma() {
+        let caps = default_caps(); // GPU RDMA disabled
+        // Device → Remote should use bounce buffer
+        let plan = select_direct_strategy(StorageKind::Device(0), StorageKind::System, true, &caps);
+        match plan {
+            TransferPlan::TwoHop {
+                first,
+                bounce_location,
+                second,
+            } => {
+                assert_eq!(first, TransferStrategy::CudaAsyncD2H);
+                assert_eq!(bounce_location, StorageKind::Pinned);
+                assert_eq!(second, TransferStrategy::NixlWrite);
+            }
+            _ => panic!("Expected TwoHop plan"),
+        }
+    }
+
+    #[test]
+    fn test_device_to_remote_with_rdma() {
+        let caps = TransferCapabilities::default().with_gpu_rdma(true);
+        // Device → Remote should be direct with GPU RDMA
+        assert_eq!(
+            select_direct_strategy(StorageKind::Device(0), StorageKind::Device(0), true, &caps),
+            TransferPlan::Direct(TransferStrategy::NixlWrite)
+        );
+    }
+
+    #[test]
+    fn test_disk_to_remote() {
+        let caps = default_caps();
+        // Disk → Remote always uses bounce buffer
+        let plan = select_direct_strategy(StorageKind::Disk(42), StorageKind::System, true, &caps);
+        match plan {
+            TransferPlan::TwoHop {
+                first,
+                bounce_location,
+                second,
+            } => {
+                assert_eq!(first, TransferStrategy::NixlWrite);
+                assert_eq!(bounce_location, StorageKind::Pinned);
+                assert_eq!(second, TransferStrategy::NixlWrite);
+            }
+            _ => panic!("Expected TwoHop plan"),
+        }
+    }
+}
--- a/lib/kvbm-physical/src/transfer/testing.rs
+++ b/lib/kvbm-physical/src/transfer/testing.rs
+// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Round-trip testing infrastructure for transfer verification.
+//!
+//! This module provides utilities for testing data integrity across transfers
+//! by comparing checksums after round-trip operations:
+//! 1. Source blocks (host) → Intermediate (device/disk/remote)
+//! 2. Intermediate → Destination blocks (host, different IDs)
+//! 3. Verify checksums match between source and destination
+
+use super::context::TransferContext;
+use super::{
+    BlockChecksum, FillPattern, PhysicalLayout, StorageKind, compute_block_checksums, fill_blocks,
+    transfer_blocks,
+};
+use anyhow::{Result, anyhow};
+use std::collections::HashMap;
+
+/// Result of a round-trip test.
+#[derive(Debug)]
+pub struct RoundTripTestResult {
+    /// Source block checksums (keyed by source block ID)
+    pub source_checksums: HashMap<usize, BlockChecksum>,
+
+    /// Destination block checksums (keyed by destination block ID)
+    pub dest_checksums: HashMap<usize, BlockChecksum>,
+
+    /// Block ID mapping used (src_id, dst_id)
+    pub block_mapping: Vec<(usize, usize)>,
+
+    /// Whether all checksums matched
+    pub success: bool,
+
+    /// Mismatched blocks (if any)
+    pub mismatches: Vec<(usize, usize)>, // (src_id, dst_id) pairs that didn't match
+}
+
+impl RoundTripTestResult {
+    /// Check if the round-trip test passed.
+    pub fn is_success(&self) -> bool {
+        self.success
+    }
+
+    /// Get the number of blocks tested.
+    pub fn num_blocks(&self) -> usize {
+        self.block_mapping.len()
+    }
+
+    /// Get a detailed report of the test results.
+    pub fn report(&self) -> String {
+        if self.success {
+            format!(
+                "Round-trip test PASSED: {}/{} blocks verified successfully",
+                self.num_blocks(),
+                self.num_blocks()
+            )
+        } else {
+            format!(
+                "Round-trip test FAILED: {}/{} blocks mismatched\nMismatches: {:?}",
+                self.mismatches.len(),
+                self.num_blocks(),
+                self.mismatches
+            )
+        }
+    }
+}
+
+/// Builder for round-trip tests.
+///
+/// This allows configuring a test that transfers data from source blocks
+/// to intermediate storage and back to different destination blocks,
+/// verifying data integrity via checksums.
+pub struct RoundTripTest {
+    /// Source physical layout (must be local)
+    source: PhysicalLayout,
+
+    /// Intermediate physical layout (can be remote/device/disk)
+    intermediate: PhysicalLayout,
+
+    /// Destination physical layout (must be local)
+    destination: PhysicalLayout,
+
+    /// Block mapping: (src_id, intermediate_id, dst_id)
+    block_mapping: Vec<(usize, usize, usize)>,
+
+    /// Fill pattern for source blocks
+    fill_pattern: FillPattern,
+}
+
+impl RoundTripTest {
+    /// Create a new round-trip test.
+    ///
+    /// # Arguments
+    /// * `source` - Source physical layout (must be local)
+    /// * `intermediate` - Intermediate physical layout
+    /// * `destination` - Destination physical layout (must be local)
+    pub fn new(
+        source: PhysicalLayout,
+        intermediate: PhysicalLayout,
+        destination: PhysicalLayout,
+    ) -> Result<Self> {
+        if source.is_remote() {
+            return Err(anyhow!("Source layout must be local"));
+        }
+        if destination.is_remote() {
+            return Err(anyhow!("Destination layout must be local"));
+        }
+
+        Ok(Self {
+            source,
+            intermediate,
+            destination,
+            block_mapping: Vec::new(),
+            fill_pattern: FillPattern::Sequential,
+        })
+    }
+
+    /// Set the fill pattern for source blocks.
+    pub fn with_fill_pattern(mut self, pattern: FillPattern) -> Self {
+        self.fill_pattern = pattern;
+        self
+    }
+
+    /// Add a block mapping for the round-trip test.
+    ///
+    /// # Arguments
+    /// * `src_id` - Source block ID
+    /// * `intermediate_id` - Intermediate block ID
+    /// * `dst_id` - Destination block ID
+    pub fn add_block_mapping(
+        mut self,
+        src_id: usize,
+        intermediate_id: usize,
+        dst_id: usize,
+    ) -> Self {
+        self.block_mapping.push((src_id, intermediate_id, dst_id));
+        self
+    }
+
+    /// Add multiple block mappings at once.
+    ///
+    /// This is a convenience method for adding several mappings.
+    pub fn with_block_mappings(mut self, mappings: &[(usize, usize, usize)]) -> Self {
+        self.block_mapping.extend_from_slice(mappings);
+        self
+    }
+
+    /// Run the round-trip test.
+    ///
+    /// # Workflow
+    /// 1. Fill source blocks with the specified pattern
+    /// 2. Compute source checksums
+    /// 3. Transfer source → intermediate
+    /// 4. Transfer intermediate → destination
+    /// 5. Compute destination checksums
+    /// 6. Compare checksums
+    ///
+    /// # Arguments
+    /// * `ctx` - Transfer context with CUDA stream and NIXL agent
+    pub async fn run(self, ctx: &TransferContext) -> Result<RoundTripTestResult> {
+        if self.block_mapping.is_empty() {
+            return Err(anyhow!("No block mappings specified"));
+        }
+
+        // Step 1: Fill source blocks
+        let src_ids: Vec<usize> = self.block_mapping.iter().map(|(src, _, _)| *src).collect();
+        fill_blocks(&self.source, &src_ids, self.fill_pattern)?;
+
+        // Step 2: Compute source checksums
+        let source_checksums = compute_block_checksums(&self.source, &src_ids)?;
+
+        // Step 3: Transfer source → intermediate
+        let src_ids_intermediate: Vec<usize> =
+            self.block_mapping.iter().map(|(src, _, _)| *src).collect();
+        let inter_ids_from_src: Vec<usize> = self
+            .block_mapping
+            .iter()
+            .map(|(_, inter, _)| *inter)
+            .collect();
+        let notification = transfer_blocks(
+            &self.source,
+            &self.intermediate,
+            &src_ids_intermediate,
+            &inter_ids_from_src,
+            ctx,
+        )?;
+        notification.await?;
+
+        // Step 4: Transfer intermediate → destination
+        let inter_ids_to_dst: Vec<usize> = self
+            .block_mapping
+            .iter()
+            .map(|(_, inter, _)| *inter)
+            .collect();
+        let dst_ids_from_inter: Vec<usize> =
+            self.block_mapping.iter().map(|(_, _, dst)| *dst).collect();
+        let notification = transfer_blocks(
+            &self.intermediate,
+            &self.destination,
+            &inter_ids_to_dst,
+            &dst_ids_from_inter,
+            ctx,
+        )?;
+        notification.await?;
+
+        // Step 5: Compute destination checksums
+        let dst_ids: Vec<usize> = self.block_mapping.iter().map(|(_, _, dst)| *dst).collect();
+        let dest_checksums = compute_block_checksums(&self.destination, &dst_ids)?;
+
+        // Step 6: Compare checksums
+        let mut mismatches = Vec::new();
+        for (src_id, _, dst_id) in &self.block_mapping {
+            let src_checksum = &source_checksums[src_id];
+            let dst_checksum = &dest_checksums[dst_id];
+
+            if src_checksum != dst_checksum {
+                mismatches.push((*src_id, *dst_id));
+            }
+        }
+
+        let success = mismatches.is_empty();
+        let block_mapping: Vec<(usize, usize)> = self
+            .block_mapping
+            .iter()
+            .map(|(src, _, dst)| (*src, *dst))
+            .collect();
+
+        Ok(RoundTripTestResult {
+            source_checksums,
+            dest_checksums,
+            block_mapping,
+            success,
+            mismatches,
+        })
+    }
+}
+
+#[cfg(all(test, feature = "testing-kvbm"))]
+mod tests {
+    use super::*;
+    use crate::v2::layout::{
+        FullyContiguousLayout, Layout, LayoutConfig, MemoryRegion, OwnedMemoryRegion,
+    };
+    use std::sync::Arc;
+
+    // Helper to create a minimal transfer context for testing
+    // In real tests with CUDA/NIXL, this would be properly constructed
+    fn create_test_context() -> TransferContext {
+        // For now, we'll skip these tests if CUDA is not available
+        // In the future, we can mock TransferContext or use conditional compilation
+        todo!("Create test context - requires CUDA/NIXL setup")
+    }
+
+    #[tokio::test]
+    #[ignore = "Requires CUDA/NIXL setup"]
+    async fn test_round_trip_host_to_host() {
+        // Create three layouts: source, intermediate, destination
+        let (src_layout, _src_mem) = create_test_layout(4);
+        let (inter_layout, _inter_mem) = create_test_layout(4);
+        let (dst_layout, _dst_mem) = create_test_layout(4);
+
+        let source = PhysicalLayout::new_local(src_layout, StorageKind::System);
+        let intermediate = PhysicalLayout::new_local(inter_layout, StorageKind::Pinned);
+        let destination = PhysicalLayout::new_local(dst_layout, StorageKind::System);
+
+        // Build round-trip test with different block IDs
+        // Source: blocks [0, 1, 2, 3]
+        // Intermediate: blocks [0, 1, 2, 3]
+        // Destination: blocks [0, 1, 2, 3] (different memory than source)
+        let test = RoundTripTest::new(source, intermediate, destination)
+            .unwrap()
+            .with_fill_pattern(FillPattern::Sequential)
+            .add_block_mapping(0, 0, 0)
+            .add_block_mapping(1, 1, 1)
+            .add_block_mapping(2, 2, 2)
+            .add_block_mapping(3, 3, 3);
+
+        // Create a transfer context (requires actual CUDA/NIXL setup)
+        let ctx = create_test_context();
+
+        // Run the test
+        let result = test.run(&ctx).await.unwrap();
+
+        assert!(result.is_success(), "{}", result.report());
+        assert_eq!(result.num_blocks(), 4);
+    }
+
+    #[tokio::test]
+    #[ignore = "Requires CUDA/NIXL setup"]
+    async fn test_round_trip_different_block_ids() {
+        // Create layouts with enough blocks
+        let (src_layout, _src_mem) = create_test_layout(8);
+        let (inter_layout, _inter_mem) = create_test_layout(8);
+        let (dst_layout, _dst_mem) = create_test_layout(8);
+
+        let source = PhysicalLayout::new_local(src_layout, StorageKind::System);
+        let intermediate = PhysicalLayout::new_local(inter_layout, StorageKind::Pinned);
+        let destination = PhysicalLayout::new_local(dst_layout, StorageKind::System);
+
+        // Test with non-overlapping block IDs
+        // Source: blocks [0, 1, 2, 3]
+        // Intermediate: blocks [2, 3, 4, 5]
+        // Destination: blocks [4, 5, 6, 7]
+        let test = RoundTripTest::new(source, intermediate, destination)
+            .unwrap()
+            .with_fill_pattern(FillPattern::BlockBased)
+            .with_block_mappings(&[(0, 2, 4), (1, 3, 5), (2, 4, 6), (3, 5, 7)]);
+
+        let ctx = create_test_context();
+        let result = test.run(&ctx).await.unwrap();
+
+        assert!(result.is_success(), "{}", result.report());
+        assert_eq!(result.num_blocks(), 4);
+    }
+
+    #[test]
+    fn test_round_trip_builder() {
+        let (src_layout, _) = create_test_layout(4);
+        let (inter_layout, _) = create_test_layout(4);
+        let (dst_layout, _) = create_test_layout(4);
+
+        let source = PhysicalLayout::new_local(src_layout, StorageKind::System);
+        let intermediate = PhysicalLayout::new_local(inter_layout, StorageKind::Pinned);
+        let destination = PhysicalLayout::new_local(dst_layout, StorageKind::System);
+
+        let test = RoundTripTest::new(source, intermediate, destination)
+            .unwrap()
+            .with_fill_pattern(FillPattern::Constant(42))
+            .add_block_mapping(0, 0, 1)
+            .add_block_mapping(1, 1, 2);
+
+        assert_eq!(test.block_mapping.len(), 2);
+    }
+
+    #[test]
+    fn test_round_trip_requires_local_source() {
+        let (src_layout, _) = create_test_layout(1);
+        let (inter_layout, _) = create_test_layout(1);
+        let (dst_layout, _) = create_test_layout(1);
+
+        let source =
+            PhysicalLayout::new_remote(src_layout, StorageKind::System, "remote".to_string());
+        let intermediate = PhysicalLayout::new_local(inter_layout, StorageKind::Pinned);
+        let destination = PhysicalLayout::new_local(dst_layout, StorageKind::System);
+
+        let result = RoundTripTest::new(source, intermediate, destination);
+        assert!(result.is_err());
+    }
+
+    #[test]
+    fn test_round_trip_requires_local_destination() {
+        let (src_layout, _) = create_test_layout(1);
+        let (inter_layout, _) = create_test_layout(1);
+        let (dst_layout, _) = create_test_layout(1);
+
+        let source = PhysicalLayout::new_local(src_layout, StorageKind::System);
+        let intermediate = PhysicalLayout::new_local(inter_layout, StorageKind::Pinned);
+        let destination =
+            PhysicalLayout::new_remote(dst_layout, StorageKind::System, "remote".to_string());
+
+        let result = RoundTripTest::new(source, intermediate, destination);
+        assert!(result.is_err());
+    }
+}
--- a/lib/kvbm-physical/src/transfer/tests/local_transfers.rs
+++ b/lib/kvbm-physical/src/transfer/tests/local_transfers.rs
+// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Local transfer tests where source and destination use the same NIXL agent.
+//!
+//! These tests verify data integrity across:
+//! - Different storage types (System, Pinned, Device)
+//! - Different layout types (Fully Contiguous, Layer-wise)
+//! - Different transfer strategies (Memcpy, CUDA H2D/D2H)
+
+use super::*;
+use crate::transfer::TransferCapabilities;
+use crate::transfer::executor::TransferOptionsInternal;
+use crate::transfer::executor::execute_transfer;
+use crate::transfer::{can_use_whole_block_transfer, validate_layout_compatibility};
+use anyhow::Result;
+use rstest::rstest;
+
+// ============================================================================
+// System <=> System Tests (Memcpy)
+// ============================================================================
+
+#[derive(Clone)]
+enum LayoutType {
+    FC,
+    LW,
+}
+
+fn build_layout(
+    agent: NixlAgent,
+    layout_type: LayoutType,
+    storage_kind: StorageKind,
+    num_blocks: usize,
+) -> PhysicalLayout {
+    match layout_type {
+        LayoutType::FC => create_fc_layout(agent, storage_kind, num_blocks),
+        LayoutType::LW => create_lw_layout(agent, storage_kind, num_blocks),
+    }
+}
+
+/// Check if a transfer between two storage kinds requires GDS_MT (Device ↔ Disk direct).
+fn requires_gds(src_kind: StorageKind, dst_kind: StorageKind) -> bool {
+    matches!(
+        (src_kind, dst_kind),
+        (StorageKind::Device(_), StorageKind::Disk(_))
+            | (StorageKind::Disk(_), StorageKind::Device(_))
+    )
+}
+
+/// Check if a transfer between two storage kinds is unsupported.
+///
+/// Device ↔ System transfers are not supported - must use Pinned memory for CUDA transfers.
+fn is_unsupported_transfer(src_kind: StorageKind, dst_kind: StorageKind) -> bool {
+    matches!(
+        (src_kind, dst_kind),
+        (StorageKind::Device(_), StorageKind::System)
+            | (StorageKind::System, StorageKind::Device(_))
+    )
+}
+
+/// Probe whether a NIXL backend is available by attempting to add it to a temporary agent.
+fn is_nixl_backend_available(backend: &str) -> bool {
+    let mut agent = match NixlAgent::new("__backend_probe__") {
+        Ok(a) => a,
+        Err(_) => return false,
+    };
+    agent.add_backend(backend).is_ok()
+}
+
+fn build_agent_for_kinds(kinds: &[StorageKind]) -> Result<NixlAgent> {
+    let mut agent = NixlAgent::new("agent")?;
+    let mut added_backends = Vec::new();
+
+    // Determine required backends for all storage kinds
+    for &kind in kinds {
+        match kind {
+            StorageKind::System | StorageKind::Pinned => {
+                if !added_backends.contains(&"POSIX") {
+                    let _ = agent.add_backend("POSIX"); // Optional for DRAM
+                    added_backends.push("POSIX");
+                }
+            }
+            StorageKind::Device(_) => {
+                if !added_backends.contains(&"UCX") {
+                    agent.add_backend("UCX")?; // Required for VRAM
+                    added_backends.push("UCX");
+                }
+            }
+            StorageKind::Disk(_) => {
+                if !added_backends.contains(&"POSIX") {
+                    let _ = agent.add_backend("POSIX"); // Optional for disk I/O
+                    added_backends.push("POSIX");
+                }
+            }
+        }
+    }
+
+    // GDS_MT is optional for Device <-> Disk (will be checked separately)
+    for window in kinds.windows(2) {
+        if requires_gds(window[0], window[1]) && !added_backends.contains(&"GDS_MT") {
+            let _ = agent.add_backend("GDS_MT");
+            break;
+        }
+    }
+
+    Ok(agent)
+}
+
+#[rstest]
+#[tokio::test]
+async fn test_p2p(
+    #[values(LayoutType::FC, LayoutType::LW)] src_layout: LayoutType,
+    #[values(
+        StorageKind::System,
+        StorageKind::Pinned,
+        StorageKind::Device(0),
+        StorageKind::Disk(0)
+    )]
+    src_kind: StorageKind,
+    #[values(LayoutType::FC, LayoutType::LW)] dst_layout: LayoutType,
+    #[values(
+        StorageKind::System,
+        StorageKind::Pinned,
+        StorageKind::Device(0),
+        StorageKind::Disk(0)
+    )]
+    dst_kind: StorageKind,
+) -> Result<()> {
+    // Skip unsupported Device ↔ System transfers (must use Pinned for CUDA)
+    if is_unsupported_transfer(src_kind, dst_kind) {
+        eprintln!(
+            "Skipping unsupported Device ↔ System transfer: src={:?}, dst={:?}",
+            src_kind, dst_kind
+        );
+        return Ok(());
+    }
+
+    // Device ↔ Disk direct transfers require GDS_MT
+    if requires_gds(src_kind, dst_kind) && !is_nixl_backend_available("GDS_MT") {
+        eprintln!("Skipping Device ↔ Disk test - GDS_MT backend unavailable");
+        return Ok(());
+    }
+
+    use crate::transfer::{BounceBufferInternal, executor::TransferOptionsInternal};
+
+    let agent = build_agent_for_kinds(&[src_kind, dst_kind])?;
+
+    let src = build_layout(agent.clone(), src_layout, src_kind, 4);
+    let dst = build_layout(agent.clone(), dst_layout, dst_kind, 4);
+
+    let bounce_layout = build_layout(agent.clone(), LayoutType::FC, StorageKind::Pinned, 4);
+
+    let bounce_buffer_spec = BounceBufferInternal::from_layout(bounce_layout, vec![0, 1]);
+
+    let src_blocks = vec![0, 1];
+    let dst_blocks = vec![2, 3];
+
+    let checksums = fill_and_checksum(&src, &src_blocks, FillPattern::Sequential)?;
+    let ctx = create_transfer_context(agent, None).unwrap();
+
+    let options = TransferOptionsInternal::builder()
+        .bounce_buffer(bounce_buffer_spec)
+        .build()?;
+
+    let notification =
+        execute_transfer(&src, &dst, &src_blocks, &dst_blocks, options, ctx.context())?;
+    notification.await?;
+
+    verify_checksums_by_position(&checksums, &src_blocks, &dst, &dst_blocks)?;
+
+    Ok(())
+}
+
+#[rstest]
+#[tokio::test]
+async fn test_roundtrip(
+    #[values(LayoutType::FC, LayoutType::LW)] src_layout: LayoutType,
+    #[values(StorageKind::System, StorageKind::Pinned, StorageKind::Device(0))]
+    src_kind: StorageKind,
+    #[values(LayoutType::FC, LayoutType::LW)] inter_layout: LayoutType,
+    #[values(StorageKind::System, StorageKind::Pinned, StorageKind::Device(0))]
+    inter_kind: StorageKind,
+    #[values(LayoutType::FC, LayoutType::LW)] dst_layout: LayoutType,
+    #[values(StorageKind::System, StorageKind::Pinned, StorageKind::Device(0))]
+    dst_kind: StorageKind,
+) -> Result<()> {
+    // Skip unsupported Device ↔ System transfers (must use Pinned for CUDA)
+    if is_unsupported_transfer(src_kind, inter_kind)
+        || is_unsupported_transfer(inter_kind, dst_kind)
+    {
+        eprintln!(
+            "Skipping unsupported Device ↔ System transfer: src={:?}, inter={:?}, dst={:?}",
+            src_kind, inter_kind, dst_kind
+        );
+        return Ok(());
+    }
+
+    use crate::transfer::executor::TransferOptionsInternal;
+
+    let agent = build_agent_for_kinds(&[src_kind, inter_kind, dst_kind])?;
+
+    // Create layouts: source pinned, device intermediate, destination pinned
+    let src = build_layout(agent.clone(), src_layout, src_kind, 4);
+    let device = build_layout(agent.clone(), inter_layout, inter_kind, 4);
+    let dst = build_layout(agent.clone(), dst_layout, dst_kind, 4);
+
+    let src_blocks = vec![0, 1];
+    let device_blocks = vec![0, 1];
+    let dst_blocks = vec![2, 3];
+
+    // Fill source and compute checksums
+    let checksums = fill_and_checksum(&src, &src_blocks, FillPattern::Sequential)?;
+    let ctx = create_transfer_context(agent, None).unwrap();
+
+    // Transfer: Pinned[0,1] -> Device[0,1]
+    let notification = execute_transfer(
+        &src,
+        &device,
+        &src_blocks,
+        &device_blocks,
+        TransferOptionsInternal::default(),
+        ctx.context(),
+    )?;
+    notification.await?;
+
+    // Transfer: Device[0,1] -> Pinned[2,3]
+    let notification = execute_transfer(
+        &device,
+        &dst,
+        &device_blocks,
+        &dst_blocks,
+        TransferOptionsInternal::default(),
+        ctx.context(),
+    )?;
+    notification.await?;
+
+    // Verify checksums match
+    verify_checksums_by_position(&checksums, &src_blocks, &dst, &dst_blocks)?;
+
+    Ok(())
+}
+
+#[cfg(feature = "testing-nixl-gds")]
+#[rstest]
+#[case(StorageKind::Device(0), StorageKind::Disk(0))]
+#[case(StorageKind::Disk(0), StorageKind::Device(0))]
+#[tokio::test]
+async fn test_gds(
+    #[case] src_kind: StorageKind,
+    #[values(LayoutType::FC, LayoutType::LW)] src_layout: LayoutType,
+    #[case] dst_kind: StorageKind,
+    #[values(LayoutType::FC, LayoutType::LW)] dst_layout: LayoutType,
+) -> Result<()> {
+    let capabilities = TransferCapabilities::default().with_gds_if_supported();
+
+    if !capabilities.allow_gds {
+        println!("System does not support GDS. Skipping test.");
+        return Ok(());
+    }
+
+    let agent = build_agent_for_kinds(&[src_kind, dst_kind])?;
+
+    let src = build_layout(agent.clone(), src_layout, src_kind, 4);
+    let dst = build_layout(agent.clone(), dst_layout, dst_kind, 4);
+
+    let src_blocks = vec![0, 1];
+    let dst_blocks = vec![2, 3];
+
+    let checksums = fill_and_checksum(&src, &src_blocks, FillPattern::Sequential)?;
+    let ctx = create_transfer_context(agent, Some(capabilities)).unwrap();
+
+    let notification = execute_transfer(
+        &src,
+        &dst,
+        &src_blocks,
+        &dst_blocks,
+        TransferOptionsInternal::default(),
+        ctx.context(),
+    )?;
+    notification.await?;
+
+    verify_checksums_by_position(&checksums, &src_blocks, &dst, &dst_blocks)?;
+
+    Ok(())
+}
+
+#[rstest]
+#[case(1024)]
+#[case(2048)]
+#[case(4096)]
+#[case(8192)]
+#[case(16384)]
+#[tokio::test]
+async fn test_large_block_counts(#[case] block_count: usize) {
+    let agent = create_test_agent(&format!("test_large_block_counts_{}", block_count));
+
+    let src = create_fc_layout(agent.clone(), StorageKind::Pinned, block_count);
+    let device = create_fc_layout(agent.clone(), StorageKind::Device(0), block_count);
+
+    let src_blocks = (0..block_count).collect::<Vec<_>>();
+    let device_blocks = (0..block_count).collect::<Vec<_>>();
+
+    let ctx = create_transfer_context(agent, None).unwrap();
+    let notification = execute_transfer(
+        &src,
+        &device,
+        &src_blocks,
+        &device_blocks,
+        TransferOptionsInternal::default(),
+        ctx.context(),
+    )
+    .unwrap();
+    notification.await.unwrap();
+}
+
+// ============================================================================
+// Parameterized Bounce Tests with Guard Block Validation
+// ============================================================================
+
+/// Test bounce transfers with guard block validation.
+///
+/// This test validates that:
+/// 1. Data can be transferred: host[src_blocks] → bounce[src_blocks] → host[dst_blocks]
+/// 2. Guard blocks adjacent to dst_blocks remain unchanged (no memory corruption)
+/// 3. Works correctly with different storage types, layouts, and transfer modes
+///
+/// Test pattern (6 blocks total):
+/// - Source blocks: [0, 1]
+/// - Destination blocks: [3, 4]
+/// - Guard blocks: [2, 5] (adjacent to destination, should remain unchanged)
+#[rstest]
+// Storage combinations (host, bounce)
+#[case(StorageKind::System, StorageKind::Pinned, "sys_pin")]
+#[case(StorageKind::Pinned, StorageKind::System, "pin_sys")]
+#[case(StorageKind::Pinned, StorageKind::Device(0), "pin_dev")]
+#[tokio::test]
+async fn test_bounce_with_guards_fc_fc_full(
+    #[case] host_storage: StorageKind,
+    #[case] bounce_storage: StorageKind,
+    #[case] name_suffix: &str,
+) {
+    test_bounce_with_guards_impl(
+        host_storage,
+        bounce_storage,
+        LayoutKind::FC,
+        LayoutKind::FC,
+        TransferMode::FullBlocks,
+        name_suffix,
+    )
+    .await
+    .unwrap();
+}
+
+#[rstest]
+#[case(StorageKind::System, StorageKind::Pinned, "sys_pin")]
+#[case(StorageKind::Pinned, StorageKind::System, "pin_sys")]
+#[case(StorageKind::Pinned, StorageKind::Device(0), "pin_dev")]
+#[tokio::test]
+async fn test_bounce_with_guards_fc_lw_full(
+    #[case] host_storage: StorageKind,
+    #[case] bounce_storage: StorageKind,
+    #[case] name_suffix: &str,
+) {
+    test_bounce_with_guards_impl(
+        host_storage,
+        bounce_storage,
+        LayoutKind::FC,
+        LayoutKind::LW,
+        TransferMode::FullBlocks,
+        name_suffix,
+    )
+    .await
+    .unwrap();
+}
+
+#[rstest]
+#[case(StorageKind::System, StorageKind::Pinned, "sys_pin")]
+#[case(StorageKind::Pinned, StorageKind::System, "pin_sys")]
+#[case(StorageKind::Pinned, StorageKind::Device(0), "pin_dev")]
+#[tokio::test]
+async fn test_bounce_with_guards_lw_fc_full(
+    #[case] host_storage: StorageKind,
+    #[case] bounce_storage: StorageKind,
+    #[case] name_suffix: &str,
+) {
+    test_bounce_with_guards_impl(
+        host_storage,
+        bounce_storage,
+        LayoutKind::LW,
+        LayoutKind::FC,
+        TransferMode::FullBlocks,
+        name_suffix,
+    )
+    .await
+    .unwrap();
+}
+
+#[rstest]
+#[case(StorageKind::System, StorageKind::Pinned, "sys_pin")]
+#[case(StorageKind::Pinned, StorageKind::System, "pin_sys")]
+#[case(StorageKind::Pinned, StorageKind::Device(0), "pin_dev")]
+#[tokio::test]
+async fn test_bounce_with_guards_lw_lw_full(
+    #[case] host_storage: StorageKind,
+    #[case] bounce_storage: StorageKind,
+    #[case] name_suffix: &str,
+) {
+    test_bounce_with_guards_impl(
+        host_storage,
+        bounce_storage,
+        LayoutKind::LW,
+        LayoutKind::LW,
+        TransferMode::FullBlocks,
+        name_suffix,
+    )
+    .await
+    .unwrap();
+}
+
+#[rstest]
+#[case(StorageKind::Pinned, StorageKind::Device(0), "pin_dev")]
+#[tokio::test]
+async fn test_bounce_with_guards_fc_fc_layer0(
+    #[case] host_storage: StorageKind,
+    #[case] bounce_storage: StorageKind,
+    #[case] name_suffix: &str,
+) {
+    test_bounce_with_guards_impl(
+        host_storage,
+        bounce_storage,
+        LayoutKind::FC,
+        LayoutKind::FC,
+        TransferMode::FirstLayerOnly,
+        name_suffix,
+    )
+    .await
+    .unwrap();
+}
+
+#[rstest]
+#[case(StorageKind::Pinned, StorageKind::Device(0), "pin_dev")]
+#[tokio::test]
+async fn test_bounce_with_guards_lw_lw_layer0(
+    #[case] host_storage: StorageKind,
+    #[case] bounce_storage: StorageKind,
+    #[case] name_suffix: &str,
+) {
+    test_bounce_with_guards_impl(
+        host_storage,
+        bounce_storage,
+        LayoutKind::LW,
+        LayoutKind::LW,
+        TransferMode::FirstLayerOnly,
+        name_suffix,
+    )
+    .await
+    .unwrap();
+}
+
+/// Implementation helper for bounce tests with guard blocks.
+async fn test_bounce_with_guards_impl(
+    host_storage: StorageKind,
+    bounce_storage: StorageKind,
+    host_layout: LayoutKind,
+    bounce_layout: LayoutKind,
+    mode: TransferMode,
+    name_suffix: &str,
+) -> Result<()> {
+    let num_blocks = 6;
+    let test_name = format!(
+        "bounce_{}_{:?}_{:?}_{}_{}",
+        name_suffix,
+        host_layout,
+        bounce_layout,
+        mode.suffix(),
+        std::time::SystemTime::now()
+            .duration_since(std::time::UNIX_EPOCH)
+            .unwrap()
+            .as_millis()
+    );
+    let agent = create_test_agent(&test_name);
+
+    // Create layouts
+    let host = create_layout(
+        agent.clone(),
+        LayoutSpec::new(host_layout, host_storage),
+        num_blocks,
+    );
+    let bounce = create_layout(
+        agent.clone(),
+        LayoutSpec::new(bounce_layout, bounce_storage),
+        num_blocks,
+    );
+
+    // Block assignments:
+    // - Transfer: host[0,1] → bounce[0,1] → host[3,4]
+    // - Guards: host[2,5] (should remain unchanged)
+    let src_blocks = vec![0, 1];
+    let dst_blocks = vec![3, 4];
+    let guard_blocks = vec![2, 5];
+
+    // Setup: Fill source blocks and guard blocks
+    let src_checksums =
+        fill_and_checksum_with_mode(&host, &src_blocks, FillPattern::Sequential, mode)?;
+    let guard_checksums = create_guard_blocks(&host, &guard_blocks, FillPattern::Constant(0xFF))?;
+
+    let ctx = create_transfer_context(agent, None)?;
+
+    // Execute bounce: host[0,1] → bounce[0,1]
+    let mut options_builder = TransferOptionsInternal::builder();
+    if let Some(range) = mode.layer_range() {
+        options_builder = options_builder.layer_range(range);
+    }
+    let notification = execute_transfer(
+        &host,
+        &bounce,
+        &src_blocks,
+        &src_blocks,
+        options_builder.build()?,
+        ctx.context(),
+    )?;
+    notification.await?;
+
+    // Execute bounce: bounce[0,1] → host[3,4]
+    let mut options_builder = TransferOptionsInternal::builder();
+    if let Some(range) = mode.layer_range() {
+        options_builder = options_builder.layer_range(range);
+    }
+    let notification = execute_transfer(
+        &bounce,
+        &host,
+        &src_blocks,
+        &dst_blocks,
+        options_builder.build()?,
+        ctx.context(),
+    )?;
+    notification.await?;
+
+    // Verify: Data integrity + guards unchanged
+    verify_checksums_by_position_with_mode(&src_checksums, &src_blocks, &host, &dst_blocks, mode)?;
+    verify_guard_blocks_unchanged(&host, &guard_blocks, &guard_checksums)?;
+
+    Ok(())
+}
+
+// ============================================================================
+// Parameterized Direct Transfer Tests
+// ============================================================================
+
+/// Test direct transfers with parameterization over storage, layout, and transfer mode.
+///
+/// This demonstrates the DRY parameterized approach that can replace the 18 individual
+/// tests above (System<=>System, Pinned<=>Pinned, cross-type, etc).
+///
+/// Note: Only tests System<=>System, Pinned<=>Pinned, and System<=>Pinned since we can only
+/// fill/checksum System and Pinned storage. For Device tests, use bounce tests instead.
+#[rstest]
+// Storage combinations (only fillable storage types)
+#[case(StorageKind::System, StorageKind::System, "sys_sys")]
+#[case(StorageKind::Pinned, StorageKind::Pinned, "pin_pin")]
+#[case(StorageKind::System, StorageKind::Pinned, "sys_pin")]
+#[case(StorageKind::Pinned, StorageKind::System, "pin_sys")]
+#[tokio::test]
+async fn test_direct_transfer_fc_fc_full(
+    #[case] src_storage: StorageKind,
+    #[case] dst_storage: StorageKind,
+    #[case] name_suffix: &str,
+) {
+    test_direct_transfer_impl(
+        src_storage,
+        dst_storage,
+        LayoutKind::FC,
+        LayoutKind::FC,
+        TransferMode::FullBlocks,
+        name_suffix,
+    )
+    .await
+    .unwrap();
+}
+
+#[rstest]
+#[case(StorageKind::System, StorageKind::Pinned, "sys_pin")]
+#[case(StorageKind::Pinned, StorageKind::System, "pin_sys")]
+#[tokio::test]
+async fn test_direct_transfer_fc_lw_layer0(
+    #[case] src_storage: StorageKind,
+    #[case] dst_storage: StorageKind,
+    #[case] name_suffix: &str,
+) {
+    test_direct_transfer_impl(
+        src_storage,
+        dst_storage,
+        LayoutKind::FC,
+        LayoutKind::LW,
+        TransferMode::FirstLayerOnly,
+        name_suffix,
+    )
+    .await
+    .unwrap();
+}
+
+#[rstest]
+#[case(StorageKind::Pinned, StorageKind::Pinned, "pin_pin")]
+#[tokio::test]
+async fn test_direct_transfer_lw_lw_layer1(
+    #[case] src_storage: StorageKind,
+    #[case] dst_storage: StorageKind,
+    #[case] name_suffix: &str,
+) {
+    test_direct_transfer_impl(
+        src_storage,
+        dst_storage,
+        LayoutKind::LW,
+        LayoutKind::LW,
+        TransferMode::SecondLayerOnly,
+        name_suffix,
+    )
+    .await
+    .unwrap();
+}
+
+/// Implementation helper for direct transfer tests.
+async fn test_direct_transfer_impl(
+    src_storage: StorageKind,
+    dst_storage: StorageKind,
+    src_layout: LayoutKind,
+    dst_layout: LayoutKind,
+    mode: TransferMode,
+    name_suffix: &str,
+) -> Result<()> {
+    let num_blocks = 4;
+    let test_name = format!(
+        "direct_{}_{:?}_{:?}_{}_{}",
+        name_suffix,
+        src_layout,
+        dst_layout,
+        mode.suffix(),
+        std::time::SystemTime::now()
+            .duration_since(std::time::UNIX_EPOCH)
+            .unwrap()
+            .as_millis()
+    );
+    let agent = create_test_agent(&test_name);
+
+    // Create layouts
+    let src = create_layout(
+        agent.clone(),
+        LayoutSpec::new(src_layout, src_storage),
+        num_blocks,
+    );
+    let dst = create_layout(
+        agent.clone(),
+        LayoutSpec::new(dst_layout, dst_storage),
+        num_blocks,
+    );
+
+    // Transfer src[0,1] -> dst[2,3]
+    let src_blocks = vec![0, 1];
+    let dst_blocks = vec![2, 3];
+
+    // Fill source and compute checksums
+    let src_checksums =
+        fill_and_checksum_with_mode(&src, &src_blocks, FillPattern::Sequential, mode)?;
+
+    let ctx = create_transfer_context(agent, None)?;
+
+    // Execute transfer
+    let mut options_builder = TransferOptionsInternal::builder();
+    if let Some(range) = mode.layer_range() {
+        options_builder = options_builder.layer_range(range);
+    }
+    let notification = execute_transfer(
+        &src,
+        &dst,
+        &src_blocks,
+        &dst_blocks,
+        options_builder.build()?,
+        ctx.context(),
+    )?;
+    notification.await?;
+
+    // Verify data integrity
+    verify_checksums_by_position_with_mode(&src_checksums, &src_blocks, &dst, &dst_blocks, mode)?;
+
+    Ok(())
+}
+
+// ============================================================================
+// Layout Compatibility Helper Tests
+// ============================================================================
+
+#[test]
+fn test_validate_layout_compatibility_same_layout() {
+    let agent = create_test_agent("test_compat_same");
+    let src = create_fc_layout(agent.clone(), StorageKind::System, 4);
+    let dst = create_fc_layout(agent.clone(), StorageKind::System, 4);
+
+    // Both FC layouts with Unknown KvBlockLayout - should be compatible
+    assert!(validate_layout_compatibility(&src, &dst).is_ok());
+}
+
+#[test]
+fn test_validate_layout_compatibility_fc_lw_same_block_layout() {
+    let agent = create_test_agent("test_compat_fc_lw");
+    let src = create_fc_layout(agent.clone(), StorageKind::System, 4);
+    let dst = create_lw_layout(agent.clone(), StorageKind::System, 4);
+
+    // Both have Unknown/Unknown-derived KvBlockLayout - should be compatible
+    // (Unknown→Unknown returns false for requires_transform)
+    assert!(validate_layout_compatibility(&src, &dst).is_ok());
+}
+
+#[test]
+fn test_can_use_whole_block_fc_fc_full_block() {
+    let agent = create_test_agent("test_whole_block_fc_fc");
+    let src = create_fc_layout(agent.clone(), StorageKind::System, 4);
+    let dst = create_fc_layout(agent.clone(), StorageKind::System, 4);
+
+    // Both FC + full block transfer = should use whole-block
+    assert!(can_use_whole_block_transfer(&src, &dst, None));
+}
+
+#[test]
+fn test_can_use_whole_block_fc_fc_full_range() {
+    let agent = create_test_agent("test_whole_block_fc_fc_range");
+    let src = create_fc_layout(agent.clone(), StorageKind::System, 4);
+    let dst = create_fc_layout(agent.clone(), StorageKind::System, 4);
+
+    // Both FC + full range (0..num_layers) = should use whole-block
+    let full_range = 0..src.layout().num_layers();
+    assert!(can_use_whole_block_transfer(&src, &dst, Some(&full_range)));
+}
+
+#[test]
+fn test_can_use_whole_block_fc_fc_partial_layer() {
+    let agent = create_test_agent("test_whole_block_partial");
+    let src = create_fc_layout(agent.clone(), StorageKind::System, 4);
+    let dst = create_fc_layout(agent.clone(), StorageKind::System, 4);
+
+    // Partial layer transfer = should NOT use whole-block
+    let partial_range = 0..1;
+    assert!(!can_use_whole_block_transfer(
+        &src,
+        &dst,
+        Some(&partial_range)
+    ));
+}
+
+#[test]
+fn test_can_use_whole_block_fc_lw() {
+    let agent = create_test_agent("test_whole_block_fc_lw");
+    let src = create_fc_layout(agent.clone(), StorageKind::System, 4);
+    let dst = create_lw_layout(agent.clone(), StorageKind::System, 4);
+
+    // FC + LW = should NOT use whole-block (dst is not fully contiguous)
+    assert!(!can_use_whole_block_transfer(&src, &dst, None));
+}
+
+#[test]
+fn test_can_use_whole_block_lw_fc() {
+    let agent = create_test_agent("test_whole_block_lw_fc");
+    let src = create_lw_layout(agent.clone(), StorageKind::System, 4);
+    let dst = create_fc_layout(agent.clone(), StorageKind::System, 4);
+
+    // LW + FC = should NOT use whole-block (src is not fully contiguous)
+    assert!(!can_use_whole_block_transfer(&src, &dst, None));
+}
+
+#[test]
+fn test_can_use_whole_block_lw_lw() {
+    let agent = create_test_agent("test_whole_block_lw_lw");
+    let src = create_lw_layout(agent.clone(), StorageKind::System, 4);
+    let dst = create_lw_layout(agent.clone(), StorageKind::System, 4);
+
+    // LW + LW = should NOT use whole-block (neither is fully contiguous)
+    assert!(!can_use_whole_block_transfer(&src, &dst, None));
+}
+
+// ============================================================================
+// Whole-Block Transfer Integration Tests
+// ============================================================================
+
+/// Test that FC→FC transfers with full blocks use the whole-block path.
+///
+/// This test verifies data integrity for FC→FC transfers that should use
+/// the optimized whole-block memcpy path.
+#[rstest]
+#[case(StorageKind::System, StorageKind::System)]
+#[case(StorageKind::System, StorageKind::Pinned)]
+#[case(StorageKind::Pinned, StorageKind::Pinned)]
+#[tokio::test]
+async fn test_whole_block_transfer_fc_fc(
+    #[case] src_storage: StorageKind,
+    #[case] dst_storage: StorageKind,
+) -> Result<()> {
+    let agent = create_test_agent("test_whole_block_fc_fc_transfer");
+
+    let src = create_fc_layout(agent.clone(), src_storage, 4);
+    let dst = create_fc_layout(agent.clone(), dst_storage, 4);
+
+    // Verify this should use whole-block path
+    assert!(can_use_whole_block_transfer(&src, &dst, None));
+
+    let src_blocks = vec![0, 1];
+    let dst_blocks = vec![2, 3];
+
+    let checksums = fill_and_checksum(&src, &src_blocks, FillPattern::Sequential)?;
+    let ctx = create_transfer_context(agent, None)?;
+
+    let notification = execute_transfer(
+        &src,
+        &dst,
+        &src_blocks,
+        &dst_blocks,
+        TransferOptionsInternal::default(),
+        ctx.context(),
+    )?;
+    notification.await?;
+
+    verify_checksums_by_position(&checksums, &src_blocks, &dst, &dst_blocks)?;
+
+    Ok(())
+}
+
+/// Test that FC→LW transfers fall back to layer-wise path.
+///
+/// This test verifies data integrity for FC→LW transfers that should use
+/// the layer-wise path (not whole-block).
+#[rstest]
+#[case(StorageKind::System, StorageKind::System)]
+#[case(StorageKind::Pinned, StorageKind::Pinned)]
+#[tokio::test]
+async fn test_layer_wise_transfer_fc_lw(
+    #[case] src_storage: StorageKind,
+    #[case] dst_storage: StorageKind,
+) -> Result<()> {
+    let agent = create_test_agent("test_layer_wise_fc_lw_transfer");
+
+    let src = create_fc_layout(agent.clone(), src_storage, 4);
+    let dst = create_lw_layout(agent.clone(), dst_storage, 4);
+
+    // Verify this should NOT use whole-block path
+    assert!(!can_use_whole_block_transfer(&src, &dst, None));
+
+    let src_blocks = vec![0, 1];
+    let dst_blocks = vec![2, 3];
+
+    let checksums = fill_and_checksum(&src, &src_blocks, FillPattern::Sequential)?;
+    let ctx = create_transfer_context(agent, None)?;
+
+    let notification = execute_transfer(
+        &src,
+        &dst,
+        &src_blocks,
+        &dst_blocks,
+        TransferOptionsInternal::default(),
+        ctx.context(),
+    )?;
+    notification.await?;
+
+    verify_checksums_by_position(&checksums, &src_blocks, &dst, &dst_blocks)?;
+
+    Ok(())
+}
+
+/// Test partial layer transfer uses layer-wise path even for FC→FC.
+#[tokio::test]
+async fn test_partial_layer_transfer_uses_layer_wise() -> Result<()> {
+    let agent = create_test_agent("test_partial_layer");
+
+    let src = create_fc_layout(agent.clone(), StorageKind::System, 4);
+    let dst = create_fc_layout(agent.clone(), StorageKind::Pinned, 4);
+
+    // Verify partial transfer should NOT use whole-block path
+    let partial_range = 0..1;
+    assert!(!can_use_whole_block_transfer(
+        &src,
+        &dst,
+        Some(&partial_range)
+    ));
+
+    let src_blocks = vec![0, 1];
+    let dst_blocks = vec![2, 3];
+
+    // Fill source with sequential pattern for layer 0 only
+    let checksums = fill_and_checksum_with_mode(
+        &src,
+        &src_blocks,
+        FillPattern::Sequential,
+        TransferMode::FirstLayerOnly,
+    )?;
+    let ctx = create_transfer_context(agent, None)?;
+
+    let options = TransferOptionsInternal::builder()
+        .layer_range(partial_range)
+        .build()?;
+
+    let notification =
+        execute_transfer(&src, &dst, &src_blocks, &dst_blocks, options, ctx.context())?;
+    notification.await?;
+
+    verify_checksums_by_position_with_mode(
+        &checksums,
+        &src_blocks,
+        &dst,
+        &dst_blocks,
+        TransferMode::FirstLayerOnly,
+    )?;
+
+    Ok(())
+}
+
+// ============================================================================
+// Transfer Coverage Gap Tests
+// ============================================================================
+
+/// Test that transferring layer 0 and layer 1 independently produces the same
+/// result as a full-block transfer.
+///
+/// `test_partial_layer_transfer_uses_layer_wise` only transfers layer 0. This test
+/// verifies that layer 0 + layer 1 transferred independently compose to the same
+/// result as transferring all layers at once.
+#[rstest]
+#[case(LayoutKind::FC, LayoutKind::FC)]
+#[case(LayoutKind::FC, LayoutKind::LW)]
+#[case(LayoutKind::LW, LayoutKind::FC)]
+#[case(LayoutKind::LW, LayoutKind::LW)]
+#[tokio::test]
+async fn test_layer_composition_equals_full_block(
+    #[case] src_kind: LayoutKind,
+    #[case] dst_kind: LayoutKind,
+) -> Result<()> {
+    let agent = create_test_agent("test_layer_composition");
+
+    let src = create_layout(
+        agent.clone(),
+        LayoutSpec::new(src_kind, StorageKind::Pinned),
+        4,
+    );
+    let dst_full = create_layout(
+        agent.clone(),
+        LayoutSpec::new(dst_kind, StorageKind::Pinned),
+        4,
+    );
+    let dst_layered = create_layout(
+        agent.clone(),
+        LayoutSpec::new(dst_kind, StorageKind::Pinned),
+        4,
+    );
+
+    let src_blocks = vec![0, 1];
+    let dst_blocks = vec![2, 3];
+
+    // Fill source blocks with sequential pattern (all layers)
+    fill_and_checksum(&src, &src_blocks, FillPattern::Sequential)?;
+
+    let ctx = create_transfer_context(agent, None)?;
+
+    // Full-block transfer: src[0,1] → dst_full[2,3]
+    let notification = execute_transfer(
+        &src,
+        &dst_full,
+        &src_blocks,
+        &dst_blocks,
+        TransferOptionsInternal::default(),
+        ctx.context(),
+    )?;
+    notification.await?;
+
+    // Layer-wise transfers: src[0,1] → dst_layered[2,3] layer by layer
+    let options_layer0 = TransferOptionsInternal::builder()
+        .layer_range(0..1)
+        .build()?;
+    let notification = execute_transfer(
+        &src,
+        &dst_layered,
+        &src_blocks,
+        &dst_blocks,
+        options_layer0,
+        ctx.context(),
+    )?;
+    notification.await?;
+
+    let options_layer1 = TransferOptionsInternal::builder()
+        .layer_range(1..2)
+        .build()?;
+    let notification = execute_transfer(
+        &src,
+        &dst_layered,
+        &src_blocks,
+        &dst_blocks,
+        options_layer1,
+        ctx.context(),
+    )?;
+    notification.await?;
+
+    // Compute full-block checksums on both destinations
+    let checksums_full = compute_block_checksums(&dst_full, &dst_blocks)?;
+    let checksums_layered = compute_block_checksums(&dst_layered, &dst_blocks)?;
+
+    // Layer-wise composition must equal full-block transfer
+    for &block_id in &dst_blocks {
+        assert_eq!(
+            checksums_full[&block_id], checksums_layered[&block_id],
+            "Block {}: full-block checksum ({}) != layer-composed checksum ({})",
+            block_id, checksums_full[&block_id], checksums_layered[&block_id],
+        );
+    }
+
+    Ok(())
+}
+
+/// Test that FC↔LW CUDA transfers through Pinned↔Device correctly use the
+/// vectorized path and preserve data integrity through a roundtrip.
+///
+/// Existing tests cover FC↔LW via memcpy (System/Pinned) and via `test_p2p`.
+/// This test explicitly asserts that CUDA FC↔LW transfers go through the
+/// vectorized path (not whole-block) and verifies roundtrip integrity.
+#[rstest]
+#[case(LayoutKind::FC, LayoutKind::LW)]
+#[case(LayoutKind::LW, LayoutKind::FC)]
+#[tokio::test]
+async fn test_cuda_fc_lw_roundtrip_uses_vectorized(
+    #[case] host_kind: LayoutKind,
+    #[case] device_kind: LayoutKind,
+) -> Result<()> {
+    let agent = build_agent_for_kinds(&[StorageKind::Pinned, StorageKind::Device(0)])?;
+
+    let host = create_layout(
+        agent.clone(),
+        LayoutSpec::new(host_kind, StorageKind::Pinned),
+        4,
+    );
+    let device = create_layout(
+        agent.clone(),
+        LayoutSpec::new(device_kind, StorageKind::Device(0)),
+        4,
+    );
+
+    // Confirm vectorized path will be used (not whole-block)
+    assert!(
+        !can_use_whole_block_transfer(&host, &device, None),
+        "FC↔LW across Pinned↔Device should use vectorized path, not whole-block"
+    );
+
+    let src_blocks = vec![0, 1];
+    let device_blocks = vec![0, 1];
+    let dst_blocks = vec![2, 3];
+
+    // Fill host source blocks
+    let checksums = fill_and_checksum(&host, &src_blocks, FillPattern::Sequential)?;
+
+    let ctx = create_transfer_context(agent, None)?;
+
+    // H2D: host[0,1] → device[0,1]
+    let notification = execute_transfer(
+        &host,
+        &device,
+        &src_blocks,
+        &device_blocks,
+        TransferOptionsInternal::default(),
+        ctx.context(),
+    )?;
+    notification.await?;
+
+    // D2H: device[0,1] → host[2,3]
+    let notification = execute_transfer(
+        &device,
+        &host,
+        &device_blocks,
+        &dst_blocks,
+        TransferOptionsInternal::default(),
+        ctx.context(),
+    )?;
+    notification.await?;
+
+    // Verify roundtrip: host[0,1] == host[2,3]
+    verify_checksums_by_position(&checksums, &src_blocks, &host, &dst_blocks)?;
+
+    Ok(())
+}
--- a/lib/kvbm-physical/src/transfer/tests/mod.rs
+++ b/lib/kvbm-physical/src/transfer/tests/mod.rs
+// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Comprehensive transfer tests for verifying data integrity across storage types and layout configurations.
+
+mod local_transfers;
+
+/// Skip test if stub kernels are in use (no real CUDA available).
+///
+/// Call this at the start of any test that requires real CUDA operations.
+/// When stubs are in use, the test will print a message and return early.
+///
+/// # Example
+/// ```ignore
+/// #[test]
+/// fn my_cuda_test() {
+///     skip_if_stubs!();
+///     // ... test code that requires CUDA ...
+/// }
+/// ```
+#[allow(unused_macros)]
+macro_rules! skip_if_stubs {
+    () => {
+        if kvbm_kernels::is_using_stubs() {
+            eprintln!(
+                "Skipping test '{}': stub kernels in use (no real CUDA)",
+                module_path!()
+            );
+            return;
+        }
+    };
+}
+
+/// Check if any of the storage kinds require CUDA, and skip if stubs are in use.
+///
+/// Call this at the start of parameterized tests that may or may not use Device storage.
+#[allow(unused_macros)]
+macro_rules! skip_if_stubs_and_device {
+    ($($kind:expr),+ $(,)?) => {
+        if kvbm_kernels::is_using_stubs() {
+            let needs_cuda = false $(|| matches!($kind, StorageKind::Device(_)))+;
+            if needs_cuda {
+                eprintln!(
+                    "Skipping test '{}': stub kernels in use and test requires Device storage",
+                    module_path!()
+                );
+                return Ok(());
+            }
+        }
+    };
+}
+
+// Make the macros available to submodules
+#[allow(unused_imports)]
+pub(crate) use skip_if_stubs;
+#[allow(unused_imports)]
+pub(crate) use skip_if_stubs_and_device;
+
+use super::{
+    BlockChecksum, FillPattern, NixlAgent, PhysicalLayout, StorageKind, TransferCapabilities,
+    compute_block_checksums, compute_layer_checksums, fill_blocks, fill_layers,
+};
+use crate::{
+    BlockId,
+    layout::{
+        BlockDimension, LayoutConfig,
+        builder::{HasConfig, NoLayout, NoMemory, PhysicalLayoutBuilder},
+    },
+};
+use anyhow::Result;
+use cudarc::driver::sys::CUdevice_attribute_enum;
+use cudarc::driver::{CudaContext, CudaStream, LaunchConfig, PushKernelArg};
+use cudarc::nvrtc::{CompileOptions, compile_ptx_with_opts};
+use std::collections::HashMap;
+use std::ops::Range;
+use std::sync::{Arc, OnceLock};
+use std::time::{Duration, Instant};
+
+/// Layout kind for parameterized testing.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum LayoutKind {
+    /// Fully contiguous layout
+    FC,
+    /// Layer-wise (layer-separate) layout
+    LW,
+}
+
+/// Storage and layout specification for creating test layouts.
+#[derive(Debug, Clone, Copy)]
+pub struct LayoutSpec {
+    pub kind: LayoutKind,
+    pub storage: StorageKind,
+}
+
+impl LayoutSpec {
+    pub fn new(kind: LayoutKind, storage: StorageKind) -> Self {
+        Self { kind, storage }
+    }
+}
+
+/// Transfer mode for parameterized testing.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum TransferMode {
+    /// Transfer entire blocks (all layers)
+    FullBlocks,
+    /// Transfer only the first layer
+    FirstLayerOnly,
+    /// Transfer only the second layer
+    SecondLayerOnly,
+}
+
+impl TransferMode {
+    /// Convert to optional layer range for execute_transfer.
+    pub fn layer_range(&self) -> Option<Range<usize>> {
+        match self {
+            TransferMode::FullBlocks => None,
+            TransferMode::FirstLayerOnly => Some(0..1),
+            TransferMode::SecondLayerOnly => Some(1..2),
+        }
+    }
+
+    /// Get a descriptive suffix for test names.
+    pub fn suffix(&self) -> &'static str {
+        match self {
+            TransferMode::FullBlocks => "full",
+            TransferMode::FirstLayerOnly => "layer0",
+            TransferMode::SecondLayerOnly => "layer1",
+        }
+    }
+}
+
+/// Standard layout configuration for all tests.
+pub fn standard_config(num_blocks: usize) -> LayoutConfig {
+    LayoutConfig::builder()
+        .num_blocks(num_blocks)
+        .num_layers(2)
+        .outer_dim(2)
+        .page_size(16)
+        .inner_dim(128)
+        .dtype_width_bytes(2)
+        .build()
+        .unwrap()
+}
+
+/// Helper function for creating a PhysicalLayout builder with standard config.
+///
+/// This is used by other test modules (fill, checksum, validation) for backwards compatibility.
+pub fn builder(num_blocks: usize) -> PhysicalLayoutBuilder<HasConfig, NoLayout, NoMemory> {
+    let agent = create_test_agent("test_agent");
+    let config = standard_config(num_blocks);
+    PhysicalLayout::builder(agent).with_config(config)
+}
+
+/// Create a test agent with no backends.
+///
+/// Use this for tests that don't require specific NIXL backends.
+pub fn create_test_agent(name: &str) -> NixlAgent {
+    NixlAgent::new(name).expect("Failed to create agent")
+}
+
+/// Create a test agent with specific backends (strict - all must succeed).
+#[expect(dead_code)]
+pub fn create_test_agent_with_backends(name: &str, backends: &[&str]) -> Result<NixlAgent> {
+    NixlAgent::with_backends(name, backends)
+}
+
+/// Create a fully contiguous physical layout with the specified storage type.
+pub fn create_fc_layout(
+    agent: NixlAgent,
+    storage_kind: StorageKind,
+    num_blocks: usize,
+) -> PhysicalLayout {
+    let config = standard_config(num_blocks);
+    let builder = PhysicalLayout::builder(agent)
+        .with_config(config)
+        .fully_contiguous();
+
+    match storage_kind {
+        StorageKind::System => builder.allocate_system().build().unwrap(),
+        StorageKind::Pinned => builder.allocate_pinned(None).build().unwrap(),
+        StorageKind::Device(device_id) => builder.allocate_device(device_id).build().unwrap(),
+        StorageKind::Disk(_) => builder.allocate_disk(None).build().unwrap(),
+    }
+}
+
+/// Create a layer-separate physical layout with the specified storage type.
+pub fn create_lw_layout(
+    agent: NixlAgent,
+    storage_kind: StorageKind,
+    num_blocks: usize,
+) -> PhysicalLayout {
+    let config = standard_config(num_blocks);
+    let builder = PhysicalLayout::builder(agent)
+        .with_config(config)
+        .layer_separate(BlockDimension::BlockIsFirstDim);
+
+    match storage_kind {
+        StorageKind::System => builder.allocate_system().build().unwrap(),
+        StorageKind::Pinned => builder.allocate_pinned(None).build().unwrap(),
+        StorageKind::Device(device_id) => builder.allocate_device(device_id).build().unwrap(),
+        StorageKind::Disk(_) => builder.allocate_disk(None).build().unwrap(),
+    }
+}
+
+/// Create a physical layout based on the specification.
+///
+/// This is a DRY helper that dispatches to create_fc_layout or create_lw_layout
+/// based on the layout kind in the spec.
+pub fn create_layout(agent: NixlAgent, spec: LayoutSpec, num_blocks: usize) -> PhysicalLayout {
+    match spec.kind {
+        LayoutKind::FC => create_fc_layout(agent, spec.storage, num_blocks),
+        LayoutKind::LW => create_lw_layout(agent, spec.storage, num_blocks),
+    }
+}
+
+/// Create a transport manager for testing with the specified agent.
+///
+/// Note: The agent should already have backends configured. Use `create_test_agent`
+/// or `build_agent_with_backends` to create properly configured agents.
+pub fn create_transfer_context(
+    agent: NixlAgent,
+    capabilities: Option<TransferCapabilities>,
+) -> Result<crate::manager::TransferManager> {
+    crate::manager::TransferManager::builder()
+        .capabilities(capabilities.unwrap_or_default())
+        .nixl_agent(agent)
+        .cuda_device_id(0)
+        .build()
+}
+
+/// Fill blocks and compute checksums.
+///
+/// This can only be called on System or Pinned layouts.
+pub fn fill_and_checksum(
+    layout: &PhysicalLayout,
+    block_ids: &[BlockId],
+    pattern: FillPattern,
+) -> Result<HashMap<BlockId, BlockChecksum>> {
+    fill_blocks(layout, block_ids, pattern)?;
+    compute_block_checksums(layout, block_ids)
+}
+
+/// Fill blocks or layers based on transfer mode and compute checksums.
+///
+/// This is a mode-aware version of fill_and_checksum that handles both
+/// full block transfers and layer-wise transfers.
+pub fn fill_and_checksum_with_mode(
+    layout: &PhysicalLayout,
+    block_ids: &[BlockId],
+    pattern: FillPattern,
+    mode: TransferMode,
+) -> Result<HashMap<BlockId, BlockChecksum>> {
+    match mode {
+        TransferMode::FullBlocks => {
+            fill_blocks(layout, block_ids, pattern)?;
+            compute_block_checksums(layout, block_ids)
+        }
+        TransferMode::FirstLayerOnly => {
+            fill_layers(layout, block_ids, 0..1, pattern)?;
+            compute_layer_checksums(layout, block_ids, 0..1)
+        }
+        TransferMode::SecondLayerOnly => {
+            fill_layers(layout, block_ids, 1..2, pattern)?;
+            compute_layer_checksums(layout, block_ids, 1..2)
+        }
+    }
+}
+
+/// Verify that destination block checksums match the expected source checksums.
+///
+/// This function compares checksums in order, assuming the source and destination
+/// block arrays have a 1:1 correspondence (src[i] was transferred to dst[i]).
+pub fn verify_checksums_by_position(
+    src_checksums: &HashMap<BlockId, BlockChecksum>,
+    src_block_ids: &[BlockId],
+    dst_layout: &PhysicalLayout,
+    dst_block_ids: &[BlockId],
+) -> Result<()> {
+    assert_eq!(
+        src_block_ids.len(),
+        dst_block_ids.len(),
+        "Source and destination block arrays must have same length"
+    );
+
+    let dst_checksums = compute_block_checksums(dst_layout, dst_block_ids)?;
+
+    for (src_id, dst_id) in src_block_ids.iter().zip(dst_block_ids.iter()) {
+        let src_checksum = src_checksums
+            .get(src_id)
+            .unwrap_or_else(|| panic!("Missing source checksum for block {}", src_id));
+        let dst_checksum = dst_checksums
+            .get(dst_id)
+            .unwrap_or_else(|| panic!("Missing destination checksum for block {}", dst_id));
+
+        assert_eq!(
+            src_checksum, dst_checksum,
+            "Checksum mismatch: src[{}] != dst[{}]: {} != {}",
+            src_id, dst_id, src_checksum, dst_checksum
+        );
+    }
+
+    Ok(())
+}
+
+/// Verify checksums with transfer mode awareness.
+///
+/// This is a mode-aware version that handles both full block and layer-wise verification.
+pub fn verify_checksums_by_position_with_mode(
+    src_checksums: &HashMap<BlockId, BlockChecksum>,
+    src_block_ids: &[BlockId],
+    dst_layout: &PhysicalLayout,
+    dst_block_ids: &[BlockId],
+    mode: TransferMode,
+) -> Result<()> {
+    assert_eq!(
+        src_block_ids.len(),
+        dst_block_ids.len(),
+        "Source and destination block arrays must have same length"
+    );
+
+    let dst_checksums = match mode {
+        TransferMode::FullBlocks => compute_block_checksums(dst_layout, dst_block_ids)?,
+        TransferMode::FirstLayerOnly => compute_layer_checksums(dst_layout, dst_block_ids, 0..1)?,
+        TransferMode::SecondLayerOnly => compute_layer_checksums(dst_layout, dst_block_ids, 1..2)?,
+    };
+
+    for (src_id, dst_id) in src_block_ids.iter().zip(dst_block_ids.iter()) {
+        let src_checksum = src_checksums
+            .get(src_id)
+            .unwrap_or_else(|| panic!("Missing source checksum for block {}", src_id));
+        let dst_checksum = dst_checksums
+            .get(dst_id)
+            .unwrap_or_else(|| panic!("Missing destination checksum for block {}", dst_id));
+
+        assert_eq!(
+            src_checksum, dst_checksum,
+            "Checksum mismatch (mode={:?}): src[{}] != dst[{}]: {} != {}",
+            mode, src_id, dst_id, src_checksum, dst_checksum
+        );
+    }
+
+    Ok(())
+}
+
+/// Fill guard blocks and return their checksums for later verification.
+///
+/// Guard blocks are blocks adjacent to transfer destinations that should
+/// remain unchanged during transfers. This function fills them with a
+/// distinctive pattern and returns their checksums for later validation.
+///
+/// # Arguments
+/// * `layout` - The physical layout containing the guard blocks
+/// * `guard_block_ids` - Block IDs to use as guards
+/// * `pattern` - Fill pattern for guard blocks (typically a constant like 0xFF)
+///
+/// # Returns
+/// A map of block ID to checksum for all guard blocks
+pub fn create_guard_blocks(
+    layout: &PhysicalLayout,
+    guard_block_ids: &[usize],
+    pattern: FillPattern,
+) -> Result<HashMap<usize, BlockChecksum>> {
+    fill_blocks(layout, guard_block_ids, pattern)?;
+    compute_block_checksums(layout, guard_block_ids)
+}
+
+/// Verify that guard blocks remain unchanged after transfers.
+///
+/// This function compares the current checksums of guard blocks against
+/// their expected values. Any mismatch indicates memory corruption or
+/// unintended overwrites during transfer operations.
+///
+/// # Arguments
+/// * `layout` - The physical layout containing the guard blocks
+/// * `guard_block_ids` - Block IDs to verify
+/// * `expected_checksums` - Expected checksums from create_guard_blocks
+///
+/// # Errors
+/// Returns an error if any guard block checksum has changed
+pub fn verify_guard_blocks_unchanged(
+    layout: &PhysicalLayout,
+    guard_block_ids: &[usize],
+    expected_checksums: &HashMap<usize, BlockChecksum>,
+) -> Result<()> {
+    let current_checksums = compute_block_checksums(layout, guard_block_ids)?;
+
+    for &block_id in guard_block_ids {
+        let expected = expected_checksums
+            .get(&block_id)
+            .unwrap_or_else(|| panic!("Missing expected checksum for guard block {}", block_id));
+        let current = current_checksums
+            .get(&block_id)
+            .unwrap_or_else(|| panic!("Missing current checksum for guard block {}", block_id));
+
+        if expected != current {
+            return Err(anyhow::anyhow!(
+                "Guard block {} was modified during transfer! Expected: {}, Got: {}",
+                block_id,
+                expected,
+                current
+            ));
+        }
+    }
+
+    Ok(())
+}
+
+/// CUDA sleep kernel source code.
+const SLEEP_KERNEL_SRC: &str = r#"
+extern "C" __global__ void sleep_kernel(unsigned long long min_cycles) {
+    const unsigned long long start = clock64();
+    while ((clock64() - start) < min_cycles) {
+        asm volatile("");
+    }
+}
+"#;
+
+/// A reusable CUDA sleep utility for tests.
+///
+/// This struct provides a simple interface to execute GPU sleep operations
+/// with calibrated timing. It compiles the sleep kernel once per CUDA context
+/// and caches the calibration for reuse.
+///
+/// The calibration is conservative (prefers longer sleep durations over shorter)
+/// to ensure minimum sleep times are met.
+pub struct CudaSleep {
+    function: cudarc::driver::CudaFunction,
+    cycles_per_ms: f64,
+}
+
+impl CudaSleep {
+    /// Get or create a CudaSleep instance for the given CUDA context.
+    ///
+    /// This function uses lazy initialization and caches instances per device ID.
+    /// The first call for each device will compile the kernel and run calibration.
+    ///
+    /// # Arguments
+    /// * `cuda_ctx` - The CUDA context to use
+    ///
+    /// # Returns
+    /// A shared reference to the CudaSleep instance for this context's device.
+    pub fn for_context(cuda_ctx: &Arc<CudaContext>) -> Result<Arc<Self>> {
+        static INSTANCES: OnceLock<parking_lot::Mutex<HashMap<usize, Arc<CudaSleep>>>> =
+            OnceLock::new();
+
+        let instances = INSTANCES.get_or_init(|| parking_lot::Mutex::new(HashMap::new()));
+        let device_ordinal = cuda_ctx.ordinal();
+
+        // Fast path: check if instance already exists
+        {
+            let instances_guard = instances.lock();
+            if let Some(instance) = instances_guard.get(&device_ordinal) {
+                return Ok(Arc::clone(instance));
+            }
+        }
+
+        // Slow path: create new instance with calibration
+        let instance = Arc::new(Self::new(cuda_ctx)?);
+
+        // Store in cache
+        let mut instances_guard = instances.lock();
+        instances_guard
+            .entry(device_ordinal)
+            .or_insert_with(|| Arc::clone(&instance));
+
+        Ok(instance)
+    }
+
+    /// Create a new CudaSleep instance with calibration.
+    ///
+    /// This compiles the sleep kernel and runs a calibration loop to determine
+    /// the relationship between clock cycles and wall-clock time.
+    fn new(cuda_ctx: &Arc<CudaContext>) -> Result<Self> {
+        // Get device compute capability
+        let major = cuda_ctx
+            .attribute(CUdevice_attribute_enum::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR)?;
+        let minor = cuda_ctx
+            .attribute(CUdevice_attribute_enum::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR)?;
+
+        // Compile PTX for this device
+        let mut compile_opts = CompileOptions {
+            name: Some("sleep_kernel.cu".into()),
+            ..Default::default()
+        };
+        compile_opts
+            .options
+            .push(format!("--gpu-architecture=compute_{}{}", major, minor));
+        let ptx = compile_ptx_with_opts(SLEEP_KERNEL_SRC, compile_opts)?;
+        let module = cuda_ctx.load_module(ptx)?;
+        let function = module.load_function("sleep_kernel")?;
+
+        // Get device clock rate
+        let clock_rate_khz =
+            cuda_ctx.attribute(CUdevice_attribute_enum::CU_DEVICE_ATTRIBUTE_CLOCK_RATE)? as u64;
+
+        // Create a temporary stream for calibration
+        let stream = cuda_ctx.new_stream()?;
+
+        // Warm up to absorb JIT overhead
+        let warm_cycles = clock_rate_khz.saturating_mul(10).max(1);
+        Self::launch_kernel(&function, &stream, warm_cycles)?;
+        stream.synchronize()?;
+
+        // Run calibration loop
+        let desired_delay = Duration::from_millis(600);
+        let mut target_cycles = clock_rate_khz.saturating_mul(50).max(1); // ~50ms starting point
+        let mut actual_duration = Duration::ZERO;
+
+        for _ in 0..8 {
+            let start = Instant::now();
+            Self::launch_kernel(&function, &stream, target_cycles)?;
+            stream.synchronize()?;
+            actual_duration = start.elapsed();
+
+            if actual_duration >= desired_delay {
+                break;
+            }
+
+            target_cycles = target_cycles.saturating_mul(2);
+        }
+
+        // Calculate cycles per millisecond with conservative 20% margin
+        // (prefer longer sleeps over shorter)
+        let cycles_per_ms = if actual_duration.as_millis() > 0 {
+            (target_cycles as f64 / actual_duration.as_millis() as f64) * 1.2
+        } else {
+            clock_rate_khz as f64 // Fallback to clock rate
+        };
+
+        Ok(Self {
+            function,
+            cycles_per_ms,
+        })
+    }
+
+    /// Launch the sleep kernel with the specified number of cycles.
+    fn launch_kernel(
+        function: &cudarc::driver::CudaFunction,
+        stream: &Arc<CudaStream>,
+        cycles: u64,
+    ) -> Result<()> {
+        let launch_cfg = LaunchConfig {
+            grid_dim: (1, 1, 1),
+            block_dim: (1, 1, 1),
+            shared_mem_bytes: 0,
+        };
+
+        let mut launch = stream.launch_builder(function);
+        unsafe {
+            launch.arg(&cycles);
+            launch.launch(launch_cfg)?;
+        }
+
+        Ok(())
+    }
+
+    /// Launch a sleep operation on the given stream.
+    ///
+    /// This queues a GPU kernel that will sleep for approximately the specified
+    /// duration. The sleep is conservative and may take longer than requested.
+    ///
+    /// # Arguments
+    /// * `duration` - The minimum duration to sleep
+    /// * `stream` - The CUDA stream to launch the kernel on
+    ///
+    /// # Returns
+    /// Ok(()) if the kernel was successfully queued
+    pub fn launch(&self, duration: Duration, stream: &Arc<CudaStream>) -> Result<()> {
+        let target_cycles = (duration.as_millis() as f64 * self.cycles_per_ms) as u64;
+        Self::launch_kernel(&self.function, stream, target_cycles)
+    }
+}
--- a/lib/kvbm-physical/src/transfer/validation.rs
+++ b/lib/kvbm-physical/src/transfer/validation.rs
+// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Block ID validation for transfers.
+//!
+//! This module provides validation functions to ensure block transfers are safe and correct.
+
+use crate::BlockId;
+
+use super::PhysicalLayout;
+use std::collections::HashSet;
+use thiserror::Error;
+
+/// Validation errors for block transfers.
+#[derive(Debug, Error, PartialEq)]
+pub enum BlockValidationError {
+    /// Destination block IDs contain duplicates.
+    #[error("Destination block IDs are not unique: duplicates = {duplicates:?}")]
+    DuplicateDestinationBlocks { duplicates: Vec<BlockId> },
+
+    /// Source and destination blocks overlap when using the same layout.
+    #[error("Source and destination blocks overlap (same layout): overlapping = {overlapping:?}")]
+    OverlappingBlocks { overlapping: Vec<BlockId> },
+
+    /// Lists have mismatched lengths.
+    #[error(
+        "Block ID lists have mismatched lengths: src={src_len}, dst={dst_len}, bounce={bounce_len:?}"
+    )]
+    LengthMismatch {
+        src_len: usize,
+        dst_len: usize,
+        bounce_len: Option<usize>,
+    },
+
+    /// Block ID is out of range for the layout.
+    #[error("Block ID {block_id} out of range for {layout_name} (max={max})")]
+    BlockOutOfRange {
+        block_id: BlockId,
+        layout_name: &'static str,
+        max: usize,
+    },
+
+    /// Bounce block IDs contain duplicates.
+    #[error("Bounce block IDs are not unique: duplicates = {duplicates:?}")]
+    DuplicateBounceBlocks { duplicates: Vec<BlockId> },
+}
+
+/// Validate that destination block IDs are unique (no duplicates).
+///
+/// # Arguments
+/// * `dst_block_ids` - Destination block IDs
+///
+/// # Returns
+/// Ok(()) if unique, Err with duplicate IDs otherwise
+pub fn validate_dst_unique(dst_block_ids: &[BlockId]) -> Result<(), BlockValidationError> {
+    let mut seen = HashSet::new();
+    let mut duplicates = Vec::new();
+
+    for &id in dst_block_ids {
+        if !seen.insert(id) && !duplicates.contains(&id) {
+            duplicates.push(id);
+        }
+    }
+
+    if duplicates.is_empty() {
+        Ok(())
+    } else {
+        Err(BlockValidationError::DuplicateDestinationBlocks { duplicates })
+    }
+}
+
+/// Validate that bounce block IDs are unique (no duplicates).
+pub fn validate_bounce_unique(bounce_block_ids: &[BlockId]) -> Result<(), BlockValidationError> {
+    let mut seen = HashSet::new();
+    let mut duplicates = Vec::new();
+
+    for &id in bounce_block_ids {
+        if !seen.insert(id) && !duplicates.contains(&id) {
+            duplicates.push(id);
+        }
+    }
+
+    if duplicates.is_empty() {
+        Ok(())
+    } else {
+        Err(BlockValidationError::DuplicateBounceBlocks { duplicates })
+    }
+}
+
+/// Check if two layouts are the same by comparing their Arc pointers.
+///
+/// This is a conservative check - if pointers differ, layouts might still be the same
+/// but we treat them as different to avoid false positives in disjoint validation.
+fn are_same_layout(layout1: &PhysicalLayout, layout2: &PhysicalLayout) -> bool {
+    // Compare Arc pointer addresses
+    std::ptr::eq(
+        std::sync::Arc::as_ptr(layout1.layout()),
+        std::sync::Arc::as_ptr(layout2.layout()),
+    )
+}
+
+/// Validate that src and dst block IDs are disjoint when using the same layout.
+///
+/// Only enforced in debug mode when src and dst point to the same layout.
+///
+/// # Arguments
+/// * `src_block_ids` - Source block IDs
+/// * `dst_block_ids` - Destination block IDs
+/// * `src_layout` - Source physical layout
+/// * `dst_layout` - Destination physical layout
+#[cfg(debug_assertions)]
+pub fn validate_disjoint_same_layout(
+    src_block_ids: &[BlockId],
+    dst_block_ids: &[BlockId],
+    src_layout: &PhysicalLayout,
+    dst_layout: &PhysicalLayout,
+) -> Result<(), BlockValidationError> {
+    // Only check if same layout
+    if !are_same_layout(src_layout, dst_layout) {
+        return Ok(());
+    }
+
+    let src_set: HashSet<_> = src_block_ids.iter().copied().collect();
+    let overlapping: Vec<_> = dst_block_ids
+        .iter()
+        .filter(|id| src_set.contains(id))
+        .copied()
+        .collect();
+
+    if overlapping.is_empty() {
+        Ok(())
+    } else {
+        Err(BlockValidationError::OverlappingBlocks { overlapping })
+    }
+}
+
+/// Validate block IDs are in range for a layout.
+#[cfg(debug_assertions)]
+pub fn validate_block_ids_in_range(
+    block_ids: &[BlockId],
+    layout: &PhysicalLayout,
+    layout_name: &'static str,
+) -> Result<(), BlockValidationError> {
+    let max_blocks = layout.layout().config().num_blocks;
+
+    for &block_id in block_ids {
+        if block_id >= max_blocks as BlockId {
+            return Err(BlockValidationError::BlockOutOfRange {
+                block_id,
+                layout_name,
+                max: max_blocks,
+            });
+        }
+    }
+
+    Ok(())
+}
+
+/// Full validation for block transfer (debug mode).
+///
+/// Validates:
+/// - List lengths match
+/// - Destination IDs are unique
+/// - Bounce IDs are unique (if provided)
+/// - Source and destination are disjoint (if same layout)
+/// - All block IDs are in range for their respective layouts
+#[cfg(debug_assertions)]
+pub fn validate_block_transfer(
+    src_block_ids: &[BlockId],
+    dst_block_ids: &[BlockId],
+    bounce_block_ids: Option<&[BlockId]>,
+    src_layout: &PhysicalLayout,
+    dst_layout: &PhysicalLayout,
+    bounce_layout: Option<&PhysicalLayout>,
+) -> Result<(), BlockValidationError> {
+    // Validate lengths
+    if src_block_ids.len() != dst_block_ids.len() {
+        return Err(BlockValidationError::LengthMismatch {
+            src_len: src_block_ids.len(),
+            dst_len: dst_block_ids.len(),
+            bounce_len: bounce_block_ids.map(|ids| ids.len()),
+        });
+    }
+
+    if let Some(bounce_ids) = bounce_block_ids
+        && bounce_ids.len() != src_block_ids.len()
+    {
+        return Err(BlockValidationError::LengthMismatch {
+            src_len: src_block_ids.len(),
+            dst_len: dst_block_ids.len(),
+            bounce_len: Some(bounce_ids.len()),
+        });
+    }
+
+    #[cfg(debug_assertions)]
+    {
+        // Validate destination uniqueness
+        validate_dst_unique(dst_block_ids)?;
+
+        // Validate bounce uniqueness if provided
+        if let Some(bounce_ids) = bounce_block_ids {
+            validate_bounce_unique(bounce_ids)?;
+        }
+
+        // Validate disjoint if same layout
+        validate_disjoint_same_layout(src_block_ids, dst_block_ids, src_layout, dst_layout)?;
+
+        // Validate block IDs in range
+        validate_block_ids_in_range(src_block_ids, src_layout, "source")?;
+        validate_block_ids_in_range(dst_block_ids, dst_layout, "destination")?;
+        if let (Some(bounce_ids), Some(bounce_layout)) = (bounce_block_ids, bounce_layout) {
+            validate_block_ids_in_range(bounce_ids, bounce_layout, "bounce")?;
+        }
+    }
+
+    Ok(())
+}
+
+/// Minimal validation for block transfer (release mode).
+///
+/// Only validates:
+/// - List lengths match
+/// - Destination IDs are unique
+#[cfg(not(debug_assertions))]
+pub fn validate_block_transfer(
+    src_block_ids: &[BlockId],
+    dst_block_ids: &[BlockId],
+    bounce_block_ids: Option<&[BlockId]>,
+    _src_layout: &PhysicalLayout,
+    _dst_layout: &PhysicalLayout,
+    _bounce_layout: Option<&PhysicalLayout>,
+) -> Result<(), BlockValidationError> {
+    // Validate lengths
+    if src_block_ids.len() != dst_block_ids.len() {
+        return Err(BlockValidationError::LengthMismatch {
+            src_len: src_block_ids.len(),
+            dst_len: dst_block_ids.len(),
+            bounce_len: bounce_block_ids.map(|ids| ids.len()),
+        });
+    }
+
+    if let Some(bounce_ids) = bounce_block_ids {
+        if bounce_ids.len() != src_block_ids.len() {
+            return Err(BlockValidationError::LengthMismatch {
+                src_len: src_block_ids.len(),
+                dst_len: dst_block_ids.len(),
+                bounce_len: Some(bounce_ids.len()),
+            });
+        }
+    }
+
+    // Validate destination uniqueness
+    validate_dst_unique(dst_block_ids)?;
+
+    Ok(())
+}
+
+#[cfg(all(test, feature = "testing-kvbm"))]
+mod tests {
+    use super::super::tests::*;
+    use super::*;
+
+    #[test]
+    fn test_dst_unique_valid() {
+        let ids = vec![0, 1, 2, 3, 4];
+        assert!(validate_dst_unique(&ids).is_ok());
+    }
+
+    #[test]
+    fn test_dst_unique_duplicate() {
+        let ids = vec![0, 1, 2, 1, 3];
+        let result = validate_dst_unique(&ids);
+        assert!(result.is_err());
+        match result.unwrap_err() {
+            BlockValidationError::DuplicateDestinationBlocks { duplicates } => {
+                assert_eq!(duplicates, vec![1]);
+            }
+            _ => panic!("Wrong error type"),
+        }
+    }
+
+    #[test]
+    fn test_dst_unique_multiple_duplicates() {
+        let ids = vec![0, 1, 2, 1, 3, 2];
+        let result = validate_dst_unique(&ids);
+        assert!(result.is_err());
+        match result.unwrap_err() {
+            BlockValidationError::DuplicateDestinationBlocks { duplicates } => {
+                assert!(duplicates.contains(&1));
+                assert!(duplicates.contains(&2));
+            }
+            _ => panic!("Wrong error type"),
+        }
+    }
+
+    #[test]
+    #[cfg(debug_assertions)]
+    fn test_disjoint_same_layout_valid() {
+        let physical = builder(2)
+            .fully_contiguous()
+            .allocate_system()
+            .build()
+            .unwrap();
+
+        let src_ids = vec![0, 1, 2];
+        let dst_ids = vec![5, 6, 7];
+
+        assert!(validate_disjoint_same_layout(&src_ids, &dst_ids, &physical, &physical).is_ok());
+    }
+
+    #[test]
+    #[cfg(debug_assertions)]
+    fn test_disjoint_same_layout_overlap() {
+        let physical = builder(2)
+            .fully_contiguous()
+            .allocate_system()
+            .build()
+            .unwrap();
+
+        let src_ids = vec![0, 1, 2];
+        let dst_ids = vec![2, 3, 4]; // 2 overlaps
+
+        let result = validate_disjoint_same_layout(&src_ids, &dst_ids, &physical, &physical);
+        assert!(result.is_err());
+        match result.unwrap_err() {
+            BlockValidationError::OverlappingBlocks { overlapping } => {
+                assert_eq!(overlapping, vec![2]);
+            }
+            _ => panic!("Wrong error type"),
+        }
+    }
+
+    #[test]
+    fn test_disjoint_different_layouts_ok() {
+        let physical1 = builder(2)
+            .fully_contiguous()
+            .allocate_system()
+            .build()
+            .unwrap();
+
+        let physical2 = builder(2)
+            .fully_contiguous()
+            .allocate_system()
+            .build()
+            .unwrap();
+
+        let src_ids = vec![0, 1, 2];
+        let dst_ids = vec![0, 1, 2]; // Same IDs but different layouts
+
+        // Should be OK since different layouts
+        #[cfg(debug_assertions)]
+        assert!(validate_disjoint_same_layout(&src_ids, &dst_ids, &physical1, &physical2).is_ok());
+    }
+
+    #[test]
+    fn test_length_mismatch() {
+        let physical1 = builder(2)
+            .fully_contiguous()
+            .allocate_system()
+            .build()
+            .unwrap();
+
+        let physical2 = builder(2)
+            .fully_contiguous()
+            .allocate_system()
+            .build()
+            .unwrap();
+
+        let src_ids = vec![0, 1, 2];
+        let dst_ids = vec![5, 6]; // Different length
+
+        let result =
+            validate_block_transfer(&src_ids, &dst_ids, None, &physical1, &physical2, None);
+        assert!(result.is_err());
+        match result.unwrap_err() {
+            BlockValidationError::LengthMismatch {
+                src_len,
+                dst_len,
+                bounce_len,
+            } => {
+                assert_eq!(src_len, 3);
+                assert_eq!(dst_len, 2);
+                assert_eq!(bounce_len, None);
+            }
+            _ => panic!("Wrong error type"),
+        }
+    }
+
+    // #[test]
+    // #[cfg(debug_assertions)]
+    // fn test_block_out_of_range() {
+    //     let (_layout, physical) = create_test_layout(5); // Only 5 blocks
+    //     let src_ids = vec![0, 1, 2];
+    //     let dst_ids = vec![3, 4, 10]; // 10 is out of range
+
+    //     let result = validate_block_ids_in_range(&dst_ids, &physical, "destination");
+    //     assert!(result.is_err());
+    //     match result.unwrap_err() {
+    //         BlockValidationError::BlockOutOfRange {
+    //             block_id,
+    //             layout_name,
+    //             max,
+    //         } => {
+    //             assert_eq!(block_id, 10);
+    //             assert_eq!(layout_name, "destination");
+    //             assert_eq!(max, 5);
+    //         }
+    //         _ => panic!("Wrong error type"),
+    //     }
+    // }
+
+    // #[test]
+    // fn test_bounce_length_mismatch() {
+    //     let (_layout1, physical1) = create_test_layout(10);
+    //     let (_layout2, physical2) = create_test_layout(10);
+    //     let (_layout3, physical3) = create_test_layout(10);
+    //     let src_ids = vec![0, 1, 2];
+    //     let dst_ids = vec![5, 6, 7];
+    //     let bounce_ids = vec![8, 9]; // Wrong length
+
+    //     let result = validate_block_transfer(
+    //         &src_ids,
+    //         &dst_ids,
+    //         Some(&bounce_ids),
+    //         &physical1,
+    //         &physical2,
+    //         Some(&physical3),
+    //     );
+    //     assert!(result.is_err());
+    //     match result.unwrap_err() {
+    //         BlockValidationError::LengthMismatch {
+    //             src_len,
+    //             dst_len,
+    //             bounce_len,
+    //         } => {
+    //             assert_eq!(src_len, 3);
+    //             assert_eq!(dst_len, 3);
+    //             assert_eq!(bounce_len, Some(2));
+    //         }
+    //         _ => panic!("Wrong error type"),
+    //     }
+    // }
+
+    // #[test]
+    // fn test_full_validation_success() {
+    //     let (_layout1, physical1) = create_test_layout(10);
+    //     let (_layout2, physical2) = create_test_layout(10);
+    //     let (_layout3, physical3) = create_test_layout(10);
+    //     let src_ids = vec![0, 1, 2];
+    //     let dst_ids = vec![5, 6, 7];
+    //     let bounce_ids = vec![8, 9, 3];
+
+    //     assert!(
+    //         validate_block_transfer(
+    //             &src_ids,
+    //             &dst_ids,
+    //             Some(&bounce_ids),
+    //             &physical1,
+    //             &physical2,
+    //             Some(&physical3),
+    //         )
+    //         .is_ok()
+    //     );
+    // }
+}
--- a/lib/memory/src/nixl.rs
+++ b/lib/memory/src/nixl.rs
@@ -15,7 +15,8 @@ pub use agent::NixlAgent;
 pub use config::NixlBackendConfig;

 pub use nixl_sys::{
-    Agent, MemType, NotificationMap, OptArgs, RegistrationHandle, XferDescList, XferOp, XferRequest,
+    Agent, MemType, NotificationMap, OptArgs, RegistrationHandle, XferDescList, XferOp,
+    XferRequest, is_stub,
 };
 pub use serde::{Deserialize, Serialize};


--- a/lib/memory/src/nixl/agent.rs
+++ b/lib/memory/src/nixl/agent.rs
@@ -8,7 +8,7 @@
 //! - `NixlBackendConfig`: Configuration for NIXL backends from environment variables

 use anyhow::Result;
-use nixl_sys::Agent;
+use nixl_sys::{Agent, is_stub};
 use std::collections::{HashMap, HashSet};

 use crate::nixl::NixlBackendConfig;
@@ -34,6 +34,9 @@ pub struct NixlAgent {
 impl NixlAgent {
    /// Create a NIXL agent without any backends.
    pub fn new(name: &str) -> Result<Self> {
+        if is_stub() {
+            return Err(anyhow::anyhow!("NIXL is not supported in stub mode"));
+        }
        let agent = Agent::new(name)?;

        Ok(Self {

--- a/lib/runtime/examples/Cargo.lock
+++ b/lib/runtime/examples/Cargo.lock
@@ -412,9 +412,9 @@ checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724"

 [[package]]
 name = "chrono"
-version = "0.4.43"
+version = "0.4.44"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fac4744fb15ae8337dc853fee7fb3f4e48c0fbaa23d0afe49c447b4fab126118"
+checksum = "c673075a2e0e5f4a1dde27ce9dee1ea4558c7ffe648f576438a20ca1d2acc4b0"
 dependencies = [
 "iana-time-zone",
 "num-traits",
@@ -682,9 +682,9 @@ dependencies = [

 [[package]]
 name = "deranged"
-version = "0.5.6"
+version = "0.5.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cc3dc5ad92c2e2d1c193bbbbdf2ea477cb81331de4f3103f267ca18368b988c4"
+checksum = "7cd812cc2bc1d69d4764bd80df88b4317eaef9e773c75226407d9bc0876b211c"
 dependencies = [
 "powerfmt",
 "serde_core",
@@ -1687,9 +1687,9 @@ dependencies = [

 [[package]]
 name = "js-sys"
-version = "0.3.86"
+version = "0.3.91"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d36139f1c97c42c0c86a411910b04e48d4939a0376e6e0f989420cbdee0120e5"
+checksum = "b49715b7073f385ba4bc528e5747d02e66cb39c6146efb66b781f131f0fb399c"
 dependencies = [
 "once_cell",
 "wasm-bindgen",
@@ -1911,20 +1911,21 @@ checksum = "6800badb6cb2082ffd7b6a67e6125bb39f18782f793520caee8cb8846be06112"

 [[package]]
 name = "libredox"
-version = "0.1.12"
+version = "0.1.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3d0b95e02c851351f877147b7deea7b1afb1df71b63aa5f8270716e0c5720616"
+checksum = "1744e39d1d6a9948f4f388969627434e31128196de472883b39f148769bfe30a"
 dependencies = [
 "bitflags 2.11.0",
 "libc",
- "redox_syscall 0.7.1",
+ "plain",
+ "redox_syscall 0.7.3",
 ]

 [[package]]
 name = "linux-raw-sys"
-version = "0.11.0"
+version = "0.12.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "df1d3c3b53da64cf5760482273a98e575c651a67eec7f77df96b5b642de8f039"
+checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53"

 [[package]]
 name = "litemap"
@@ -2411,18 +2412,18 @@ dependencies = [

 [[package]]
 name = "pin-project"
-version = "1.1.10"
+version = "1.1.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "677f1add503faace112b9f1373e43e9e054bfdd22ff1a63c1bc485eaec6a6a8a"
+checksum = "f1749c7ed4bcaf4c3d0a3efc28538844fb29bcdd7d2b67b2be7e20ba861ff517"
 dependencies = [
 "pin-project-internal",
 ]

 [[package]]
 name = "pin-project-internal"
-version = "1.1.10"
+version = "1.1.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6e918e4ff8c4549eb882f14b3a4bc8c8bc93de829416eacf579f1207a8fbf861"
+checksum = "d9b20ed30f105399776b9c883e68e536ef602a16ae6f596d2c473591d6ad64c6"
 dependencies = [
 "proc-macro2",
 "quote",
@@ -2431,9 +2432,9 @@ dependencies = [

 [[package]]
 name = "pin-project-lite"
-version = "0.2.16"
+version = "0.2.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b"
+checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd"

 [[package]]
 name = "pin-utils"
@@ -2457,6 +2458,12 @@ version = "0.3.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c"

+[[package]]
+name = "plain"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b4596b6d070b27117e987119b4dac604f3c58cfb0b191112e24771b2faeac1a6"
+
 [[package]]
 name = "portable-atomic"
 version = "1.13.1"
@@ -2631,9 +2638,9 @@ dependencies = [

 [[package]]
 name = "pulldown-cmark"
-version = "0.13.0"
+version = "0.13.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1e8bbe1a966bd2f362681a44f6edce3c2310ac21e4d5067a6e7ec396297a6ea0"
+checksum = "83c41efbf8f90ac44de7f3a868f0867851d261b56291732d0cbf7cceaaeb55a6"
 dependencies = [
 "bitflags 2.11.0",
 "memchr",
@@ -2809,9 +2816,9 @@ dependencies = [

 [[package]]
 name = "redox_syscall"
-version = "0.7.1"
+version = "0.7.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "35985aa610addc02e24fc232012c86fd11f14111180f902b67e2d5331f8ebf2b"
+checksum = "6ce70a74e890531977d37e532c34d45e9055d2409ed08ddba14529471ed0be16"
 dependencies = [
 "bitflags 2.11.0",
 ]
@@ -2861,9 +2868,9 @@ dependencies = [

 [[package]]
 name = "regex-syntax"
-version = "0.8.9"
+version = "0.8.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a96887878f22d7bad8a3b6dc5b7440e0ada9a245242924394987b21cf2210a4c"
+checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a"

 [[package]]
 name = "reqwest"
@@ -2957,9 +2964,9 @@ dependencies = [

 [[package]]
 name = "rustix"
-version = "1.1.3"
+version = "1.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "146c9e247ccc180c1f61615433868c99f3de3ae256a30a43b49f67c2d9171f34"
+checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190"
 dependencies = [
 "bitflags 2.11.0",
 "errno",
@@ -2970,9 +2977,9 @@ dependencies = [

 [[package]]
 name = "rustls"
-version = "0.23.36"
+version = "0.23.37"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c665f33d38cea657d9614f766881e4d510e0eda4239891eea56b4cadcf01801b"
+checksum = "758025cb5fccfd3bc2fd74708fd4682be41d99e5dff73c377c0646c6012c73a4"
 dependencies = [
 "aws-lc-rs",
 "log",
@@ -3483,9 +3490,9 @@ checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1"

 [[package]]
 name = "tempfile"
-version = "3.25.0"
+version = "3.26.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0136791f7c95b1f6dd99f9cc786b91bb81c3800b639b3478e561ddb7be95e5f1"
+checksum = "82a72c767771b47409d2345987fda8628641887d5466101319899796367354a0"
 dependencies = [
 "fastrand",
 "getrandom 0.4.1",
@@ -4141,9 +4148,9 @@ dependencies = [

 [[package]]
 name = "wasm-bindgen"
-version = "0.2.109"
+version = "0.2.114"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9ff9c7baef35ac3c0e17d8bfc9ad75eb62f85a2f02bccc906699dadb0aa9c622"
+checksum = "6532f9a5c1ece3798cb1c2cfdba640b9b3ba884f5db45973a6f442510a87d38e"
 dependencies = [
 "cfg-if 1.0.4",
 "once_cell",
@@ -4154,9 +4161,9 @@ dependencies = [

 [[package]]
 name = "wasm-bindgen-futures"
-version = "0.4.59"
+version = "0.4.64"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d24699cd39db9966cf6e2ef10d2f72779c961ad905911f395ea201c3ec9f545d"
+checksum = "e9c5522b3a28661442748e09d40924dfb9ca614b21c00d3fd135720e48b67db8"
 dependencies = [
 "cfg-if 1.0.4",
 "futures-util",
@@ -4168,9 +4175,9 @@ dependencies = [

 [[package]]
 name = "wasm-bindgen-macro"
-version = "0.2.109"
+version = "0.2.114"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "39455e84ad887a0bbc93c116d72403f1bb0a39e37dd6f235a43e2128a0c7f1fd"
+checksum = "18a2d50fcf105fb33bb15f00e7a77b772945a2ee45dcf454961fd843e74c18e6"
 dependencies = [
 "quote",
 "wasm-bindgen-macro-support",
@@ -4178,9 +4185,9 @@ dependencies = [

 [[package]]
 name = "wasm-bindgen-macro-support"
-version = "0.2.109"
+version = "0.2.114"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dff4761f60b0b51fd13fec8764167b7bbcc34498ce3e52805fe1db6f2d56b6d6"
+checksum = "03ce4caeaac547cdf713d280eda22a730824dd11e6b8c3ca9e42247b25c631e3"
 dependencies = [
 "bumpalo",
 "proc-macro2",
@@ -4191,9 +4198,9 @@ dependencies = [

 [[package]]
 name = "wasm-bindgen-shared"
-version = "0.2.109"
+version = "0.2.114"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bc6a171c53d98021a93a474c4a4579d76ba97f9517d871bc12e27640f218b6dd"
+checksum = "75a326b8c223ee17883a4251907455a2431acc2791c98c26279376490c378c16"
 dependencies = [
 "unicode-ident",
 ]
@@ -4247,9 +4254,9 @@ dependencies = [

 [[package]]
 name = "web-sys"
-version = "0.3.86"
+version = "0.3.91"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "668fa5d00434e890a452ab060d24e3904d1be93f7bb01b70e5603baa2b8ab23b"
+checksum = "854ba17bb104abfb26ba36da9729addc7ce7f06f5c0f90f3c391f8461cca21f9"
 dependencies = [
 "js-sys",
 "wasm-bindgen",
@@ -4757,18 +4764,18 @@ dependencies = [

 [[package]]
 name = "zerocopy"
-version = "0.8.39"
+version = "0.8.40"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "db6d35d663eadb6c932438e763b262fe1a70987f9ae936e60158176d710cae4a"
+checksum = "a789c6e490b576db9f7e6b6d661bcc9799f7c0ac8352f56ea20193b2681532e5"
 dependencies = [
 "zerocopy-derive",
 ]

 [[package]]
 name = "zerocopy-derive"
-version = "0.8.39"
+version = "0.8.40"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4122cd3169e94605190e77839c9a40d40ed048d305bfdc146e7df40ab0f3e517"
+checksum = "f65c489a7071a749c849713807783f70672b28094011623e200cb86dcb835953"
 dependencies = [
 "proc-macro2",
 "quote",