// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 //! Transfer capability flags for controlling direct path enablement. //! //! By default, the transfer system uses a conservative staging policy where: //! - Device can only transfer to/from Host //! - Disk can only transfer to/from Host //! - Host can transfer to Device, Disk, or Remote //! - Device ↔ Device is allowed (native CUDA) //! //! These capability flags enable optional direct paths that bypass host staging. use serde::{Deserialize, Serialize}; use std::sync::OnceLock; use crate::{ layout::LayoutConfig, transfer::{ PhysicalLayout, TransferManager, executor::{TransferOptionsInternal, execute_transfer}, }, }; use dynamo_memory::nixl::NixlAgent; /// Transfer capability flags controlling which direct paths are enabled. /// /// # Default Policy (Conservative) /// /// With all flags disabled (default), the system uses host staging: /// - **Device → Remote**: Device → Host → Remote (2 hops) /// - **Disk → Remote**: Disk → Host → Remote (2 hops) /// - **Device ↔ Disk**: Device → Host → Disk (2 hops) /// /// # Optional Direct Paths /// /// - `allow_gds`: Enables GPU Direct Storage (Disk ↔ Device without host) /// - `allow_gpu_rdma`: Enables GPU RDMA (Device → Remote without host) /// /// # Example /// /// ``` /// # use kvbm_physical::transfer::TransferCapabilities; /// // Default conservative policy /// let caps = TransferCapabilities::default(); /// assert!(!caps.allow_gds); /// assert!(!caps.allow_gpu_rdma); /// /// // Enable GDS for high-performance disk I/O /// let caps = TransferCapabilities::default().with_gds(true); /// ``` static GDS_SUPPORTED: OnceLock = OnceLock::new(); #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)] pub struct TransferCapabilities { /// Enable GPU Direct Storage (Disk ↔ Device without host staging). /// /// When enabled: /// - Disk → Device: Direct transfer (requires GDS support) /// - Device → Disk: Direct transfer (requires GDS support) /// /// When disabled (default): /// - Disk → Device: Disk → Host → Device (2 hops) /// - Device → Disk: Device → Host → Disk (2 hops) pub allow_gds: bool, /// Enable GPU RDMA (Device → Remote without host staging). /// /// When enabled: /// - Device → Remote: Direct NIXL transfer /// /// When disabled (default): /// - Device → Remote: Device → Host → Remote (2 hops) /// /// Note: This only affects Device → Remote. Host → Remote is always direct. pub allow_gpu_rdma: bool, } impl TransferCapabilities { /// Create capabilities with default conservative policy (all direct paths disabled). pub fn new() -> Self { Self::default() } /// Create capabilities with all direct paths enabled (high performance mode). pub fn all_enabled() -> Self { Self { allow_gds: true, allow_gpu_rdma: true, } } /// Set the GDS (GPU Direct Storage) capability. pub fn with_gds(mut self, enabled: bool) -> Self { self.allow_gds = enabled; self } fn test_gds_transfer(&self) -> anyhow::Result<()> { let agent = NixlAgent::with_backends("agent", &["GDS_MT"])?; // Try a little test transfer and see if it works. let config = LayoutConfig::builder() .num_blocks(1) .num_layers(1) .outer_dim(1) .page_size(1) .inner_dim(4096) .build()?; let src = PhysicalLayout::builder(agent.clone()) .with_config(config.clone()) .fully_contiguous() .allocate_device(0) .build()?; let dst = PhysicalLayout::builder(agent.clone()) .with_config(config) .fully_contiguous() .allocate_disk(None) .build()?; let src_blocks = vec![0]; let dst_blocks = vec![0]; let ctx = TransferManager::builder() .nixl_agent(agent) .cuda_device_id(0) .build()?; execute_transfer( &src, &dst, &src_blocks, &dst_blocks, TransferOptionsInternal::default(), ctx.context(), )?; Ok(()) } pub fn with_gds_if_supported(mut self) -> Self { self.allow_gds = *GDS_SUPPORTED.get_or_init(|| self.test_gds_transfer().is_ok()); self } /// Set the GPU RDMA capability. pub fn with_gpu_rdma(mut self, enabled: bool) -> Self { self.allow_gpu_rdma = enabled; self } /// Check if a direct path from Device to Disk is allowed. pub fn allows_device_disk_direct(&self) -> bool { self.allow_gds } /// Check if a direct path from Device to Remote is allowed. pub fn allows_device_remote_direct(&self) -> bool { self.allow_gpu_rdma } } #[cfg(all(test, feature = "testing-kvbm"))] mod tests { use super::*; #[test] fn test_default_capabilities() { let caps = TransferCapabilities::default(); assert!(!caps.allow_gds); assert!(!caps.allow_gpu_rdma); assert!(!caps.allows_device_disk_direct()); assert!(!caps.allows_device_remote_direct()); } #[test] fn test_all_enabled() { let caps = TransferCapabilities::all_enabled(); assert!(caps.allow_gds); assert!(caps.allow_gpu_rdma); assert!(caps.allows_device_disk_direct()); assert!(caps.allows_device_remote_direct()); } #[test] fn test_builder_pattern() { let caps = TransferCapabilities::new() .with_gds(true) .with_gpu_rdma(false); assert!(caps.allow_gds); assert!(!caps.allow_gpu_rdma); } #[test] fn test_selective_enablement() { // Enable only GDS let caps = TransferCapabilities::new().with_gds(true); assert!(caps.allows_device_disk_direct()); assert!(!caps.allows_device_remote_direct()); // Enable only GPU RDMA let caps = TransferCapabilities::new().with_gpu_rdma(true); assert!(!caps.allows_device_disk_direct()); assert!(caps.allows_device_remote_direct()); } }