fix: GB200 mem allocation failure (#5328)

e3c3a6b9 · Qi Wang · GitHub · 52ce68e6 · e3c3a6b9
Unverified Commit e3c3a6b9 authored Jan 12, 2026 by Qi Wang Committed by GitHub Jan 12, 2026
Show whitespace changes
Inline Side-by-side

Showing with 104 additions and 7 deletions

lib/llm/src/block_manager/storage/cuda.rs lib/llm/src/block_manager/storage/cuda.rs +104 -7

No files found.
--- a/lib/llm/src/block_manager/storage/cuda.rs
+++ b/lib/llm/src/block_manager/storage/cuda.rs
@@ -78,6 +78,33 @@ use cudarc::driver::{CudaContext, sys};
 use crate::block_manager::numa_allocator;
+/// Allocates pinned host memory, preferring write-combined if supported.
+///
+/// Write-combined (WC) memory is optimal for PCIe DMA transfers but may not be
+/// supported on systems with cache-coherent CPU-GPU interconnects (e.g., Grace
+/// Hopper/Blackwell with NVLink-C2C). This function tries WC first and falls
+/// back to regular pinned memory if not supported.
+///
+/// # Safety
+///
+/// Caller must ensure a valid CUDA context is bound to the current thread.
+unsafe fn malloc_host_prefer_writecombined(size: usize) -> Result<*mut u8, StorageError> {
+    // First, try write-combined allocation (optimal for PCIe systems)
+    // SAFETY: Caller guarantees a valid CUDA context is bound to the current thread
+    match unsafe { cudarc::driver::result::malloc_host(size, sys::CU_MEMHOSTALLOC_WRITECOMBINED) } {
+        Ok(ptr) => Ok(ptr as *mut u8),
+        Err(_) => {
+            // Write-combined not supported (e.g., Grace Hopper/Blackwell),
+            // fall back to regular pinned memory
+            tracing::debug!("Write-combined memory not supported, using regular pinned memory");
+            // SAFETY: Same as above - caller guarantees valid CUDA context
+            unsafe { cudarc::driver::result::malloc_host(size, 0) }
+                .map(|ptr| ptr as *mut u8)
+                .map_err(StorageError::Cuda)
+        }
+    }
+}
 /// Trait for [Storage] types that can be accessed by CUDA
 pub trait CudaAccessible: Storage {}
@@ -187,16 +214,11 @@ impl PinnedStorage {
                    Ok(ptr) => ptr,
                    Err(e) => {
                        tracing::warn!("NUMA allocation failed: {}, using direct allocation", e);
-                        cudarc::driver::result::malloc_host(
+                        malloc_host_prefer_writecombined(size)?
-                            size,
-                            sys::CU_MEMHOSTALLOC_WRITECOMBINED,
-                        )
-                        .map_err(StorageError::Cuda)? as *mut u8
                    }
                }
            } else {
-                cudarc::driver::result::malloc_host(size, sys::CU_MEMHOSTALLOC_WRITECOMBINED)
+                malloc_host_prefer_writecombined(size)?
-                    .map_err(StorageError::Cuda)? as *mut u8
            };
            assert!(!ptr.is_null(), "Failed to allocate pinned memory");
@@ -574,4 +596,79 @@ mod tests {
        let result = DeviceStorage::new_from_torch(&ctx, Arc::new(tensor));
        assert!(result.is_err());
    }
+    #[test]
+    fn test_malloc_host_prefer_writecombined_allocates_memory() {
+        let ctx = Cuda::device_or_create(0).expect("Failed to create CUDA context");
+        let size = 4096; // One page
+        unsafe {
+            ctx.bind_to_thread().expect("Failed to bind CUDA context");
+            // Test allocation succeeds (either write-combined or fallback)
+            let ptr = malloc_host_prefer_writecombined(size)
+                .expect("malloc_host_prefer_writecombined should succeed");
+            // Verify pointer is valid and non-null
+            assert!(!ptr.is_null(), "Allocated pointer should not be null");
+            // Verify memory is accessible by writing and reading
+            std::ptr::write_volatile(ptr, 0xAB);
+            let val = std::ptr::read_volatile(ptr);
+            assert_eq!(val, 0xAB, "Should be able to write and read pinned memory");
+            // Clean up
+            cudarc::driver::result::free_host(ptr as _).expect("Failed to free pinned memory");
+        }
+    }
+    /// Test PinnedStorage::new with NUMA disabled (the direct allocation path)
+    ///
+    /// This test confirms that when `DYN_KVBM_ENABLE_NUMA` is not set,
+    /// PinnedStorage::new uses the direct malloc_host_prefer_writecombined path
+    /// (lines 222-224 in the source).
+    #[test]
+    // `remove_var` is not thread-safe, so we need to run this test in a serial context
+    // #[serial]
+    fn test_pinned_storage_new_without_numa() {
+        // Verify NUMA is actually disabled for this test
+        assert!(
+            !numa_allocator::is_numa_enabled(),
+            "NUMA should be disabled for this test"
+        );
+        let ctx = Cuda::device_or_create(0).expect("Failed to create CUDA context");
+        let size = 8192;
+        // Create PinnedStorage - this should take the non-NUMA path
+        let mut storage =
+            PinnedStorage::new(&ctx, size).expect("PinnedStorage::new should succeed");
+        // Verify storage properties
+        assert_eq!(storage.size(), size);
+        assert_eq!(storage.storage_type(), StorageType::Pinned);
+        assert_ne!(storage.addr(), 0, "Address should be non-zero");
+        // Verify memory is accessible
+        unsafe {
+            let ptr = storage.as_mut_ptr();
+            assert!(!ptr.is_null(), "Pointer should not be null");
+            // Write a pattern to verify memory is usable
+            for i in 0..size {
+                std::ptr::write_volatile(ptr.add(i), (i & 0xFF) as u8);
+            }
+            // Read back and verify
+            for i in 0..size {
+                let val = std::ptr::read_volatile(ptr.add(i));
+                assert_eq!(
+                    val,
+                    (i & 0xFF) as u8,
+                    "Memory content mismatch at offset {}",
+                    i
+                );
+            }
+        }
+    }
 }