// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 //! CUDA memory pool for efficient device memory allocation in hot paths. //! //! This module provides a safe wrapper around CUDA's memory pool APIs, enabling //! fast async allocations that avoid the overhead of cudaMalloc/cudaFree per call. //! Memory is returned to the pool on free and reused for subsequent allocations. use anyhow::{Result, anyhow}; use cudarc::driver::sys::{ self, CUmemAllocationType, CUmemLocationType, CUmemPool_attribute, CUmemPoolProps, CUmemoryPool, CUresult, CUstream, }; use cudarc::driver::{CudaContext, CudaStream}; use std::ptr; use std::sync::{Arc, Mutex}; /// Builder for creating a CUDA memory pool with configurable parameters. /// /// # Example /// ```ignore /// let pool = CudaMemPoolBuilder::new(context, 64 * 1024 * 1024) // 64 MiB reserve /// .release_threshold(32 * 1024 * 1024) // 32 MiB release threshold /// .build()?; /// ``` pub struct CudaMemPoolBuilder { /// CUDA context for the target device. context: Arc, /// Bytes to pre-allocate to warm the pool. reserve_size: usize, /// Optional threshold above which memory is returned to the system on free. release_threshold: Option, } impl CudaMemPoolBuilder { /// Create a new builder with the required reserve size. /// /// # Arguments /// * `context` - CUDA context for the device /// * `reserve_size` - Number of bytes to pre-allocate to warm the pool pub fn new(context: Arc, reserve_size: usize) -> Self { Self { context, reserve_size, release_threshold: None, } } /// Set the release threshold for the pool. /// /// Memory above this threshold is returned to the system when freed. /// If not set, no release threshold is configured (CUDA default behavior). pub fn release_threshold(mut self, threshold: u64) -> Self { self.release_threshold = Some(threshold); self } /// Build the CUDA memory pool. /// /// This will: /// 1. Create the pool /// 2. Set the release threshold if configured /// 3. Pre-allocate and free memory to warm the pool pub fn build(self) -> Result { // Initialize pool properties let mut props: CUmemPoolProps = unsafe { std::mem::zeroed() }; props.allocType = CUmemAllocationType::CU_MEM_ALLOCATION_TYPE_PINNED; props.location.type_ = CUmemLocationType::CU_MEM_LOCATION_TYPE_DEVICE; props.location.id = self.context.cu_device(); let mut pool: CUmemoryPool = ptr::null_mut(); // Create the pool let result = unsafe { sys::cuMemPoolCreate(&mut pool, &props) }; if result != CUresult::CUDA_SUCCESS { return Err(anyhow!("cuMemPoolCreate failed with error: {:?}", result)); } // Set release threshold if configured if let Some(threshold) = self.release_threshold { let result = unsafe { sys::cuMemPoolSetAttribute( pool, CUmemPool_attribute::CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, &threshold as *const u64 as *mut std::ffi::c_void, ) }; if result != CUresult::CUDA_SUCCESS { // Clean up on failure unsafe { sys::cuMemPoolDestroy(pool) }; return Err(anyhow!( "cuMemPoolSetAttribute failed with error: {:?}", result )); } } let cuda_pool = CudaMemPool { inner: Mutex::new(pool), }; // Warm the pool by pre-allocating and freeing memory if self.reserve_size > 0 { // Create a temporary stream for warming let stream = self.context.new_stream()?; // Allocate to warm the pool (using safe variant) let ptr = cuda_pool.alloc_async(self.reserve_size, &stream)?; // Free back to pool (memory stays reserved) cuda_pool.free_async(ptr, &stream)?; // Synchronize to ensure operations complete // SAFETY: stream.cu_stream() is valid for the lifetime of `stream` let result = unsafe { sys::cuStreamSynchronize(stream.cu_stream()) }; if result != CUresult::CUDA_SUCCESS { return Err(anyhow!( "cuStreamSynchronize failed with error: {:?}", result )); } } Ok(cuda_pool) } } /// Safe wrapper around a CUDA memory pool. /// /// The pool amortizes allocation overhead by maintaining a reservoir of device memory. /// Allocations are fast sub-allocations from this reservoir, and frees return memory /// to the pool rather than the OS (until the release threshold is exceeded). /// /// # Thread Safety /// /// This type uses internal locking to serialize host-side calls to CUDA driver APIs. /// `cuMemAllocFromPoolAsync` is not host-thread reentrant, so concurrent calls from /// multiple threads must be serialized. The GPU-side operations remain asynchronous /// and stream-ordered. /// /// Use [`CudaMemPoolBuilder`] for configurable pool creation with pre-allocation. pub struct CudaMemPool { /// Mutex protecting the pool handle for host-thread serialization. /// /// CUDA's `cuMemAllocFromPoolAsync` does not guarantee host-thread reentrancy, /// so all calls to the pool must be serialized on the host side. inner: Mutex, } // SAFETY: CudaMemPool is Send because the Mutex serializes all host-side access // to the pool handle, and CUDA driver state is thread-safe when properly serialized. unsafe impl Send for CudaMemPool {} // SAFETY: CudaMemPool is Sync because all access to the pool handle goes through // the Mutex, which serializes host-thread access. The CUDA driver requires this // serialization because cuMemAllocFromPoolAsync is not host-thread reentrant. unsafe impl Sync for CudaMemPool {} impl CudaMemPool { /// Create a builder for a new CUDA memory pool. /// /// # Arguments /// * `context` - CUDA context for the device /// * `reserve_size` - Number of bytes to pre-allocate to warm the pool pub fn builder(context: Arc, reserve_size: usize) -> CudaMemPoolBuilder { CudaMemPoolBuilder::new(context, reserve_size) } /// Allocate memory from the pool asynchronously. /// /// This is the safe variant that takes a `&CudaStream` reference, ensuring /// the stream is valid for the duration of the call. /// /// The allocation is stream-ordered; the memory is available for use /// after all preceding operations on the stream complete. /// /// # Host Serialization /// /// This method acquires an internal mutex because `cuMemAllocFromPoolAsync` /// is not host-thread reentrant. The allocation itself is stream-ordered on /// the GPU side. /// /// # Arguments /// * `size` - Size in bytes to allocate /// * `stream` - CUDA stream for async ordering /// /// # Returns /// Device pointer to the allocated memory pub fn alloc_async(&self, size: usize, stream: &CudaStream) -> Result { // SAFETY: stream.cu_stream() returns a valid handle owned by the CudaStream, // and the borrow ensures the stream lives for the duration of this call. unsafe { self.alloc_async_raw(size, stream.cu_stream()) } } /// Allocate memory from the pool asynchronously (raw stream handle variant). /// /// This is the unsafe variant for use when you have a raw `CUstream` handle /// from sources other than cudarc's `CudaStream`. /// /// # Host Serialization /// /// This method acquires an internal mutex because `cuMemAllocFromPoolAsync` /// is not host-thread reentrant. /// /// # Arguments /// * `size` - Size in bytes to allocate /// * `stream` - Raw CUDA stream handle for async ordering /// /// # Returns /// Device pointer to the allocated memory /// /// # Safety /// /// The caller must ensure that `stream` is a valid CUDA stream handle that /// will remain valid for the duration of this call. pub unsafe fn alloc_async_raw(&self, size: usize, stream: CUstream) -> Result { let pool = self .inner .lock() .map_err(|e| anyhow!("mutex poisoned: {}", e))?; let mut ptr: u64 = 0; let result = unsafe { sys::cuMemAllocFromPoolAsync(&mut ptr, size, *pool, stream) }; if result != CUresult::CUDA_SUCCESS { return Err(anyhow!( "cuMemAllocFromPoolAsync failed with error: {:?}", result )); } Ok(ptr) } /// Free memory back to the pool asynchronously. /// /// This is the safe variant that takes a `&CudaStream` reference. /// /// The memory is returned to the pool's reservoir (not the OS) and can be /// reused by subsequent allocations. The free is stream-ordered. /// /// # Arguments /// * `ptr` - Device pointer previously allocated from this pool /// * `stream` - CUDA stream for async ordering pub fn free_async(&self, ptr: u64, stream: &CudaStream) -> Result<()> { // SAFETY: stream.cu_stream() returns a valid handle owned by the CudaStream, // and the borrow ensures the stream lives for the duration of this call. unsafe { self.free_async_raw(ptr, stream.cu_stream()) } } // NOTE: Unlike alloc_async_raw, this method does NOT acquire the pool mutex. // The mutex in alloc_async_raw ensures each allocation returns a unique pointer. // cuMemFreeAsync only enqueues a stream-ordered free operation for that unique // pointer - multiple threads can safely enqueue frees for different unique pointers // concurrently. The actual return-to-pool happens asynchronously on the GPU side. /// Free memory back to the pool asynchronously (raw stream handle variant). /// /// This is the unsafe variant for use when you have a raw `CUstream` handle. /// /// The memory is returned to the pool's reservoir (not the OS) and can be /// reused by subsequent allocations. The free is stream-ordered. /// /// # Arguments /// * `ptr` - Device pointer previously allocated from this pool /// * `stream` - Raw CUDA stream handle for async ordering /// /// # Safety /// /// The caller must ensure that: /// - `ptr` is a valid device pointer previously allocated from this pool /// - `stream` is a valid CUDA stream handle pub unsafe fn free_async_raw(&self, ptr: u64, stream: CUstream) -> Result<()> { let result = unsafe { sys::cuMemFreeAsync(ptr, stream) }; if result != CUresult::CUDA_SUCCESS { return Err(anyhow!("cuMemFreeAsync failed with error: {:?}", result)); } Ok(()) } } impl Drop for CudaMemPool { fn drop(&mut self) { // No need to lock - we have &mut self so exclusive access is guaranteed let pool = self .inner .get_mut() .expect("mutex should not be poisoned during drop"); // Destroy the pool, releasing all memory back to the system let result = unsafe { sys::cuMemPoolDestroy(*pool) }; if result != CUresult::CUDA_SUCCESS { tracing::warn!("cuMemPoolDestroy failed with error: {:?}", result); } } } #[cfg(all(test, feature = "testing-cuda"))] mod tests { use super::*; #[test] fn test_pool_creation_with_builder() { // Skip if no CUDA device available let context = match CudaContext::new(0) { Ok(ctx) => ctx, Err(e) => { eprintln!("Skipping test - no CUDA device: {:?}", e); return; } }; // Test builder with reserve size and release threshold let result = CudaMemPool::builder(context.clone(), 1024 * 1024) // 1 MiB reserve .release_threshold(64 * 1024 * 1024) // 64 MiB threshold .build(); if result.is_err() { eprintln!("Skipping test - pool creation failed: {:?}", result.err()); return; } let pool = result.unwrap(); drop(pool); } #[test] fn test_pool_creation_no_threshold() { // Skip if no CUDA device available let context = match CudaContext::new(0) { Ok(ctx) => ctx, Err(e) => { eprintln!("Skipping test - no CUDA device: {:?}", e); return; } }; // Test builder without release threshold let result = CudaMemPool::builder(context, 0).build(); if result.is_err() { eprintln!("Skipping test - pool creation failed: {:?}", result.err()); return; } let pool = result.unwrap(); drop(pool); } }