// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
use super::*;
use super::TransferError;
use crate::block_manager::block::{BlockDataProvider, BlockDataProviderMut};
use anyhow::Result;
use cudarc::driver::CudaStream;
use cudarc::driver::result as cuda_result;
use cudarc::driver::sys::{CUevent_flags, CUresult, cuMemcpyHtoDAsync_v2};
use dynamo_runtime::config::environment_names::cuda as env_cuda;
use std::ops::Range;
use std::sync::Mutex;
use std::sync::OnceLock;
// Global storage for kernel function - store as usize to avoid Send/Sync issues
static COPY_KERNEL_MODULE: Mutex> = Mutex::new(None);
static COPY_KERNEL_FUNCTION: Mutex > = Mutex::new(None);
type CudaMemcpyFnPtr = unsafe fn(
src_ptr: *const u8,
dst_ptr: *mut u8,
size: usize,
stream: &CudaStream,
) -> Result<(), TransferError>;
fn cuda_memcpy_fn_ptr(strategy: &TransferStrategy) -> Result {
match strategy {
TransferStrategy::CudaAsyncH2D => Ok(cuda_memcpy_h2d),
TransferStrategy::CudaAsyncD2H => Ok(cuda_memcpy_d2h),
TransferStrategy::CudaAsyncD2D => Ok(cuda_memcpy_d2d),
_ => Err(TransferError::ExecutionError(
"Unsupported copy strategy for CUDA memcpy async".into(),
)),
}
}
/// Collect K/V cache addresses from source and destination blocks
fn collect_kv_addresses(
sources: &[Source],
destinations: &[Destination],
num_layers: usize,
num_outer_dims: usize,
) -> Result<(Vec, Vec), TransferError>
where
Source: BlockDataProvider,
Destination: BlockDataProviderMut,
{
if sources.is_empty() {
return Err(TransferError::ExecutionError(
"No source blocks provided".to_string(),
));
}
let total_address_pairs = sources.len() * num_layers * num_outer_dims;
let mut src_addresses = Vec::with_capacity(total_address_pairs);
let mut dst_addresses = Vec::with_capacity(total_address_pairs);
let src_block_data: Vec<_> = sources.iter().map(|block| block.block_data()).collect();
let dst_block_data: Vec<_> = destinations
.iter()
.map(|block| block.block_data())
.collect();
for (src_data, dst_data) in src_block_data.iter().zip(dst_block_data.iter()) {
for layer_idx in 0..num_layers {
for outer_idx in 0..num_outer_dims {
let src_view = src_data.layer_view(layer_idx, outer_idx)?;
let dst_view = dst_data.layer_view(layer_idx, outer_idx)?;
unsafe {
src_addresses.push(src_view.as_ptr() as u64);
dst_addresses.push(dst_view.as_ptr() as u64);
}
}
}
}
Ok((src_addresses, dst_addresses))
}
/// Launch CUDA kernel directly with pinned buffer pointers (no address copying)
unsafe fn launch_copy_kernel_direct(
src_pinned_ptr: u64,
dst_pinned_ptr: u64,
address_count: usize,
layer_size: usize,
stream: &CudaStream,
) -> Result<(), TransferError> {
// Get kernel function
let kernel = get_copy_kernel()?;
tracing::debug!(
"LAUNCHING KERNEL: {} pairs, src=0x{:x}, dst=0x{:x}",
address_count,
src_pinned_ptr,
dst_pinned_ptr
);
let threads_per_block = 256u32;
let max_blocks = 1024u32;
let blocks_needed = std::cmp::min(max_blocks, address_count as u32);
let grid_dim = (blocks_needed, 1, 1);
let block_dim = (threads_per_block, 1, 1);
// cuLaunchKernel expects pointers to parameter values
let src_ptr_param = src_pinned_ptr;
let dst_ptr_param = dst_pinned_ptr;
let size_param = layer_size;
let num_pairs_param = address_count as i32;
let params = [
&src_ptr_param as *const _ as *mut std::ffi::c_void,
&dst_ptr_param as *const _ as *mut std::ffi::c_void,
&size_param as *const _ as *mut std::ffi::c_void,
&num_pairs_param as *const _ as *mut std::ffi::c_void,
];
let result = unsafe {
cudarc::driver::sys::cuLaunchKernel(
kernel,
grid_dim.0,
grid_dim.1,
grid_dim.2,
block_dim.0,
block_dim.1,
block_dim.2,
0, // shared memory
stream.cu_stream(),
params.as_ptr() as *mut *mut std::ffi::c_void,
std::ptr::null_mut(), // extra
)
};
if result != cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
tracing::error!(
"Kernel launch failed: {:?} - kernel params: {} pairs, layer_size={}, src=0x{:x}, dst=0x{:x}",
result,
address_count,
layer_size,
src_pinned_ptr,
dst_pinned_ptr
);
return Err(TransferError::ExecutionError(format!(
"CUDA kernel launch failed: {:?} (address_count={}, layer_size={})",
result, address_count, layer_size
)));
}
Ok(())
}
#[derive(Clone, Copy, Debug)]
struct CachedBlockDimensions {
num_layers: usize,
num_outer_dims: usize,
layer_size: usize,
}
static BLOCK_DIMENSIONS_CACHE: OnceLock = OnceLock::new();
fn get_cached_block_dimensions(
block: &T,
) -> Result {
Ok(*BLOCK_DIMENSIONS_CACHE
.get_or_init(|| calculate_block_dimensions_from_layout(block).unwrap()))
}
fn calculate_block_dimensions_from_layout(
block: &T,
) -> Result {
let block_data = block.block_data();
// Get dimensions directly from layout (pre-computed values)
let num_layers = block_data.num_layers();
let num_outer_dims = block_data.num_outer_dims();
let layer_size = block_data.layer_view(0, 0).map(|v| v.size()).unwrap_or(0);
Ok(CachedBlockDimensions {
num_layers,
num_outer_dims,
layer_size,
})
}
pub fn copy_blocks_with_customized_kernel<'a, Source, Destination>(
sources: &'a [Source],
destinations: &'a mut [Destination],
stream: &CudaStream,
ctx: &crate::block_manager::block::transfer::TransferContext,
) -> Result<(), TransferError>
where
Source: BlockDataProvider,
Destination: BlockDataProviderMut,
{
let _context_guard = stream.context().bind_to_thread();
// Get cached dimensions (calculated once per program lifetime!)
let dims = get_cached_block_dimensions(&sources[0])?;
// Use cached dimensions
let (src_addresses, dst_addresses) =
collect_kv_addresses(sources, destinations, dims.num_layers, dims.num_outer_dims)?;
tracing::debug!(
"Using vectorized_copy for {} blocks [{}L×{}O×{}B], {} address pairs",
sources.len(),
dims.num_layers,
dims.num_outer_dims,
dims.layer_size,
src_addresses.len()
);
let size = src_addresses.len() * std::mem::size_of::();
let pool = ctx.cuda_mem_pool().ok_or_else(|| {
TransferError::ExecutionError(
"TransferContext was not instantiated with a CudaPool; please report this error"
.to_string(),
)
})?;
// Allocate DEVICE memory from pool (stream-ordered)
let src_buffer = pool.alloc_async(size, stream).map_err(|e| {
TransferError::ExecutionError(format!("CUDA pool allocation failed: {}", e))
})?;
let dst_buffer = pool.alloc_async(size, stream).map_err(|e| {
TransferError::ExecutionError(format!("CUDA pool allocation failed: {}", e))
})?;
// Copy address buffers from host to device using stream-ordered H2D memcpy
let result_src = unsafe {
cuMemcpyHtoDAsync_v2(
src_buffer,
src_addresses.as_ptr() as *const std::ffi::c_void,
size,
stream.cu_stream(),
)
};
if result_src != CUresult::CUDA_SUCCESS {
return Err(TransferError::ExecutionError(format!(
"H2D memcpy for src buffer failed: {:?}",
result_src
)));
}
let result_dst = unsafe {
cuMemcpyHtoDAsync_v2(
dst_buffer,
dst_addresses.as_ptr() as *const std::ffi::c_void,
size,
stream.cu_stream(),
)
};
if result_dst != CUresult::CUDA_SUCCESS {
return Err(TransferError::ExecutionError(format!(
"H2D memcpy for dst buffer failed: {:?}",
result_dst
)));
}
// Record event and synchronize to ensure H2D completes before host vectors drop
// This is critical: the async H2D memcpy is still reading from src_addresses/dst_addresses
// host memory when it returns. We must wait for completion before those vectors are dropped.
let h2d_event = stream
.record_event(Some(CUevent_flags::CU_EVENT_BLOCKING_SYNC))
.map_err(|e| TransferError::ExecutionError(format!("Failed to record H2D event: {}", e)))?;
// Launch kernel (reads from device buffers)
unsafe {
launch_copy_kernel_direct(
src_buffer,
dst_buffer,
src_addresses.len(),
dims.layer_size,
stream,
)?;
}
// Free buffers immediately (stream-ordered - CUDA ensures kernel completes first)
pool.free_async(src_buffer, stream)
.map_err(|e| TransferError::ExecutionError(format!("Failed to free src buffer: {}", e)))?;
pool.free_async(dst_buffer, stream)
.map_err(|e| TransferError::ExecutionError(format!("Failed to free dst buffer: {}", e)))?;
// By synchronizing here, we enqueue all the work to the stream, then wait.
// There is cpu overheads associated with each of those calls.
// We might as well amortize the transfer of the pointers with those launch overheads.
h2d_event
.synchronize()
.map_err(|e| TransferError::ExecutionError(format!("Failed to sync H2D event: {}", e)))?;
Ok(())
}
/// Copy a block from a source to a destination using CUDA memcpy
pub fn copy_block<'a, Source, Destination>(
sources: &'a Source,
destinations: &'a mut Destination,
stream: &CudaStream,
strategy: TransferStrategy,
) -> Result<(), TransferError>
where
Source: BlockDataProvider,
Destination: BlockDataProviderMut,
{
let src_data = sources.block_data();
let dst_data = destinations.block_data_mut();
let memcpy_fn = cuda_memcpy_fn_ptr(&strategy)?;
#[cfg(debug_assertions)]
{
let expected_strategy =
expected_strategy::();
assert_eq!(strategy, expected_strategy);
}
if src_data.is_fully_contiguous() && dst_data.is_fully_contiguous() {
let src_view = src_data.block_view()?;
let mut dst_view = dst_data.block_view_mut()?;
debug_assert_eq!(src_view.size(), dst_view.size());
unsafe {
memcpy_fn(
src_view.as_ptr(),
dst_view.as_mut_ptr(),
src_view.size(),
stream,
)?;
}
} else {
assert_eq!(src_data.num_layers(), dst_data.num_layers());
copy_layers(
0..src_data.num_layers(),
sources,
destinations,
stream,
strategy,
)?;
}
Ok(())
}
/// Copy a range of layers from a source to a destination using CUDA memcpy
pub fn copy_layers<'a, Source, Destination>(
layer_range: Range,
sources: &'a Source,
destinations: &'a mut Destination,
stream: &CudaStream,
strategy: TransferStrategy,
) -> Result<(), TransferError>
where
Source: BlockDataProvider,
Destination: BlockDataProviderMut,
{
let src_data = sources.block_data();
let dst_data = destinations.block_data_mut();
let memcpy_fn = cuda_memcpy_fn_ptr(&strategy)?;
#[cfg(debug_assertions)]
{
let expected_strategy =
expected_strategy::();
assert_eq!(strategy, expected_strategy);
}
for layer_idx in layer_range {
for outer_idx in 0..src_data.num_outer_dims() {
let src_view = src_data.layer_view(layer_idx, outer_idx)?;
let mut dst_view = dst_data.layer_view_mut(layer_idx, outer_idx)?;
debug_assert_eq!(src_view.size(), dst_view.size());
unsafe {
memcpy_fn(
src_view.as_ptr(),
dst_view.as_mut_ptr(),
src_view.size(),
stream,
)?;
}
}
}
Ok(())
}
/// Helper function to perform the appropriate CUDA memcpy based on storage types
// Allow dead code because it's used in debug assertions
#[allow(dead_code)]
fn expected_strategy() -> TransferStrategy {
match (
std::any::TypeId::of::(),
std::any::TypeId::of::(),
) {
(src, dst)
if src == std::any::TypeId::of::()
&& dst == std::any::TypeId::of::() =>
{
TransferStrategy::CudaAsyncH2D
}
(src, dst)
if src == std::any::TypeId::of::()
&& dst == std::any::TypeId::of::() =>
{
TransferStrategy::CudaAsyncD2H
}
(src, dst)
if src == std::any::TypeId::of::()
&& dst == std::any::TypeId::of::() =>
{
TransferStrategy::CudaAsyncD2D
}
_ => TransferStrategy::Invalid,
}
}
/// H2D Implementation
#[inline(always)]
unsafe fn cuda_memcpy_h2d(
src_ptr: *const u8,
dst_ptr: *mut u8,
size: usize,
stream: &CudaStream,
) -> Result<(), TransferError> {
debug_assert!(!src_ptr.is_null(), "Source host pointer is null");
debug_assert!(!dst_ptr.is_null(), "Destination device pointer is null");
unsafe {
let src_slice = std::slice::from_raw_parts(src_ptr, size);
cuda_result::memcpy_htod_async(dst_ptr as u64, src_slice, stream.cu_stream())
.map_err(|e| TransferError::ExecutionError(format!("CUDA H2D memcpy failed: {}", e)))?
};
Ok(())
}
/// D2H Implementation
#[inline(always)]
unsafe fn cuda_memcpy_d2h(
src_ptr: *const u8,
dst_ptr: *mut u8,
size: usize,
stream: &CudaStream,
) -> Result<(), TransferError> {
debug_assert!(!src_ptr.is_null(), "Source device pointer is null");
debug_assert!(!dst_ptr.is_null(), "Destination host pointer is null");
unsafe {
let dst_slice = std::slice::from_raw_parts_mut(dst_ptr, size);
cuda_result::memcpy_dtoh_async(dst_slice, src_ptr as u64, stream.cu_stream())
.map_err(|e| TransferError::ExecutionError(format!("CUDA D2H memcpy failed: {}", e)))?;
}
Ok(())
}
/// D2D Implementation
#[inline(always)]
unsafe fn cuda_memcpy_d2d(
src_ptr: *const u8,
dst_ptr: *mut u8,
size: usize,
stream: &CudaStream,
) -> Result<(), TransferError> {
debug_assert!(!src_ptr.is_null(), "Source device pointer is null");
debug_assert!(!dst_ptr.is_null(), "Destination device pointer is null");
debug_assert!(
(src_ptr as usize + size <= dst_ptr as usize)
|| (dst_ptr as usize + size <= src_ptr as usize),
"Source and destination device memory regions must not overlap for D2D copy"
);
unsafe {
cuda_result::memcpy_dtod_async(dst_ptr as u64, src_ptr as u64, size, stream.cu_stream())
.map_err(|e| TransferError::ExecutionError(format!("CUDA D2D memcpy failed: {}", e)))?
};
Ok(())
}
// Load the vectorized_copy module from FATBIN
fn get_copy_kernel_module() -> Result {
let mut module_guard = COPY_KERNEL_MODULE.lock().unwrap();
if let Some(module_ptr) = *module_guard {
return Ok(module_ptr as cudarc::driver::sys::CUmodule);
}
// Load the module on first access
let module = match load_embedded_fatbin() {
Ok(module) => {
tracing::debug!("Successfully loaded embedded FATBIN module");
module
}
Err(embedded_err) => {
tracing::debug!("Embedded FATBIN loading failed: {:?}", embedded_err);
match load_runtime_fatbin() {
Ok(module) => {
tracing::debug!("Successfully loaded runtime FATBIN module");
module
}
Err(runtime_err) => {
tracing::error!(" Both FATBIN loading methods failed:");
tracing::error!(" Embedded error: {:?}", embedded_err);
tracing::error!(" Runtime error: {:?}", runtime_err);
return Err(TransferError::ExecutionError(
"No vectorized_copy FATBIN found (tried embedded and runtime paths)"
.to_string(),
));
}
}
}
};
let module_ptr = module as usize;
*module_guard = Some(module_ptr);
Ok(module as cudarc::driver::sys::CUmodule)
}
// Get the vectorized_copy function
fn get_copy_kernel() -> Result {
let mut func_guard = COPY_KERNEL_FUNCTION.lock().unwrap();
if let Some(func_ptr) = *func_guard {
return Ok(func_ptr as cudarc::driver::sys::CUfunction);
}
// Load the function on first access
let module = get_copy_kernel_module()?;
let func = unsafe {
let mut func = std::ptr::null_mut();
let func_name = std::ffi::CString::new("vectorised_copy").unwrap();
let result =
cudarc::driver::sys::cuModuleGetFunction(&mut func, module, func_name.as_ptr());
if result == cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
func
} else {
return Err(TransferError::ExecutionError(format!(
"Failed to get kernel function: {:?}",
result
)));
}
};
let func_ptr = func as usize;
*func_guard = Some(func_ptr);
Ok(func as cudarc::driver::sys::CUfunction)
}
// Try to load embedded FATBIN (compile-time) - only compiled when FATBIN is available
#[cfg(have_vec_copy_fatbin)]
fn load_embedded_fatbin() -> Result {
// FATBIN was copied to OUT_DIR by build.rs and embedded here
const FATBIN: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/vectorized_copy.fatbin"));
tracing::debug!("Loading embedded FATBIN ({} bytes)", FATBIN.len());
unsafe {
let mut module = std::ptr::null_mut();
let result = cudarc::driver::sys::cuModuleLoadData(
&mut module,
FATBIN.as_ptr() as *const std::ffi::c_void,
);
if result == cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
tracing::debug!("Embedded FATBIN module loaded successfully: {:p}", module);
return Ok(module);
} else {
tracing::error!(
"Embedded FATBIN cuModuleLoadData failed with CUDA error: {:?}",
result
);
}
}
Err(cudarc::driver::DriverError(
cudarc::driver::sys::cudaError_enum::CUDA_ERROR_FILE_NOT_FOUND,
))
}
// Fallback implementation when FATBIN is not available at compile time
#[cfg(not(have_vec_copy_fatbin))]
fn load_embedded_fatbin() -> Result {
tracing::debug!("No embedded FATBIN available (not compiled with have_vec_copy_fatbin)");
Err(cudarc::driver::DriverError(
cudarc::driver::sys::cudaError_enum::CUDA_ERROR_FILE_NOT_FOUND,
))
}
// Try to load FATBIN from filesystem (runtime)
fn load_runtime_fatbin() -> Result {
// 1. Check runtime environment variable first
if let Ok(runtime_path) = std::env::var(env_cuda::DYNAMO_FATBIN_PATH)
&& let Ok(fatbin_data) = std::fs::read(&runtime_path)
{
tracing::debug!("Loading FATBIN from runtime env var: {}", runtime_path);
unsafe {
let mut module = std::ptr::null_mut();
let result = cudarc::driver::sys::cuModuleLoadData(
&mut module,
fatbin_data.as_ptr() as *const std::ffi::c_void,
);
if result == cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
tracing::debug!("Runtime FATBIN module loaded successfully: {:p}", module);
return Ok(module);
} else {
tracing::error!(
"Runtime FATBIN cuModuleLoadData failed with CUDA error: {:?}",
result
);
}
}
}
// 2. Check standard runtime locations
let runtime_paths = ["./src/block_manager/block/transfer/kernels/vectorized_copy.fatbin"];
for path in &runtime_paths {
if let Ok(fatbin_data) = std::fs::read(path) {
tracing::debug!("Loading FATBIN from runtime path: {}", path);
unsafe {
let mut module = std::ptr::null_mut();
let result = cudarc::driver::sys::cuModuleLoadData(
&mut module,
fatbin_data.as_ptr() as *const std::ffi::c_void,
);
if result == cudarc::driver::sys::cudaError_enum::CUDA_SUCCESS {
tracing::debug!(
"Runtime path FATBIN module loaded successfully: {:p}",
module
);
return Ok(module);
} else {
tracing::error!(
"Runtime path FATBIN cuModuleLoadData failed with CUDA error: {:?}",
result
);
}
}
} else {
tracing::debug!("Could not read FATBIN file: {}", path);
}
}
Err(cudarc::driver::DriverError(
cudarc::driver::sys::cudaError_enum::CUDA_ERROR_FILE_NOT_FOUND,
))
}
#[cfg(all(test, feature = "testing-cuda"))]
mod tests {
use super::*;
use crate::block_manager::storage::{
DeviceAllocator, PinnedAllocator, StorageAllocator, StorageMemset,
};
#[test]
fn test_memset_and_transfer() {
// Create allocators
let device_allocator = DeviceAllocator::default();
let pinned_allocator = PinnedAllocator::default();
let ctx = device_allocator.ctx().clone();
// Create CUDA stream
let stream = ctx.new_stream().unwrap();
// Allocate host and device memory
let mut host = pinned_allocator.allocate(1024).unwrap();
let mut device = device_allocator.allocate(1024).unwrap();
// Set a pattern in host memory
StorageMemset::memset(&mut host, 42, 0, 1024).unwrap();
// Verify host memory was set correctly
unsafe {
let ptr = host.as_ptr();
let slice = std::slice::from_raw_parts(ptr, 1024);
assert!(slice.iter().all(|&x| x == 42));
}
// Copy host to device
unsafe {
cuda_memcpy_h2d(host.as_ptr(), device.as_mut_ptr(), 1024, stream.as_ref()).unwrap();
}
// Synchronize to ensure H2D copy is complete
stream.synchronize().unwrap();
// Clear host memory
StorageMemset::memset(&mut host, 0, 0, 1024).unwrap();
// Verify host memory was cleared
unsafe {
let ptr = host.as_ptr();
let slice = std::slice::from_raw_parts(ptr, 1024);
assert!(slice.iter().all(|&x| x == 0));
}
// Copy back from device to host
unsafe {
cuda_memcpy_d2h(device.as_ptr(), host.as_mut_ptr(), 1024, stream.as_ref()).unwrap();
}
// Synchronize to ensure D2H copy is complete before verifying
stream.synchronize().unwrap();
// Verify the original pattern was restored
unsafe {
let ptr = host.as_ptr();
let slice = std::slice::from_raw_parts(ptr, 1024);
assert!(slice.iter().all(|&x| x == 42));
}
}
// ============================================================================
// CUDA TRANSFER TESTS FOR LAYOUT COMPATIBILITY
// ============================================================================
mod layout_transfer_tests {
use super::*;
use crate::block_manager::layout::{
FullyContiguous, GenericBlockLayout, LayerSeparate, LayoutConfig,
};
const TEST_NUM_BLOCKS: usize = 4;
const TEST_NUM_LAYERS: usize = 3;
const TEST_OUTER_DIM: usize = 2;
const TEST_PAGE_SIZE: usize = 8;
const TEST_INNER_DIM: usize = 16;
const TEST_DTYPE_WIDTH_BYTES: usize = 2;
fn create_test_config() -> LayoutConfig {
LayoutConfig {
num_blocks: TEST_NUM_BLOCKS,
num_layers: TEST_NUM_LAYERS,
outer_dim: TEST_OUTER_DIM,
page_size: TEST_PAGE_SIZE,
inner_dim: TEST_INNER_DIM,
alignment: 256, // GPU-friendly alignment
dtype_width_bytes: TEST_DTYPE_WIDTH_BYTES,
}
}
/// Test H2D transfers between FullyContiguous host and LayerSeparate device layouts
#[test]
fn test_h2d_fc_host_to_ls_device() {
let device_allocator = DeviceAllocator::default();
let pinned_allocator = PinnedAllocator::default();
let ctx = device_allocator.ctx().clone();
let stream = ctx.new_stream().unwrap();
let config = create_test_config();
// Create FullyContiguous host layout
let host_layout = FullyContiguous::allocate(config.clone(), &pinned_allocator).unwrap();
// Create LayerSeparate device layout
let device_layout = LayerSeparate::allocate(config, &device_allocator, true).unwrap();
// Test data transfer for each memory region
for block_idx in 0..TEST_NUM_BLOCKS {
for layer_idx in 0..TEST_NUM_LAYERS {
for outer_idx in 0..TEST_OUTER_DIM {
let host_region = host_layout
.memory_region(block_idx, layer_idx, outer_idx)
.unwrap();
let device_region = device_layout
.memory_region(block_idx, layer_idx, outer_idx)
.unwrap();
// Verify regions have same size
assert_eq!(
host_region.size(),
device_region.size(),
"Region size mismatch at ({}, {}, {})",
block_idx,
layer_idx,
outer_idx
);
// Create test pattern
let pattern =
((block_idx as u8) << 4) | ((layer_idx as u8) << 2) | (outer_idx as u8);
// Fill host memory with pattern
unsafe {
let host_slice = std::slice::from_raw_parts_mut(
host_region.addr() as *mut u8,
host_region.size(),
);
host_slice.fill(pattern);
}
// Transfer H2D
unsafe {
cuda_memcpy_h2d(
host_region.addr() as *const u8,
device_region.addr() as *mut u8,
host_region.size(),
stream.as_ref(),
)
.unwrap();
}
}
}
}
stream.synchronize().unwrap();
// Verify transfers by copying back and checking patterns
for block_idx in 0..TEST_NUM_BLOCKS {
for layer_idx in 0..TEST_NUM_LAYERS {
for outer_idx in 0..TEST_OUTER_DIM {
let host_region = host_layout
.memory_region(block_idx, layer_idx, outer_idx)
.unwrap();
let device_region = device_layout
.memory_region(block_idx, layer_idx, outer_idx)
.unwrap();
let expected_pattern =
((block_idx as u8) << 4) | ((layer_idx as u8) << 2) | (outer_idx as u8);
// Create temporary verification buffer
let mut verify_buffer =
pinned_allocator.allocate(host_region.size()).unwrap();
// Copy back from device
unsafe {
cuda_memcpy_d2h(
device_region.addr() as *const u8,
verify_buffer.as_mut_ptr(),
host_region.size(),
stream.as_ref(),
)
.unwrap();
}
stream.synchronize().unwrap();
// Verify pattern
unsafe {
let verify_slice = std::slice::from_raw_parts(
verify_buffer.as_ptr(),
host_region.size(),
);
assert!(
verify_slice.iter().all(|&x| x == expected_pattern),
"Pattern mismatch at ({}, {}, {}) - expected {}, got {:?}",
block_idx,
layer_idx,
outer_idx,
expected_pattern,
&verify_slice[0..std::cmp::min(8, verify_slice.len())]
);
}
}
}
}
}
/// Test D2H transfers from LayerSeparate device to FullyContiguous host
#[test]
fn test_d2h_ls_device_to_fc_host() {
let device_allocator = DeviceAllocator::default();
let pinned_allocator = PinnedAllocator::default();
let ctx = device_allocator.ctx().clone();
let stream = ctx.new_stream().unwrap();
let config = create_test_config();
// Create LayerSeparate device layout (block contiguous)
let device_layout =
LayerSeparate::allocate(config.clone(), &device_allocator, false).unwrap();
// Create FullyContiguous host layout
let host_layout = FullyContiguous::allocate(config, &pinned_allocator).unwrap();
// Initialize device memory with patterns using a temporary host buffer
for block_idx in 0..TEST_NUM_BLOCKS {
for layer_idx in 0..TEST_NUM_LAYERS {
for outer_idx in 0..TEST_OUTER_DIM {
let device_region = device_layout
.memory_region(block_idx, layer_idx, outer_idx)
.unwrap();
let pattern = ((block_idx as u8) << 4)
| ((layer_idx as u8) << 2)
| (outer_idx as u8)
| 0x80;
// Create temp buffer with pattern
let mut temp_buffer =
pinned_allocator.allocate(device_region.size()).unwrap();
unsafe {
let temp_slice = std::slice::from_raw_parts_mut(
temp_buffer.as_mut_ptr(),
device_region.size(),
);
temp_slice.fill(pattern);
}
// Copy pattern to device
unsafe {
cuda_memcpy_h2d(
temp_buffer.as_ptr(),
device_region.addr() as *mut u8,
device_region.size(),
stream.as_ref(),
)
.unwrap();
}
}
}
}
stream.synchronize().unwrap();
// Clear host layout
for block_idx in 0..TEST_NUM_BLOCKS {
for layer_idx in 0..TEST_NUM_LAYERS {
for outer_idx in 0..TEST_OUTER_DIM {
let host_region = host_layout
.memory_region(block_idx, layer_idx, outer_idx)
.unwrap();
unsafe {
let host_slice = std::slice::from_raw_parts_mut(
host_region.addr() as *mut u8,
host_region.size(),
);
host_slice.fill(0);
}
}
}
}
// Transfer D2H
for block_idx in 0..TEST_NUM_BLOCKS {
for layer_idx in 0..TEST_NUM_LAYERS {
for outer_idx in 0..TEST_OUTER_DIM {
let device_region = device_layout
.memory_region(block_idx, layer_idx, outer_idx)
.unwrap();
let host_region = host_layout
.memory_region(block_idx, layer_idx, outer_idx)
.unwrap();
unsafe {
cuda_memcpy_d2h(
device_region.addr() as *const u8,
host_region.addr() as *mut u8,
device_region.size(),
stream.as_ref(),
)
.unwrap();
}
}
}
}
stream.synchronize().unwrap();
// Verify patterns in host layout
for block_idx in 0..TEST_NUM_BLOCKS {
for layer_idx in 0..TEST_NUM_LAYERS {
for outer_idx in 0..TEST_OUTER_DIM {
let host_region = host_layout
.memory_region(block_idx, layer_idx, outer_idx)
.unwrap();
let expected_pattern = ((block_idx as u8) << 4)
| ((layer_idx as u8) << 2)
| (outer_idx as u8)
| 0x80;
unsafe {
let host_slice = std::slice::from_raw_parts(
host_region.addr() as *const u8,
host_region.size(),
);
assert!(
host_slice.iter().all(|&x| x == expected_pattern),
"Pattern mismatch at ({}, {}, {}) - expected {}, got {:?}",
block_idx,
layer_idx,
outer_idx,
expected_pattern,
&host_slice[0..std::cmp::min(8, host_slice.len())]
);
}
}
}
}
}
/// Test bidirectional transfers with layout compatibility verification
#[test]
fn test_bidirectional_layout_transfers() {
let device_allocator = DeviceAllocator::default();
let pinned_allocator = PinnedAllocator::default();
let ctx = device_allocator.ctx().clone();
let stream = ctx.new_stream().unwrap();
let config = create_test_config();
// Create both layout types
let host_fc = FullyContiguous::allocate(config.clone(), &pinned_allocator).unwrap();
let device_ls_outer =
LayerSeparate::allocate(config.clone(), &device_allocator, true).unwrap();
let device_ls_block =
LayerSeparate::allocate(config, &device_allocator, false).unwrap();
// Test round-trip: Host FC -> Device LS (outer) -> Device LS (block) -> Host FC
for block_idx in 0..TEST_NUM_BLOCKS {
for layer_idx in 0..TEST_NUM_LAYERS {
for outer_idx in 0..TEST_OUTER_DIM {
let original_pattern = ((block_idx as u8) << 4)
| ((layer_idx as u8) << 2)
| (outer_idx as u8)
| 0x40;
// Step 1: Initialize host FC with pattern
let host_region = host_fc
.memory_region(block_idx, layer_idx, outer_idx)
.unwrap();
unsafe {
let host_slice = std::slice::from_raw_parts_mut(
host_region.addr() as *mut u8,
host_region.size(),
);
host_slice.fill(original_pattern);
}
// Step 2: Transfer to device LS outer
let device_outer_region = device_ls_outer
.memory_region(block_idx, layer_idx, outer_idx)
.unwrap();
unsafe {
cuda_memcpy_h2d(
host_region.addr() as *const u8,
device_outer_region.addr() as *mut u8,
host_region.size(),
stream.as_ref(),
)
.unwrap();
}
// Step 3: Transfer between device layouts (D2D)
let device_block_region = device_ls_block
.memory_region(block_idx, layer_idx, outer_idx)
.unwrap();
unsafe {
cuda_memcpy_d2d(
device_outer_region.addr() as *const u8,
device_block_region.addr() as *mut u8,
device_outer_region.size(),
stream.as_ref(),
)
.unwrap();
}
stream.synchronize().unwrap();
// Step 4: Clear host and transfer back
unsafe {
let host_slice = std::slice::from_raw_parts_mut(
host_region.addr() as *mut u8,
host_region.size(),
);
host_slice.fill(0);
}
unsafe {
cuda_memcpy_d2h(
device_block_region.addr() as *const u8,
host_region.addr() as *mut u8,
device_block_region.size(),
stream.as_ref(),
)
.unwrap();
}
stream.synchronize().unwrap();
// Step 5: Verify pattern survived the round trip
unsafe {
let host_slice = std::slice::from_raw_parts(
host_region.addr() as *const u8,
host_region.size(),
);
assert!(
host_slice.iter().all(|&x| x == original_pattern),
"Round-trip pattern mismatch at ({}, {}, {}) - expected {}, got {:?}",
block_idx,
layer_idx,
outer_idx,
original_pattern,
&host_slice[0..std::cmp::min(8, host_slice.len())]
);
}
}
}
}
}
/// Test transfer performance and alignment impact
#[test]
fn test_layout_transfer_alignment_performance() {
let device_allocator = DeviceAllocator::default();
let pinned_allocator = PinnedAllocator::default();
let ctx = device_allocator.ctx().clone();
let stream = ctx.new_stream().unwrap();
// Test different alignments
for alignment in [1, 64, 256, 512] {
let config = LayoutConfig {
num_blocks: 2,
num_layers: 2,
outer_dim: 1,
page_size: 1024,
inner_dim: 256,
alignment,
dtype_width_bytes: 4,
};
let host_layout =
FullyContiguous::allocate(config.clone(), &pinned_allocator).unwrap();
let device_layout = FullyContiguous::allocate(config, &device_allocator).unwrap();
// Measure transfer time (basic timing)
let start = std::time::Instant::now();
for block_idx in 0..2 {
for layer_idx in 0..2 {
let host_region =
host_layout.memory_region(block_idx, layer_idx, 0).unwrap();
let device_region = device_layout
.memory_region(block_idx, layer_idx, 0)
.unwrap();
unsafe {
cuda_memcpy_h2d(
host_region.addr() as *const u8,
device_region.addr() as *mut u8,
host_region.size(),
stream.as_ref(),
)
.unwrap();
}
}
}
stream.synchronize().unwrap();
let duration = start.elapsed();
// Verify alignment was applied correctly
let region = host_layout.memory_region(0, 0, 0).unwrap();
if alignment > 1 {
assert_eq!(
region.addr() % alignment,
0,
"Memory not aligned to {} bytes",
alignment
);
}
println!("Transfer with alignment {} took {:?}", alignment, duration);
}
}
}
}