use crate::driver::{result, sys}; use super::alloc::DeviceRepr; use super::core::{CudaDevice, CudaFunction, CudaModule, CudaStream}; use std::sync::Arc; impl CudaDevice { /// Whether a module and function are currently loaded into the device. pub fn has_func(self: &Arc, module_name: &str, func_name: &str) -> bool { let modules = self.modules.read(); #[cfg(not(feature = "no-std"))] let modules = modules.unwrap(); modules .get(module_name) .map_or(false, |module| module.has_func(func_name)) } /// Retrieves a [CudaFunction] that was registered under `module_name` and `func_name`. pub fn get_func(self: &Arc, module_name: &str, func_name: &str) -> Option { let modules = self.modules.read(); #[cfg(not(feature = "no-std"))] let modules = modules.unwrap(); modules .get(module_name) .and_then(|m| m.get_func(func_name)) .map(|cu_function| CudaFunction { cu_function, device: self.clone(), }) } } impl CudaModule { /// Returns reference to function with `name`. If function /// was not already loaded into CudaModule, then `None` /// is returned. pub(crate) fn get_func(&self, name: &str) -> Option { self.functions.get(name).cloned() } pub(crate) fn has_func(&self, name: &str) -> bool { self.functions.contains_key(name) } } impl CudaFunction { #[inline(always)] unsafe fn launch_async_impl( self, cfg: LaunchConfig, params: &mut [*mut std::ffi::c_void], ) -> Result<(), result::DriverError> { self.device.bind_to_thread()?; result::launch_kernel( self.cu_function, cfg.grid_dim, cfg.block_dim, cfg.shared_mem_bytes, self.device.stream, params, ) } #[inline(always)] unsafe fn par_launch_async_impl( self, stream: &CudaStream, cfg: LaunchConfig, params: &mut [*mut std::ffi::c_void], ) -> Result<(), result::DriverError> { self.device.bind_to_thread()?; result::launch_kernel( self.cu_function, cfg.grid_dim, cfg.block_dim, cfg.shared_mem_bytes, stream.stream, params, ) } } /// Configuration for [result::launch_kernel] /// /// See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1gb8f3dc3031b40da29d5f9a7139e52e15) /// for description of each parameter. #[derive(Clone, Copy, Debug)] pub struct LaunchConfig { /// (width, height, depth) of grid in blocks pub grid_dim: (u32, u32, u32), /// (x, y, z) dimension of each thread block pub block_dim: (u32, u32, u32), /// Dynamic shared-memory size per thread block in bytes pub shared_mem_bytes: u32, } impl LaunchConfig { /// Creates a [LaunchConfig] with: /// - block_dim == `1024` /// - grid_dim == `(n + 1023) / 1024` /// - shared_mem_bytes == `0` pub fn for_num_elems(n: u32) -> Self { const NUM_THREADS: u32 = 1024; let num_blocks = (n + NUM_THREADS - 1) / NUM_THREADS; Self { grid_dim: (num_blocks, 1, 1), block_dim: (NUM_THREADS, 1, 1), shared_mem_bytes: 0, } } } /// Consumes a [CudaFunction] to execute asychronously on the device with /// params determined by generic parameter `Params`. /// /// This is impl'd multiple times for different number and types of params. In /// general, `Params` should impl [DeviceRepr]. /// /// ```ignore /// # use cudarc::driver::*; /// # let dev = CudaDevice::new(0).unwrap(); /// let my_kernel: CudaFunction = dev.get_func("my_module", "my_kernel").unwrap(); /// let cfg: LaunchConfig = LaunchConfig { /// grid_dim: (1, 1, 1), /// block_dim: (1, 1, 1), /// shared_mem_bytes: 0, /// }; /// let params = (1i32, 2u64, 3usize); /// unsafe { my_kernel.launch(cfg, params) }.unwrap(); /// ``` /// /// # Safety /// /// This is not safe really ever, because there's no garuntee that `Params` /// will work for any [CudaFunction] passed in. Great care should be taken /// to ensure that [CudaFunction] works with `Params` and that the correct /// parameters have `&mut` in front of them. /// /// Additionally, kernels can mutate data that is marked as immutable, /// such as `&CudaSlice`. /// /// See [LaunchAsync::launch] for more details pub unsafe trait LaunchAsync { /// Launches the [CudaFunction] with the corresponding `Params`. /// /// # Safety /// /// This method is **very** unsafe. /// /// See cuda documentation notes on this as well: /// /// /// 1. `params` can be changed regardless of `&` or `&mut` usage. /// 2. `params` will be changed at some later point after the /// function returns because the kernel is executed async. /// 3. There are no guaruntees that the `params` /// are the correct number/types/order for `func`. /// 4. Specifying the wrong values for [LaunchConfig] can result /// in accessing/modifying values past memory limits. /// /// ## Asynchronous mutation /// /// Since this library queues kernels to be launched on a single /// stream, and really the only way to modify [crate::driver::CudaSlice] is through /// kernels, mutating the same [crate::driver::CudaSlice] with multiple kernels /// is safe. This is because each kernel is executed sequentially /// on the stream. /// /// **Modifying a value on the host that is in used by a /// kernel is undefined behavior.** But is hard to do /// accidentally. /// /// Also for this reason, do not pass in any values to kernels /// that can be modified on the host. This is the reason /// [DeviceRepr] is not implemented for rust primitive /// references. /// /// ## Use after free /// /// Since the drop implementation for [crate::driver::CudaSlice] also occurs /// on the device's single stream, any kernels launched before /// the drop will complete before the value is actually freed. /// /// **If you launch a kernel or drop a value on a different stream /// this may not hold** unsafe fn launch(self, cfg: LaunchConfig, params: Params) -> Result<(), result::DriverError>; /// Launch the function on a stream concurrent to the device's default /// work stream. /// /// # Safety /// This method is even more unsafe than [LaunchAsync::launch], all the same rules apply, /// except now things are executing in parallel to each other. /// /// That means that if any of the kernels modify the same memory location, you'll get race /// conditions or potentially undefined behavior. unsafe fn launch_on_stream( self, stream: &CudaStream, cfg: LaunchConfig, params: Params, ) -> Result<(), result::DriverError>; } macro_rules! impl_launch { ([$($Vars:tt),*], [$($Idx:tt),*]) => { unsafe impl<$($Vars: DeviceRepr),*> LaunchAsync<($($Vars, )*)> for CudaFunction { #[inline(always)] unsafe fn launch( self, cfg: LaunchConfig, args: ($($Vars, )*) ) -> Result<(), result::DriverError> { let params = &mut [$(args.$Idx.as_kernel_param(), )*]; self.launch_async_impl(cfg, params) } #[inline(always)] unsafe fn launch_on_stream( self, stream: &CudaStream, cfg: LaunchConfig, args: ($($Vars, )*) ) -> Result<(), result::DriverError> { let params = &mut [$(args.$Idx.as_kernel_param(), )*]; self.par_launch_async_impl(stream, cfg, params) } } }; } impl_launch!([A], [0]); impl_launch!([A, B], [0, 1]); impl_launch!([A, B, C], [0, 1, 2]); impl_launch!([A, B, C, D], [0, 1, 2, 3]); impl_launch!([A, B, C, D, E], [0, 1, 2, 3, 4]); impl_launch!([A, B, C, D, E, F], [0, 1, 2, 3, 4, 5]); impl_launch!([A, B, C, D, E, F, G], [0, 1, 2, 3, 4, 5, 6]); impl_launch!([A, B, C, D, E, F, G, H], [0, 1, 2, 3, 4, 5, 6, 7]); impl_launch!([A, B, C, D, E, F, G, H, I], [0, 1, 2, 3, 4, 5, 6, 7, 8]); impl_launch!( [A, B, C, D, E, F, G, H, I, J], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] ); impl_launch!( [A, B, C, D, E, F, G, H, I, J, K], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] ); impl_launch!( [A, B, C, D, E, F, G, H, I, J, K, L], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] ); #[cfg(test)] mod tests { use std::{time::Instant, vec::Vec}; use crate::{ driver::{DeviceSlice, DriverError}, nvrtc::compile_ptx_with_opts, }; use super::*; #[test] fn test_mut_into_kernel_param_no_inc_rc() { let device = CudaDevice::new(0).unwrap(); let t = device.htod_copy([0.0f32; 1].to_vec()).unwrap(); let _r = t.clone(); assert_eq!(Arc::strong_count(&device), 3); let _ = (&t).as_kernel_param(); assert_eq!(Arc::strong_count(&device), 3); } #[test] fn test_ref_into_kernel_param_inc_rc() { let device = CudaDevice::new(0).unwrap(); let t = device.htod_copy([0.0f32; 1].to_vec()).unwrap(); let _r = t.clone(); assert_eq!(Arc::strong_count(&device), 3); let _ = (&t).as_kernel_param(); assert_eq!(Arc::strong_count(&device), 3); } const SIN_CU: &str = " extern \"C\" __global__ void sin_kernel(float *out, const float *inp, size_t numel) { size_t i = blockIdx.x * blockDim.x + threadIdx.x; if (i < numel) { out[i] = sin(inp[i]); } }"; #[test] fn test_launch_with_mut_and_ref_cudarc() { let ptx = compile_ptx_with_opts(SIN_CU, Default::default()).unwrap(); let dev = CudaDevice::new(0).unwrap(); dev.load_ptx(ptx, "sin", &["sin_kernel"]).unwrap(); let sin_kernel = dev.get_func("sin", "sin_kernel").unwrap(); let a_host = [-1.0f32, -0.8, -0.6, -0.4, -0.2, 0.0, 0.2, 0.4, 0.6, 0.8]; let a_dev = dev.htod_copy(a_host.clone().to_vec()).unwrap(); let mut b_dev = a_dev.clone(); unsafe { sin_kernel.launch( LaunchConfig::for_num_elems(10), (&mut b_dev, &a_dev, 10usize), ) } .unwrap(); let b_host = dev.sync_reclaim(b_dev).unwrap(); for (a_i, b_i) in a_host.iter().zip(b_host.iter()) { let expected = a_i.sin(); assert!((b_i - expected).abs() <= 1e-6); } drop(a_dev); } #[test] fn test_large_launches() { let ptx = compile_ptx_with_opts(SIN_CU, Default::default()).unwrap(); let dev = CudaDevice::new(0).unwrap(); dev.load_ptx(ptx, "sin", &["sin_kernel"]).unwrap(); for numel in [256, 512, 1024, 1280, 1536, 2048] { let mut a = Vec::with_capacity(numel); a.resize(numel, 1.0f32); let a = dev.htod_copy(a).unwrap(); let mut b = dev.alloc_zeros::(numel).unwrap(); let sin_kernel = dev.get_func("sin", "sin_kernel").unwrap(); let cfg = LaunchConfig::for_num_elems(numel as u32); unsafe { sin_kernel.launch(cfg, (&mut b, &a, numel)) }.unwrap(); let b = dev.sync_reclaim(b).unwrap(); for v in b { assert_eq!(v, 0.841471); } } } #[test] fn test_launch_with_views() { let ptx = compile_ptx_with_opts(SIN_CU, Default::default()).unwrap(); let dev = CudaDevice::new(0).unwrap(); dev.load_ptx(ptx, "sin", &["sin_kernel"]).unwrap(); let a_host = [-1.0f32, -0.8, -0.6, -0.4, -0.2, 0.0, 0.2, 0.4, 0.6, 0.8]; let a_dev = dev.htod_copy(a_host.clone().to_vec()).unwrap(); let mut b_dev = a_dev.clone(); for i in 0..5 { let a_sub = a_dev.try_slice(i * 2..).unwrap(); assert_eq!(a_sub.len, 10 - 2 * i); let mut b_sub = b_dev.try_slice_mut(i * 2..).unwrap(); assert_eq!(b_sub.len, 10 - 2 * i); let f = dev.get_func("sin", "sin_kernel").unwrap(); unsafe { f.launch(LaunchConfig::for_num_elems(2), (&mut b_sub, &a_sub, 2usize)) } .unwrap(); } let b_host = dev.sync_reclaim(b_dev).unwrap(); for (a_i, b_i) in a_host.iter().zip(b_host.iter()) { let expected = a_i.sin(); assert!((b_i - expected).abs() <= 1e-6); } drop(a_dev); } const TEST_KERNELS: &str = " extern \"C\" __global__ void int_8bit(signed char s_min, char s_max, unsigned char u_min, unsigned char u_max) { assert(s_min == -128); assert(s_max == 127); assert(u_min == 0); assert(u_max == 255); } extern \"C\" __global__ void int_16bit(signed short s_min, short s_max, unsigned short u_min, unsigned short u_max) { assert(s_min == -32768); assert(s_max == 32767); assert(u_min == 0); assert(u_max == 65535); } extern \"C\" __global__ void int_32bit(signed int s_min, int s_max, unsigned int u_min, unsigned int u_max) { assert(s_min == -2147483648); assert(s_max == 2147483647); assert(u_min == 0); assert(u_max == 4294967295); } extern \"C\" __global__ void int_64bit(signed long s_min, long s_max, unsigned long u_min, unsigned long u_max) { assert(s_min == -9223372036854775808); assert(s_max == 9223372036854775807); assert(u_min == 0); assert(u_max == 18446744073709551615); } extern \"C\" __global__ void floating(float f, double d) { assert(fabs(f - 1.2345678) <= 1e-7); assert(fabs(d - -10.123456789876543) <= 1e-16); } "; #[test] fn test_launch_with_8bit() { let ptx = compile_ptx_with_opts(TEST_KERNELS, Default::default()).unwrap(); let dev = CudaDevice::new(0).unwrap(); dev.load_ptx(ptx, "tests", &["int_8bit"]).unwrap(); let f = dev.get_func("tests", "int_8bit").unwrap(); unsafe { f.launch( LaunchConfig::for_num_elems(1), (i8::MIN, i8::MAX, u8::MIN, u8::MAX), ) } .unwrap(); dev.synchronize().unwrap(); } #[test] fn test_launch_with_16bit() { let ptx = compile_ptx_with_opts(TEST_KERNELS, Default::default()).unwrap(); let dev = CudaDevice::new(0).unwrap(); dev.load_ptx(ptx, "tests", &["int_16bit"]).unwrap(); let f = dev.get_func("tests", "int_16bit").unwrap(); unsafe { f.launch( LaunchConfig::for_num_elems(1), (i16::MIN, i16::MAX, u16::MIN, u16::MAX), ) } .unwrap(); dev.synchronize().unwrap(); } #[test] fn test_launch_with_32bit() { let ptx = compile_ptx_with_opts(TEST_KERNELS, Default::default()).unwrap(); let dev = CudaDevice::new(0).unwrap(); dev.load_ptx(ptx, "tests", &["int_32bit"]).unwrap(); let f = dev.get_func("tests", "int_32bit").unwrap(); unsafe { f.launch( LaunchConfig::for_num_elems(1), (i32::MIN, i32::MAX, u32::MIN, u32::MAX), ) } .unwrap(); dev.synchronize().unwrap(); } #[test] fn test_launch_with_64bit() { let ptx = compile_ptx_with_opts(TEST_KERNELS, Default::default()).unwrap(); let dev = CudaDevice::new(0).unwrap(); dev.load_ptx(ptx, "tests", &["int_64bit"]).unwrap(); let f = dev.get_func("tests", "int_64bit").unwrap(); unsafe { f.launch( LaunchConfig::for_num_elems(1), (i64::MIN, i64::MAX, u64::MIN, u64::MAX), ) } .unwrap(); dev.synchronize().unwrap(); } #[test] fn test_launch_with_floats() { let ptx = compile_ptx_with_opts(TEST_KERNELS, Default::default()).unwrap(); let dev = CudaDevice::new(0).unwrap(); dev.load_ptx(ptx, "tests", &["floating"]).unwrap(); let f = dev.get_func("tests", "floating").unwrap(); unsafe { f.launch( LaunchConfig::for_num_elems(1), (1.2345678f32, -10.123456789876543f64), ) } .unwrap(); dev.synchronize().unwrap(); } #[cfg(feature = "f16")] const HALF_KERNELS: &str = " #include \"cuda_fp16.h\" extern \"C\" __global__ void halfs(__half h) { assert(__habs(h - __float2half(1.234)) <= __float2half(1e-4)); } "; #[cfg(feature = "f16")] #[test] fn test_launch_with_half() { use crate::nvrtc::CompileOptions; let ptx = compile_ptx_with_opts( HALF_KERNELS, CompileOptions { include_paths: std::vec!["/usr/include".into()], arch: Some("compute_53"), ..Default::default() }, ) .unwrap(); let dev = CudaDevice::new(0).unwrap(); dev.load_ptx(ptx, "tests", &["halfs"]).unwrap(); let f = dev.get_func("tests", "halfs").unwrap(); unsafe { f.launch( LaunchConfig::for_num_elems(1), (half::f16::from_f32(1.234),), ) } .unwrap(); dev.synchronize().unwrap(); } const SLOW_KERNELS: &str = " extern \"C\" __global__ void slow_worker(const float *data, const size_t len, float *out) { float tmp = 0.0; for(size_t i = 0; i < 1000000; i++) { tmp += data[i % len]; } *out = tmp; } "; #[test] fn test_par_launch() -> Result<(), DriverError> { let ptx = compile_ptx_with_opts(SLOW_KERNELS, Default::default()).unwrap(); let dev = CudaDevice::new(0).unwrap(); dev.load_ptx(ptx, "tests", &["slow_worker"]).unwrap(); let slice = dev.alloc_zeros::(1000)?; let mut a = dev.alloc_zeros::(1)?; let mut b = dev.alloc_zeros::(1)?; let cfg = LaunchConfig::for_num_elems(1); let start = Instant::now(); { // launch two kernels on the default stream let f = dev.get_func("tests", "slow_worker").unwrap(); unsafe { f.launch(cfg, (&slice, slice.len(), &mut a))? }; let f = dev.get_func("tests", "slow_worker").unwrap(); unsafe { f.launch(cfg, (&slice, slice.len(), &mut b))? }; dev.synchronize()?; } let double_launch_s = start.elapsed().as_secs_f64(); let start = Instant::now(); { // create a new stream & launch them concurrently let stream = dev.fork_default_stream()?; let f = dev.get_func("tests", "slow_worker").unwrap(); unsafe { f.launch(cfg, (&slice, slice.len(), &mut a))? }; let f = dev.get_func("tests", "slow_worker").unwrap(); unsafe { f.launch_on_stream(&stream, cfg, (&slice, slice.len(), &mut b))? }; dev.wait_for(&stream)?; dev.synchronize()?; } let par_launch_s = start.elapsed().as_secs_f64(); assert!( (double_launch_s - 2.0 * par_launch_s).abs() < 20.0 / 100.0, "par={:?} dbl={:?}", par_launch_s, double_launch_s ); Ok(()) } }