"vscode:/vscode.git/clone" did not exist on "b75e3bb963bbf3571bad45001b4dee693c93c0e2"
Commit 3ec8c534 authored by yongshk's avatar yongshk
Browse files

Initial commit

parents
use crate::driver::{result, sys};
use super::core::{CudaDevice, CudaSlice, CudaView, CudaViewMut};
use super::device_ptr::{DevicePtr, DevicePtrMut, DeviceSlice};
use std::{marker::Unpin, pin::Pin, sync::Arc, vec::Vec};
/// Something that can be copied to device memory and
/// turned into a parameter for [result::launch_kernel].
///
/// # Safety
///
/// This is unsafe because a struct should likely
/// be `#[repr(C)]` to be represented in cuda memory,
/// and not all types are valid.
pub unsafe trait DeviceRepr {
#[inline(always)]
fn as_kernel_param(&self) -> *mut std::ffi::c_void {
self as *const Self as *mut _
}
}
unsafe impl DeviceRepr for bool {}
unsafe impl DeviceRepr for i8 {}
unsafe impl DeviceRepr for i16 {}
unsafe impl DeviceRepr for i32 {}
unsafe impl DeviceRepr for i64 {}
unsafe impl DeviceRepr for i128 {}
unsafe impl DeviceRepr for isize {}
unsafe impl DeviceRepr for u8 {}
unsafe impl DeviceRepr for u16 {}
unsafe impl DeviceRepr for u32 {}
unsafe impl DeviceRepr for u64 {}
unsafe impl DeviceRepr for u128 {}
unsafe impl DeviceRepr for usize {}
unsafe impl DeviceRepr for f32 {}
unsafe impl DeviceRepr for f64 {}
#[cfg(feature = "f16")]
unsafe impl DeviceRepr for half::f16 {}
#[cfg(feature = "f16")]
unsafe impl DeviceRepr for half::bf16 {}
unsafe impl<T: DeviceRepr> DeviceRepr for &mut CudaSlice<T> {
#[inline(always)]
fn as_kernel_param(&self) -> *mut std::ffi::c_void {
(&self.cu_device_ptr) as *const sys::CUdeviceptr as *mut std::ffi::c_void
}
}
unsafe impl<T: DeviceRepr> DeviceRepr for &CudaSlice<T> {
#[inline(always)]
fn as_kernel_param(&self) -> *mut std::ffi::c_void {
(&self.cu_device_ptr) as *const sys::CUdeviceptr as *mut std::ffi::c_void
}
}
unsafe impl<'a, T: DeviceRepr> DeviceRepr for &CudaView<'a, T> {
#[inline(always)]
fn as_kernel_param(&self) -> *mut std::ffi::c_void {
(&self.ptr) as *const sys::CUdeviceptr as *mut std::ffi::c_void
}
}
unsafe impl<'a, T: DeviceRepr> DeviceRepr for &mut CudaViewMut<'a, T> {
#[inline(always)]
fn as_kernel_param(&self) -> *mut std::ffi::c_void {
(&self.ptr) as *const sys::CUdeviceptr as *mut std::ffi::c_void
}
}
impl<T> CudaSlice<T> {
/// Takes ownership of the underlying [sys::CUdeviceptr]. **It is up
/// to the owner to free this value**.
///
/// Drops the underlying host_buf if there is one.
pub fn leak(mut self) -> sys::CUdeviceptr {
if let Some(host_buf) = std::mem::take(&mut self.host_buf) {
drop(host_buf);
}
let ptr = self.cu_device_ptr;
std::mem::forget(self);
ptr
}
}
impl CudaDevice {
/// Creates a [CudaSlice] from a [sys::CUdeviceptr]. Useful in conjunction with
/// [`CudaSlice::leak()`].
///
/// # Safety
/// - `cu_device_ptr` must be a valid allocation
/// - `cu_device_ptr` must space for `len * std::mem::size_of<T>()` bytes
/// - The memory may not be valid for type `T`, so some sort of memset operation
/// should be called on the memory.
pub unsafe fn upgrade_device_ptr<T>(
self: &Arc<Self>,
cu_device_ptr: sys::CUdeviceptr,
len: usize,
) -> CudaSlice<T> {
CudaSlice {
cu_device_ptr,
len,
device: self.clone(),
host_buf: None,
}
}
}
impl CudaDevice {
/// Allocates an empty [CudaSlice] with 0 length.
pub fn null<T>(self: &Arc<Self>) -> Result<CudaSlice<T>, result::DriverError> {
self.bind_to_thread()?;
let cu_device_ptr = unsafe {
if self.is_async {
result::malloc_async(self.stream, 0)?
} else {
result::malloc_sync(0)?
}
};
Ok(CudaSlice {
cu_device_ptr,
len: 0,
device: self.clone(),
host_buf: None,
})
}
/// Allocates device memory and increments the reference counter of [CudaDevice].
///
/// # Safety
/// This is unsafe because the device memory is unset after this call.
pub unsafe fn alloc<T: DeviceRepr>(
self: &Arc<Self>,
len: usize,
) -> Result<CudaSlice<T>, result::DriverError> {
self.bind_to_thread()?;
let cu_device_ptr = if self.is_async {
result::malloc_async(self.stream, len * std::mem::size_of::<T>())?
} else {
result::malloc_sync(len * std::mem::size_of::<T>())?
};
Ok(CudaSlice {
cu_device_ptr,
len,
device: self.clone(),
host_buf: None,
})
}
/// Allocates device memory with no associated host memory, and memsets
/// the device memory to all 0s.
///
/// # Safety
/// 1. `T` is marked as [ValidAsZeroBits], so the device memory is valid to use
/// 2. Self is [`Arc<Self>`], and this method increments the rc for self
pub fn alloc_zeros<T: ValidAsZeroBits + DeviceRepr>(
self: &Arc<Self>,
len: usize,
) -> Result<CudaSlice<T>, result::DriverError> {
let mut dst = unsafe { self.alloc(len) }?;
self.memset_zeros(&mut dst)?;
Ok(dst)
}
/// Sets all memory to 0 asynchronously.
///
/// # Safety
/// 1. `T` is marked as [ValidAsZeroBits], so the device memory is valid to use
/// 2. Self is [`Arc<Self>`], and this method increments the rc for self
pub fn memset_zeros<T: ValidAsZeroBits + DeviceRepr, Dst: DevicePtrMut<T>>(
self: &Arc<Self>,
dst: &mut Dst,
) -> Result<(), result::DriverError> {
self.bind_to_thread()?;
if self.is_async {
unsafe {
result::memset_d8_async(*dst.device_ptr_mut(), 0, dst.num_bytes(), self.stream)
}
} else {
unsafe { result::memset_d8_sync(*dst.device_ptr_mut(), 0, dst.num_bytes()) }
}
}
/// Device to device copy (safe version of [result::memcpy_dtod_async]).
///
/// # Panics
///
/// If the length of the two values are different
///
/// # Safety
/// 1. We are guarunteed that `src` and `dst` are pointers to the same underlying
/// type `T`
/// 2. Since they are both references, they can't have been freed
/// 3. Self is [`Arc<Self>`], and this method increments the rc for self
pub fn dtod_copy<T: DeviceRepr, Src: DevicePtr<T>, Dst: DevicePtrMut<T>>(
self: &Arc<Self>,
src: &Src,
dst: &mut Dst,
) -> Result<(), result::DriverError> {
assert_eq!(src.len(), dst.len());
self.bind_to_thread()?;
if self.is_async {
unsafe {
result::memcpy_dtod_async(
*dst.device_ptr_mut(),
*src.device_ptr(),
src.len() * std::mem::size_of::<T>(),
self.stream,
)
}
} else {
unsafe {
result::memcpy_dtod_sync(
*dst.device_ptr_mut(),
*src.device_ptr(),
src.len() * std::mem::size_of::<T>(),
)
}
}
}
/// Takes ownership of the host data and copies it to device data asynchronously.
///
/// # Safety
///
/// 1. Since `src` is owned by this funcion, it is safe to copy data. Any actions executed
/// after this will take place after the data has been successfully copied.
/// 2. Self is [`Arc<Self>`], and this method increments the rc for self
pub fn htod_copy<T: Unpin + DeviceRepr>(
self: &Arc<Self>,
src: Vec<T>,
) -> Result<CudaSlice<T>, result::DriverError> {
let mut dst = unsafe { self.alloc(src.len()) }?;
self.htod_copy_into(src, &mut dst)?;
Ok(dst)
}
/// Takes ownership of the host data and copies it to device data asynchronously.
///
/// # Safety
///
/// 1. Since `src` is owned by this funcion, it is safe to copy data. Any actions executed
/// after this will take place after the data has been successfully copied.
/// 2. Self is [`Arc<Self>`], and this method increments the rc for self
pub fn htod_copy_into<T: DeviceRepr + Unpin>(
self: &Arc<Self>,
src: Vec<T>,
dst: &mut CudaSlice<T>,
) -> Result<(), result::DriverError> {
assert_eq!(src.len(), dst.len());
dst.host_buf = Some(Pin::new(src));
self.bind_to_thread()?;
if self.is_async {
unsafe {
result::memcpy_htod_async(
dst.cu_device_ptr,
dst.host_buf.as_ref().unwrap(),
self.stream,
)
}?
} else {
unsafe { result::memcpy_htod_sync(dst.cu_device_ptr, dst.host_buf.as_ref().unwrap()) }?
}
Ok(())
}
/// Allocates new device memory and synchronously copies data from `src` into the new allocation.
///
/// If you want an asynchronous copy, see [CudaDevice::htod_copy()].
///
/// # Safety
///
/// 1. Since this function doesn't own `src` it is executed synchronously.
/// 2. Self is [`Arc<Self>`], and this method increments the rc for self
pub fn htod_sync_copy<T: DeviceRepr>(
self: &Arc<Self>,
src: &[T],
) -> Result<CudaSlice<T>, result::DriverError> {
let mut dst = unsafe { self.alloc(src.len()) }?;
self.htod_sync_copy_into(src, &mut dst)?;
Ok(dst)
}
/// Synchronously copies data from `src` into the new allocation.
///
/// If you want an asynchronous copy, see [CudaDevice::htod_copy()].
///
/// # Panics
///
/// If the lengths of slices are not equal, this method panics.
///
/// # Safety
/// 1. Since this function doesn't own `src` it is executed synchronously.
/// 2. Self is [`Arc<Self>`], and this method increments the rc for self
pub fn htod_sync_copy_into<T: DeviceRepr, Dst: DevicePtrMut<T>>(
self: &Arc<Self>,
src: &[T],
dst: &mut Dst,
) -> Result<(), result::DriverError> {
assert_eq!(src.len(), dst.len());
self.bind_to_thread()?;
if self.is_async {
unsafe { result::memcpy_htod_async(*dst.device_ptr_mut(), src, self.stream) }?;
} else {
unsafe { result::memcpy_htod_sync(*dst.device_ptr_mut(), src) }?;
}
self.synchronize()
}
/// Synchronously copies device memory into host memory.
/// Unlike [`CudaDevice::dtoh_sync_copy_into`] this returns a [`Vec<T>`].
///
/// # Safety
/// 1. Since this function doesn't own `dst` (after returning) it is executed synchronously.
/// 2. Self is [`Arc<Self>`], and this method increments the rc for self
#[allow(clippy::uninit_vec)]
pub fn dtoh_sync_copy<T: DeviceRepr, Src: DevicePtr<T>>(
self: &Arc<Self>,
src: &Src,
) -> Result<Vec<T>, result::DriverError> {
let mut dst = Vec::with_capacity(src.len());
unsafe { dst.set_len(src.len()) };
self.dtoh_sync_copy_into(src, &mut dst)?;
Ok(dst)
}
/// Synchronously copies device memory into host memory
///
/// Use [`CudaDevice::dtoh_sync_copy`] if you need [`Vec<T>`] and can't provide
/// a correctly sized slice.
///
/// # Panics
///
/// If the lengths of slices are not equal, this method panics.
///
/// # Safety
/// 1. Since this function doesn't own `dst` it is executed synchronously.
/// 2. Self is [`Arc<Self>`], and this method increments the rc for self
pub fn dtoh_sync_copy_into<T: DeviceRepr, Src: DevicePtr<T>>(
self: &Arc<Self>,
src: &Src,
dst: &mut [T],
) -> Result<(), result::DriverError> {
assert_eq!(src.len(), dst.len());
self.bind_to_thread()?;
if self.is_async {
unsafe { result::memcpy_dtoh_async(dst, *src.device_ptr(), self.stream) }?;
} else {
unsafe { result::memcpy_dtoh_sync(dst, *src.device_ptr()) }?;
}
self.synchronize()
}
/// Synchronously de-allocates `src` and converts it into it's host value.
/// You can just [drop] the slice if you don't need the host data.
///
/// # Safety
/// 1. Self is [`Arc<Self>`], and this method increments the rc for self
pub fn sync_reclaim<T: Clone + Default + DeviceRepr + Unpin>(
self: &Arc<Self>,
mut src: CudaSlice<T>,
) -> Result<Vec<T>, result::DriverError> {
let buf = src.host_buf.take();
let mut buf = buf.unwrap_or_else(|| {
let mut b = Vec::with_capacity(src.len);
b.resize(src.len, Default::default());
Pin::new(b)
});
self.dtoh_sync_copy_into(&src, &mut buf)?;
Ok(Pin::into_inner(buf))
}
/// Synchronizes the stream.
pub fn synchronize(self: &Arc<Self>) -> Result<(), result::DriverError> {
self.bind_to_thread()?;
unsafe { result::stream::synchronize(self.stream) }
}
}
/// Marker trait to indicate that the type is valid
/// when all of its bits are set to 0.
///
/// # Safety
/// Not all types are valid when all bits are set to 0.
/// Be very sure when implementing this trait!
pub unsafe trait ValidAsZeroBits {}
unsafe impl ValidAsZeroBits for bool {}
unsafe impl ValidAsZeroBits for i8 {}
unsafe impl ValidAsZeroBits for i16 {}
unsafe impl ValidAsZeroBits for i32 {}
unsafe impl ValidAsZeroBits for i64 {}
unsafe impl ValidAsZeroBits for i128 {}
unsafe impl ValidAsZeroBits for isize {}
unsafe impl ValidAsZeroBits for u8 {}
unsafe impl ValidAsZeroBits for u16 {}
unsafe impl ValidAsZeroBits for u32 {}
unsafe impl ValidAsZeroBits for u64 {}
unsafe impl ValidAsZeroBits for u128 {}
unsafe impl ValidAsZeroBits for usize {}
unsafe impl ValidAsZeroBits for f32 {}
unsafe impl ValidAsZeroBits for f64 {}
#[cfg(feature = "f16")]
unsafe impl ValidAsZeroBits for half::f16 {}
#[cfg(feature = "f16")]
unsafe impl ValidAsZeroBits for half::bf16 {}
unsafe impl<T: ValidAsZeroBits, const M: usize> ValidAsZeroBits for [T; M] {}
/// Implement `ValidAsZeroBits` for tuples if all elements are `ValidAsZeroBits`,
///
/// # Note
/// This will also implement `ValidAsZeroBits` for a tuple with one element
macro_rules! impl_tuples {
($t:tt) => {
impl_tuples!(@ $t);
};
// the $l is in front of the reptition to prevent parsing ambiguities
($l:tt $(,$t:tt)+) => {
impl_tuples!($($t),+);
impl_tuples!(@ $l $(,$t)+);
};
(@ $($t:tt),+) => {
unsafe impl<$($t: ValidAsZeroBits,)+> ValidAsZeroBits for ($($t,)+) {}
};
}
impl_tuples!(A, B, C, D, E, F, G, H, I, J, K, L);
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_post_build_arc_count() {
let device = CudaDevice::new(0).unwrap();
assert_eq!(Arc::strong_count(&device), 1);
}
#[test]
fn test_post_alloc_arc_counts() {
let device = CudaDevice::new(0).unwrap();
let t = device.alloc_zeros::<f32>(1).unwrap();
assert!(t.host_buf.is_none());
assert_eq!(Arc::strong_count(&device), 2);
}
#[test]
fn test_post_take_arc_counts() {
let device = CudaDevice::new(0).unwrap();
let t = device.htod_copy([0.0f32; 5].to_vec()).unwrap();
assert!(t.host_buf.is_some());
assert_eq!(Arc::strong_count(&device), 2);
drop(t);
assert_eq!(Arc::strong_count(&device), 1);
}
#[test]
fn test_post_clone_counts() {
let device = CudaDevice::new(0).unwrap();
let t = device.htod_copy([0.0f64; 10].to_vec()).unwrap();
let r = t.clone();
assert_eq!(Arc::strong_count(&device), 3);
drop(t);
assert_eq!(Arc::strong_count(&device), 2);
drop(r);
assert_eq!(Arc::strong_count(&device), 1);
}
#[test]
fn test_post_clone_arc_slice_counts() {
let device = CudaDevice::new(0).unwrap();
let t = Arc::new(device.htod_copy::<f64>([0.0; 10].to_vec()).unwrap());
let r = t.clone();
assert_eq!(Arc::strong_count(&device), 2);
drop(t);
assert_eq!(Arc::strong_count(&device), 2);
drop(r);
assert_eq!(Arc::strong_count(&device), 1);
}
#[test]
fn test_post_release_counts() {
let device = CudaDevice::new(0).unwrap();
let t = device.htod_copy([1.0f32, 2.0, 3.0].to_vec()).unwrap();
#[allow(clippy::redundant_clone)]
let r = t.clone();
assert_eq!(Arc::strong_count(&device), 3);
let r_host = device.sync_reclaim(r).unwrap();
assert_eq!(&r_host, &[1.0, 2.0, 3.0]);
assert_eq!(Arc::strong_count(&device), 2);
drop(r_host);
assert_eq!(Arc::strong_count(&device), 2);
}
#[test]
#[ignore = "must be executed by itself"]
fn test_post_alloc_memory() {
let device = CudaDevice::new(0).unwrap();
let (free1, total1) = result::mem_get_info().unwrap();
let t = device.htod_copy([0.0f32; 5].to_vec()).unwrap();
let (free2, total2) = result::mem_get_info().unwrap();
assert_eq!(total1, total2);
assert!(free2 < free1);
drop(t);
device.synchronize().unwrap();
let (free3, total3) = result::mem_get_info().unwrap();
assert_eq!(total2, total3);
assert!(free3 > free2);
assert_eq!(free3, free1);
}
#[test]
fn test_device_copy_to_views() {
let dev = CudaDevice::new(0).unwrap();
let smalls = [
dev.htod_copy(std::vec![-1.0f32, -0.8]).unwrap(),
dev.htod_copy(std::vec![-0.6, -0.4]).unwrap(),
dev.htod_copy(std::vec![-0.2, 0.0]).unwrap(),
dev.htod_copy(std::vec![0.2, 0.4]).unwrap(),
dev.htod_copy(std::vec![0.6, 0.8]).unwrap(),
];
let mut big = dev.alloc_zeros::<f32>(10).unwrap();
let mut offset = 0;
for small in smalls.iter() {
let mut sub = big.try_slice_mut(offset..offset + small.len()).unwrap();
dev.dtod_copy(small, &mut sub).unwrap();
offset += small.len();
}
assert_eq!(
dev.sync_reclaim(big).unwrap(),
[-1.0, -0.8, -0.6, -0.4, -0.2, 0.0, 0.2, 0.4, 0.6, 0.8]
);
}
#[test]
fn test_leak_and_upgrade() {
let dev = CudaDevice::new(0).unwrap();
let a = dev
.htod_copy(std::vec![1.0f32, 2.0, 3.0, 4.0, 5.0])
.unwrap();
let ptr = a.leak();
let b = unsafe { dev.upgrade_device_ptr::<f32>(ptr, 3) };
assert_eq!(dev.dtoh_sync_copy(&b).unwrap(), &[1.0, 2.0, 3.0]);
let ptr = b.leak();
let c = unsafe { dev.upgrade_device_ptr::<f32>(ptr, 5) };
assert_eq!(dev.dtoh_sync_copy(&c).unwrap(), &[1.0, 2.0, 3.0, 4.0, 5.0]);
}
/// See https://github.com/coreylowman/cudarc/issues/160
#[test]
fn test_slice_is_freed_with_correct_context() {
if CudaDevice::count().unwrap() < 2 {
return;
}
let dev0 = CudaDevice::new(0).unwrap();
let slice = dev0.htod_copy(vec![1.0; 10]).unwrap();
let dev1 = CudaDevice::new(1).unwrap();
drop(dev1);
drop(dev0);
drop(slice);
}
/// See https://github.com/coreylowman/cudarc/issues/161
#[test]
fn test_copy_uses_correct_context() {
if CudaDevice::count().unwrap() < 2 {
return;
}
let dev0 = CudaDevice::new(0).unwrap();
let _dev1 = CudaDevice::new(1).unwrap();
let slice = dev0.htod_copy(vec![1.0; 10]).unwrap();
let _out = dev0.dtoh_sync_copy(&slice).unwrap();
}
}
use crate::driver::{
result,
sys::{self, CUfunction_attribute_enum},
};
use super::{alloc::DeviceRepr, device_ptr::DeviceSlice};
use std::{
marker::PhantomData,
ops::{Bound, RangeBounds},
string::String,
};
#[cfg(feature = "no-std")]
use spin::RwLock;
#[cfg(not(feature = "no-std"))]
use std::sync::RwLock;
use std::{collections::BTreeMap, marker::Unpin, pin::Pin, sync::Arc, vec::Vec};
/// A wrapper around [sys::CUdevice], [sys::CUcontext], [sys::CUstream],
/// and [CudaFunction].
///
/// ```rust
/// # use cudarc::driver::CudaDevice;
/// let dev = CudaDevice::new(0).unwrap();
/// ```
///
/// # Safety
/// 1. impl [Drop] to call all the corresponding resource cleanup methods
/// 2. Doesn't impl clone, so you can't have multiple device pointers
/// hanging around.
/// 3. Any allocations enforce that self is an [Arc], meaning no allocation
/// can outlive the [CudaDevice]
#[derive(Debug)]
pub struct CudaDevice {
pub(crate) cu_device: sys::CUdevice,
pub(crate) cu_primary_ctx: sys::CUcontext,
/// The stream that all work is executed on.
pub(crate) stream: sys::CUstream,
/// Used to synchronize with stream
pub(crate) event: sys::CUevent,
pub(crate) modules: RwLock<BTreeMap<String, CudaModule>>,
pub(crate) ordinal: usize,
pub(crate) is_async: bool,
}
unsafe impl Send for CudaDevice {}
unsafe impl Sync for CudaDevice {}
impl CudaDevice {
/// Creates a new [CudaDevice] on device index `ordinal`.
pub fn new(ordinal: usize) -> Result<Arc<Self>, result::DriverError> {
result::init().unwrap();
let cu_device = result::device::get(ordinal as i32).unwrap();
// primary context initialization, can fail with OOM
let cu_primary_ctx = unsafe { result::primary_ctx::retain(cu_device) }?;
unsafe { result::ctx::set_current(cu_primary_ctx) }.unwrap();
// can fail with OOM
let event = result::event::create(sys::CUevent_flags::CU_EVENT_DISABLE_TIMING)?;
let value = unsafe {
result::device::get_attribute(
cu_device,
sys::CUdevice_attribute_enum::CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED,
)?
};
let is_async = value > 0;
let device = CudaDevice {
cu_device,
cu_primary_ctx,
stream: std::ptr::null_mut(),
event,
modules: RwLock::new(BTreeMap::new()),
ordinal,
is_async,
};
Ok(Arc::new(device))
}
pub fn count() -> Result<i32, result::DriverError> {
result::init().unwrap();
result::device::get_count()
}
/// Get the `ordinal` index of this [CudaDevice].
pub fn ordinal(&self) -> usize {
self.ordinal
}
/// Get the underlying [sys::CUdevice] of this [CudaDevice].
///
/// # Safety
/// While this function is marked as safe, actually using the
/// returned object is unsafe.
///
/// **You must not free/release the device pointer**, as it is still
/// owned by the [CudaDevice].
pub fn cu_device(&self) -> &sys::CUdevice {
&self.cu_device
}
/// Get the underlying [sys::CUcontext] of this [CudaDevice].
///
/// # Safety
/// While this function is marked as safe, actually using the
/// returned object is unsafe.
///
/// **You must not free/release the context pointer**, as it is still
/// owned by the [CudaDevice].
pub fn cu_primary_ctx(&self) -> &sys::CUcontext {
&self.cu_primary_ctx
}
/// Get the underlying [sys::CUstream] that this [CudaDevice] executes
/// all of its work on.
///
/// # Safety
/// While this function is marked as safe, actually using the
/// returned object is unsafe.
///
/// **You must not free/release the stream pointer**, as it is still
/// owned by the [CudaDevice].
pub fn cu_stream(&self) -> &sys::CUstream {
&self.stream
}
/// Get the value of the specified attribute of this [CudaDevice].
pub fn attribute(&self, attrib: sys::CUdevice_attribute) -> Result<i32, result::DriverError> {
unsafe { result::device::get_attribute(self.cu_device, attrib) }
}
}
impl Drop for CudaDevice {
fn drop(&mut self) {
self.bind_to_thread().unwrap();
let modules = RwLock::get_mut(&mut self.modules);
#[cfg(not(feature = "no-std"))]
let modules = modules.unwrap();
for (_, module) in modules.iter() {
unsafe { result::module::unload(module.cu_module) }.unwrap();
}
modules.clear();
let stream = std::mem::replace(&mut self.stream, std::ptr::null_mut());
if !stream.is_null() {
unsafe { result::stream::destroy(stream) }.unwrap();
}
let event = std::mem::replace(&mut self.event, std::ptr::null_mut());
if !event.is_null() {
unsafe { result::event::destroy(event) }.unwrap();
}
let ctx = std::mem::replace(&mut self.cu_primary_ctx, std::ptr::null_mut());
if !ctx.is_null() {
unsafe { result::primary_ctx::release(self.cu_device) }.unwrap();
}
}
}
/// Contains a reference counted pointer to both
/// device and host memory allocated for type `T`.
///
/// # Host data
///
/// *This owns the host data it is associated with*. However
/// it is possible to create device memory without having
/// a corresponding host memory, so the host memory is
/// actually [Option].
///
/// # Reclaiming host data
///
/// To reclaim the host data for this device data,
/// use [CudaDevice::sync_reclaim()]. This will
/// perform necessary synchronization to ensure
/// that the device data finishes copying over.
///
/// # Mutating device data
///
/// This can only be done by launching kernels via
/// [crate::driver::LaunchAsync] which is implemented
/// by [CudaDevice]. Pass `&mut CudaSlice<T>`
/// if you want to mutate the rc, and `&CudaSlice<T>` otherwise.
///
/// Unfortunately, `&CudaSlice<T>` can **still be mutated
/// by the [CudaFunction]**.
#[derive(Debug)]
pub struct CudaSlice<T> {
pub(crate) cu_device_ptr: sys::CUdeviceptr,
pub(crate) len: usize,
pub(crate) device: Arc<CudaDevice>,
pub(crate) host_buf: Option<Pin<Vec<T>>>,
}
unsafe impl<T: Send> Send for CudaSlice<T> {}
unsafe impl<T: Sync> Sync for CudaSlice<T> {}
impl<T> Drop for CudaSlice<T> {
fn drop(&mut self) {
self.device.bind_to_thread().unwrap();
unsafe {
if self.device.is_async {
result::free_async(self.cu_device_ptr, self.device.stream).unwrap();
} else {
result::free_sync(self.cu_device_ptr).unwrap();
}
}
}
}
impl<T> CudaSlice<T> {
/// Get a clone of the underlying [CudaDevice].
pub fn device(&self) -> Arc<CudaDevice> {
self.device.clone()
}
}
impl<T: DeviceRepr> CudaSlice<T> {
/// Allocates copy of self and schedules a device to device copy of memory.
pub fn try_clone(&self) -> Result<Self, result::DriverError> {
let mut dst = unsafe { self.device.alloc(self.len) }?;
self.device.dtod_copy(self, &mut dst)?;
Ok(dst)
}
}
impl<T: DeviceRepr> Clone for CudaSlice<T> {
fn clone(&self) -> Self {
self.try_clone().unwrap()
}
}
impl<T: Clone + Default + DeviceRepr + Unpin> TryFrom<CudaSlice<T>> for Vec<T> {
type Error = result::DriverError;
fn try_from(value: CudaSlice<T>) -> Result<Self, Self::Error> {
value.device.clone().sync_reclaim(value)
}
}
/// Wrapper around [sys::CUmodule] that also contains
/// the loaded [CudaFunction] associated with this module.
///
/// See [CudaModule::get_fn()] for retrieving function handles.
#[derive(Debug)]
pub(crate) struct CudaModule {
pub(crate) cu_module: sys::CUmodule,
pub(crate) functions: BTreeMap<&'static str, sys::CUfunction>,
}
unsafe impl Send for CudaModule {}
unsafe impl Sync for CudaModule {}
/// Wrapper around [sys::CUfunction]. Used by [crate::driver::LaunchAsync].
#[derive(Debug, Clone)]
pub struct CudaFunction {
pub(crate) cu_function: sys::CUfunction,
pub(crate) device: Arc<CudaDevice>,
}
impl CudaFunction {
pub fn occupancy_available_dynamic_smem_per_block(
&self,
num_blocks: u32,
block_size: u32,
) -> Result<usize, result::DriverError> {
let mut dynamic_smem_size: usize = 0;
unsafe {
sys::cuOccupancyAvailableDynamicSMemPerBlock(
&mut dynamic_smem_size,
self.cu_function,
num_blocks as std::ffi::c_int,
block_size as std::ffi::c_int,
)
.result()?
};
Ok(dynamic_smem_size)
}
pub fn occupancy_max_active_blocks_per_multiprocessor(
&self,
block_size: u32,
dynamic_smem_size: usize,
flags: Option<sys::CUoccupancy_flags_enum>,
) -> Result<u32, result::DriverError> {
let mut num_blocks: std::ffi::c_int = 0;
let flags = flags.unwrap_or(sys::CUoccupancy_flags_enum::CU_OCCUPANCY_DEFAULT);
unsafe {
sys::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
&mut num_blocks,
self.cu_function,
block_size as std::ffi::c_int,
dynamic_smem_size,
flags as std::ffi::c_uint,
)
.result()?
};
Ok(num_blocks as u32)
}
pub fn occupancy_max_active_clusters(
&self,
config: crate::driver::LaunchConfig,
shared_mem_size: u32,
) -> Result<u32, result::DriverError> {
let mut num_clusters: std::ffi::c_int = 0;
let cfg = sys::CUlaunchConfig {
gridDimX: config.grid_dim.0,
gridDimY: config.grid_dim.1,
gridDimZ: config.grid_dim.2,
blockDimX: config.block_dim.0,
blockDimY: config.block_dim.1,
blockDimZ: config.block_dim.2,
sharedMemBytes: shared_mem_size as std::ffi::c_uint,
hStream: self.device.stream,
attrs: std::ptr::null_mut(),
numAttrs: 0,
};
unsafe {
sys::cuOccupancyMaxActiveClusters(&mut num_clusters, self.cu_function, &cfg).result()?
};
Ok(num_clusters as u32)
}
pub fn occupancy_max_potential_block_size(
&self,
block_size_to_dynamic_smem_size: extern "C" fn(block_size: std::ffi::c_int) -> usize,
dynamic_smem_size: usize,
block_size_limit: u32,
flags: Option<sys::CUoccupancy_flags_enum>,
) -> Result<(u32, u32), result::DriverError> {
let mut min_grid_size: std::ffi::c_int = 0;
let mut block_size: std::ffi::c_int = 0;
let flags = flags.unwrap_or(sys::CUoccupancy_flags_enum::CU_OCCUPANCY_DEFAULT);
unsafe {
sys::cuOccupancyMaxPotentialBlockSizeWithFlags(
&mut min_grid_size,
&mut block_size,
self.cu_function,
Some(block_size_to_dynamic_smem_size),
dynamic_smem_size,
block_size_limit as std::ffi::c_int,
flags as std::ffi::c_uint,
)
.result()?
};
Ok((min_grid_size as u32, block_size as u32))
}
pub fn occupancy_max_potential_cluster_size(
&self,
config: crate::driver::LaunchConfig,
shared_mem_size: u32,
) -> Result<u32, result::DriverError> {
let mut cluster_size: std::ffi::c_int = 0;
let cfg = sys::CUlaunchConfig {
gridDimX: config.grid_dim.0,
gridDimY: config.grid_dim.1,
gridDimZ: config.grid_dim.2,
blockDimX: config.block_dim.0,
blockDimY: config.block_dim.1,
blockDimZ: config.block_dim.2,
sharedMemBytes: shared_mem_size as std::ffi::c_uint,
hStream: self.device.stream,
attrs: std::ptr::null_mut(),
numAttrs: 0,
};
unsafe {
sys::cuOccupancyMaxPotentialClusterSize(&mut cluster_size, self.cu_function, &cfg)
.result()?
};
Ok(cluster_size as u32)
}
/// Set the value of a specific attribute of this [CudaFunction].
pub fn set_attribute(
&self,
attribute: CUfunction_attribute_enum,
value: i32,
) -> Result<(), result::DriverError> {
unsafe {
result::function::set_function_attribute(self.cu_function, attribute, value)?;
}
Ok(())
}
}
unsafe impl Send for CudaFunction {}
unsafe impl Sync for CudaFunction {}
/// A wrapper around [sys::CUstream] that safely ensures null stream is synchronized
/// upon the completion of this streams work.
///
/// Create with [CudaDevice::fork_default_stream].
///
/// The synchronization happens in **code order**. E.g.
/// ```ignore
/// let stream = dev.fork_default_stream()?; // 0
/// dev.launch(...)?; // 1
/// dev.launch_on_stream(&stream, ...)?; // 2
/// dev.launch(...)?; // 3
/// drop(stream); // 4
/// dev.launch(...) // 5
/// ```
///
/// - 0 will place a streamWaitEvent(default work stream) on the new stream
/// - 1 will launch on the default work stream
/// - 2 will launch concurrently to 1 on `&stream`,
/// - 3 will launch after 1 on the default work stream, but potentially concurrently to 2.
/// - 4 will place a streamWaitEvent(`&stream`) on default work stream
/// - 5 will happen on the default stream **after the default stream waits for 2**
#[derive(Debug)]
pub struct CudaStream {
pub stream: sys::CUstream,
device: Arc<CudaDevice>,
}
impl CudaDevice {
/// Allocates a new stream that can execute kernels concurrently to the default stream.
///
/// The synchronization with default stream happens in **code order**. See [CudaStream] docstring.
///
/// This stream synchronizes in the following way:
/// 1. On creation it adds a wait for any existing work on the default work stream to complete
/// 2. On drop it adds a wait for any existign work on Self to complete *to the default stream*.
pub fn fork_default_stream(self: &Arc<Self>) -> Result<CudaStream, result::DriverError> {
self.bind_to_thread()?;
let stream = CudaStream {
stream: result::stream::create(result::stream::StreamKind::NonBlocking)?,
device: self.clone(),
};
stream.wait_for_default()?;
Ok(stream)
}
/// Forces [CudaStream] to drop, causing the default work stream to block on `streams` completion.
/// **This is asynchronous with respect to the host.**
#[allow(unused_variables)]
pub fn wait_for(self: &Arc<Self>, stream: &CudaStream) -> Result<(), result::DriverError> {
self.bind_to_thread()?;
unsafe {
result::event::record(self.event, stream.stream)?;
result::stream::wait_event(
self.stream,
self.event,
sys::CUevent_wait_flags::CU_EVENT_WAIT_DEFAULT,
)
}
}
}
impl CudaStream {
/// Record's the current default streams workload, and then causes `self`
/// to wait for the default stream to finish that recorded workload.
pub fn wait_for_default(&self) -> Result<(), result::DriverError> {
self.device.bind_to_thread()?;
unsafe {
result::event::record(self.device.event, self.device.stream)?;
result::stream::wait_event(
self.stream,
self.device.event,
sys::CUevent_wait_flags::CU_EVENT_WAIT_DEFAULT,
)
}
}
}
impl Drop for CudaStream {
fn drop(&mut self) {
self.device.wait_for(self).unwrap();
unsafe {
result::stream::destroy(self.stream).unwrap();
}
}
}
/// A immutable sub-view into a [CudaSlice] created by [CudaSlice::try_slice()].
#[derive(Debug)]
pub struct CudaView<'a, T> {
pub(crate) root: &'a sys::CUdeviceptr,
pub(crate) ptr: sys::CUdeviceptr,
pub(crate) len: usize,
marker: PhantomData<T>,
}
impl<T> CudaSlice<T> {
/// Creates a [CudaView] at the specified offset from the start of `self`.
///
/// Returns `None` if `range.start >= self.len`
pub fn slice(&self, range: impl RangeBounds<usize>) -> CudaView<'_, T> {
self.try_slice(range).unwrap()
}
/// Fallible version of [CudaSlice::slice]
pub fn try_slice(&self, range: impl RangeBounds<usize>) -> Option<CudaView<'_, T>> {
range.bounds(..self.len()).map(|(start, end)| CudaView {
root: &self.cu_device_ptr,
ptr: self.cu_device_ptr + (start * std::mem::size_of::<T>()) as u64,
len: end - start,
marker: PhantomData,
})
}
/// Reinterprets the slice of memory into a different type. `len` is the number
/// of elements of the new type `S` that are expected. If not enough bytes
/// are allocated in `self` for the view, then this returns `None`.
///
/// # Safety
/// This is unsafe because not the memory for the view may not be a valid interpretation
/// for the type `S`.
pub unsafe fn transmute<S>(&self, len: usize) -> Option<CudaView<'_, S>> {
(len * std::mem::size_of::<S>() <= self.num_bytes()).then_some(CudaView {
root: &self.cu_device_ptr,
ptr: self.cu_device_ptr,
len,
marker: PhantomData,
})
}
}
impl<'a, T> CudaView<'a, T> {
/// Creates a [CudaView] at the specified offset from the start of `self`.
///
/// Returns `None` if `range.start >= self.len`
pub fn slice(&self, range: impl RangeBounds<usize>) -> CudaView<'a, T> {
self.try_slice(range).unwrap()
}
/// Fallible version of [CudaView::slice]
pub fn try_slice(&self, range: impl RangeBounds<usize>) -> Option<CudaView<'a, T>> {
range.bounds(..self.len()).map(|(start, end)| CudaView {
root: self.root,
ptr: self.ptr + (start * std::mem::size_of::<T>()) as u64,
len: end - start,
marker: PhantomData,
})
}
}
/// A mutable sub-view into a [CudaSlice] created by [CudaSlice::try_slice_mut()].
#[derive(Debug)]
pub struct CudaViewMut<'a, T> {
pub(crate) root: &'a mut sys::CUdeviceptr,
pub(crate) ptr: sys::CUdeviceptr,
pub(crate) len: usize,
marker: PhantomData<T>,
}
impl<T> CudaSlice<T> {
/// Creates a [CudaViewMut] at the specified offset from the start of `self`.
///
/// Returns `None` if `offset >= self.len`
pub fn slice_mut(&mut self, range: impl RangeBounds<usize>) -> CudaViewMut<'_, T> {
self.try_slice_mut(range).unwrap()
}
/// Fallible version of [CudaSlice::slice_mut]
pub fn try_slice_mut(&mut self, range: impl RangeBounds<usize>) -> Option<CudaViewMut<'_, T>> {
range.bounds(..self.len()).map(|(start, end)| CudaViewMut {
ptr: self.cu_device_ptr + (start * std::mem::size_of::<T>()) as u64,
root: &mut self.cu_device_ptr,
len: end - start,
marker: PhantomData,
})
}
/// Reinterprets the slice of memory into a different type. `len` is the number
/// of elements of the new type `S` that are expected. If not enough bytes
/// are allocated in `self` for the view, then this returns `None`.
///
/// # Safety
/// This is unsafe because not the memory for the view may not be a valid interpretation
/// for the type `S`.
pub unsafe fn transmute_mut<S>(&mut self, len: usize) -> Option<CudaViewMut<'_, S>> {
(len * std::mem::size_of::<S>() <= self.num_bytes()).then_some(CudaViewMut {
ptr: self.cu_device_ptr,
root: &mut self.cu_device_ptr,
len,
marker: PhantomData,
})
}
}
impl<'a, T> CudaViewMut<'a, T> {
/// Creates a [CudaView] at the specified offset from the start of `self`.
///
/// Returns `None` if `range.start >= self.len`
pub fn slice<'b: 'a>(&'b self, range: impl RangeBounds<usize>) -> CudaView<'a, T> {
self.try_slice(range).unwrap()
}
/// Fallible version of [CudaViewMut::slice]
pub fn try_slice<'b: 'a>(&'b self, range: impl RangeBounds<usize>) -> Option<CudaView<'a, T>> {
range.bounds(..self.len()).map(|(start, end)| CudaView {
root: self.root,
ptr: self.ptr + (start * std::mem::size_of::<T>()) as u64,
len: end - start,
marker: PhantomData,
})
}
/// Creates a [CudaViewMut] at the specified offset from the start of `self`.
///
/// Returns `None` if `offset >= self.len`
pub fn slice_mut<'b: 'a>(&'b mut self, range: impl RangeBounds<usize>) -> CudaViewMut<'a, T> {
self.try_slice_mut(range).unwrap()
}
/// Fallible version of [CudaViewMut::slice_mut]
pub fn try_slice_mut<'b: 'a>(
&'b mut self,
range: impl RangeBounds<usize>,
) -> Option<CudaViewMut<'a, T>> {
range.bounds(..self.len()).map(|(start, end)| CudaViewMut {
ptr: self.ptr + (start * std::mem::size_of::<T>()) as u64,
root: self.root,
len: end - start,
marker: PhantomData,
})
}
}
trait RangeHelper: RangeBounds<usize> {
fn inclusive_start(&self, valid_start: usize) -> usize;
fn exclusive_end(&self, valid_end: usize) -> usize;
fn bounds(&self, valid: impl RangeHelper) -> Option<(usize, usize)> {
let vs = valid.inclusive_start(0);
let ve = valid.exclusive_end(usize::MAX);
let s = self.inclusive_start(vs);
let e = self.exclusive_end(ve);
let inside = s >= vs && e <= ve;
let valid = s < e || (s == e && !matches!(self.end_bound(), Bound::Included(_)));
(inside && valid).then_some((s, e))
}
}
impl<R: RangeBounds<usize>> RangeHelper for R {
fn inclusive_start(&self, valid_start: usize) -> usize {
match self.start_bound() {
Bound::Included(n) => *n,
Bound::Excluded(n) => *n + 1,
Bound::Unbounded => valid_start,
}
}
fn exclusive_end(&self, valid_end: usize) -> usize {
match self.end_bound() {
Bound::Included(n) => *n + 1,
Bound::Excluded(n) => *n,
Bound::Unbounded => valid_end,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
#[allow(clippy::reversed_empty_ranges)]
fn test_bounds_helper() {
assert_eq!((..2usize).bounds(0..usize::MAX), Some((0, 2)));
assert_eq!((1..2usize).bounds(..usize::MAX), Some((1, 2)));
assert_eq!((..).bounds(1..10), Some((1, 10)));
assert_eq!((2..=2usize).bounds(0..usize::MAX), Some((2, 3)));
assert_eq!((2..=2usize).bounds(0..=1), None);
assert_eq!((2..2usize).bounds(0..usize::MAX), Some((2, 2)));
assert_eq!((1..0usize).bounds(0..usize::MAX), None);
assert_eq!((1..=0usize).bounds(0..usize::MAX), None);
}
#[test]
fn test_transmutes() {
let dev = CudaDevice::new(0).unwrap();
let mut slice = dev.alloc_zeros::<u8>(100).unwrap();
assert!(unsafe { slice.transmute::<f32>(25) }.is_some());
assert!(unsafe { slice.transmute::<f32>(26) }.is_none());
assert!(unsafe { slice.transmute_mut::<f32>(25) }.is_some());
assert!(unsafe { slice.transmute_mut::<f32>(26) }.is_none());
}
}
use crate::driver::sys;
use super::core::{CudaSlice, CudaView, CudaViewMut};
pub trait DeviceSlice<T> {
fn len(&self) -> usize;
fn num_bytes(&self) -> usize {
self.len() * std::mem::size_of::<T>()
}
fn is_empty(&self) -> bool {
self.len() == 0
}
}
impl<T> DeviceSlice<T> for CudaSlice<T> {
fn len(&self) -> usize {
self.len
}
}
impl<'a, T> DeviceSlice<T> for CudaView<'a, T> {
fn len(&self) -> usize {
self.len
}
}
impl<'a, T> DeviceSlice<T> for CudaViewMut<'a, T> {
fn len(&self) -> usize {
self.len
}
}
/// Abstraction over [CudaSlice]/[CudaView]
pub trait DevicePtr<T>: DeviceSlice<T> {
fn device_ptr(&self) -> &sys::CUdeviceptr;
}
impl<T> DevicePtr<T> for CudaSlice<T> {
fn device_ptr(&self) -> &sys::CUdeviceptr {
&self.cu_device_ptr
}
}
impl<'a, T> DevicePtr<T> for CudaView<'a, T> {
fn device_ptr(&self) -> &sys::CUdeviceptr {
&self.ptr
}
}
impl<'a, T> DevicePtr<T> for CudaViewMut<'a, T> {
fn device_ptr(&self) -> &sys::CUdeviceptr {
&self.ptr
}
}
/// Abstraction over [CudaSlice]/[CudaViewMut]
pub trait DevicePtrMut<T>: DeviceSlice<T> {
fn device_ptr_mut(&mut self) -> &mut sys::CUdeviceptr;
}
impl<T> DevicePtrMut<T> for CudaSlice<T> {
fn device_ptr_mut(&mut self) -> &mut sys::CUdeviceptr {
&mut self.cu_device_ptr
}
}
impl<'a, T> DevicePtrMut<T> for CudaViewMut<'a, T> {
fn device_ptr_mut(&mut self) -> &mut sys::CUdeviceptr {
&mut self.ptr
}
}
use core::mem::ManuallyDrop;
use std::fs::File;
use std::ops::Range;
use std::sync::Arc;
use super::{CudaDevice, DevicePtr, DeviceSlice};
use crate::driver::{result, sys, DriverError};
impl CudaDevice {
/// Import external memory from a [`File`].
///
/// # Safety
/// `size` must be the size of the external memory in bytes.
#[cfg(any(unix, windows))]
pub unsafe fn import_external_memory(
self: &Arc<Self>,
file: File,
size: u64,
) -> Result<ExternalMemory, DriverError> {
self.bind_to_thread()?;
#[cfg(unix)]
let external_memory = unsafe {
use std::os::fd::AsRawFd;
result::external_memory::import_external_memory_opaque_fd(file.as_raw_fd(), size)
}?;
#[cfg(windows)]
let external_memory = unsafe {
use std::os::windows::io::AsRawHandle;
result::external_memory::import_external_memory_opaque_win32(file.as_raw_handle(), size)
}?;
Ok(ExternalMemory {
external_memory,
size,
device: self.clone(),
_file: ManuallyDrop::new(file),
})
}
}
/// An abstraction for imported external memory.
///
/// This struct can be created via [`CudaDevice::import_external_memory`].
/// The imported external memory will be destroyed when this struct is dropped.
#[derive(Debug)]
pub struct ExternalMemory {
external_memory: sys::CUexternalMemory,
size: u64,
device: Arc<CudaDevice>,
_file: ManuallyDrop<File>,
}
impl Drop for ExternalMemory {
fn drop(&mut self) {
self.device.bind_to_thread().unwrap();
unsafe { result::external_memory::destroy_external_memory(self.external_memory) }.unwrap();
// From [CUDA docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXTRES__INTEROP.html#group__CUDA__EXTRES__INTEROP_1g52aba3a7f780157d8ba12972b2481735),
// when successfully importing UNIX file descriptor:
//
// > Ownership of the file descriptor is transferred to the CUDA driver when the handle is imported successfully.
// > Performing any operations on the file descriptor after it is imported results in undefined behavior.
//
// On the other hand, on Windows:
//
// > Ownership of this handle is not transferred to CUDA after the import operation,
// > so the application must release the handle using the appropriate system call.
//
// Therefore, we manually drop the file when we are on Windows.
#[cfg(windows)]
unsafe {
ManuallyDrop::<File>::drop(&mut self._file)
};
}
}
impl ExternalMemory {
/// Map the whole external memory to get mapped buffer.
pub fn map_all(self) -> Result<MappedBuffer, DriverError> {
let size = self.size as usize;
self.map_range(0..size)
}
/// Map a range of the external memory to a mapped buffer.
///
/// Only one mapped buffer is allowed at a time.
/// This is more restrictive than it necessarily needs to be,
/// but it makes enforcing safety easier.
///
/// # Panics
/// This function will panic if the range is invalid,
/// such as when the start or end is larger than the size.
pub fn map_range(self, range: Range<usize>) -> Result<MappedBuffer, DriverError> {
assert!(range.start as u64 <= self.size);
assert!(range.end as u64 <= self.size);
let device_ptr = unsafe {
result::external_memory::get_mapped_buffer(
self.external_memory,
range.start as u64,
range.len() as u64,
)
}?;
Ok(MappedBuffer {
device_ptr,
len: range.len(),
external_memory: self,
})
}
}
/// An abstraction for a mapped buffer for some external memory.
///
/// This struct can be created via [`ExternalMemory::map_range`] or [`ExternalMemory::map_all`].
/// The underlying mapped buffer will be freed when this struct is dropped.
#[derive(Debug)]
pub struct MappedBuffer {
device_ptr: sys::CUdeviceptr,
len: usize,
external_memory: ExternalMemory,
}
impl Drop for MappedBuffer {
fn drop(&mut self) {
self.external_memory.device.bind_to_thread().unwrap();
unsafe { result::memory_free(self.device_ptr) }.unwrap()
}
}
impl DeviceSlice<u8> for MappedBuffer {
fn len(&self) -> usize {
self.len
}
}
impl DevicePtr<u8> for MappedBuffer {
fn device_ptr(&self) -> &sys::CUdeviceptr {
&self.device_ptr
}
}
use crate::driver::{result, sys};
use super::alloc::DeviceRepr;
use super::core::{CudaDevice, CudaFunction, CudaModule, CudaStream};
use std::sync::Arc;
impl CudaDevice {
/// Whether a module and function are currently loaded into the device.
pub fn has_func(self: &Arc<Self>, module_name: &str, func_name: &str) -> bool {
let modules = self.modules.read();
#[cfg(not(feature = "no-std"))]
let modules = modules.unwrap();
modules
.get(module_name)
.map_or(false, |module| module.has_func(func_name))
}
/// Retrieves a [CudaFunction] that was registered under `module_name` and `func_name`.
pub fn get_func(self: &Arc<Self>, module_name: &str, func_name: &str) -> Option<CudaFunction> {
let modules = self.modules.read();
#[cfg(not(feature = "no-std"))]
let modules = modules.unwrap();
modules
.get(module_name)
.and_then(|m| m.get_func(func_name))
.map(|cu_function| CudaFunction {
cu_function,
device: self.clone(),
})
}
}
impl CudaModule {
/// Returns reference to function with `name`. If function
/// was not already loaded into CudaModule, then `None`
/// is returned.
pub(crate) fn get_func(&self, name: &str) -> Option<sys::CUfunction> {
self.functions.get(name).cloned()
}
pub(crate) fn has_func(&self, name: &str) -> bool {
self.functions.contains_key(name)
}
}
impl CudaFunction {
#[inline(always)]
unsafe fn launch_async_impl(
self,
cfg: LaunchConfig,
params: &mut [*mut std::ffi::c_void],
) -> Result<(), result::DriverError> {
self.device.bind_to_thread()?;
result::launch_kernel(
self.cu_function,
cfg.grid_dim,
cfg.block_dim,
cfg.shared_mem_bytes,
self.device.stream,
params,
)
}
#[inline(always)]
unsafe fn par_launch_async_impl(
self,
stream: &CudaStream,
cfg: LaunchConfig,
params: &mut [*mut std::ffi::c_void],
) -> Result<(), result::DriverError> {
self.device.bind_to_thread()?;
result::launch_kernel(
self.cu_function,
cfg.grid_dim,
cfg.block_dim,
cfg.shared_mem_bytes,
stream.stream,
params,
)
}
}
/// Configuration for [result::launch_kernel]
///
/// See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1gb8f3dc3031b40da29d5f9a7139e52e15)
/// for description of each parameter.
#[derive(Clone, Copy, Debug)]
pub struct LaunchConfig {
/// (width, height, depth) of grid in blocks
pub grid_dim: (u32, u32, u32),
/// (x, y, z) dimension of each thread block
pub block_dim: (u32, u32, u32),
/// Dynamic shared-memory size per thread block in bytes
pub shared_mem_bytes: u32,
}
impl LaunchConfig {
/// Creates a [LaunchConfig] with:
/// - block_dim == `1024`
/// - grid_dim == `(n + 1023) / 1024`
/// - shared_mem_bytes == `0`
pub fn for_num_elems(n: u32) -> Self {
const NUM_THREADS: u32 = 1024;
let num_blocks = (n + NUM_THREADS - 1) / NUM_THREADS;
Self {
grid_dim: (num_blocks, 1, 1),
block_dim: (NUM_THREADS, 1, 1),
shared_mem_bytes: 0,
}
}
}
/// Consumes a [CudaFunction] to execute asychronously on the device with
/// params determined by generic parameter `Params`.
///
/// This is impl'd multiple times for different number and types of params. In
/// general, `Params` should impl [DeviceRepr].
///
/// ```ignore
/// # use cudarc::driver::*;
/// # let dev = CudaDevice::new(0).unwrap();
/// let my_kernel: CudaFunction = dev.get_func("my_module", "my_kernel").unwrap();
/// let cfg: LaunchConfig = LaunchConfig {
/// grid_dim: (1, 1, 1),
/// block_dim: (1, 1, 1),
/// shared_mem_bytes: 0,
/// };
/// let params = (1i32, 2u64, 3usize);
/// unsafe { my_kernel.launch(cfg, params) }.unwrap();
/// ```
///
/// # Safety
///
/// This is not safe really ever, because there's no garuntee that `Params`
/// will work for any [CudaFunction] passed in. Great care should be taken
/// to ensure that [CudaFunction] works with `Params` and that the correct
/// parameters have `&mut` in front of them.
///
/// Additionally, kernels can mutate data that is marked as immutable,
/// such as `&CudaSlice<T>`.
///
/// See [LaunchAsync::launch] for more details
pub unsafe trait LaunchAsync<Params> {
/// Launches the [CudaFunction] with the corresponding `Params`.
///
/// # Safety
///
/// This method is **very** unsafe.
///
/// See cuda documentation notes on this as well:
/// <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#functions>
///
/// 1. `params` can be changed regardless of `&` or `&mut` usage.
/// 2. `params` will be changed at some later point after the
/// function returns because the kernel is executed async.
/// 3. There are no guaruntees that the `params`
/// are the correct number/types/order for `func`.
/// 4. Specifying the wrong values for [LaunchConfig] can result
/// in accessing/modifying values past memory limits.
///
/// ## Asynchronous mutation
///
/// Since this library queues kernels to be launched on a single
/// stream, and really the only way to modify [crate::driver::CudaSlice] is through
/// kernels, mutating the same [crate::driver::CudaSlice] with multiple kernels
/// is safe. This is because each kernel is executed sequentially
/// on the stream.
///
/// **Modifying a value on the host that is in used by a
/// kernel is undefined behavior.** But is hard to do
/// accidentally.
///
/// Also for this reason, do not pass in any values to kernels
/// that can be modified on the host. This is the reason
/// [DeviceRepr] is not implemented for rust primitive
/// references.
///
/// ## Use after free
///
/// Since the drop implementation for [crate::driver::CudaSlice] also occurs
/// on the device's single stream, any kernels launched before
/// the drop will complete before the value is actually freed.
///
/// **If you launch a kernel or drop a value on a different stream
/// this may not hold**
unsafe fn launch(self, cfg: LaunchConfig, params: Params) -> Result<(), result::DriverError>;
/// Launch the function on a stream concurrent to the device's default
/// work stream.
///
/// # Safety
/// This method is even more unsafe than [LaunchAsync::launch], all the same rules apply,
/// except now things are executing in parallel to each other.
///
/// That means that if any of the kernels modify the same memory location, you'll get race
/// conditions or potentially undefined behavior.
unsafe fn launch_on_stream(
self,
stream: &CudaStream,
cfg: LaunchConfig,
params: Params,
) -> Result<(), result::DriverError>;
}
macro_rules! impl_launch {
([$($Vars:tt),*], [$($Idx:tt),*]) => {
unsafe impl<$($Vars: DeviceRepr),*> LaunchAsync<($($Vars, )*)> for CudaFunction {
#[inline(always)]
unsafe fn launch(
self,
cfg: LaunchConfig,
args: ($($Vars, )*)
) -> Result<(), result::DriverError> {
let params = &mut [$(args.$Idx.as_kernel_param(), )*];
self.launch_async_impl(cfg, params)
}
#[inline(always)]
unsafe fn launch_on_stream(
self,
stream: &CudaStream,
cfg: LaunchConfig,
args: ($($Vars, )*)
) -> Result<(), result::DriverError> {
let params = &mut [$(args.$Idx.as_kernel_param(), )*];
self.par_launch_async_impl(stream, cfg, params)
}
}
};
}
impl_launch!([A], [0]);
impl_launch!([A, B], [0, 1]);
impl_launch!([A, B, C], [0, 1, 2]);
impl_launch!([A, B, C, D], [0, 1, 2, 3]);
impl_launch!([A, B, C, D, E], [0, 1, 2, 3, 4]);
impl_launch!([A, B, C, D, E, F], [0, 1, 2, 3, 4, 5]);
impl_launch!([A, B, C, D, E, F, G], [0, 1, 2, 3, 4, 5, 6]);
impl_launch!([A, B, C, D, E, F, G, H], [0, 1, 2, 3, 4, 5, 6, 7]);
impl_launch!([A, B, C, D, E, F, G, H, I], [0, 1, 2, 3, 4, 5, 6, 7, 8]);
impl_launch!(
[A, B, C, D, E, F, G, H, I, J],
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
);
impl_launch!(
[A, B, C, D, E, F, G, H, I, J, K],
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
);
impl_launch!(
[A, B, C, D, E, F, G, H, I, J, K, L],
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
);
#[cfg(test)]
mod tests {
use std::{time::Instant, vec::Vec};
use crate::{
driver::{DeviceSlice, DriverError},
nvrtc::compile_ptx_with_opts,
};
use super::*;
#[test]
fn test_mut_into_kernel_param_no_inc_rc() {
let device = CudaDevice::new(0).unwrap();
let t = device.htod_copy([0.0f32; 1].to_vec()).unwrap();
let _r = t.clone();
assert_eq!(Arc::strong_count(&device), 3);
let _ = (&t).as_kernel_param();
assert_eq!(Arc::strong_count(&device), 3);
}
#[test]
fn test_ref_into_kernel_param_inc_rc() {
let device = CudaDevice::new(0).unwrap();
let t = device.htod_copy([0.0f32; 1].to_vec()).unwrap();
let _r = t.clone();
assert_eq!(Arc::strong_count(&device), 3);
let _ = (&t).as_kernel_param();
assert_eq!(Arc::strong_count(&device), 3);
}
const SIN_CU: &str = "
extern \"C\" __global__ void sin_kernel(float *out, const float *inp, size_t numel) {
size_t i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < numel) {
out[i] = sin(inp[i]);
}
}";
#[test]
fn test_launch_with_mut_and_ref_cudarc() {
let ptx = compile_ptx_with_opts(SIN_CU, Default::default()).unwrap();
let dev = CudaDevice::new(0).unwrap();
dev.load_ptx(ptx, "sin", &["sin_kernel"]).unwrap();
let sin_kernel = dev.get_func("sin", "sin_kernel").unwrap();
let a_host = [-1.0f32, -0.8, -0.6, -0.4, -0.2, 0.0, 0.2, 0.4, 0.6, 0.8];
let a_dev = dev.htod_copy(a_host.clone().to_vec()).unwrap();
let mut b_dev = a_dev.clone();
unsafe {
sin_kernel.launch(
LaunchConfig::for_num_elems(10),
(&mut b_dev, &a_dev, 10usize),
)
}
.unwrap();
let b_host = dev.sync_reclaim(b_dev).unwrap();
for (a_i, b_i) in a_host.iter().zip(b_host.iter()) {
let expected = a_i.sin();
assert!((b_i - expected).abs() <= 1e-6);
}
drop(a_dev);
}
#[test]
fn test_large_launches() {
let ptx = compile_ptx_with_opts(SIN_CU, Default::default()).unwrap();
let dev = CudaDevice::new(0).unwrap();
dev.load_ptx(ptx, "sin", &["sin_kernel"]).unwrap();
for numel in [256, 512, 1024, 1280, 1536, 2048] {
let mut a = Vec::with_capacity(numel);
a.resize(numel, 1.0f32);
let a = dev.htod_copy(a).unwrap();
let mut b = dev.alloc_zeros::<f32>(numel).unwrap();
let sin_kernel = dev.get_func("sin", "sin_kernel").unwrap();
let cfg = LaunchConfig::for_num_elems(numel as u32);
unsafe { sin_kernel.launch(cfg, (&mut b, &a, numel)) }.unwrap();
let b = dev.sync_reclaim(b).unwrap();
for v in b {
assert_eq!(v, 0.841471);
}
}
}
#[test]
fn test_launch_with_views() {
let ptx = compile_ptx_with_opts(SIN_CU, Default::default()).unwrap();
let dev = CudaDevice::new(0).unwrap();
dev.load_ptx(ptx, "sin", &["sin_kernel"]).unwrap();
let a_host = [-1.0f32, -0.8, -0.6, -0.4, -0.2, 0.0, 0.2, 0.4, 0.6, 0.8];
let a_dev = dev.htod_copy(a_host.clone().to_vec()).unwrap();
let mut b_dev = a_dev.clone();
for i in 0..5 {
let a_sub = a_dev.try_slice(i * 2..).unwrap();
assert_eq!(a_sub.len, 10 - 2 * i);
let mut b_sub = b_dev.try_slice_mut(i * 2..).unwrap();
assert_eq!(b_sub.len, 10 - 2 * i);
let f = dev.get_func("sin", "sin_kernel").unwrap();
unsafe { f.launch(LaunchConfig::for_num_elems(2), (&mut b_sub, &a_sub, 2usize)) }
.unwrap();
}
let b_host = dev.sync_reclaim(b_dev).unwrap();
for (a_i, b_i) in a_host.iter().zip(b_host.iter()) {
let expected = a_i.sin();
assert!((b_i - expected).abs() <= 1e-6);
}
drop(a_dev);
}
const TEST_KERNELS: &str = "
extern \"C\" __global__ void int_8bit(signed char s_min, char s_max, unsigned char u_min, unsigned char u_max) {
assert(s_min == -128);
assert(s_max == 127);
assert(u_min == 0);
assert(u_max == 255);
}
extern \"C\" __global__ void int_16bit(signed short s_min, short s_max, unsigned short u_min, unsigned short u_max) {
assert(s_min == -32768);
assert(s_max == 32767);
assert(u_min == 0);
assert(u_max == 65535);
}
extern \"C\" __global__ void int_32bit(signed int s_min, int s_max, unsigned int u_min, unsigned int u_max) {
assert(s_min == -2147483648);
assert(s_max == 2147483647);
assert(u_min == 0);
assert(u_max == 4294967295);
}
extern \"C\" __global__ void int_64bit(signed long s_min, long s_max, unsigned long u_min, unsigned long u_max) {
assert(s_min == -9223372036854775808);
assert(s_max == 9223372036854775807);
assert(u_min == 0);
assert(u_max == 18446744073709551615);
}
extern \"C\" __global__ void floating(float f, double d) {
assert(fabs(f - 1.2345678) <= 1e-7);
assert(fabs(d - -10.123456789876543) <= 1e-16);
}
";
#[test]
fn test_launch_with_8bit() {
let ptx = compile_ptx_with_opts(TEST_KERNELS, Default::default()).unwrap();
let dev = CudaDevice::new(0).unwrap();
dev.load_ptx(ptx, "tests", &["int_8bit"]).unwrap();
let f = dev.get_func("tests", "int_8bit").unwrap();
unsafe {
f.launch(
LaunchConfig::for_num_elems(1),
(i8::MIN, i8::MAX, u8::MIN, u8::MAX),
)
}
.unwrap();
dev.synchronize().unwrap();
}
#[test]
fn test_launch_with_16bit() {
let ptx = compile_ptx_with_opts(TEST_KERNELS, Default::default()).unwrap();
let dev = CudaDevice::new(0).unwrap();
dev.load_ptx(ptx, "tests", &["int_16bit"]).unwrap();
let f = dev.get_func("tests", "int_16bit").unwrap();
unsafe {
f.launch(
LaunchConfig::for_num_elems(1),
(i16::MIN, i16::MAX, u16::MIN, u16::MAX),
)
}
.unwrap();
dev.synchronize().unwrap();
}
#[test]
fn test_launch_with_32bit() {
let ptx = compile_ptx_with_opts(TEST_KERNELS, Default::default()).unwrap();
let dev = CudaDevice::new(0).unwrap();
dev.load_ptx(ptx, "tests", &["int_32bit"]).unwrap();
let f = dev.get_func("tests", "int_32bit").unwrap();
unsafe {
f.launch(
LaunchConfig::for_num_elems(1),
(i32::MIN, i32::MAX, u32::MIN, u32::MAX),
)
}
.unwrap();
dev.synchronize().unwrap();
}
#[test]
fn test_launch_with_64bit() {
let ptx = compile_ptx_with_opts(TEST_KERNELS, Default::default()).unwrap();
let dev = CudaDevice::new(0).unwrap();
dev.load_ptx(ptx, "tests", &["int_64bit"]).unwrap();
let f = dev.get_func("tests", "int_64bit").unwrap();
unsafe {
f.launch(
LaunchConfig::for_num_elems(1),
(i64::MIN, i64::MAX, u64::MIN, u64::MAX),
)
}
.unwrap();
dev.synchronize().unwrap();
}
#[test]
fn test_launch_with_floats() {
let ptx = compile_ptx_with_opts(TEST_KERNELS, Default::default()).unwrap();
let dev = CudaDevice::new(0).unwrap();
dev.load_ptx(ptx, "tests", &["floating"]).unwrap();
let f = dev.get_func("tests", "floating").unwrap();
unsafe {
f.launch(
LaunchConfig::for_num_elems(1),
(1.2345678f32, -10.123456789876543f64),
)
}
.unwrap();
dev.synchronize().unwrap();
}
#[cfg(feature = "f16")]
const HALF_KERNELS: &str = "
#include \"cuda_fp16.h\"
extern \"C\" __global__ void halfs(__half h) {
assert(__habs(h - __float2half(1.234)) <= __float2half(1e-4));
}
";
#[cfg(feature = "f16")]
#[test]
fn test_launch_with_half() {
use crate::nvrtc::CompileOptions;
let ptx = compile_ptx_with_opts(
HALF_KERNELS,
CompileOptions {
include_paths: std::vec!["/usr/include".into()],
arch: Some("compute_53"),
..Default::default()
},
)
.unwrap();
let dev = CudaDevice::new(0).unwrap();
dev.load_ptx(ptx, "tests", &["halfs"]).unwrap();
let f = dev.get_func("tests", "halfs").unwrap();
unsafe {
f.launch(
LaunchConfig::for_num_elems(1),
(half::f16::from_f32(1.234),),
)
}
.unwrap();
dev.synchronize().unwrap();
}
const SLOW_KERNELS: &str = "
extern \"C\" __global__ void slow_worker(const float *data, const size_t len, float *out) {
float tmp = 0.0;
for(size_t i = 0; i < 1000000; i++) {
tmp += data[i % len];
}
*out = tmp;
}
";
#[test]
fn test_par_launch() -> Result<(), DriverError> {
let ptx = compile_ptx_with_opts(SLOW_KERNELS, Default::default()).unwrap();
let dev = CudaDevice::new(0).unwrap();
dev.load_ptx(ptx, "tests", &["slow_worker"]).unwrap();
let slice = dev.alloc_zeros::<f32>(1000)?;
let mut a = dev.alloc_zeros::<f32>(1)?;
let mut b = dev.alloc_zeros::<f32>(1)?;
let cfg = LaunchConfig::for_num_elems(1);
let start = Instant::now();
{
// launch two kernels on the default stream
let f = dev.get_func("tests", "slow_worker").unwrap();
unsafe { f.launch(cfg, (&slice, slice.len(), &mut a))? };
let f = dev.get_func("tests", "slow_worker").unwrap();
unsafe { f.launch(cfg, (&slice, slice.len(), &mut b))? };
dev.synchronize()?;
}
let double_launch_s = start.elapsed().as_secs_f64();
let start = Instant::now();
{
// create a new stream & launch them concurrently
let stream = dev.fork_default_stream()?;
let f = dev.get_func("tests", "slow_worker").unwrap();
unsafe { f.launch(cfg, (&slice, slice.len(), &mut a))? };
let f = dev.get_func("tests", "slow_worker").unwrap();
unsafe { f.launch_on_stream(&stream, cfg, (&slice, slice.len(), &mut b))? };
dev.wait_for(&stream)?;
dev.synchronize()?;
}
let par_launch_s = start.elapsed().as_secs_f64();
assert!(
(double_launch_s - 2.0 * par_launch_s).abs() < 20.0 / 100.0,
"par={:?} dbl={:?}",
par_launch_s,
double_launch_s
);
Ok(())
}
}
//! Safe abstractions over [crate::driver::result] provided by [CudaSlice], [CudaDevice], [CudaStream], and more.
pub(crate) mod alloc;
pub(crate) mod core;
pub(crate) mod device_ptr;
pub(crate) mod external_memory;
pub(crate) mod launch;
pub(crate) mod profile;
pub(crate) mod ptx;
pub(crate) mod threading;
pub use self::alloc::{DeviceRepr, ValidAsZeroBits};
pub use self::core::{CudaDevice, CudaFunction, CudaSlice, CudaStream, CudaView, CudaViewMut};
pub use self::device_ptr::{DevicePtr, DevicePtrMut, DeviceSlice};
pub use self::external_memory::{ExternalMemory, MappedBuffer};
pub use self::launch::{LaunchAsync, LaunchConfig};
pub use self::profile::{profiler_start, profiler_stop, Profiler};
pub use crate::driver::result::DriverError;
use crate::driver::{result, sys};
/// Calls [profiler_start()] in [Profiler::new()], and [profiler_stop()] in [Drop].
#[derive(Default)]
pub struct Profiler {}
impl Profiler {
/// Enables profile collection by the active profiling tool for the current context. If profiling is already enabled, then Profiler::new() has no effect.
/// More info in [Cuda docs](https://docs.nvidia.com/cuda/profiler-users-guide/)
/// ```no_run
/// use cudarc::driver::{Profiler};
/// # use cudarc::driver::result;
///
/// # fn run() -> Result<(), result::DriverError>{
/// {
/// let profiler = Profiler::new()?;
/// // Hotpath
/// // Profiler stops on drop
/// }
/// # Ok(())
/// # }
/// // Now check your results
/// // nsys profile -c cudaProfilerApi /path/to/bin
/// // And this will profile only the hotpath.
/// ```
///
pub fn new() -> Result<Self, result::DriverError> {
profiler_start()?;
Ok(Self {})
}
}
impl Drop for Profiler {
fn drop(&mut self) {
// We don't want to panic on drop.
profiler_stop().ok();
}
}
/// Enables profile collection by the active profiling tool for the current context. If profiling is already enabled, then profiler_start() has no effect.
/// More info in [Cuda docs](https://docs.nvidia.com/cuda/profiler-users-guide/)
/// For RAII version see [`Profiler::new`].
/// ```no_run
/// use cudarc::driver::{profiler_start, profiler_stop};
/// # use cudarc::driver::result;
///
/// # fn run() -> Result<(), result::DriverError>{
/// profiler_start()?;
/// // Hotpath
/// profiler_stop()?;
/// # Ok(())
/// # }
/// // Now check your results
/// // nsys profile -c cudaProfilerApi /path/to/bin
/// // And this will profile only the hotpath.
/// ```
///
pub fn profiler_start() -> Result<(), result::DriverError> {
unsafe { sys::cuProfilerStart() }.result()
}
/// Disables profile collection by the active profiling tool for the current context. If profiling is already disabled, then profiler_stop() has no effect.
pub fn profiler_stop() -> Result<(), result::DriverError> {
unsafe { sys::cuProfilerStop() }.result()
}
use crate::{
driver::result,
nvrtc::{Ptx, PtxKind},
};
use super::core::{CudaDevice, CudaModule};
use std::ffi::CString;
use std::{collections::BTreeMap, sync::Arc};
impl CudaDevice {
/// Dynamically load a set of [crate::driver::CudaFunction] from a jit compiled ptx.
///
/// - `ptx` contains the compilex ptx
/// - `module_name` is a unique identifier used to access the module later on with [CudaDevice::get_func()]
/// - `func_names` is a slice of function names to load into the module during build.
pub fn load_ptx(
self: &Arc<Self>,
ptx: Ptx,
module_name: &str,
func_names: &[&'static str],
) -> Result<(), result::DriverError> {
self.bind_to_thread()?;
let cu_module = match ptx.0 {
PtxKind::Image(image) => unsafe {
result::module::load_data(image.as_ptr() as *const _)
},
PtxKind::Src(src) => {
let c_src = CString::new(src).unwrap();
unsafe { result::module::load_data(c_src.as_ptr() as *const _) }
}
PtxKind::File(path) => {
let name_c = CString::new(path.to_str().unwrap()).unwrap();
result::module::load(name_c)
}
}?;
let mut functions = BTreeMap::new();
for &fn_name in func_names.iter() {
let fn_name_c = CString::new(fn_name).unwrap();
let cu_function = unsafe { result::module::get_function(cu_module, fn_name_c) }?;
functions.insert(fn_name, cu_function);
}
let module = CudaModule {
cu_module,
functions,
};
#[allow(unused_mut)]
{
let mut modules = self.modules.write();
#[cfg(not(feature = "no-std"))]
let mut modules = modules.unwrap();
modules.insert(module_name.into(), module);
}
Ok(())
}
pub fn load_ptx_bin(
self: &Arc<Self>,
ptx: &[u8],
module_name: &str,
func_names: &[&'static str],
) -> Result<(), result::DriverError> {
self.bind_to_thread()?;
let cu_module = unsafe {
result::module::load_data(ptx.as_ptr() as *const _)
}?;
let mut functions = BTreeMap::new();
for &fn_name in func_names.iter() {
let fn_name_c = CString::new(fn_name).unwrap();
let cu_function = unsafe { result::module::get_function(cu_module, fn_name_c) }?;
functions.insert(fn_name, cu_function);
}
let module = CudaModule {
cu_module,
functions,
};
#[allow(unused_mut)]
{
let mut modules = self.modules.write();
#[cfg(not(feature = "no-std"))]
let mut modules = modules.unwrap();
modules.insert(module_name.into(), module);
}
Ok(())
}
}
use super::{CudaDevice, DriverError};
use crate::driver::result;
impl CudaDevice {
/// Binds the device to the calling thread. You must call this before
/// using the device on a separate thread!
pub fn bind_to_thread(&self) -> Result<(), DriverError> {
unsafe { result::ctx::set_current(self.cu_primary_ctx) }
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::thread;
#[test]
fn test_threading() {
let dev1 = CudaDevice::new(0).unwrap();
let dev2 = dev1.clone();
let thread1 = thread::spawn(move || {
dev1.bind_to_thread()?;
dev1.alloc_zeros::<f32>(10)
});
let thread2 = thread::spawn(move || {
dev2.bind_to_thread()?;
dev2.alloc_zeros::<f32>(10)
});
let _: crate::driver::CudaSlice<f32> = thread1.join().unwrap().unwrap();
let _: crate::driver::CudaSlice<f32> = thread2.join().unwrap().unwrap();
}
}
This source diff could not be displayed because it is too large. You can view the blob instead.
#include "cuda.h"
#include "cudaProfiler.h"
//! Safe abstractions over:
//! 1. [CUDA driver API](https://docs.nvidia.com/cuda/cuda-driver-api/index.html)
//! 2. [NVRTC API](https://docs.nvidia.com/cuda/nvrtc/index.html)
//! 3. [cuRAND API](https://docs.nvidia.com/cuda/curand/index.html)
//! 4. [cuBLAS API](https://docs.nvidia.com/cuda/cublas/index.html)
//!
//! # crate organization
//!
//! Each of the modules for the above is organized into three levels:
//! 1. A `safe` module which provides safe abstractions over the `result` module
//! 2. A `result` which is a thin wrapper around the `sys` module to ensure all functions return [Result]
//! 3. A `sys` module which contains the raw FFI bindings
//!
//! | API | Safe | Result | Sys |
//! | --- | --- | --- | --- |
//! | driver | [driver::safe] | [driver::result] | [driver::sys] |
//! | cublas | [cublas::safe] | [cublas::result] | [cublas::sys] |
//! | cublaslt | [cublaslt::safe] | [cublaslt::result] | [cublaslt::sys] |
//! | nvrtc | [nvrtc::safe] | [nvrtc::result] | [nvrtc::sys] |
//! | curand | [curand::safe] | [curand::result] | [curand::sys] |
//! | cudnn | - | [cudnn::result] | [cudnn::sys] |
//!
//! # Core Concepts
//!
//! At the core is the [driver] API, which exposes a bunch of structs, but the main ones are:
//!
//! 1. [`driver::CudaDevice`] is a handle to a specific device ordinal (e.g. 0, 1, 2, ...)
//! 2. [`driver::CudaSlice<T>`], which represents a [`Vec<T>`] on the device, can be allocated
//! using the aforementioned CudaDevice.
//!
//! Here is a table of similar concepts between CPU and Cuda:
//!
//! | Concept | CPU | Cuda |
//! | --- | --- | --- |
//! | Memory allocator | [`std::alloc::GlobalAlloc`] | [`driver::CudaDevice`] |
//! | List of values on heap | [`Vec<T>`] | [`driver::CudaSlice<T>`] |
//! | Slice | `&[T]` | [`driver::CudaView<T>`] |
//! | Mutable Slice | `&mut [T]` | [`driver::CudaViewMut<T>`] |
//! | Function | [`Fn`] | [`driver::CudaFunction`] |
//! | Calling a function | `my_function(a, b, c)` | [`driver::LaunchAsync::launch()`] |
//! | Thread | [`std::thread::Thread`] | [`driver::CudaStream`] |
//!
//! # Combining the different APIs
//!
//! All the highest level apis have been designed to work together.
//!
//! ## nvrtc
//!
//! [`nvrtc::compile_ptx()`] outputs a [`nvrtc::Ptx`], which can
//! be loaded into a device with [`driver::CudaDevice::load_ptx()`].
//!
//! ## cublas
//!
//! [cublas::CudaBlas] can perform gemm operations using [`cublas::Gemm<T>`],
//! and [`cublas::Gemv<T>`]. Both of these traits can generically accept memory
//! allocated by the driver in the form of: [`driver::CudaSlice<T>`],
//! [`driver::CudaView<T>`], and [`driver::CudaViewMut<T>`].
//!
//! ## curand
//!
//! [curand::CudaRng] can fill a [`driver::CudaSlice<T>`] with random data, based on
//! one of its available distributions.
//!
//! # Combining safe/result/sys
//!
//! The result and sys levels are very inter-changeable for each API. However,
//! the safe apis don't necessarily allow you to mix in the result level. This
//! is to encourage going through the safe API when possible.
//!
//! **If you need some functionality that isn't present in the safe api, please
//! open a ticket.**
#![cfg_attr(feature = "no-std", no_std)]
#[cfg(feature = "no-std")]
extern crate alloc;
#[cfg(feature = "no-std")]
extern crate no_std_compat as std;
#[cfg(feature = "cublas")]
pub mod cublas;
#[cfg(feature = "cublaslt")]
pub mod cublaslt;
#[cfg(feature = "cudnn")]
pub mod cudnn;
#[cfg(feature = "curand")]
pub mod curand;
#[cfg(feature = "driver")]
pub mod driver;
#[cfg(feature = "nccl")]
pub mod nccl;
#[cfg(feature = "nvrtc")]
pub mod nvrtc;
pub mod types;
#!/bin/bash
set -exu
BINDGEN_EXTRA_CLANG_ARGS="-D__CUDA_BF16_TYPES_EXIST__" \
bindgen \
--allowlist-type="^nccl.*" \
--allowlist-var="^nccl.*" \
--allowlist-function="^nccl.*" \
--default-enum-style=rust \
--no-doc-comments \
--with-derive-default \
--with-derive-eq \
--with-derive-hash \
--with-derive-ord \
--use-core \
wrapper.h -- -I/usr/local/cuda/include -I/usr/lib/gcc/x86_64-linux-gnu/12/include/ \
> sys.rs
//! Wrappers around the [NCCL API](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/index.html)
//! in three levels. See crate documentation for description of each.
pub mod result;
pub mod safe;
#[allow(warnings)]
pub mod sys;
pub use safe::*;
//! A thin wrapper around [sys] providing [Result]s with [NcclError].
use super::sys::{self, ncclCommSplit, ncclGetVersion, ncclRedOpCreatePreMulSum, ncclRedOpDestroy};
use std::mem::MaybeUninit;
/// Wrapper around [sys::ncclResult_t].
/// See [NCCL docs](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/types.html?ncclresult-t)
#[derive(Clone, PartialEq, Eq)]
pub struct NcclError(pub sys::ncclResult_t);
impl std::fmt::Debug for NcclError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "NcclError")
}
}
#[derive(Clone, PartialEq, Eq)]
pub enum NcclStatus {
Success,
InProgress,
NumResults,
}
impl sys::ncclResult_t {
/// Transforms into a [Result] of [NcclError]
pub fn result(self) -> Result<NcclStatus, NcclError> {
match self {
sys::ncclResult_t::ncclSuccess => Ok(NcclStatus::Success),
sys::ncclResult_t::ncclInProgress => Ok(NcclStatus::InProgress),
sys::ncclResult_t::ncclNumResults => Ok(NcclStatus::NumResults),
_ => Err(NcclError(self)),
}
}
}
/// See [cuda docs](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/comms.html?c.ncclCommFinalize)
/// # Safety
/// User is in charge of sending valid pointers.
pub unsafe fn comm_finalize(comm: sys::ncclComm_t) -> Result<NcclStatus, NcclError> {
sys::ncclCommFinalize(comm).result()
}
/// See [cuda docs](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/comms.html?ncclcommdestroy)
/// # Safety
/// User is in charge of sending valid pointers.
pub unsafe fn comm_destroy(comm: sys::ncclComm_t) -> Result<NcclStatus, NcclError> {
sys::ncclCommDestroy(comm).result()
}
/// See [cuda docs](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/comms.html?ncclcommabort)
/// # Safety
/// User is in charge of sending valid pointers.
pub unsafe fn comm_abort(comm: sys::ncclComm_t) -> Result<NcclStatus, NcclError> {
sys::ncclCommAbort(comm).result()
}
/// See [cuda docs](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/comms.html?ncclcommcount)
pub fn get_nccl_version() -> Result<::core::ffi::c_int, NcclError> {
let mut version: ::core::ffi::c_int = 0;
unsafe {
ncclGetVersion(&mut version).result()?;
}
Ok(version)
}
/// See [cuda docs](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/comms.html?ncclgetuniqueid)
pub fn get_uniqueid() -> Result<sys::ncclUniqueId, NcclError> {
let mut uniqueid = MaybeUninit::uninit();
Ok(unsafe {
sys::ncclGetUniqueId(uniqueid.as_mut_ptr()).result()?;
uniqueid.assume_init()
})
}
/// See [cuda docs](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/comms.html?ncclcomminitrankconfig)
/// # Safety
/// User is in charge of sending valid pointers.
pub unsafe fn comm_init_rank_config(
comm: *mut sys::ncclComm_t,
nranks: ::core::ffi::c_int,
comm_id: sys::ncclUniqueId,
rank: ::core::ffi::c_int,
config: *mut sys::ncclConfig_t,
) -> Result<NcclStatus, NcclError> {
sys::ncclCommInitRankConfig(comm, nranks, comm_id, rank, config).result()
}
/// See [cuda docs](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/comms.html?ncclcomminitrank)
/// # Safety
/// User is in charge of sending valid pointers.
pub unsafe fn comm_init_rank(
comm: *mut sys::ncclComm_t,
nranks: ::core::ffi::c_int,
comm_id: sys::ncclUniqueId,
rank: ::core::ffi::c_int,
) -> Result<NcclStatus, NcclError> {
sys::ncclCommInitRank(comm, nranks, comm_id, rank).result()
}
/// See [cuda docs](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/comms.html?ncclcomminitall)
/// # Safety
/// User is in charge of sending valid pointers.
pub unsafe fn comm_init_all(
comm: *mut sys::ncclComm_t,
ndev: ::core::ffi::c_int,
devlist: *const ::core::ffi::c_int,
) -> Result<NcclStatus, NcclError> {
sys::ncclCommInitAll(comm, ndev, devlist).result()
}
/// See [cuda docs](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/comms.html?ncclcommsplit)
/// # Safety
/// User is in charge of sending valid pointers.
pub unsafe fn comm_split(
comm: sys::ncclComm_t,
color: ::core::ffi::c_int,
key: ::core::ffi::c_int,
newcomm: *mut sys::ncclComm_t,
config: *mut sys::ncclConfig_t,
) -> Result<NcclStatus, NcclError> {
ncclCommSplit(comm, color, key, newcomm, config).result()
}
/// See [cuda docs](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/comms.html?ncclcommcount)
/// # Safety
/// User is in charge of sending valid pointers.
pub unsafe fn comm_count(comm: sys::ncclComm_t) -> Result<::core::ffi::c_int, NcclError> {
let mut count = 0;
sys::ncclCommCount(comm, &mut count).result()?;
Ok(count)
}
/// See [cuda docs](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/comms.html?ncclcommcudevice)
/// # Safety
/// User is in charge of sending valid pointers.
pub unsafe fn comm_cu_device(comm: sys::ncclComm_t) -> Result<::core::ffi::c_int, NcclError> {
let mut device = 0;
sys::ncclCommCuDevice(comm, &mut device).result()?;
Ok(device)
}
/// See [cuda docs](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/comms.html?ncclcommuserrank)
/// # Safety
/// User is in charge of sending valid pointers.
pub unsafe fn comm_user_rank(comm: sys::ncclComm_t) -> Result<::core::ffi::c_int, NcclError> {
let mut rank = 0;
sys::ncclCommUserRank(comm, &mut rank).result()?;
Ok(rank)
}
/// See [cuda docs](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/ops.html?c.ncclRedOpCreatePreMulSum)
/// # Safety
/// User is in charge of sending valid pointers.
pub unsafe fn reduce_op_create_pre_mul_sum(
op: *mut sys::ncclRedOp_t,
scalar: *mut ::core::ffi::c_void,
datatype: sys::ncclDataType_t,
residence: sys::ncclScalarResidence_t,
comm: sys::ncclComm_t,
) -> Result<NcclStatus, NcclError> {
ncclRedOpCreatePreMulSum(op, scalar, datatype, residence, comm).result()
}
/// See [cuda docs](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/ops.html?ncclredopdestroy)
/// # Safety
/// User is in charge of sending valid pointers.
pub unsafe fn reduce_op_destroy(
op: sys::ncclRedOp_t,
comm: sys::ncclComm_t,
) -> Result<NcclStatus, NcclError> {
ncclRedOpDestroy(op, comm).result()
}
/// See [cuda docs](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/colls.html?ncclreduce)
/// # Safety
/// User is in charge of sending valid pointers.
#[allow(clippy::too_many_arguments)]
pub unsafe fn reduce(
sendbuff: *const ::core::ffi::c_void,
recvbuff: *mut ::core::ffi::c_void,
count: usize,
datatype: sys::ncclDataType_t,
op: sys::ncclRedOp_t,
root: ::core::ffi::c_int,
comm: sys::ncclComm_t,
stream: sys::cudaStream_t,
) -> Result<NcclStatus, NcclError> {
sys::ncclReduce(sendbuff, recvbuff, count, datatype, op, root, comm, stream).result()
}
/// See [cuda docs](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/colls.html?ncclbroadcast)
/// # Safety
/// User is in charge of sending valid pointers.
pub unsafe fn broadcast(
sendbuff: *const ::core::ffi::c_void,
recvbuff: *mut ::core::ffi::c_void,
count: usize,
datatype: sys::ncclDataType_t,
root: ::core::ffi::c_int,
comm: sys::ncclComm_t,
stream: sys::cudaStream_t,
) -> Result<NcclStatus, NcclError> {
sys::ncclBroadcast(sendbuff, recvbuff, count, datatype, root, comm, stream).result()
}
/// See [cuda docs](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/colls.html?ncclallreduce)
/// # Safety
/// User is in charge of sending valid pointers.
pub unsafe fn all_reduce(
sendbuff: *const ::core::ffi::c_void,
recvbuff: *mut ::core::ffi::c_void,
count: usize,
datatype: sys::ncclDataType_t,
op: sys::ncclRedOp_t,
comm: sys::ncclComm_t,
stream: sys::cudaStream_t,
) -> Result<NcclStatus, NcclError> {
sys::ncclAllReduce(sendbuff, recvbuff, count, datatype, op, comm, stream).result()
}
/// See [cuda docs](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/colls.html?ncclreducescatter)
/// # Safety
/// User is in charge of sending valid pointers.
pub unsafe fn reduce_scatter(
sendbuff: *const ::core::ffi::c_void,
recvbuff: *mut ::core::ffi::c_void,
recvcount: usize,
datatype: sys::ncclDataType_t,
op: sys::ncclRedOp_t,
comm: sys::ncclComm_t,
stream: sys::cudaStream_t,
) -> Result<NcclStatus, NcclError> {
sys::ncclReduceScatter(sendbuff, recvbuff, recvcount, datatype, op, comm, stream).result()
}
/// See [cuda docs](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/colls.html?ncclallgather)
/// # Safety
/// User is in charge of sending valid pointers.
pub unsafe fn all_gather(
sendbuff: *const ::core::ffi::c_void,
recvbuff: *mut ::core::ffi::c_void,
sendcount: usize,
datatype: sys::ncclDataType_t,
comm: sys::ncclComm_t,
stream: sys::cudaStream_t,
) -> Result<NcclStatus, NcclError> {
sys::ncclAllGather(sendbuff, recvbuff, sendcount, datatype, comm, stream).result()
}
/// See [cuda docs](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/p2p.html?ncclsend)
/// # Safety
/// User is in charge of sending valid pointers.
pub unsafe fn send(
sendbuff: *const ::core::ffi::c_void,
count: usize,
datatype: sys::ncclDataType_t,
peer: ::core::ffi::c_int,
comm: sys::ncclComm_t,
stream: sys::cudaStream_t,
) -> Result<NcclStatus, NcclError> {
sys::ncclSend(sendbuff, count, datatype, peer, comm, stream).result()
}
/// See [cuda docs](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/p2p.html?ncclrecv)
/// # Safety
/// User is in charge of sending valid pointers.
pub unsafe fn recv(
recvbuff: *mut ::core::ffi::c_void,
count: usize,
datatype: sys::ncclDataType_t,
peer: ::core::ffi::c_int,
comm: sys::ncclComm_t,
stream: sys::cudaStream_t,
) -> Result<NcclStatus, NcclError> {
sys::ncclRecv(recvbuff, count, datatype, peer, comm, stream).result()
}
/// See [cuda docs](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/group.html?c.ncclGroupEnd)
pub fn group_end() -> Result<NcclStatus, NcclError> {
unsafe { sys::ncclGroupEnd().result() }
}
/// See [cuda docs](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/group.html?ncclgroupstart)
pub fn group_start() -> Result<NcclStatus, NcclError> {
unsafe { sys::ncclGroupStart().result() }
}
#[cfg(test)]
mod tests {
use super::*;
use crate::driver::CudaDevice;
use std::ffi::c_void;
#[test]
fn single_thread() {
let n_devices = CudaDevice::count().unwrap() as usize;
let n = 2;
let mut devs = vec![];
let mut sendslices = vec![];
let mut recvslices = vec![];
for i in 0..n_devices {
let dev = CudaDevice::new(i).unwrap();
let slice = dev.htod_copy(vec![(i + 1) as f32 * 1.0; n]).unwrap();
sendslices.push(slice);
let slice = dev.alloc_zeros::<f32>(n).unwrap();
recvslices.push(slice);
devs.push(dev);
}
let mut comms = vec![std::ptr::null_mut(); n_devices];
let ordinals: Vec<_> = devs.iter().map(|d| d.ordinal as i32).collect();
unsafe {
comm_init_all(comms.as_mut_ptr(), n_devices as i32, ordinals.as_ptr()).unwrap();
group_start().unwrap();
for i in 0..n_devices {
// Very important to set the cuda context to this device.
let dev = CudaDevice::new(i).unwrap();
all_reduce(
sendslices[i].cu_device_ptr as *const c_void,
recvslices[i].cu_device_ptr as *mut c_void,
n,
sys::ncclDataType_t::ncclFloat32,
sys::ncclRedOp_t::ncclSum,
comms[i],
dev.stream as sys::cudaStream_t,
)
.unwrap();
}
group_end().unwrap();
}
for (i, recv) in recvslices.iter().enumerate() {
// Get the current device context
let dev = CudaDevice::new(i).unwrap();
let out = dev.dtoh_sync_copy(recv).unwrap();
assert_eq!(out, vec![(n_devices * (n_devices + 1)) as f32 / 2.0; n]);
}
}
#[test]
fn multi_thread() {
let n_devices = CudaDevice::count().unwrap() as usize;
let n = 2;
let comm_id = get_uniqueid().unwrap();
let threads: Vec<_> = (0..n_devices)
.map(|i| {
let n_devices = n_devices.clone();
std::thread::spawn(move || {
let dev = CudaDevice::new(i).unwrap();
let sendslice = dev.htod_copy(vec![(i + 1) as f32 * 1.0; n]).unwrap();
let recvslice = dev.alloc_zeros::<f32>(n).unwrap();
let mut comm = MaybeUninit::uninit();
unsafe {
comm_init_rank(comm.as_mut_ptr(), n_devices as i32, comm_id, i as i32)
.unwrap();
let comm = comm.assume_init();
use std::ffi::c_void;
all_reduce(
sendslice.cu_device_ptr as *const c_void,
recvslice.cu_device_ptr as *mut c_void,
n,
sys::ncclDataType_t::ncclFloat32,
sys::ncclRedOp_t::ncclSum,
comm,
dev.stream as sys::cudaStream_t,
)
.unwrap();
}
})
})
.collect();
for t in threads {
t.join().unwrap();
}
}
}
use super::{result, sys};
use crate::driver::{CudaDevice, CudaSlice};
use std::mem::MaybeUninit;
use std::ptr;
use std::{sync::Arc, vec, vec::Vec};
pub use result::{group_end, group_start};
#[derive(Debug)]
pub struct Comm {
comm: sys::ncclComm_t,
device: Arc<CudaDevice>,
rank: usize,
world_size: usize,
}
#[derive(Debug, Clone, Copy)]
pub struct Id {
id: sys::ncclUniqueId,
}
impl Id {
pub fn new() -> Result<Self, result::NcclError> {
let id = result::get_uniqueid()?;
Ok(Self { id })
}
pub fn uninit(internal: [::core::ffi::c_char; 128usize]) -> Self {
let id = sys::ncclUniqueId { internal };
Self { id }
}
pub fn internal(&self) -> &[::core::ffi::c_char; 128usize] {
&self.id.internal
}
}
pub enum ReduceOp {
Sum,
Prod,
Max,
Min,
Avg,
}
fn convert_to_nccl_reduce_op(op: &ReduceOp) -> sys::ncclRedOp_t {
match op {
ReduceOp::Sum => sys::ncclRedOp_t::ncclSum,
ReduceOp::Prod => sys::ncclRedOp_t::ncclProd,
ReduceOp::Max => sys::ncclRedOp_t::ncclMax,
ReduceOp::Min => sys::ncclRedOp_t::ncclMin,
ReduceOp::Avg => sys::ncclRedOp_t::ncclAvg,
}
}
impl Drop for Comm {
fn drop(&mut self) {
// TODO(thenerdstation): Shoule we instead do finalize then destory?
unsafe {
result::comm_abort(self.comm).expect("Error when aborting Comm.");
}
}
}
pub trait NcclType {
fn as_nccl_type() -> sys::ncclDataType_t;
}
macro_rules! define_nccl_type {
($t:ty, $nccl_type:expr) => {
impl NcclType for $t {
fn as_nccl_type() -> sys::ncclDataType_t {
$nccl_type
}
}
};
}
define_nccl_type!(f32, sys::ncclDataType_t::ncclFloat32);
define_nccl_type!(f64, sys::ncclDataType_t::ncclFloat64);
define_nccl_type!(i8, sys::ncclDataType_t::ncclInt8);
define_nccl_type!(i32, sys::ncclDataType_t::ncclInt32);
define_nccl_type!(i64, sys::ncclDataType_t::ncclInt64);
define_nccl_type!(u8, sys::ncclDataType_t::ncclUint8);
define_nccl_type!(u32, sys::ncclDataType_t::ncclUint32);
define_nccl_type!(u64, sys::ncclDataType_t::ncclUint64);
define_nccl_type!(char, sys::ncclDataType_t::ncclUint8);
#[cfg(feature = "f16")]
define_nccl_type!(half::f16, sys::ncclDataType_t::ncclFloat16);
#[cfg(feature = "f16")]
define_nccl_type!(half::bf16, sys::ncclDataType_t::ncclBfloat16);
impl Comm {
/// Primitive to create new communication link on a single thread.
/// WARNING: You are likely to get limited throughput using a single core
/// to control multiple GPUs
/// ```
/// # use cudarc::driver::safe::{CudaDevice};
/// # use cudarc::nccl::safe::{Comm, ReduceOp, group_start, group_end};
/// let n = 2;
/// let n_devices = CudaDevice::count().unwrap() as usize;
/// let devices : Vec<_> = (0..n_devices).flat_map(CudaDevice::new).collect();
/// let comms = Comm::from_devices(devices).unwrap();
/// group_start().unwrap();
/// (0..n_devices).map(|i| {
/// let comm = &comms[i];
/// let dev = comm.device();
/// let slice = dev.htod_copy(vec![(i + 1) as f32 * 1.0; n]).unwrap();
/// let mut slice_receive = dev.alloc_zeros::<f32>(n).unwrap();
/// comm.all_reduce(&slice, &mut slice_receive, &ReduceOp::Sum)
/// .unwrap();
/// });
/// group_start().unwrap();
/// ```
pub fn from_devices(devices: Vec<Arc<CudaDevice>>) -> Result<Vec<Self>, result::NcclError> {
let n_devices = devices.len();
let mut comms = vec![std::ptr::null_mut(); n_devices];
let ordinals: Vec<_> = devices.iter().map(|d| d.ordinal as i32).collect();
unsafe {
result::comm_init_all(comms.as_mut_ptr(), n_devices as i32, ordinals.as_ptr())?;
}
let comms: Vec<Self> = comms
.into_iter()
.zip(devices.iter().cloned())
.enumerate()
.map(|(rank, (comm, device))| Self {
comm,
device,
rank,
world_size: n_devices,
})
.collect();
Ok(comms)
}
pub fn device(&self) -> Arc<CudaDevice> {
self.device.clone()
}
pub fn rank(&self) -> usize {
self.rank
}
pub fn world_size(&self) -> usize {
self.world_size
}
/// Primitive to create new communication link on each process (threads are possible but not
/// recommended).
///
/// WARNING: If using threads, uou are likely to get limited throughput using a single core
/// to control multiple GPUs. Cuda drivers effectively use a global mutex thrashing
/// performance on multi threaded multi GPU.
/// ```
/// # use cudarc::driver::safe::{CudaDevice};
/// # use cudarc::nccl::safe::{Comm, Id, ReduceOp};
/// let n = 2;
/// let n_devices = 1; // This is to simplify this example.
/// // Spawn this only on rank 0
/// let id = Id::new().unwrap();
/// // Send id.internal() to other ranks
/// // let id = Id::uninit(id.internal().clone()); on other ranks
///
/// let rank = 0;
/// let dev = CudaDevice::new(rank).unwrap();
/// let comm = Comm::from_rank(dev.clone(), rank, n_devices, id).unwrap();
/// let slice = dev.htod_copy(vec![(rank + 1) as f32 * 1.0; n]).unwrap();
/// let mut slice_receive = dev.alloc_zeros::<f32>(n).unwrap();
/// comm.all_reduce(&slice, &mut slice_receive, &ReduceOp::Sum)
/// .unwrap();
/// let out = dev.dtoh_sync_copy(&slice_receive).unwrap();
/// assert_eq!(out, vec![(n_devices * (n_devices + 1)) as f32 / 2.0; n]);
/// ```
pub fn from_rank(
device: Arc<CudaDevice>,
rank: usize,
world_size: usize,
id: Id,
) -> Result<Self, result::NcclError> {
let mut comm = MaybeUninit::uninit();
let comm = unsafe {
result::comm_init_rank(
comm.as_mut_ptr(),
world_size
.try_into()
.expect("World_size cannot be casted to i32"),
id.id,
rank.try_into().expect("Rank cannot be cast to i32"),
)?;
comm.assume_init()
};
Ok(Self {
comm,
device,
rank,
world_size,
})
}
}
impl Comm {
pub fn send<T: NcclType>(
&self,
data: &CudaSlice<T>,
peer: i32,
) -> Result<(), result::NcclError> {
unsafe {
result::send(
data.cu_device_ptr as *mut _,
data.len,
T::as_nccl_type(),
peer,
self.comm,
self.device.stream as *mut _,
)?;
}
Ok(())
}
pub fn recv<T: NcclType>(
&self,
buff: &mut CudaSlice<T>,
peer: i32,
) -> Result<result::NcclStatus, result::NcclError> {
unsafe {
result::recv(
buff.cu_device_ptr as *mut _,
buff.len,
T::as_nccl_type(),
peer,
self.comm,
self.device.stream as *mut _,
)
}
}
pub fn broadcast<T: NcclType>(
&self,
sendbuff: &Option<CudaSlice<T>>,
recvbuff: &mut CudaSlice<T>,
root: i32,
) -> Result<result::NcclStatus, result::NcclError> {
unsafe {
let send_ptr = match sendbuff {
Some(buffer) => buffer.cu_device_ptr as *mut _,
None => ptr::null(),
};
result::broadcast(
send_ptr,
recvbuff.cu_device_ptr as *mut _,
recvbuff.len,
T::as_nccl_type(),
root,
self.comm,
self.device.stream as *mut _,
)
}
}
pub fn all_gather<T: NcclType>(
&self,
sendbuff: &CudaSlice<T>,
recvbuff: &mut CudaSlice<T>,
) -> Result<result::NcclStatus, result::NcclError> {
unsafe {
result::all_gather(
sendbuff.cu_device_ptr as *mut _,
recvbuff.cu_device_ptr as *mut _,
sendbuff.len,
T::as_nccl_type(),
self.comm,
self.device.stream as *mut _,
)
}
}
pub fn all_reduce<T: NcclType>(
&self,
sendbuff: &CudaSlice<T>,
recvbuff: &mut CudaSlice<T>,
reduce_op: &ReduceOp,
) -> Result<result::NcclStatus, result::NcclError> {
unsafe {
result::all_reduce(
sendbuff.cu_device_ptr as *mut _,
recvbuff.cu_device_ptr as *mut _,
sendbuff.len,
T::as_nccl_type(),
convert_to_nccl_reduce_op(reduce_op),
self.comm,
self.device.stream as *mut _,
)
}
}
pub fn reduce<T: NcclType>(
&self,
sendbuff: &CudaSlice<T>,
recvbuff: &mut CudaSlice<T>,
reduce_op: &ReduceOp,
root: i32,
) -> Result<result::NcclStatus, result::NcclError> {
unsafe {
result::reduce(
sendbuff.cu_device_ptr as *mut _,
recvbuff.cu_device_ptr as *mut _,
sendbuff.len,
T::as_nccl_type(),
convert_to_nccl_reduce_op(reduce_op),
root,
self.comm,
self.device.stream as *mut _,
)
}
}
pub fn reduce_scatter<T: NcclType>(
&self,
sendbuff: &CudaSlice<T>,
recvbuff: &mut CudaSlice<T>,
reduce_op: &ReduceOp,
) -> Result<result::NcclStatus, result::NcclError> {
unsafe {
result::reduce_scatter(
sendbuff.cu_device_ptr as *mut _,
recvbuff.cu_device_ptr as *mut _,
recvbuff.len,
T::as_nccl_type(),
convert_to_nccl_reduce_op(reduce_op),
self.comm,
self.device.stream as *mut _,
)
}
}
}
#[macro_export]
macro_rules! group {
($x:block) => {
unsafe {
result::group_start().unwrap();
}
$x
unsafe {
result::group_end().unwrap();
}
};
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_all_reduce() {
let n = 2;
let n_devices = CudaDevice::count().unwrap() as usize;
let id = Id::new().unwrap();
let threads: Vec<_> = (0..n_devices)
.map(|i| {
println!("III {i}");
std::thread::spawn(move || {
println!("Within thread {i}");
let dev = CudaDevice::new(i).unwrap();
let comm = Comm::from_rank(dev.clone(), i, n_devices, id).unwrap();
let slice = dev.htod_copy(vec![(i + 1) as f32 * 1.0; n]).unwrap();
let mut slice_receive = dev.alloc_zeros::<f32>(n).unwrap();
comm.all_reduce(&slice, &mut slice_receive, &ReduceOp::Sum)
.unwrap();
let out = dev.dtoh_sync_copy(&slice_receive).unwrap();
assert_eq!(out, vec![(n_devices * (n_devices + 1)) as f32 / 2.0; n]);
})
})
.collect();
for t in threads {
t.join().unwrap()
}
}
}
/* automatically generated by rust-bindgen 0.66.1 */
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct CUstream_st {
_unused: [u8; 0],
}
pub type cudaStream_t = *mut CUstream_st;
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct ncclComm {
_unused: [u8; 0],
}
pub type ncclComm_t = *mut ncclComm;
#[repr(C)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub struct ncclUniqueId {
pub internal: [::core::ffi::c_char; 128usize],
}
#[test]
fn bindgen_test_layout_ncclUniqueId() {
const UNINIT: ::core::mem::MaybeUninit<ncclUniqueId> = ::core::mem::MaybeUninit::uninit();
let ptr = UNINIT.as_ptr();
assert_eq!(
::core::mem::size_of::<ncclUniqueId>(),
128usize,
concat!("Size of: ", stringify!(ncclUniqueId))
);
assert_eq!(
::core::mem::align_of::<ncclUniqueId>(),
1usize,
concat!("Alignment of ", stringify!(ncclUniqueId))
);
assert_eq!(
unsafe { ::core::ptr::addr_of!((*ptr).internal) as usize - ptr as usize },
0usize,
concat!(
"Offset of field: ",
stringify!(ncclUniqueId),
"::",
stringify!(internal)
)
);
}
impl Default for ncclUniqueId {
fn default() -> Self {
let mut s = ::core::mem::MaybeUninit::<Self>::uninit();
unsafe {
::core::ptr::write_bytes(s.as_mut_ptr(), 0, 1);
s.assume_init()
}
}
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum ncclResult_t {
ncclSuccess = 0,
ncclUnhandledCudaError = 1,
ncclSystemError = 2,
ncclInternalError = 3,
ncclInvalidArgument = 4,
ncclInvalidUsage = 5,
ncclRemoteError = 6,
ncclInProgress = 7,
ncclNumResults = 8,
}
#[repr(C)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub struct ncclConfig_v21700 {
pub size: usize,
pub magic: ::core::ffi::c_uint,
pub version: ::core::ffi::c_uint,
pub blocking: ::core::ffi::c_int,
pub cgaClusterSize: ::core::ffi::c_int,
pub minCTAs: ::core::ffi::c_int,
pub maxCTAs: ::core::ffi::c_int,
pub netName: *const ::core::ffi::c_char,
pub splitShare: ::core::ffi::c_int,
}
#[test]
fn bindgen_test_layout_ncclConfig_v21700() {
const UNINIT: ::core::mem::MaybeUninit<ncclConfig_v21700> = ::core::mem::MaybeUninit::uninit();
let ptr = UNINIT.as_ptr();
assert_eq!(
::core::mem::size_of::<ncclConfig_v21700>(),
48usize,
concat!("Size of: ", stringify!(ncclConfig_v21700))
);
assert_eq!(
::core::mem::align_of::<ncclConfig_v21700>(),
8usize,
concat!("Alignment of ", stringify!(ncclConfig_v21700))
);
assert_eq!(
unsafe { ::core::ptr::addr_of!((*ptr).size) as usize - ptr as usize },
0usize,
concat!(
"Offset of field: ",
stringify!(ncclConfig_v21700),
"::",
stringify!(size)
)
);
assert_eq!(
unsafe { ::core::ptr::addr_of!((*ptr).magic) as usize - ptr as usize },
8usize,
concat!(
"Offset of field: ",
stringify!(ncclConfig_v21700),
"::",
stringify!(magic)
)
);
assert_eq!(
unsafe { ::core::ptr::addr_of!((*ptr).version) as usize - ptr as usize },
12usize,
concat!(
"Offset of field: ",
stringify!(ncclConfig_v21700),
"::",
stringify!(version)
)
);
assert_eq!(
unsafe { ::core::ptr::addr_of!((*ptr).blocking) as usize - ptr as usize },
16usize,
concat!(
"Offset of field: ",
stringify!(ncclConfig_v21700),
"::",
stringify!(blocking)
)
);
assert_eq!(
unsafe { ::core::ptr::addr_of!((*ptr).cgaClusterSize) as usize - ptr as usize },
20usize,
concat!(
"Offset of field: ",
stringify!(ncclConfig_v21700),
"::",
stringify!(cgaClusterSize)
)
);
assert_eq!(
unsafe { ::core::ptr::addr_of!((*ptr).minCTAs) as usize - ptr as usize },
24usize,
concat!(
"Offset of field: ",
stringify!(ncclConfig_v21700),
"::",
stringify!(minCTAs)
)
);
assert_eq!(
unsafe { ::core::ptr::addr_of!((*ptr).maxCTAs) as usize - ptr as usize },
28usize,
concat!(
"Offset of field: ",
stringify!(ncclConfig_v21700),
"::",
stringify!(maxCTAs)
)
);
assert_eq!(
unsafe { ::core::ptr::addr_of!((*ptr).netName) as usize - ptr as usize },
32usize,
concat!(
"Offset of field: ",
stringify!(ncclConfig_v21700),
"::",
stringify!(netName)
)
);
assert_eq!(
unsafe { ::core::ptr::addr_of!((*ptr).splitShare) as usize - ptr as usize },
40usize,
concat!(
"Offset of field: ",
stringify!(ncclConfig_v21700),
"::",
stringify!(splitShare)
)
);
}
impl Default for ncclConfig_v21700 {
fn default() -> Self {
let mut s = ::core::mem::MaybeUninit::<Self>::uninit();
unsafe {
::core::ptr::write_bytes(s.as_mut_ptr(), 0, 1);
s.assume_init()
}
}
}
pub type ncclConfig_t = ncclConfig_v21700;
extern "C" {
pub fn ncclGetVersion(version: *mut ::core::ffi::c_int) -> ncclResult_t;
}
extern "C" {
pub fn ncclGetUniqueId(uniqueId: *mut ncclUniqueId) -> ncclResult_t;
}
extern "C" {
pub fn ncclCommInitRankConfig(
comm: *mut ncclComm_t,
nranks: ::core::ffi::c_int,
commId: ncclUniqueId,
rank: ::core::ffi::c_int,
config: *mut ncclConfig_t,
) -> ncclResult_t;
}
extern "C" {
pub fn ncclCommInitRank(
comm: *mut ncclComm_t,
nranks: ::core::ffi::c_int,
commId: ncclUniqueId,
rank: ::core::ffi::c_int,
) -> ncclResult_t;
}
extern "C" {
pub fn ncclCommInitAll(
comm: *mut ncclComm_t,
ndev: ::core::ffi::c_int,
devlist: *const ::core::ffi::c_int,
) -> ncclResult_t;
}
extern "C" {
pub fn ncclCommFinalize(comm: ncclComm_t) -> ncclResult_t;
}
extern "C" {
pub fn ncclCommDestroy(comm: ncclComm_t) -> ncclResult_t;
}
extern "C" {
pub fn ncclCommAbort(comm: ncclComm_t) -> ncclResult_t;
}
extern "C" {
pub fn ncclCommSplit(
comm: ncclComm_t,
color: ::core::ffi::c_int,
key: ::core::ffi::c_int,
newcomm: *mut ncclComm_t,
config: *mut ncclConfig_t,
) -> ncclResult_t;
}
extern "C" {
pub fn ncclGetErrorString(result: ncclResult_t) -> *const ::core::ffi::c_char;
}
extern "C" {
pub fn ncclGetLastError(comm: ncclComm_t) -> *const ::core::ffi::c_char;
}
extern "C" {
pub fn ncclCommGetAsyncError(comm: ncclComm_t, asyncError: *mut ncclResult_t) -> ncclResult_t;
}
extern "C" {
pub fn ncclCommCount(comm: ncclComm_t, count: *mut ::core::ffi::c_int) -> ncclResult_t;
}
extern "C" {
pub fn ncclCommCuDevice(comm: ncclComm_t, device: *mut ::core::ffi::c_int) -> ncclResult_t;
}
extern "C" {
pub fn ncclCommUserRank(comm: ncclComm_t, rank: *mut ::core::ffi::c_int) -> ncclResult_t;
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum ncclRedOp_dummy_t {
ncclNumOps_dummy = 5,
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum ncclRedOp_t {
ncclSum = 0,
ncclProd = 1,
ncclMax = 2,
ncclMin = 3,
ncclAvg = 4,
ncclNumOps = 5,
ncclMaxRedOp = 2147483647,
}
impl ncclDataType_t {
pub const ncclChar: ncclDataType_t = ncclDataType_t::ncclInt8;
}
impl ncclDataType_t {
pub const ncclInt: ncclDataType_t = ncclDataType_t::ncclInt32;
}
impl ncclDataType_t {
pub const ncclHalf: ncclDataType_t = ncclDataType_t::ncclFloat16;
}
impl ncclDataType_t {
pub const ncclFloat: ncclDataType_t = ncclDataType_t::ncclFloat32;
}
impl ncclDataType_t {
pub const ncclDouble: ncclDataType_t = ncclDataType_t::ncclFloat64;
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum ncclDataType_t {
ncclInt8 = 0,
ncclUint8 = 1,
ncclInt32 = 2,
ncclUint32 = 3,
ncclInt64 = 4,
ncclUint64 = 5,
ncclFloat16 = 6,
ncclFloat32 = 7,
ncclFloat64 = 8,
ncclBfloat16 = 9,
ncclNumTypes = 10,
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum ncclScalarResidence_t {
ncclScalarDevice = 0,
ncclScalarHostImmediate = 1,
}
extern "C" {
pub fn ncclRedOpCreatePreMulSum(
op: *mut ncclRedOp_t,
scalar: *mut ::core::ffi::c_void,
datatype: ncclDataType_t,
residence: ncclScalarResidence_t,
comm: ncclComm_t,
) -> ncclResult_t;
}
extern "C" {
pub fn ncclRedOpDestroy(op: ncclRedOp_t, comm: ncclComm_t) -> ncclResult_t;
}
extern "C" {
pub fn ncclReduce(
sendbuff: *const ::core::ffi::c_void,
recvbuff: *mut ::core::ffi::c_void,
count: usize,
datatype: ncclDataType_t,
op: ncclRedOp_t,
root: ::core::ffi::c_int,
comm: ncclComm_t,
stream: cudaStream_t,
) -> ncclResult_t;
}
extern "C" {
pub fn ncclBcast(
buff: *mut ::core::ffi::c_void,
count: usize,
datatype: ncclDataType_t,
root: ::core::ffi::c_int,
comm: ncclComm_t,
stream: cudaStream_t,
) -> ncclResult_t;
}
extern "C" {
pub fn ncclBroadcast(
sendbuff: *const ::core::ffi::c_void,
recvbuff: *mut ::core::ffi::c_void,
count: usize,
datatype: ncclDataType_t,
root: ::core::ffi::c_int,
comm: ncclComm_t,
stream: cudaStream_t,
) -> ncclResult_t;
}
extern "C" {
pub fn ncclAllReduce(
sendbuff: *const ::core::ffi::c_void,
recvbuff: *mut ::core::ffi::c_void,
count: usize,
datatype: ncclDataType_t,
op: ncclRedOp_t,
comm: ncclComm_t,
stream: cudaStream_t,
) -> ncclResult_t;
}
extern "C" {
pub fn ncclReduceScatter(
sendbuff: *const ::core::ffi::c_void,
recvbuff: *mut ::core::ffi::c_void,
recvcount: usize,
datatype: ncclDataType_t,
op: ncclRedOp_t,
comm: ncclComm_t,
stream: cudaStream_t,
) -> ncclResult_t;
}
extern "C" {
pub fn ncclAllGather(
sendbuff: *const ::core::ffi::c_void,
recvbuff: *mut ::core::ffi::c_void,
sendcount: usize,
datatype: ncclDataType_t,
comm: ncclComm_t,
stream: cudaStream_t,
) -> ncclResult_t;
}
extern "C" {
pub fn ncclSend(
sendbuff: *const ::core::ffi::c_void,
count: usize,
datatype: ncclDataType_t,
peer: ::core::ffi::c_int,
comm: ncclComm_t,
stream: cudaStream_t,
) -> ncclResult_t;
}
extern "C" {
pub fn ncclRecv(
recvbuff: *mut ::core::ffi::c_void,
count: usize,
datatype: ncclDataType_t,
peer: ::core::ffi::c_int,
comm: ncclComm_t,
stream: cudaStream_t,
) -> ncclResult_t;
}
extern "C" {
pub fn ncclGroupStart() -> ncclResult_t;
}
extern "C" {
pub fn ncclGroupEnd() -> ncclResult_t;
}
#include "nccl.h"
\ No newline at end of file
#!/bin/bash
set -exu
bindgen \
--whitelist-type="^nvrtc.*" \
--whitelist-function="^nvrtc.*" \
--default-enum-style=rust \
--no-doc-comments \
--with-derive-default \
--with-derive-eq \
--with-derive-hash \
--with-derive-ord \
--size_t-is-usize \
--use-core \
wrapper.h -- -I/usr/local/cuda/include \
> sys.rs
\ No newline at end of file
//! Wrappers around the [Nvidia Runtime Compilation (nvrtc) API](https://docs.nvidia.com/cuda/nvrtc/index.html),
//! in three levels. See crate documentation for description of each.
//!
//! Call [compile_ptx()] or [compile_ptx_with_opts()].
pub mod result;
pub mod safe;
#[allow(warnings)]
pub mod sys;
pub use safe::*;
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment