Initial commit

3ec8c534 · yongshk · 3ec8c534 · 3ec8c534 · 3ec8c534 · 3ec8c534
Commit 3ec8c534 authored May 29, 2025 by yongshk
20 changed files
--- a/src/driver/safe/alloc.rs
+++ b/src/driver/safe/alloc.rs
+use crate::driver::{result, sys};
+use super::core::{CudaDevice, CudaSlice, CudaView, CudaViewMut};
+use super::device_ptr::{DevicePtr, DevicePtrMut, DeviceSlice};
+use std::{marker::Unpin, pin::Pin, sync::Arc, vec::Vec};
+/// Something that can be copied to device memory and
+/// turned into a parameter for [result::launch_kernel].
+///
+/// # Safety
+///
+/// This is unsafe because a struct should likely
+/// be `#[repr(C)]` to be represented in cuda memory,
+/// and not all types are valid.
+pub unsafe trait DeviceRepr {
+    #[inline(always)]
+    fn as_kernel_param(&self) -> *mut std::ffi::c_void {
+        self as *const Self as *mut _
+    }
+}
+unsafe impl DeviceRepr for bool {}
+unsafe impl DeviceRepr for i8 {}
+unsafe impl DeviceRepr for i16 {}
+unsafe impl DeviceRepr for i32 {}
+unsafe impl DeviceRepr for i64 {}
+unsafe impl DeviceRepr for i128 {}
+unsafe impl DeviceRepr for isize {}
+unsafe impl DeviceRepr for u8 {}
+unsafe impl DeviceRepr for u16 {}
+unsafe impl DeviceRepr for u32 {}
+unsafe impl DeviceRepr for u64 {}
+unsafe impl DeviceRepr for u128 {}
+unsafe impl DeviceRepr for usize {}
+unsafe impl DeviceRepr for f32 {}
+unsafe impl DeviceRepr for f64 {}
+#[cfg(feature = "f16")]
+unsafe impl DeviceRepr for half::f16 {}
+#[cfg(feature = "f16")]
+unsafe impl DeviceRepr for half::bf16 {}
+unsafe impl<T: DeviceRepr> DeviceRepr for &mut CudaSlice<T> {
+    #[inline(always)]
+    fn as_kernel_param(&self) -> *mut std::ffi::c_void {
+        (&self.cu_device_ptr) as *const sys::CUdeviceptr as *mut std::ffi::c_void
+    }
+}
+unsafe impl<T: DeviceRepr> DeviceRepr for &CudaSlice<T> {
+    #[inline(always)]
+    fn as_kernel_param(&self) -> *mut std::ffi::c_void {
+        (&self.cu_device_ptr) as *const sys::CUdeviceptr as *mut std::ffi::c_void
+    }
+}
+unsafe impl<'a, T: DeviceRepr> DeviceRepr for &CudaView<'a, T> {
+    #[inline(always)]
+    fn as_kernel_param(&self) -> *mut std::ffi::c_void {
+        (&self.ptr) as *const sys::CUdeviceptr as *mut std::ffi::c_void
+    }
+}
+unsafe impl<'a, T: DeviceRepr> DeviceRepr for &mut CudaViewMut<'a, T> {
+    #[inline(always)]
+    fn as_kernel_param(&self) -> *mut std::ffi::c_void {
+        (&self.ptr) as *const sys::CUdeviceptr as *mut std::ffi::c_void
+    }
+}
+impl<T> CudaSlice<T> {
+    /// Takes ownership of the underlying [sys::CUdeviceptr]. **It is up
+    /// to the owner to free this value**.
+    ///
+    /// Drops the underlying host_buf if there is one.
+    pub fn leak(mut self) -> sys::CUdeviceptr {
+        if let Some(host_buf) = std::mem::take(&mut self.host_buf) {
+            drop(host_buf);
+        }
+        let ptr = self.cu_device_ptr;
+        std::mem::forget(self);
+        ptr
+    }
+}
+impl CudaDevice {
+    /// Creates a [CudaSlice] from a [sys::CUdeviceptr]. Useful in conjunction with
+    /// [`CudaSlice::leak()`].
+    ///
+    /// # Safety
+    /// - `cu_device_ptr` must be a valid allocation
+    /// - `cu_device_ptr` must space for `len * std::mem::size_of<T>()` bytes
+    /// - The memory may not be valid for type `T`, so some sort of memset operation
+    ///   should be called on the memory.
+    pub unsafe fn upgrade_device_ptr<T>(
+        self: &Arc<Self>,
+        cu_device_ptr: sys::CUdeviceptr,
+        len: usize,
+    ) -> CudaSlice<T> {
+        CudaSlice {
+            cu_device_ptr,
+            len,
+            device: self.clone(),
+            host_buf: None,
+        }
+    }
+}
+impl CudaDevice {
+    /// Allocates an empty [CudaSlice] with 0 length.
+    pub fn null<T>(self: &Arc<Self>) -> Result<CudaSlice<T>, result::DriverError> {
+        self.bind_to_thread()?;
+        let cu_device_ptr = unsafe {
+            if self.is_async {
+                result::malloc_async(self.stream, 0)?
+            } else {
+                result::malloc_sync(0)?
+            }
+        };
+        Ok(CudaSlice {
+            cu_device_ptr,
+            len: 0,
+            device: self.clone(),
+            host_buf: None,
+        })
+    }
+    /// Allocates device memory and increments the reference counter of [CudaDevice].
+    ///
+    /// # Safety
+    /// This is unsafe because the device memory is unset after this call.
+    pub unsafe fn alloc<T: DeviceRepr>(
+        self: &Arc<Self>,
+        len: usize,
+    ) -> Result<CudaSlice<T>, result::DriverError> {
+        self.bind_to_thread()?;
+        let cu_device_ptr = if self.is_async {
+            result::malloc_async(self.stream, len * std::mem::size_of::<T>())?
+        } else {
+            result::malloc_sync(len * std::mem::size_of::<T>())?
+        };
+        Ok(CudaSlice {
+            cu_device_ptr,
+            len,
+            device: self.clone(),
+            host_buf: None,
+        })
+    }
+    /// Allocates device memory with no associated host memory, and memsets
+    /// the device memory to all 0s.
+    ///
+    /// # Safety
+    /// 1. `T` is marked as [ValidAsZeroBits], so the device memory is valid to use
+    /// 2. Self is [`Arc<Self>`], and this method increments the rc for self
+    pub fn alloc_zeros<T: ValidAsZeroBits + DeviceRepr>(
+        self: &Arc<Self>,
+        len: usize,
+    ) -> Result<CudaSlice<T>, result::DriverError> {
+        let mut dst = unsafe { self.alloc(len) }?;
+        self.memset_zeros(&mut dst)?;
+        Ok(dst)
+    }
+    /// Sets all memory to 0 asynchronously.
+    ///
+    /// # Safety
+    /// 1. `T` is marked as [ValidAsZeroBits], so the device memory is valid to use
+    /// 2. Self is [`Arc<Self>`], and this method increments the rc for self
+    pub fn memset_zeros<T: ValidAsZeroBits + DeviceRepr, Dst: DevicePtrMut<T>>(
+        self: &Arc<Self>,
+        dst: &mut Dst,
+    ) -> Result<(), result::DriverError> {
+        self.bind_to_thread()?;
+        if self.is_async {
+            unsafe {
+                result::memset_d8_async(*dst.device_ptr_mut(), 0, dst.num_bytes(), self.stream)
+            }
+        } else {
+            unsafe { result::memset_d8_sync(*dst.device_ptr_mut(), 0, dst.num_bytes()) }
+        }
+    }
+    /// Device to device copy (safe version of [result::memcpy_dtod_async]).
+    ///
+    /// # Panics
+    ///
+    /// If the length of the two values are different
+    ///
+    /// # Safety
+    /// 1. We are guarunteed that `src` and `dst` are pointers to the same underlying
+    ///     type `T`
+    /// 2. Since they are both references, they can't have been freed
+    /// 3. Self is [`Arc<Self>`], and this method increments the rc for self
+    pub fn dtod_copy<T: DeviceRepr, Src: DevicePtr<T>, Dst: DevicePtrMut<T>>(
+        self: &Arc<Self>,
+        src: &Src,
+        dst: &mut Dst,
+    ) -> Result<(), result::DriverError> {
+        assert_eq!(src.len(), dst.len());
+        self.bind_to_thread()?;
+        if self.is_async {
+            unsafe {
+                result::memcpy_dtod_async(
+                    *dst.device_ptr_mut(),
+                    *src.device_ptr(),
+                    src.len() * std::mem::size_of::<T>(),
+                    self.stream,
+                )
+            }
+        } else {
+            unsafe {
+                result::memcpy_dtod_sync(
+                    *dst.device_ptr_mut(),
+                    *src.device_ptr(),
+                    src.len() * std::mem::size_of::<T>(),
+                )
+            }
+        }
+    }
+    /// Takes ownership of the host data and copies it to device data asynchronously.
+    ///
+    /// # Safety
+    ///
+    /// 1. Since `src` is owned by this funcion, it is safe to copy data. Any actions executed
+    ///    after this will take place after the data has been successfully copied.
+    /// 2. Self is [`Arc<Self>`], and this method increments the rc for self
+    pub fn htod_copy<T: Unpin + DeviceRepr>(
+        self: &Arc<Self>,
+        src: Vec<T>,
+    ) -> Result<CudaSlice<T>, result::DriverError> {
+        let mut dst = unsafe { self.alloc(src.len()) }?;
+        self.htod_copy_into(src, &mut dst)?;
+        Ok(dst)
+    }
+    /// Takes ownership of the host data and copies it to device data asynchronously.
+    ///
+    /// # Safety
+    ///
+    /// 1. Since `src` is owned by this funcion, it is safe to copy data. Any actions executed
+    ///    after this will take place after the data has been successfully copied.
+    /// 2. Self is [`Arc<Self>`], and this method increments the rc for self
+    pub fn htod_copy_into<T: DeviceRepr + Unpin>(
+        self: &Arc<Self>,
+        src: Vec<T>,
+        dst: &mut CudaSlice<T>,
+    ) -> Result<(), result::DriverError> {
+        assert_eq!(src.len(), dst.len());
+        dst.host_buf = Some(Pin::new(src));
+        self.bind_to_thread()?;
+        if self.is_async {
+            unsafe {
+                result::memcpy_htod_async(
+                    dst.cu_device_ptr,
+                    dst.host_buf.as_ref().unwrap(),
+                    self.stream,
+                )
+            }?
+        } else {
+            unsafe { result::memcpy_htod_sync(dst.cu_device_ptr, dst.host_buf.as_ref().unwrap()) }?
+        }
+        Ok(())
+    }
+    /// Allocates new device memory and synchronously copies data from `src` into the new allocation.
+    ///
+    /// If you want an asynchronous copy, see [CudaDevice::htod_copy()].
+    ///
+    /// # Safety
+    ///
+    /// 1. Since this function doesn't own `src` it is executed synchronously.
+    /// 2. Self is [`Arc<Self>`], and this method increments the rc for self
+    pub fn htod_sync_copy<T: DeviceRepr>(
+        self: &Arc<Self>,
+        src: &[T],
+    ) -> Result<CudaSlice<T>, result::DriverError> {
+        let mut dst = unsafe { self.alloc(src.len()) }?;
+        self.htod_sync_copy_into(src, &mut dst)?;
+        Ok(dst)
+    }
+    /// Synchronously copies data from `src` into the new allocation.
+    ///
+    /// If you want an asynchronous copy, see [CudaDevice::htod_copy()].
+    ///
+    /// # Panics
+    ///
+    /// If the lengths of slices are not equal, this method panics.
+    ///
+    /// # Safety
+    /// 1. Since this function doesn't own `src` it is executed synchronously.
+    /// 2. Self is [`Arc<Self>`], and this method increments the rc for self
+    pub fn htod_sync_copy_into<T: DeviceRepr, Dst: DevicePtrMut<T>>(
+        self: &Arc<Self>,
+        src: &[T],
+        dst: &mut Dst,
+    ) -> Result<(), result::DriverError> {
+        assert_eq!(src.len(), dst.len());
+        self.bind_to_thread()?;
+        if self.is_async {
+            unsafe { result::memcpy_htod_async(*dst.device_ptr_mut(), src, self.stream) }?;
+        } else {
+            unsafe { result::memcpy_htod_sync(*dst.device_ptr_mut(), src) }?;
+        }
+        self.synchronize()
+    }
+    /// Synchronously copies device memory into host memory.
+    /// Unlike [`CudaDevice::dtoh_sync_copy_into`] this returns a [`Vec<T>`].
+    ///
+    /// # Safety
+    /// 1. Since this function doesn't own `dst` (after returning) it is executed synchronously.
+    /// 2. Self is [`Arc<Self>`], and this method increments the rc for self
+    #[allow(clippy::uninit_vec)]
+    pub fn dtoh_sync_copy<T: DeviceRepr, Src: DevicePtr<T>>(
+        self: &Arc<Self>,
+        src: &Src,
+    ) -> Result<Vec<T>, result::DriverError> {
+        let mut dst = Vec::with_capacity(src.len());
+        unsafe { dst.set_len(src.len()) };
+        self.dtoh_sync_copy_into(src, &mut dst)?;
+        Ok(dst)
+    }
+    /// Synchronously copies device memory into host memory
+    ///
+    /// Use [`CudaDevice::dtoh_sync_copy`] if you need [`Vec<T>`] and can't provide
+    /// a correctly sized slice.
+    ///
+    /// # Panics
+    ///
+    /// If the lengths of slices are not equal, this method panics.
+    ///
+    /// # Safety
+    /// 1. Since this function doesn't own `dst` it is executed synchronously.
+    /// 2. Self is [`Arc<Self>`], and this method increments the rc for self
+    pub fn dtoh_sync_copy_into<T: DeviceRepr, Src: DevicePtr<T>>(
+        self: &Arc<Self>,
+        src: &Src,
+        dst: &mut [T],
+    ) -> Result<(), result::DriverError> {
+        assert_eq!(src.len(), dst.len());
+        self.bind_to_thread()?;
+        if self.is_async {
+            unsafe { result::memcpy_dtoh_async(dst, *src.device_ptr(), self.stream) }?;
+        } else {
+            unsafe { result::memcpy_dtoh_sync(dst, *src.device_ptr()) }?;
+        }
+        self.synchronize()
+    }
+    /// Synchronously de-allocates `src` and converts it into it's host value.
+    /// You can just [drop] the slice if you don't need the host data.
+    ///
+    /// # Safety
+    /// 1. Self is [`Arc<Self>`], and this method increments the rc for self
+    pub fn sync_reclaim<T: Clone + Default + DeviceRepr + Unpin>(
+        self: &Arc<Self>,
+        mut src: CudaSlice<T>,
+    ) -> Result<Vec<T>, result::DriverError> {
+        let buf = src.host_buf.take();
+        let mut buf = buf.unwrap_or_else(|| {
+            let mut b = Vec::with_capacity(src.len);
+            b.resize(src.len, Default::default());
+            Pin::new(b)
+        });
+        self.dtoh_sync_copy_into(&src, &mut buf)?;
+        Ok(Pin::into_inner(buf))
+    }
+    /// Synchronizes the stream.
+    pub fn synchronize(self: &Arc<Self>) -> Result<(), result::DriverError> {
+        self.bind_to_thread()?;
+        unsafe { result::stream::synchronize(self.stream) }
+    }
+}
+/// Marker trait to indicate that the type is valid
+/// when all of its bits are set to 0.
+///
+/// # Safety
+/// Not all types are valid when all bits are set to 0.
+/// Be very sure when implementing this trait!
+pub unsafe trait ValidAsZeroBits {}
+unsafe impl ValidAsZeroBits for bool {}
+unsafe impl ValidAsZeroBits for i8 {}
+unsafe impl ValidAsZeroBits for i16 {}
+unsafe impl ValidAsZeroBits for i32 {}
+unsafe impl ValidAsZeroBits for i64 {}
+unsafe impl ValidAsZeroBits for i128 {}
+unsafe impl ValidAsZeroBits for isize {}
+unsafe impl ValidAsZeroBits for u8 {}
+unsafe impl ValidAsZeroBits for u16 {}
+unsafe impl ValidAsZeroBits for u32 {}
+unsafe impl ValidAsZeroBits for u64 {}
+unsafe impl ValidAsZeroBits for u128 {}
+unsafe impl ValidAsZeroBits for usize {}
+unsafe impl ValidAsZeroBits for f32 {}
+unsafe impl ValidAsZeroBits for f64 {}
+#[cfg(feature = "f16")]
+unsafe impl ValidAsZeroBits for half::f16 {}
+#[cfg(feature = "f16")]
+unsafe impl ValidAsZeroBits for half::bf16 {}
+unsafe impl<T: ValidAsZeroBits, const M: usize> ValidAsZeroBits for [T; M] {}
+/// Implement `ValidAsZeroBits` for tuples if all elements are `ValidAsZeroBits`,
+///
+/// # Note
+/// This will also implement `ValidAsZeroBits` for a tuple with one element
+macro_rules! impl_tuples {
+    ($t:tt) => {
+        impl_tuples!(@ $t);
+    };
+    // the $l is in front of the reptition to prevent parsing ambiguities
+    ($l:tt $(,$t:tt)+) => {
+        impl_tuples!($($t),+);
+        impl_tuples!(@ $l $(,$t)+);
+    };
+    (@ $($t:tt),+) => {
+        unsafe impl<$($t: ValidAsZeroBits,)+> ValidAsZeroBits for ($($t,)+) {}
+    };
+}
+impl_tuples!(A, B, C, D, E, F, G, H, I, J, K, L);
+#[cfg(test)]
+mod tests {
+    use super::*;
+    #[test]
+    fn test_post_build_arc_count() {
+        let device = CudaDevice::new(0).unwrap();
+        assert_eq!(Arc::strong_count(&device), 1);
+    }
+    #[test]
+    fn test_post_alloc_arc_counts() {
+        let device = CudaDevice::new(0).unwrap();
+        let t = device.alloc_zeros::<f32>(1).unwrap();
+        assert!(t.host_buf.is_none());
+        assert_eq!(Arc::strong_count(&device), 2);
+    }
+    #[test]
+    fn test_post_take_arc_counts() {
+        let device = CudaDevice::new(0).unwrap();
+        let t = device.htod_copy([0.0f32; 5].to_vec()).unwrap();
+        assert!(t.host_buf.is_some());
+        assert_eq!(Arc::strong_count(&device), 2);
+        drop(t);
+        assert_eq!(Arc::strong_count(&device), 1);
+    }
+    #[test]
+    fn test_post_clone_counts() {
+        let device = CudaDevice::new(0).unwrap();
+        let t = device.htod_copy([0.0f64; 10].to_vec()).unwrap();
+        let r = t.clone();
+        assert_eq!(Arc::strong_count(&device), 3);
+        drop(t);
+        assert_eq!(Arc::strong_count(&device), 2);
+        drop(r);
+        assert_eq!(Arc::strong_count(&device), 1);
+    }
+    #[test]
+    fn test_post_clone_arc_slice_counts() {
+        let device = CudaDevice::new(0).unwrap();
+        let t = Arc::new(device.htod_copy::<f64>([0.0; 10].to_vec()).unwrap());
+        let r = t.clone();
+        assert_eq!(Arc::strong_count(&device), 2);
+        drop(t);
+        assert_eq!(Arc::strong_count(&device), 2);
+        drop(r);
+        assert_eq!(Arc::strong_count(&device), 1);
+    }
+    #[test]
+    fn test_post_release_counts() {
+        let device = CudaDevice::new(0).unwrap();
+        let t = device.htod_copy([1.0f32, 2.0, 3.0].to_vec()).unwrap();
+        #[allow(clippy::redundant_clone)]
+        let r = t.clone();
+        assert_eq!(Arc::strong_count(&device), 3);
+        let r_host = device.sync_reclaim(r).unwrap();
+        assert_eq!(&r_host, &[1.0, 2.0, 3.0]);
+        assert_eq!(Arc::strong_count(&device), 2);
+        drop(r_host);
+        assert_eq!(Arc::strong_count(&device), 2);
+    }
+    #[test]
+    #[ignore = "must be executed by itself"]
+    fn test_post_alloc_memory() {
+        let device = CudaDevice::new(0).unwrap();
+        let (free1, total1) = result::mem_get_info().unwrap();
+        let t = device.htod_copy([0.0f32; 5].to_vec()).unwrap();
+        let (free2, total2) = result::mem_get_info().unwrap();
+        assert_eq!(total1, total2);
+        assert!(free2 < free1);
+        drop(t);
+        device.synchronize().unwrap();
+        let (free3, total3) = result::mem_get_info().unwrap();
+        assert_eq!(total2, total3);
+        assert!(free3 > free2);
+        assert_eq!(free3, free1);
+    }
+    #[test]
+    fn test_device_copy_to_views() {
+        let dev = CudaDevice::new(0).unwrap();
+        let smalls = [
+            dev.htod_copy(std::vec![-1.0f32, -0.8]).unwrap(),
+            dev.htod_copy(std::vec![-0.6, -0.4]).unwrap(),
+            dev.htod_copy(std::vec![-0.2, 0.0]).unwrap(),
+            dev.htod_copy(std::vec![0.2, 0.4]).unwrap(),
+            dev.htod_copy(std::vec![0.6, 0.8]).unwrap(),
+        ];
+        let mut big = dev.alloc_zeros::<f32>(10).unwrap();
+        let mut offset = 0;
+        for small in smalls.iter() {
+            let mut sub = big.try_slice_mut(offset..offset + small.len()).unwrap();
+            dev.dtod_copy(small, &mut sub).unwrap();
+            offset += small.len();
+        }
+        assert_eq!(
+            dev.sync_reclaim(big).unwrap(),
+            [-1.0, -0.8, -0.6, -0.4, -0.2, 0.0, 0.2, 0.4, 0.6, 0.8]
+        );
+    }
+    #[test]
+    fn test_leak_and_upgrade() {
+        let dev = CudaDevice::new(0).unwrap();
+        let a = dev
+            .htod_copy(std::vec![1.0f32, 2.0, 3.0, 4.0, 5.0])
+            .unwrap();
+        let ptr = a.leak();
+        let b = unsafe { dev.upgrade_device_ptr::<f32>(ptr, 3) };
+        assert_eq!(dev.dtoh_sync_copy(&b).unwrap(), &[1.0, 2.0, 3.0]);
+        let ptr = b.leak();
+        let c = unsafe { dev.upgrade_device_ptr::<f32>(ptr, 5) };
+        assert_eq!(dev.dtoh_sync_copy(&c).unwrap(), &[1.0, 2.0, 3.0, 4.0, 5.0]);
+    }
+    /// See https://github.com/coreylowman/cudarc/issues/160
+    #[test]
+    fn test_slice_is_freed_with_correct_context() {
+        if CudaDevice::count().unwrap() < 2 {
+            return;
+        }
+        let dev0 = CudaDevice::new(0).unwrap();
+        let slice = dev0.htod_copy(vec![1.0; 10]).unwrap();
+        let dev1 = CudaDevice::new(1).unwrap();
+        drop(dev1);
+        drop(dev0);
+        drop(slice);
+    }
+    /// See https://github.com/coreylowman/cudarc/issues/161
+    #[test]
+    fn test_copy_uses_correct_context() {
+        if CudaDevice::count().unwrap() < 2 {
+            return;
+        }
+        let dev0 = CudaDevice::new(0).unwrap();
+        let _dev1 = CudaDevice::new(1).unwrap();
+        let slice = dev0.htod_copy(vec![1.0; 10]).unwrap();
+        let _out = dev0.dtoh_sync_copy(&slice).unwrap();
+    }
+}
--- a/src/driver/safe/core.rs
+++ b/src/driver/safe/core.rs
+use crate::driver::{
+    result,
+    sys::{self, CUfunction_attribute_enum},
+};
+use super::{alloc::DeviceRepr, device_ptr::DeviceSlice};
+use std::{
+    marker::PhantomData,
+    ops::{Bound, RangeBounds},
+    string::String,
+};
+#[cfg(feature = "no-std")]
+use spin::RwLock;
+#[cfg(not(feature = "no-std"))]
+use std::sync::RwLock;
+use std::{collections::BTreeMap, marker::Unpin, pin::Pin, sync::Arc, vec::Vec};
+/// A wrapper around [sys::CUdevice], [sys::CUcontext], [sys::CUstream],
+/// and [CudaFunction].
+///
+/// ```rust
+/// # use cudarc::driver::CudaDevice;
+/// let dev = CudaDevice::new(0).unwrap();
+/// ```
+///
+/// # Safety
+/// 1. impl [Drop] to call all the corresponding resource cleanup methods
+/// 2. Doesn't impl clone, so you can't have multiple device pointers
+/// hanging around.
+/// 3. Any allocations enforce that self is an [Arc], meaning no allocation
+/// can outlive the [CudaDevice]
+#[derive(Debug)]
+pub struct CudaDevice {
+    pub(crate) cu_device: sys::CUdevice,
+    pub(crate) cu_primary_ctx: sys::CUcontext,
+    /// The stream that all work is executed on.
+    pub(crate) stream: sys::CUstream,
+    /// Used to synchronize with stream
+    pub(crate) event: sys::CUevent,
+    pub(crate) modules: RwLock<BTreeMap<String, CudaModule>>,
+    pub(crate) ordinal: usize,
+    pub(crate) is_async: bool,
+}
+unsafe impl Send for CudaDevice {}
+unsafe impl Sync for CudaDevice {}
+impl CudaDevice {
+    /// Creates a new [CudaDevice] on device index `ordinal`.
+    pub fn new(ordinal: usize) -> Result<Arc<Self>, result::DriverError> {
+        result::init().unwrap();
+        let cu_device = result::device::get(ordinal as i32).unwrap();
+        // primary context initialization, can fail with OOM
+        let cu_primary_ctx = unsafe { result::primary_ctx::retain(cu_device) }?;
+        unsafe { result::ctx::set_current(cu_primary_ctx) }.unwrap();
+        // can fail with OOM
+        let event = result::event::create(sys::CUevent_flags::CU_EVENT_DISABLE_TIMING)?;
+        let value = unsafe {
+            result::device::get_attribute(
+                cu_device,
+                sys::CUdevice_attribute_enum::CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED,
+            )?
+        };
+        let is_async = value > 0;
+        let device = CudaDevice {
+            cu_device,
+            cu_primary_ctx,
+            stream: std::ptr::null_mut(),
+            event,
+            modules: RwLock::new(BTreeMap::new()),
+            ordinal,
+            is_async,
+        };
+        Ok(Arc::new(device))
+    }
+    pub fn count() -> Result<i32, result::DriverError> {
+        result::init().unwrap();
+        result::device::get_count()
+    }
+    /// Get the `ordinal` index of this [CudaDevice].
+    pub fn ordinal(&self) -> usize {
+        self.ordinal
+    }
+    /// Get the underlying [sys::CUdevice] of this [CudaDevice].
+    ///
+    /// # Safety
+    /// While this function is marked as safe, actually using the
+    /// returned object is unsafe.
+    ///
+    /// **You must not free/release the device pointer**, as it is still
+    /// owned by the [CudaDevice].
+    pub fn cu_device(&self) -> &sys::CUdevice {
+        &self.cu_device
+    }
+    /// Get the underlying [sys::CUcontext] of this [CudaDevice].
+    ///
+    /// # Safety
+    /// While this function is marked as safe, actually using the
+    /// returned object is unsafe.
+    ///
+    /// **You must not free/release the context pointer**, as it is still
+    /// owned by the [CudaDevice].
+    pub fn cu_primary_ctx(&self) -> &sys::CUcontext {
+        &self.cu_primary_ctx
+    }
+    /// Get the underlying [sys::CUstream] that this [CudaDevice] executes
+    /// all of its work on.
+    ///
+    /// # Safety
+    /// While this function is marked as safe, actually using the
+    /// returned object is unsafe.
+    ///
+    /// **You must not free/release the stream pointer**, as it is still
+    /// owned by the [CudaDevice].
+    pub fn cu_stream(&self) -> &sys::CUstream {
+        &self.stream
+    }
+    /// Get the value of the specified attribute of this [CudaDevice].
+    pub fn attribute(&self, attrib: sys::CUdevice_attribute) -> Result<i32, result::DriverError> {
+        unsafe { result::device::get_attribute(self.cu_device, attrib) }
+    }
+}
+impl Drop for CudaDevice {
+    fn drop(&mut self) {
+        self.bind_to_thread().unwrap();
+        let modules = RwLock::get_mut(&mut self.modules);
+        #[cfg(not(feature = "no-std"))]
+        let modules = modules.unwrap();
+        for (_, module) in modules.iter() {
+            unsafe { result::module::unload(module.cu_module) }.unwrap();
+        }
+        modules.clear();
+        let stream = std::mem::replace(&mut self.stream, std::ptr::null_mut());
+        if !stream.is_null() {
+            unsafe { result::stream::destroy(stream) }.unwrap();
+        }
+        let event = std::mem::replace(&mut self.event, std::ptr::null_mut());
+        if !event.is_null() {
+            unsafe { result::event::destroy(event) }.unwrap();
+        }
+        let ctx = std::mem::replace(&mut self.cu_primary_ctx, std::ptr::null_mut());
+        if !ctx.is_null() {
+            unsafe { result::primary_ctx::release(self.cu_device) }.unwrap();
+        }
+    }
+}
+/// Contains a reference counted pointer to both
+/// device and host memory allocated for type `T`.
+///
+/// # Host data
+///
+/// *This owns the host data it is associated with*. However
+/// it is possible to create device memory without having
+/// a corresponding host memory, so the host memory is
+/// actually [Option].
+///
+/// # Reclaiming host data
+///
+/// To reclaim the host data for this device data,
+/// use [CudaDevice::sync_reclaim()]. This will
+/// perform necessary synchronization to ensure
+/// that the device data finishes copying over.
+///
+/// # Mutating device data
+///
+/// This can only be done by launching kernels via
+/// [crate::driver::LaunchAsync] which is implemented
+/// by [CudaDevice]. Pass `&mut CudaSlice<T>`
+/// if you want to mutate the rc, and `&CudaSlice<T>` otherwise.
+///
+/// Unfortunately, `&CudaSlice<T>` can **still be mutated
+/// by the [CudaFunction]**.
+#[derive(Debug)]
+pub struct CudaSlice<T> {
+    pub(crate) cu_device_ptr: sys::CUdeviceptr,
+    pub(crate) len: usize,
+    pub(crate) device: Arc<CudaDevice>,
+    pub(crate) host_buf: Option<Pin<Vec<T>>>,
+}
+unsafe impl<T: Send> Send for CudaSlice<T> {}
+unsafe impl<T: Sync> Sync for CudaSlice<T> {}
+impl<T> Drop for CudaSlice<T> {
+    fn drop(&mut self) {
+        self.device.bind_to_thread().unwrap();
+        unsafe {
+            if self.device.is_async {
+                result::free_async(self.cu_device_ptr, self.device.stream).unwrap();
+            } else {
+                result::free_sync(self.cu_device_ptr).unwrap();
+            }
+        }
+    }
+}
+impl<T> CudaSlice<T> {
+    /// Get a clone of the underlying [CudaDevice].
+    pub fn device(&self) -> Arc<CudaDevice> {
+        self.device.clone()
+    }
+}
+impl<T: DeviceRepr> CudaSlice<T> {
+    /// Allocates copy of self and schedules a device to device copy of memory.
+    pub fn try_clone(&self) -> Result<Self, result::DriverError> {
+        let mut dst = unsafe { self.device.alloc(self.len) }?;
+        self.device.dtod_copy(self, &mut dst)?;
+        Ok(dst)
+    }
+}
+impl<T: DeviceRepr> Clone for CudaSlice<T> {
+    fn clone(&self) -> Self {
+        self.try_clone().unwrap()
+    }
+}
+impl<T: Clone + Default + DeviceRepr + Unpin> TryFrom<CudaSlice<T>> for Vec<T> {
+    type Error = result::DriverError;
+    fn try_from(value: CudaSlice<T>) -> Result<Self, Self::Error> {
+        value.device.clone().sync_reclaim(value)
+    }
+}
+/// Wrapper around [sys::CUmodule] that also contains
+/// the loaded [CudaFunction] associated with this module.
+///
+/// See [CudaModule::get_fn()] for retrieving function handles.
+#[derive(Debug)]
+pub(crate) struct CudaModule {
+    pub(crate) cu_module: sys::CUmodule,
+    pub(crate) functions: BTreeMap<&'static str, sys::CUfunction>,
+}
+unsafe impl Send for CudaModule {}
+unsafe impl Sync for CudaModule {}
+/// Wrapper around [sys::CUfunction]. Used by [crate::driver::LaunchAsync].
+#[derive(Debug, Clone)]
+pub struct CudaFunction {
+    pub(crate) cu_function: sys::CUfunction,
+    pub(crate) device: Arc<CudaDevice>,
+}
+impl CudaFunction {
+    pub fn occupancy_available_dynamic_smem_per_block(
+        &self,
+        num_blocks: u32,
+        block_size: u32,
+    ) -> Result<usize, result::DriverError> {
+        let mut dynamic_smem_size: usize = 0;
+        unsafe {
+            sys::cuOccupancyAvailableDynamicSMemPerBlock(
+                &mut dynamic_smem_size,
+                self.cu_function,
+                num_blocks as std::ffi::c_int,
+                block_size as std::ffi::c_int,
+            )
+            .result()?
+        };
+        Ok(dynamic_smem_size)
+    }
+    pub fn occupancy_max_active_blocks_per_multiprocessor(
+        &self,
+        block_size: u32,
+        dynamic_smem_size: usize,
+        flags: Option<sys::CUoccupancy_flags_enum>,
+    ) -> Result<u32, result::DriverError> {
+        let mut num_blocks: std::ffi::c_int = 0;
+        let flags = flags.unwrap_or(sys::CUoccupancy_flags_enum::CU_OCCUPANCY_DEFAULT);
+        unsafe {
+            sys::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+                &mut num_blocks,
+                self.cu_function,
+                block_size as std::ffi::c_int,
+                dynamic_smem_size,
+                flags as std::ffi::c_uint,
+            )
+            .result()?
+        };
+        Ok(num_blocks as u32)
+    }
+    pub fn occupancy_max_active_clusters(
+        &self,
+        config: crate::driver::LaunchConfig,
+        shared_mem_size: u32,
+    ) -> Result<u32, result::DriverError> {
+        let mut num_clusters: std::ffi::c_int = 0;
+        let cfg = sys::CUlaunchConfig {
+            gridDimX: config.grid_dim.0,
+            gridDimY: config.grid_dim.1,
+            gridDimZ: config.grid_dim.2,
+            blockDimX: config.block_dim.0,
+            blockDimY: config.block_dim.1,
+            blockDimZ: config.block_dim.2,
+            sharedMemBytes: shared_mem_size as std::ffi::c_uint,
+            hStream: self.device.stream,
+            attrs: std::ptr::null_mut(),
+            numAttrs: 0,
+        };
+        unsafe {
+            sys::cuOccupancyMaxActiveClusters(&mut num_clusters, self.cu_function, &cfg).result()?
+        };
+        Ok(num_clusters as u32)
+    }
+    pub fn occupancy_max_potential_block_size(
+        &self,
+        block_size_to_dynamic_smem_size: extern "C" fn(block_size: std::ffi::c_int) -> usize,
+        dynamic_smem_size: usize,
+        block_size_limit: u32,
+        flags: Option<sys::CUoccupancy_flags_enum>,
+    ) -> Result<(u32, u32), result::DriverError> {
+        let mut min_grid_size: std::ffi::c_int = 0;
+        let mut block_size: std::ffi::c_int = 0;
+        let flags = flags.unwrap_or(sys::CUoccupancy_flags_enum::CU_OCCUPANCY_DEFAULT);
+        unsafe {
+            sys::cuOccupancyMaxPotentialBlockSizeWithFlags(
+                &mut min_grid_size,
+                &mut block_size,
+                self.cu_function,
+                Some(block_size_to_dynamic_smem_size),
+                dynamic_smem_size,
+                block_size_limit as std::ffi::c_int,
+                flags as std::ffi::c_uint,
+            )
+            .result()?
+        };
+        Ok((min_grid_size as u32, block_size as u32))
+    }
+    pub fn occupancy_max_potential_cluster_size(
+        &self,
+        config: crate::driver::LaunchConfig,
+        shared_mem_size: u32,
+    ) -> Result<u32, result::DriverError> {
+        let mut cluster_size: std::ffi::c_int = 0;
+        let cfg = sys::CUlaunchConfig {
+            gridDimX: config.grid_dim.0,
+            gridDimY: config.grid_dim.1,
+            gridDimZ: config.grid_dim.2,
+            blockDimX: config.block_dim.0,
+            blockDimY: config.block_dim.1,
+            blockDimZ: config.block_dim.2,
+            sharedMemBytes: shared_mem_size as std::ffi::c_uint,
+            hStream: self.device.stream,
+            attrs: std::ptr::null_mut(),
+            numAttrs: 0,
+        };
+        unsafe {
+            sys::cuOccupancyMaxPotentialClusterSize(&mut cluster_size, self.cu_function, &cfg)
+                .result()?
+        };
+        Ok(cluster_size as u32)
+    }
+    /// Set the value of a specific attribute of this [CudaFunction].
+    pub fn set_attribute(
+        &self,
+        attribute: CUfunction_attribute_enum,
+        value: i32,
+    ) -> Result<(), result::DriverError> {
+        unsafe {
+            result::function::set_function_attribute(self.cu_function, attribute, value)?;
+        }
+        Ok(())
+    }
+}
+unsafe impl Send for CudaFunction {}
+unsafe impl Sync for CudaFunction {}
+/// A wrapper around [sys::CUstream] that safely ensures null stream is synchronized
+/// upon the completion of this streams work.
+///
+/// Create with [CudaDevice::fork_default_stream].
+///
+/// The synchronization happens in **code order**. E.g.
+/// ```ignore
+/// let stream = dev.fork_default_stream()?; // 0
+/// dev.launch(...)?; // 1
+/// dev.launch_on_stream(&stream, ...)?; // 2
+/// dev.launch(...)?; // 3
+/// drop(stream); // 4
+/// dev.launch(...) // 5
+/// ```
+///
+/// - 0 will place a streamWaitEvent(default work stream) on the new stream
+/// - 1 will launch on the default work stream
+/// - 2 will launch concurrently to 1 on `&stream`,
+/// - 3 will launch after 1 on the default work stream, but potentially concurrently to 2.
+/// - 4 will place a streamWaitEvent(`&stream`) on default work stream
+/// - 5 will happen on the default stream **after the default stream waits for 2**
+#[derive(Debug)]
+pub struct CudaStream {
+    pub stream: sys::CUstream,
+    device: Arc<CudaDevice>,
+}
+impl CudaDevice {
+    /// Allocates a new stream that can execute kernels concurrently to the default stream.
+    ///
+    /// The synchronization with default stream happens in **code order**. See [CudaStream] docstring.
+    ///
+    /// This stream synchronizes in the following way:
+    /// 1. On creation it adds a wait for any existing work on the default work stream to complete
+    /// 2. On drop it adds a wait for any existign work on Self to complete *to the default stream*.
+    pub fn fork_default_stream(self: &Arc<Self>) -> Result<CudaStream, result::DriverError> {
+        self.bind_to_thread()?;
+        let stream = CudaStream {
+            stream: result::stream::create(result::stream::StreamKind::NonBlocking)?,
+            device: self.clone(),
+        };
+        stream.wait_for_default()?;
+        Ok(stream)
+    }
+    /// Forces [CudaStream] to drop, causing the default work stream to block on `streams` completion.
+    /// **This is asynchronous with respect to the host.**
+    #[allow(unused_variables)]
+    pub fn wait_for(self: &Arc<Self>, stream: &CudaStream) -> Result<(), result::DriverError> {
+        self.bind_to_thread()?;
+        unsafe {
+            result::event::record(self.event, stream.stream)?;
+            result::stream::wait_event(
+                self.stream,
+                self.event,
+                sys::CUevent_wait_flags::CU_EVENT_WAIT_DEFAULT,
+            )
+        }
+    }
+}
+impl CudaStream {
+    /// Record's the current default streams workload, and then causes `self`
+    /// to wait for the default stream to finish that recorded workload.
+    pub fn wait_for_default(&self) -> Result<(), result::DriverError> {
+        self.device.bind_to_thread()?;
+        unsafe {
+            result::event::record(self.device.event, self.device.stream)?;
+            result::stream::wait_event(
+                self.stream,
+                self.device.event,
+                sys::CUevent_wait_flags::CU_EVENT_WAIT_DEFAULT,
+            )
+        }
+    }
+}
+impl Drop for CudaStream {
+    fn drop(&mut self) {
+        self.device.wait_for(self).unwrap();
+        unsafe {
+            result::stream::destroy(self.stream).unwrap();
+        }
+    }
+}
+/// A immutable sub-view into a [CudaSlice] created by [CudaSlice::try_slice()].
+#[derive(Debug)]
+pub struct CudaView<'a, T> {
+    pub(crate) root: &'a sys::CUdeviceptr,
+    pub(crate) ptr: sys::CUdeviceptr,
+    pub(crate) len: usize,
+    marker: PhantomData<T>,
+}
+impl<T> CudaSlice<T> {
+    /// Creates a [CudaView] at the specified offset from the start of `self`.
+    ///
+    /// Returns `None` if `range.start >= self.len`
+    pub fn slice(&self, range: impl RangeBounds<usize>) -> CudaView<'_, T> {
+        self.try_slice(range).unwrap()
+    }
+    /// Fallible version of [CudaSlice::slice]
+    pub fn try_slice(&self, range: impl RangeBounds<usize>) -> Option<CudaView<'_, T>> {
+        range.bounds(..self.len()).map(|(start, end)| CudaView {
+            root: &self.cu_device_ptr,
+            ptr: self.cu_device_ptr + (start * std::mem::size_of::<T>()) as u64,
+            len: end - start,
+            marker: PhantomData,
+        })
+    }
+    /// Reinterprets the slice of memory into a different type. `len` is the number
+    /// of elements of the new type `S` that are expected. If not enough bytes
+    /// are allocated in `self` for the view, then this returns `None`.
+    ///
+    /// # Safety
+    /// This is unsafe because not the memory for the view may not be a valid interpretation
+    /// for the type `S`.
+    pub unsafe fn transmute<S>(&self, len: usize) -> Option<CudaView<'_, S>> {
+        (len * std::mem::size_of::<S>() <= self.num_bytes()).then_some(CudaView {
+            root: &self.cu_device_ptr,
+            ptr: self.cu_device_ptr,
+            len,
+            marker: PhantomData,
+        })
+    }
+}
+impl<'a, T> CudaView<'a, T> {
+    /// Creates a [CudaView] at the specified offset from the start of `self`.
+    ///
+    /// Returns `None` if `range.start >= self.len`
+    pub fn slice(&self, range: impl RangeBounds<usize>) -> CudaView<'a, T> {
+        self.try_slice(range).unwrap()
+    }
+    /// Fallible version of [CudaView::slice]
+    pub fn try_slice(&self, range: impl RangeBounds<usize>) -> Option<CudaView<'a, T>> {
+        range.bounds(..self.len()).map(|(start, end)| CudaView {
+            root: self.root,
+            ptr: self.ptr + (start * std::mem::size_of::<T>()) as u64,
+            len: end - start,
+            marker: PhantomData,
+        })
+    }
+}
+/// A mutable sub-view into a [CudaSlice] created by [CudaSlice::try_slice_mut()].
+#[derive(Debug)]
+pub struct CudaViewMut<'a, T> {
+    pub(crate) root: &'a mut sys::CUdeviceptr,
+    pub(crate) ptr: sys::CUdeviceptr,
+    pub(crate) len: usize,
+    marker: PhantomData<T>,
+}
+impl<T> CudaSlice<T> {
+    /// Creates a [CudaViewMut] at the specified offset from the start of `self`.
+    ///
+    /// Returns `None` if `offset >= self.len`
+    pub fn slice_mut(&mut self, range: impl RangeBounds<usize>) -> CudaViewMut<'_, T> {
+        self.try_slice_mut(range).unwrap()
+    }
+    /// Fallible version of [CudaSlice::slice_mut]
+    pub fn try_slice_mut(&mut self, range: impl RangeBounds<usize>) -> Option<CudaViewMut<'_, T>> {
+        range.bounds(..self.len()).map(|(start, end)| CudaViewMut {
+            ptr: self.cu_device_ptr + (start * std::mem::size_of::<T>()) as u64,
+            root: &mut self.cu_device_ptr,
+            len: end - start,
+            marker: PhantomData,
+        })
+    }
+    /// Reinterprets the slice of memory into a different type. `len` is the number
+    /// of elements of the new type `S` that are expected. If not enough bytes
+    /// are allocated in `self` for the view, then this returns `None`.
+    ///
+    /// # Safety
+    /// This is unsafe because not the memory for the view may not be a valid interpretation
+    /// for the type `S`.
+    pub unsafe fn transmute_mut<S>(&mut self, len: usize) -> Option<CudaViewMut<'_, S>> {
+        (len * std::mem::size_of::<S>() <= self.num_bytes()).then_some(CudaViewMut {
+            ptr: self.cu_device_ptr,
+            root: &mut self.cu_device_ptr,
+            len,
+            marker: PhantomData,
+        })
+    }
+}
+impl<'a, T> CudaViewMut<'a, T> {
+    /// Creates a [CudaView] at the specified offset from the start of `self`.
+    ///
+    /// Returns `None` if `range.start >= self.len`
+    pub fn slice<'b: 'a>(&'b self, range: impl RangeBounds<usize>) -> CudaView<'a, T> {
+        self.try_slice(range).unwrap()
+    }
+    /// Fallible version of [CudaViewMut::slice]
+    pub fn try_slice<'b: 'a>(&'b self, range: impl RangeBounds<usize>) -> Option<CudaView<'a, T>> {
+        range.bounds(..self.len()).map(|(start, end)| CudaView {
+            root: self.root,
+            ptr: self.ptr + (start * std::mem::size_of::<T>()) as u64,
+            len: end - start,
+            marker: PhantomData,
+        })
+    }
+    /// Creates a [CudaViewMut] at the specified offset from the start of `self`.
+    ///
+    /// Returns `None` if `offset >= self.len`
+    pub fn slice_mut<'b: 'a>(&'b mut self, range: impl RangeBounds<usize>) -> CudaViewMut<'a, T> {
+        self.try_slice_mut(range).unwrap()
+    }
+    /// Fallible version of [CudaViewMut::slice_mut]
+    pub fn try_slice_mut<'b: 'a>(
+        &'b mut self,
+        range: impl RangeBounds<usize>,
+    ) -> Option<CudaViewMut<'a, T>> {
+        range.bounds(..self.len()).map(|(start, end)| CudaViewMut {
+            ptr: self.ptr + (start * std::mem::size_of::<T>()) as u64,
+            root: self.root,
+            len: end - start,
+            marker: PhantomData,
+        })
+    }
+}
+trait RangeHelper: RangeBounds<usize> {
+    fn inclusive_start(&self, valid_start: usize) -> usize;
+    fn exclusive_end(&self, valid_end: usize) -> usize;
+    fn bounds(&self, valid: impl RangeHelper) -> Option<(usize, usize)> {
+        let vs = valid.inclusive_start(0);
+        let ve = valid.exclusive_end(usize::MAX);
+        let s = self.inclusive_start(vs);
+        let e = self.exclusive_end(ve);
+        let inside = s >= vs && e <= ve;
+        let valid = s < e || (s == e && !matches!(self.end_bound(), Bound::Included(_)));
+        (inside && valid).then_some((s, e))
+    }
+}
+impl<R: RangeBounds<usize>> RangeHelper for R {
+    fn inclusive_start(&self, valid_start: usize) -> usize {
+        match self.start_bound() {
+            Bound::Included(n) => *n,
+            Bound::Excluded(n) => *n + 1,
+            Bound::Unbounded => valid_start,
+        }
+    }
+    fn exclusive_end(&self, valid_end: usize) -> usize {
+        match self.end_bound() {
+            Bound::Included(n) => *n + 1,
+            Bound::Excluded(n) => *n,
+            Bound::Unbounded => valid_end,
+        }
+    }
+}
+#[cfg(test)]
+mod tests {
+    use super::*;
+    #[test]
+    #[allow(clippy::reversed_empty_ranges)]
+    fn test_bounds_helper() {
+        assert_eq!((..2usize).bounds(0..usize::MAX), Some((0, 2)));
+        assert_eq!((1..2usize).bounds(..usize::MAX), Some((1, 2)));
+        assert_eq!((..).bounds(1..10), Some((1, 10)));
+        assert_eq!((2..=2usize).bounds(0..usize::MAX), Some((2, 3)));
+        assert_eq!((2..=2usize).bounds(0..=1), None);
+        assert_eq!((2..2usize).bounds(0..usize::MAX), Some((2, 2)));
+        assert_eq!((1..0usize).bounds(0..usize::MAX), None);
+        assert_eq!((1..=0usize).bounds(0..usize::MAX), None);
+    }
+    #[test]
+    fn test_transmutes() {
+        let dev = CudaDevice::new(0).unwrap();
+        let mut slice = dev.alloc_zeros::<u8>(100).unwrap();
+        assert!(unsafe { slice.transmute::<f32>(25) }.is_some());
+        assert!(unsafe { slice.transmute::<f32>(26) }.is_none());
+        assert!(unsafe { slice.transmute_mut::<f32>(25) }.is_some());
+        assert!(unsafe { slice.transmute_mut::<f32>(26) }.is_none());
+    }
+}
--- a/src/driver/safe/device_ptr.rs
+++ b/src/driver/safe/device_ptr.rs
+use crate::driver::sys;
+use super::core::{CudaSlice, CudaView, CudaViewMut};
+pub trait DeviceSlice<T> {
+    fn len(&self) -> usize;
+    fn num_bytes(&self) -> usize {
+        self.len() * std::mem::size_of::<T>()
+    }
+    fn is_empty(&self) -> bool {
+        self.len() == 0
+    }
+}
+impl<T> DeviceSlice<T> for CudaSlice<T> {
+    fn len(&self) -> usize {
+        self.len
+    }
+}
+impl<'a, T> DeviceSlice<T> for CudaView<'a, T> {
+    fn len(&self) -> usize {
+        self.len
+    }
+}
+impl<'a, T> DeviceSlice<T> for CudaViewMut<'a, T> {
+    fn len(&self) -> usize {
+        self.len
+    }
+}
+/// Abstraction over [CudaSlice]/[CudaView]
+pub trait DevicePtr<T>: DeviceSlice<T> {
+    fn device_ptr(&self) -> &sys::CUdeviceptr;
+}
+impl<T> DevicePtr<T> for CudaSlice<T> {
+    fn device_ptr(&self) -> &sys::CUdeviceptr {
+        &self.cu_device_ptr
+    }
+}
+impl<'a, T> DevicePtr<T> for CudaView<'a, T> {
+    fn device_ptr(&self) -> &sys::CUdeviceptr {
+        &self.ptr
+    }
+}
+impl<'a, T> DevicePtr<T> for CudaViewMut<'a, T> {
+    fn device_ptr(&self) -> &sys::CUdeviceptr {
+        &self.ptr
+    }
+}
+/// Abstraction over [CudaSlice]/[CudaViewMut]
+pub trait DevicePtrMut<T>: DeviceSlice<T> {
+    fn device_ptr_mut(&mut self) -> &mut sys::CUdeviceptr;
+}
+impl<T> DevicePtrMut<T> for CudaSlice<T> {
+    fn device_ptr_mut(&mut self) -> &mut sys::CUdeviceptr {
+        &mut self.cu_device_ptr
+    }
+}
+impl<'a, T> DevicePtrMut<T> for CudaViewMut<'a, T> {
+    fn device_ptr_mut(&mut self) -> &mut sys::CUdeviceptr {
+        &mut self.ptr
+    }
+}
--- a/src/driver/safe/external_memory.rs
+++ b/src/driver/safe/external_memory.rs
+use core::mem::ManuallyDrop;
+use std::fs::File;
+use std::ops::Range;
+use std::sync::Arc;
+use super::{CudaDevice, DevicePtr, DeviceSlice};
+use crate::driver::{result, sys, DriverError};
+impl CudaDevice {
+    /// Import external memory from a [`File`].
+    ///
+    /// # Safety
+    /// `size` must be the size of the external memory in bytes.
+    #[cfg(any(unix, windows))]
+    pub unsafe fn import_external_memory(
+        self: &Arc<Self>,
+        file: File,
+        size: u64,
+    ) -> Result<ExternalMemory, DriverError> {
+        self.bind_to_thread()?;
+        #[cfg(unix)]
+        let external_memory = unsafe {
+            use std::os::fd::AsRawFd;
+            result::external_memory::import_external_memory_opaque_fd(file.as_raw_fd(), size)
+        }?;
+        #[cfg(windows)]
+        let external_memory = unsafe {
+            use std::os::windows::io::AsRawHandle;
+            result::external_memory::import_external_memory_opaque_win32(file.as_raw_handle(), size)
+        }?;
+        Ok(ExternalMemory {
+            external_memory,
+            size,
+            device: self.clone(),
+            _file: ManuallyDrop::new(file),
+        })
+    }
+}
+/// An abstraction for imported external memory.
+///
+/// This struct can be created via [`CudaDevice::import_external_memory`].
+/// The imported external memory will be destroyed when this struct is dropped.
+#[derive(Debug)]
+pub struct ExternalMemory {
+    external_memory: sys::CUexternalMemory,
+    size: u64,
+    device: Arc<CudaDevice>,
+    _file: ManuallyDrop<File>,
+}
+impl Drop for ExternalMemory {
+    fn drop(&mut self) {
+        self.device.bind_to_thread().unwrap();
+        unsafe { result::external_memory::destroy_external_memory(self.external_memory) }.unwrap();
+        // From [CUDA docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXTRES__INTEROP.html#group__CUDA__EXTRES__INTEROP_1g52aba3a7f780157d8ba12972b2481735),
+        // when successfully importing UNIX file descriptor:
+        //
+        // > Ownership of the file descriptor is transferred to the CUDA driver when the handle is imported successfully.
+        // > Performing any operations on the file descriptor after it is imported results in undefined behavior.
+        //
+        // On the other hand, on Windows:
+        //
+        // > Ownership of this handle is not transferred to CUDA after the import operation,
+        // > so the application must release the handle using the appropriate system call.
+        //
+        // Therefore, we manually drop the file when we are on Windows.
+        #[cfg(windows)]
+        unsafe {
+            ManuallyDrop::<File>::drop(&mut self._file)
+        };
+    }
+}
+impl ExternalMemory {
+    /// Map the whole external memory to get mapped buffer.
+    pub fn map_all(self) -> Result<MappedBuffer, DriverError> {
+        let size = self.size as usize;
+        self.map_range(0..size)
+    }
+    /// Map a range of the external memory to a mapped buffer.
+    ///
+    /// Only one mapped buffer is allowed at a time.
+    /// This is more restrictive than it necessarily needs to be,
+    /// but it makes enforcing safety easier.
+    ///
+    /// # Panics
+    /// This function will panic if the range is invalid,
+    /// such as when the start or end is larger than the size.
+    pub fn map_range(self, range: Range<usize>) -> Result<MappedBuffer, DriverError> {
+        assert!(range.start as u64 <= self.size);
+        assert!(range.end as u64 <= self.size);
+        let device_ptr = unsafe {
+            result::external_memory::get_mapped_buffer(
+                self.external_memory,
+                range.start as u64,
+                range.len() as u64,
+            )
+        }?;
+        Ok(MappedBuffer {
+            device_ptr,
+            len: range.len(),
+            external_memory: self,
+        })
+    }
+}
+/// An abstraction for a mapped buffer for some external memory.
+///
+/// This struct can be created via [`ExternalMemory::map_range`] or [`ExternalMemory::map_all`].
+/// The underlying mapped buffer will be freed when this struct is dropped.
+#[derive(Debug)]
+pub struct MappedBuffer {
+    device_ptr: sys::CUdeviceptr,
+    len: usize,
+    external_memory: ExternalMemory,
+}
+impl Drop for MappedBuffer {
+    fn drop(&mut self) {
+        self.external_memory.device.bind_to_thread().unwrap();
+        unsafe { result::memory_free(self.device_ptr) }.unwrap()
+    }
+}
+impl DeviceSlice<u8> for MappedBuffer {
+    fn len(&self) -> usize {
+        self.len
+    }
+}
+impl DevicePtr<u8> for MappedBuffer {
+    fn device_ptr(&self) -> &sys::CUdeviceptr {
+        &self.device_ptr
+    }
+}
--- a/src/driver/safe/launch.rs
+++ b/src/driver/safe/launch.rs
+use crate::driver::{result, sys};
+use super::alloc::DeviceRepr;
+use super::core::{CudaDevice, CudaFunction, CudaModule, CudaStream};
+use std::sync::Arc;
+impl CudaDevice {
+    /// Whether a module and function are currently loaded into the device.
+    pub fn has_func(self: &Arc<Self>, module_name: &str, func_name: &str) -> bool {
+        let modules = self.modules.read();
+        #[cfg(not(feature = "no-std"))]
+        let modules = modules.unwrap();
+        modules
+            .get(module_name)
+            .map_or(false, |module| module.has_func(func_name))
+    }
+    /// Retrieves a [CudaFunction] that was registered under `module_name` and `func_name`.
+    pub fn get_func(self: &Arc<Self>, module_name: &str, func_name: &str) -> Option<CudaFunction> {
+        let modules = self.modules.read();
+        #[cfg(not(feature = "no-std"))]
+        let modules = modules.unwrap();
+        modules
+            .get(module_name)
+            .and_then(|m| m.get_func(func_name))
+            .map(|cu_function| CudaFunction {
+                cu_function,
+                device: self.clone(),
+            })
+    }
+}
+impl CudaModule {
+    /// Returns reference to function with `name`. If function
+    /// was not already loaded into CudaModule, then `None`
+    /// is returned.
+    pub(crate) fn get_func(&self, name: &str) -> Option<sys::CUfunction> {
+        self.functions.get(name).cloned()
+    }
+    pub(crate) fn has_func(&self, name: &str) -> bool {
+        self.functions.contains_key(name)
+    }
+}
+impl CudaFunction {
+    #[inline(always)]
+    unsafe fn launch_async_impl(
+        self,
+        cfg: LaunchConfig,
+        params: &mut [*mut std::ffi::c_void],
+    ) -> Result<(), result::DriverError> {
+        self.device.bind_to_thread()?;
+        result::launch_kernel(
+            self.cu_function,
+            cfg.grid_dim,
+            cfg.block_dim,
+            cfg.shared_mem_bytes,
+            self.device.stream,
+            params,
+        )
+    }
+    #[inline(always)]
+    unsafe fn par_launch_async_impl(
+        self,
+        stream: &CudaStream,
+        cfg: LaunchConfig,
+        params: &mut [*mut std::ffi::c_void],
+    ) -> Result<(), result::DriverError> {
+        self.device.bind_to_thread()?;
+        result::launch_kernel(
+            self.cu_function,
+            cfg.grid_dim,
+            cfg.block_dim,
+            cfg.shared_mem_bytes,
+            stream.stream,
+            params,
+        )
+    }
+}
+/// Configuration for [result::launch_kernel]
+///
+/// See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1gb8f3dc3031b40da29d5f9a7139e52e15)
+/// for description of each parameter.
+#[derive(Clone, Copy, Debug)]
+pub struct LaunchConfig {
+    /// (width, height, depth) of grid in blocks
+    pub grid_dim: (u32, u32, u32),
+    /// (x, y, z) dimension of each thread block
+    pub block_dim: (u32, u32, u32),
+    /// Dynamic shared-memory size per thread block in bytes
+    pub shared_mem_bytes: u32,
+}
+impl LaunchConfig {
+    /// Creates a [LaunchConfig] with:
+    /// - block_dim == `1024`
+    /// - grid_dim == `(n + 1023) / 1024`
+    /// - shared_mem_bytes == `0`
+    pub fn for_num_elems(n: u32) -> Self {
+        const NUM_THREADS: u32 = 1024;
+        let num_blocks = (n + NUM_THREADS - 1) / NUM_THREADS;
+        Self {
+            grid_dim: (num_blocks, 1, 1),
+            block_dim: (NUM_THREADS, 1, 1),
+            shared_mem_bytes: 0,
+        }
+    }
+}
+/// Consumes a [CudaFunction] to execute asychronously on the device with
+/// params determined by generic parameter `Params`.
+///
+/// This is impl'd multiple times for different number and types of params. In
+/// general, `Params` should impl [DeviceRepr].
+///
+/// ```ignore
+/// # use cudarc::driver::*;
+/// # let dev = CudaDevice::new(0).unwrap();
+/// let my_kernel: CudaFunction = dev.get_func("my_module", "my_kernel").unwrap();
+/// let cfg: LaunchConfig = LaunchConfig {
+///     grid_dim: (1, 1, 1),
+///     block_dim: (1, 1, 1),
+///     shared_mem_bytes: 0,
+/// };
+/// let params = (1i32, 2u64, 3usize);
+/// unsafe { my_kernel.launch(cfg, params) }.unwrap();
+/// ```
+///
+/// # Safety
+///
+/// This is not safe really ever, because there's no garuntee that `Params`
+/// will work for any [CudaFunction] passed in. Great care should be taken
+/// to ensure that [CudaFunction] works with `Params` and that the correct
+/// parameters have `&mut` in front of them.
+///
+/// Additionally, kernels can mutate data that is marked as immutable,
+/// such as `&CudaSlice<T>`.
+///
+/// See [LaunchAsync::launch] for more details
+pub unsafe trait LaunchAsync<Params> {
+    /// Launches the [CudaFunction] with the corresponding `Params`.
+    ///
+    /// # Safety
+    ///
+    /// This method is **very** unsafe.
+    ///
+    /// See cuda documentation notes on this as well:
+    /// <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#functions>
+    ///
+    /// 1. `params` can be changed regardless of `&` or `&mut` usage.
+    /// 2. `params` will be changed at some later point after the
+    /// function returns because the kernel is executed async.
+    /// 3. There are no guaruntees that the `params`
+    /// are the correct number/types/order for `func`.
+    /// 4. Specifying the wrong values for [LaunchConfig] can result
+    /// in accessing/modifying values past memory limits.
+    ///
+    /// ## Asynchronous mutation
+    ///
+    /// Since this library queues kernels to be launched on a single
+    /// stream, and really the only way to modify [crate::driver::CudaSlice] is through
+    /// kernels, mutating the same [crate::driver::CudaSlice] with multiple kernels
+    /// is safe. This is because each kernel is executed sequentially
+    /// on the stream.
+    ///
+    /// **Modifying a value on the host that is in used by a
+    /// kernel is undefined behavior.** But is hard to do
+    /// accidentally.
+    ///
+    /// Also for this reason, do not pass in any values to kernels
+    /// that can be modified on the host. This is the reason
+    /// [DeviceRepr] is not implemented for rust primitive
+    /// references.
+    ///
+    /// ## Use after free
+    ///
+    /// Since the drop implementation for [crate::driver::CudaSlice] also occurs
+    /// on the device's single stream, any kernels launched before
+    /// the drop will complete before the value is actually freed.
+    ///
+    /// **If you launch a kernel or drop a value on a different stream
+    /// this may not hold**
+    unsafe fn launch(self, cfg: LaunchConfig, params: Params) -> Result<(), result::DriverError>;
+    /// Launch the function on a stream concurrent to the device's default
+    /// work stream.
+    ///
+    /// # Safety
+    /// This method is even more unsafe than [LaunchAsync::launch], all the same rules apply,
+    /// except now things are executing in parallel to each other.
+    ///
+    /// That means that if any of the kernels modify the same memory location, you'll get race
+    /// conditions or potentially undefined behavior.
+    unsafe fn launch_on_stream(
+        self,
+        stream: &CudaStream,
+        cfg: LaunchConfig,
+        params: Params,
+    ) -> Result<(), result::DriverError>;
+}
+macro_rules! impl_launch {
+    ([$($Vars:tt),*], [$($Idx:tt),*]) => {
+unsafe impl<$($Vars: DeviceRepr),*> LaunchAsync<($($Vars, )*)> for CudaFunction {
+    #[inline(always)]
+    unsafe fn launch(
+        self,
+        cfg: LaunchConfig,
+        args: ($($Vars, )*)
+    ) -> Result<(), result::DriverError> {
+        let params = &mut [$(args.$Idx.as_kernel_param(), )*];
+        self.launch_async_impl(cfg, params)
+    }
+    #[inline(always)]
+    unsafe fn launch_on_stream(
+        self,
+        stream: &CudaStream,
+        cfg: LaunchConfig,
+        args: ($($Vars, )*)
+    ) -> Result<(), result::DriverError> {
+        let params = &mut [$(args.$Idx.as_kernel_param(), )*];
+        self.par_launch_async_impl(stream, cfg, params)
+    }
+}
+    };
+}
+impl_launch!([A], [0]);
+impl_launch!([A, B], [0, 1]);
+impl_launch!([A, B, C], [0, 1, 2]);
+impl_launch!([A, B, C, D], [0, 1, 2, 3]);
+impl_launch!([A, B, C, D, E], [0, 1, 2, 3, 4]);
+impl_launch!([A, B, C, D, E, F], [0, 1, 2, 3, 4, 5]);
+impl_launch!([A, B, C, D, E, F, G], [0, 1, 2, 3, 4, 5, 6]);
+impl_launch!([A, B, C, D, E, F, G, H], [0, 1, 2, 3, 4, 5, 6, 7]);
+impl_launch!([A, B, C, D, E, F, G, H, I], [0, 1, 2, 3, 4, 5, 6, 7, 8]);
+impl_launch!(
+    [A, B, C, D, E, F, G, H, I, J],
+    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
+);
+impl_launch!(
+    [A, B, C, D, E, F, G, H, I, J, K],
+    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+);
+impl_launch!(
+    [A, B, C, D, E, F, G, H, I, J, K, L],
+    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
+);
+#[cfg(test)]
+mod tests {
+    use std::{time::Instant, vec::Vec};
+    use crate::{
+        driver::{DeviceSlice, DriverError},
+        nvrtc::compile_ptx_with_opts,
+    };
+    use super::*;
+    #[test]
+    fn test_mut_into_kernel_param_no_inc_rc() {
+        let device = CudaDevice::new(0).unwrap();
+        let t = device.htod_copy([0.0f32; 1].to_vec()).unwrap();
+        let _r = t.clone();
+        assert_eq!(Arc::strong_count(&device), 3);
+        let _ = (&t).as_kernel_param();
+        assert_eq!(Arc::strong_count(&device), 3);
+    }
+    #[test]
+    fn test_ref_into_kernel_param_inc_rc() {
+        let device = CudaDevice::new(0).unwrap();
+        let t = device.htod_copy([0.0f32; 1].to_vec()).unwrap();
+        let _r = t.clone();
+        assert_eq!(Arc::strong_count(&device), 3);
+        let _ = (&t).as_kernel_param();
+        assert_eq!(Arc::strong_count(&device), 3);
+    }
+    const SIN_CU: &str = "
+extern \"C\" __global__ void sin_kernel(float *out, const float *inp, size_t numel) {
+    size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < numel) {
+        out[i] = sin(inp[i]);
+    }
+}";
+    #[test]
+    fn test_launch_with_mut_and_ref_cudarc() {
+        let ptx = compile_ptx_with_opts(SIN_CU, Default::default()).unwrap();
+        let dev = CudaDevice::new(0).unwrap();
+        dev.load_ptx(ptx, "sin", &["sin_kernel"]).unwrap();
+        let sin_kernel = dev.get_func("sin", "sin_kernel").unwrap();
+        let a_host = [-1.0f32, -0.8, -0.6, -0.4, -0.2, 0.0, 0.2, 0.4, 0.6, 0.8];
+        let a_dev = dev.htod_copy(a_host.clone().to_vec()).unwrap();
+        let mut b_dev = a_dev.clone();
+        unsafe {
+            sin_kernel.launch(
+                LaunchConfig::for_num_elems(10),
+                (&mut b_dev, &a_dev, 10usize),
+            )
+        }
+        .unwrap();
+        let b_host = dev.sync_reclaim(b_dev).unwrap();
+        for (a_i, b_i) in a_host.iter().zip(b_host.iter()) {
+            let expected = a_i.sin();
+            assert!((b_i - expected).abs() <= 1e-6);
+        }
+        drop(a_dev);
+    }
+    #[test]
+    fn test_large_launches() {
+        let ptx = compile_ptx_with_opts(SIN_CU, Default::default()).unwrap();
+        let dev = CudaDevice::new(0).unwrap();
+        dev.load_ptx(ptx, "sin", &["sin_kernel"]).unwrap();
+        for numel in [256, 512, 1024, 1280, 1536, 2048] {
+            let mut a = Vec::with_capacity(numel);
+            a.resize(numel, 1.0f32);
+            let a = dev.htod_copy(a).unwrap();
+            let mut b = dev.alloc_zeros::<f32>(numel).unwrap();
+            let sin_kernel = dev.get_func("sin", "sin_kernel").unwrap();
+            let cfg = LaunchConfig::for_num_elems(numel as u32);
+            unsafe { sin_kernel.launch(cfg, (&mut b, &a, numel)) }.unwrap();
+            let b = dev.sync_reclaim(b).unwrap();
+            for v in b {
+                assert_eq!(v, 0.841471);
+            }
+        }
+    }
+    #[test]
+    fn test_launch_with_views() {
+        let ptx = compile_ptx_with_opts(SIN_CU, Default::default()).unwrap();
+        let dev = CudaDevice::new(0).unwrap();
+        dev.load_ptx(ptx, "sin", &["sin_kernel"]).unwrap();
+        let a_host = [-1.0f32, -0.8, -0.6, -0.4, -0.2, 0.0, 0.2, 0.4, 0.6, 0.8];
+        let a_dev = dev.htod_copy(a_host.clone().to_vec()).unwrap();
+        let mut b_dev = a_dev.clone();
+        for i in 0..5 {
+            let a_sub = a_dev.try_slice(i * 2..).unwrap();
+            assert_eq!(a_sub.len, 10 - 2 * i);
+            let mut b_sub = b_dev.try_slice_mut(i * 2..).unwrap();
+            assert_eq!(b_sub.len, 10 - 2 * i);
+            let f = dev.get_func("sin", "sin_kernel").unwrap();
+            unsafe { f.launch(LaunchConfig::for_num_elems(2), (&mut b_sub, &a_sub, 2usize)) }
+                .unwrap();
+        }
+        let b_host = dev.sync_reclaim(b_dev).unwrap();
+        for (a_i, b_i) in a_host.iter().zip(b_host.iter()) {
+            let expected = a_i.sin();
+            assert!((b_i - expected).abs() <= 1e-6);
+        }
+        drop(a_dev);
+    }
+    const TEST_KERNELS: &str = "
+extern \"C\" __global__ void int_8bit(signed char s_min, char s_max, unsigned char u_min, unsigned char u_max) {
+    assert(s_min == -128);
+    assert(s_max == 127);
+    assert(u_min == 0);
+    assert(u_max == 255);
+}
+extern \"C\" __global__ void int_16bit(signed short s_min, short s_max, unsigned short u_min, unsigned short u_max) {
+    assert(s_min == -32768);
+    assert(s_max == 32767);
+    assert(u_min == 0);
+    assert(u_max == 65535);
+}
+extern \"C\" __global__ void int_32bit(signed int s_min, int s_max, unsigned int u_min, unsigned int u_max) {
+    assert(s_min == -2147483648);
+    assert(s_max == 2147483647);
+    assert(u_min == 0);
+    assert(u_max == 4294967295);
+}
+extern \"C\" __global__ void int_64bit(signed long s_min, long s_max, unsigned long u_min, unsigned long u_max) {
+    assert(s_min == -9223372036854775808);
+    assert(s_max == 9223372036854775807);
+    assert(u_min == 0);
+    assert(u_max == 18446744073709551615);
+}
+extern \"C\" __global__ void floating(float f, double d) {
+    assert(fabs(f - 1.2345678) <= 1e-7);
+    assert(fabs(d - -10.123456789876543) <= 1e-16);
+}
+";
+    #[test]
+    fn test_launch_with_8bit() {
+        let ptx = compile_ptx_with_opts(TEST_KERNELS, Default::default()).unwrap();
+        let dev = CudaDevice::new(0).unwrap();
+        dev.load_ptx(ptx, "tests", &["int_8bit"]).unwrap();
+        let f = dev.get_func("tests", "int_8bit").unwrap();
+        unsafe {
+            f.launch(
+                LaunchConfig::for_num_elems(1),
+                (i8::MIN, i8::MAX, u8::MIN, u8::MAX),
+            )
+        }
+        .unwrap();
+        dev.synchronize().unwrap();
+    }
+    #[test]
+    fn test_launch_with_16bit() {
+        let ptx = compile_ptx_with_opts(TEST_KERNELS, Default::default()).unwrap();
+        let dev = CudaDevice::new(0).unwrap();
+        dev.load_ptx(ptx, "tests", &["int_16bit"]).unwrap();
+        let f = dev.get_func("tests", "int_16bit").unwrap();
+        unsafe {
+            f.launch(
+                LaunchConfig::for_num_elems(1),
+                (i16::MIN, i16::MAX, u16::MIN, u16::MAX),
+            )
+        }
+        .unwrap();
+        dev.synchronize().unwrap();
+    }
+    #[test]
+    fn test_launch_with_32bit() {
+        let ptx = compile_ptx_with_opts(TEST_KERNELS, Default::default()).unwrap();
+        let dev = CudaDevice::new(0).unwrap();
+        dev.load_ptx(ptx, "tests", &["int_32bit"]).unwrap();
+        let f = dev.get_func("tests", "int_32bit").unwrap();
+        unsafe {
+            f.launch(
+                LaunchConfig::for_num_elems(1),
+                (i32::MIN, i32::MAX, u32::MIN, u32::MAX),
+            )
+        }
+        .unwrap();
+        dev.synchronize().unwrap();
+    }
+    #[test]
+    fn test_launch_with_64bit() {
+        let ptx = compile_ptx_with_opts(TEST_KERNELS, Default::default()).unwrap();
+        let dev = CudaDevice::new(0).unwrap();
+        dev.load_ptx(ptx, "tests", &["int_64bit"]).unwrap();
+        let f = dev.get_func("tests", "int_64bit").unwrap();
+        unsafe {
+            f.launch(
+                LaunchConfig::for_num_elems(1),
+                (i64::MIN, i64::MAX, u64::MIN, u64::MAX),
+            )
+        }
+        .unwrap();
+        dev.synchronize().unwrap();
+    }
+    #[test]
+    fn test_launch_with_floats() {
+        let ptx = compile_ptx_with_opts(TEST_KERNELS, Default::default()).unwrap();
+        let dev = CudaDevice::new(0).unwrap();
+        dev.load_ptx(ptx, "tests", &["floating"]).unwrap();
+        let f = dev.get_func("tests", "floating").unwrap();
+        unsafe {
+            f.launch(
+                LaunchConfig::for_num_elems(1),
+                (1.2345678f32, -10.123456789876543f64),
+            )
+        }
+        .unwrap();
+        dev.synchronize().unwrap();
+    }
+    #[cfg(feature = "f16")]
+    const HALF_KERNELS: &str = "
+#include \"cuda_fp16.h\"
+extern \"C\" __global__ void halfs(__half h) {
+    assert(__habs(h - __float2half(1.234)) <= __float2half(1e-4));
+}
+";
+    #[cfg(feature = "f16")]
+    #[test]
+    fn test_launch_with_half() {
+        use crate::nvrtc::CompileOptions;
+        let ptx = compile_ptx_with_opts(
+            HALF_KERNELS,
+            CompileOptions {
+                include_paths: std::vec!["/usr/include".into()],
+                arch: Some("compute_53"),
+                ..Default::default()
+            },
+        )
+        .unwrap();
+        let dev = CudaDevice::new(0).unwrap();
+        dev.load_ptx(ptx, "tests", &["halfs"]).unwrap();
+        let f = dev.get_func("tests", "halfs").unwrap();
+        unsafe {
+            f.launch(
+                LaunchConfig::for_num_elems(1),
+                (half::f16::from_f32(1.234),),
+            )
+        }
+        .unwrap();
+        dev.synchronize().unwrap();
+    }
+    const SLOW_KERNELS: &str = "
+extern \"C\" __global__ void slow_worker(const float *data, const size_t len, float *out) {
+    float tmp = 0.0;
+    for(size_t i = 0; i < 1000000; i++) {
+        tmp += data[i % len];
+    }
+    *out = tmp;
+}
+";
+    #[test]
+    fn test_par_launch() -> Result<(), DriverError> {
+        let ptx = compile_ptx_with_opts(SLOW_KERNELS, Default::default()).unwrap();
+        let dev = CudaDevice::new(0).unwrap();
+        dev.load_ptx(ptx, "tests", &["slow_worker"]).unwrap();
+        let slice = dev.alloc_zeros::<f32>(1000)?;
+        let mut a = dev.alloc_zeros::<f32>(1)?;
+        let mut b = dev.alloc_zeros::<f32>(1)?;
+        let cfg = LaunchConfig::for_num_elems(1);
+        let start = Instant::now();
+        {
+            // launch two kernels on the default stream
+            let f = dev.get_func("tests", "slow_worker").unwrap();
+            unsafe { f.launch(cfg, (&slice, slice.len(), &mut a))? };
+            let f = dev.get_func("tests", "slow_worker").unwrap();
+            unsafe { f.launch(cfg, (&slice, slice.len(), &mut b))? };
+            dev.synchronize()?;
+        }
+        let double_launch_s = start.elapsed().as_secs_f64();
+        let start = Instant::now();
+        {
+            // create a new stream & launch them concurrently
+            let stream = dev.fork_default_stream()?;
+            let f = dev.get_func("tests", "slow_worker").unwrap();
+            unsafe { f.launch(cfg, (&slice, slice.len(), &mut a))? };
+            let f = dev.get_func("tests", "slow_worker").unwrap();
+            unsafe { f.launch_on_stream(&stream, cfg, (&slice, slice.len(), &mut b))? };
+            dev.wait_for(&stream)?;
+            dev.synchronize()?;
+        }
+        let par_launch_s = start.elapsed().as_secs_f64();
+        assert!(
+            (double_launch_s - 2.0 * par_launch_s).abs() < 20.0 / 100.0,
+            "par={:?} dbl={:?}",
+            par_launch_s,
+            double_launch_s
+        );
+        Ok(())
+    }
+}
--- a/src/driver/safe/mod.rs
+++ b/src/driver/safe/mod.rs
+//! Safe abstractions over [crate::driver::result] provided by [CudaSlice], [CudaDevice], [CudaStream], and more.
+pub(crate) mod alloc;
+pub(crate) mod core;
+pub(crate) mod device_ptr;
+pub(crate) mod external_memory;
+pub(crate) mod launch;
+pub(crate) mod profile;
+pub(crate) mod ptx;
+pub(crate) mod threading;
+pub use self::alloc::{DeviceRepr, ValidAsZeroBits};
+pub use self::core::{CudaDevice, CudaFunction, CudaSlice, CudaStream, CudaView, CudaViewMut};
+pub use self::device_ptr::{DevicePtr, DevicePtrMut, DeviceSlice};
+pub use self::external_memory::{ExternalMemory, MappedBuffer};
+pub use self::launch::{LaunchAsync, LaunchConfig};
+pub use self::profile::{profiler_start, profiler_stop, Profiler};
+pub use crate::driver::result::DriverError;
--- a/src/driver/safe/profile.rs
+++ b/src/driver/safe/profile.rs
+use crate::driver::{result, sys};
+/// Calls [profiler_start()] in [Profiler::new()], and [profiler_stop()] in [Drop].
+#[derive(Default)]
+pub struct Profiler {}
+impl Profiler {
+    /// Enables profile collection by the active profiling tool for the current context. If profiling is already enabled, then Profiler::new() has no effect.
+    /// More info in [Cuda docs](https://docs.nvidia.com/cuda/profiler-users-guide/)
+    /// ```no_run
+    /// use cudarc::driver::{Profiler};
+    /// # use cudarc::driver::result;
+    ///
+    /// # fn run() -> Result<(), result::DriverError>{
+    /// {
+    /// let profiler = Profiler::new()?;
+    /// // Hotpath
+    /// // Profiler stops on drop
+    /// }
+    /// # Ok(())
+    /// # }
+    /// // Now check your results
+    /// // nsys profile -c cudaProfilerApi /path/to/bin
+    /// // And this will profile only the hotpath.
+    /// ```
+    ///
+    pub fn new() -> Result<Self, result::DriverError> {
+        profiler_start()?;
+        Ok(Self {})
+    }
+}
+impl Drop for Profiler {
+    fn drop(&mut self) {
+        // We don't want to panic on drop.
+        profiler_stop().ok();
+    }
+}
+/// Enables profile collection by the active profiling tool for the current context. If profiling is already enabled, then profiler_start() has no effect.
+/// More info in [Cuda docs](https://docs.nvidia.com/cuda/profiler-users-guide/)
+/// For RAII version see [`Profiler::new`].
+/// ```no_run
+/// use cudarc::driver::{profiler_start, profiler_stop};
+/// # use cudarc::driver::result;
+///
+/// # fn run() -> Result<(), result::DriverError>{
+/// profiler_start()?;
+/// // Hotpath
+/// profiler_stop()?;
+/// # Ok(())
+/// # }
+/// // Now check your results
+/// // nsys profile -c cudaProfilerApi /path/to/bin
+/// // And this will profile only the hotpath.
+/// ```
+///
+pub fn profiler_start() -> Result<(), result::DriverError> {
+    unsafe { sys::cuProfilerStart() }.result()
+}
+/// Disables profile collection by the active profiling tool for the current context. If profiling is already disabled, then profiler_stop() has no effect.
+pub fn profiler_stop() -> Result<(), result::DriverError> {
+    unsafe { sys::cuProfilerStop() }.result()
+}
--- a/src/driver/safe/ptx.rs
+++ b/src/driver/safe/ptx.rs
+use crate::{
+    driver::result,
+    nvrtc::{Ptx, PtxKind},
+};
+use super::core::{CudaDevice, CudaModule};
+use std::ffi::CString;
+use std::{collections::BTreeMap, sync::Arc};
+impl CudaDevice {
+    /// Dynamically load a set of [crate::driver::CudaFunction] from a jit compiled ptx.
+    ///
+    /// - `ptx` contains the compilex ptx
+    /// - `module_name` is a unique identifier used to access the module later on with [CudaDevice::get_func()]
+    /// - `func_names` is a slice of function names to load into the module during build.
+    pub fn load_ptx(
+        self: &Arc<Self>,
+        ptx: Ptx,
+        module_name: &str,
+        func_names: &[&'static str],
+    ) -> Result<(), result::DriverError> {
+        self.bind_to_thread()?;
+        let cu_module = match ptx.0 {
+            PtxKind::Image(image) => unsafe {
+                result::module::load_data(image.as_ptr() as *const _)
+            },
+            PtxKind::Src(src) => {
+                let c_src = CString::new(src).unwrap();
+                unsafe { result::module::load_data(c_src.as_ptr() as *const _) }
+            }
+            PtxKind::File(path) => {
+                let name_c = CString::new(path.to_str().unwrap()).unwrap();
+                result::module::load(name_c)
+            }
+        }?;
+        let mut functions = BTreeMap::new();
+        for &fn_name in func_names.iter() {
+            let fn_name_c = CString::new(fn_name).unwrap();
+            let cu_function = unsafe { result::module::get_function(cu_module, fn_name_c) }?;
+            functions.insert(fn_name, cu_function);
+        }
+        let module = CudaModule {
+            cu_module,
+            functions,
+        };
+        #[allow(unused_mut)]
+        {
+            let mut modules = self.modules.write();
+            #[cfg(not(feature = "no-std"))]
+            let mut modules = modules.unwrap();
+            modules.insert(module_name.into(), module);
+        }
+        Ok(())
+    }
+    pub fn load_ptx_bin(
+        self: &Arc<Self>,
+        ptx: &[u8],
+        module_name: &str,
+        func_names: &[&'static str],
+    ) -> Result<(), result::DriverError> {
+        self.bind_to_thread()?;
+        let cu_module = unsafe {
+            result::module::load_data(ptx.as_ptr() as *const _)
+        }?;
+        let mut functions = BTreeMap::new();
+        for &fn_name in func_names.iter() {
+            let fn_name_c = CString::new(fn_name).unwrap();
+            let cu_function = unsafe { result::module::get_function(cu_module, fn_name_c) }?;
+            functions.insert(fn_name, cu_function);
+        }
+        let module = CudaModule {
+            cu_module,
+            functions,
+        };
+        #[allow(unused_mut)]
+        {
+            let mut modules = self.modules.write();
+            #[cfg(not(feature = "no-std"))]
+            let mut modules = modules.unwrap();
+            modules.insert(module_name.into(), module);
+        }
+        Ok(())
+    }
+}
--- a/src/driver/safe/threading.rs
+++ b/src/driver/safe/threading.rs
+use super::{CudaDevice, DriverError};
+use crate::driver::result;
+impl CudaDevice {
+    /// Binds the device to the calling thread. You must call this before
+    /// using the device on a separate thread!
+    pub fn bind_to_thread(&self) -> Result<(), DriverError> {
+        unsafe { result::ctx::set_current(self.cu_primary_ctx) }
+    }
+}
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::thread;
+    #[test]
+    fn test_threading() {
+        let dev1 = CudaDevice::new(0).unwrap();
+        let dev2 = dev1.clone();
+        let thread1 = thread::spawn(move || {
+            dev1.bind_to_thread()?;
+            dev1.alloc_zeros::<f32>(10)
+        });
+        let thread2 = thread::spawn(move || {
+            dev2.bind_to_thread()?;
+            dev2.alloc_zeros::<f32>(10)
+        });
+        let _: crate::driver::CudaSlice<f32> = thread1.join().unwrap().unwrap();
+        let _: crate::driver::CudaSlice<f32> = thread2.join().unwrap().unwrap();
+    }
+}
--- a/src/driver/sys.rs
+++ b/src/driver/sys.rs
--- a/src/driver/wrapper.h
+++ b/src/driver/wrapper.h
+#include "cuda.h"
+#include "cudaProfiler.h"
--- a/src/lib.rs
+++ b/src/lib.rs
+//! Safe abstractions over:
+//! 1. [CUDA driver API](https://docs.nvidia.com/cuda/cuda-driver-api/index.html)
+//! 2. [NVRTC API](https://docs.nvidia.com/cuda/nvrtc/index.html)
+//! 3. [cuRAND API](https://docs.nvidia.com/cuda/curand/index.html)
+//! 4. [cuBLAS API](https://docs.nvidia.com/cuda/cublas/index.html)
+//!
+//! # crate organization
+//!
+//! Each of the modules for the above is organized into three levels:
+//! 1. A `safe` module which provides safe abstractions over the `result` module
+//! 2. A `result` which is a thin wrapper around the `sys` module to ensure all functions return [Result]
+//! 3. A `sys` module which contains the raw FFI bindings
+//!
+//! | API | Safe | Result | Sys |
+//! | --- | --- | --- | --- |
+//! | driver | [driver::safe] | [driver::result] | [driver::sys] |
+//! | cublas | [cublas::safe] | [cublas::result] | [cublas::sys] |
+//! | cublaslt | [cublaslt::safe] | [cublaslt::result] | [cublaslt::sys] |
+//! | nvrtc | [nvrtc::safe] | [nvrtc::result] | [nvrtc::sys] |
+//! | curand | [curand::safe] | [curand::result] | [curand::sys] |
+//! | cudnn | - | [cudnn::result] | [cudnn::sys] |
+//!
+//! # Core Concepts
+//!
+//! At the core is the [driver] API, which exposes a bunch of structs, but the main ones are:
+//!
+//! 1. [`driver::CudaDevice`] is a handle to a specific device ordinal (e.g. 0, 1, 2, ...)
+//! 2. [`driver::CudaSlice<T>`], which represents a [`Vec<T>`] on the device, can be allocated
+//!    using the aforementioned CudaDevice.
+//!
+//! Here is a table of similar concepts between CPU and Cuda:
+//!
+//! | Concept | CPU | Cuda |
+//! | --- | --- | --- |
+//! | Memory allocator | [`std::alloc::GlobalAlloc`] | [`driver::CudaDevice`] |
+//! | List of values on heap | [`Vec<T>`] | [`driver::CudaSlice<T>`] |
+//! | Slice | `&[T]` | [`driver::CudaView<T>`] |
+//! | Mutable Slice  | `&mut [T]` | [`driver::CudaViewMut<T>`] |
+//! | Function | [`Fn`] | [`driver::CudaFunction`] |
+//! | Calling a function | `my_function(a, b, c)` | [`driver::LaunchAsync::launch()`] |
+//! | Thread | [`std::thread::Thread`] | [`driver::CudaStream`] |
+//!
+//! # Combining the different APIs
+//!
+//! All the highest level apis have been designed to work together.
+//!
+//! ## nvrtc
+//!
+//! [`nvrtc::compile_ptx()`] outputs a [`nvrtc::Ptx`], which can
+//! be loaded into a device with [`driver::CudaDevice::load_ptx()`].
+//!
+//! ## cublas
+//!
+//! [cublas::CudaBlas] can perform gemm operations using [`cublas::Gemm<T>`],
+//! and [`cublas::Gemv<T>`]. Both of these traits can generically accept memory
+//! allocated by the driver in the form of: [`driver::CudaSlice<T>`],
+//! [`driver::CudaView<T>`], and [`driver::CudaViewMut<T>`].
+//!
+//! ## curand
+//!
+//! [curand::CudaRng] can fill a [`driver::CudaSlice<T>`] with random data, based on
+//! one of its available distributions.
+//!
+//! # Combining safe/result/sys
+//!
+//! The result and sys levels are very inter-changeable for each API. However,
+//! the safe apis don't necessarily allow you to mix in the result level. This
+//! is to encourage going through the safe API when possible.
+//!
+//! **If you need some functionality that isn't present in the safe api, please
+//! open a ticket.**
+#![cfg_attr(feature = "no-std", no_std)]
+#[cfg(feature = "no-std")]
+extern crate alloc;
+#[cfg(feature = "no-std")]
+extern crate no_std_compat as std;
+#[cfg(feature = "cublas")]
+pub mod cublas;
+#[cfg(feature = "cublaslt")]
+pub mod cublaslt;
+#[cfg(feature = "cudnn")]
+pub mod cudnn;
+#[cfg(feature = "curand")]
+pub mod curand;
+#[cfg(feature = "driver")]
+pub mod driver;
+#[cfg(feature = "nccl")]
+pub mod nccl;
+#[cfg(feature = "nvrtc")]
+pub mod nvrtc;
+pub mod types;
--- a/src/nccl/bindgen.sh
+++ b/src/nccl/bindgen.sh
+#!/bin/bash
+set -exu
+BINDGEN_EXTRA_CLANG_ARGS="-D__CUDA_BF16_TYPES_EXIST__" \
+bindgen \
+  --allowlist-type="^nccl.*" \
+  --allowlist-var="^nccl.*" \
+  --allowlist-function="^nccl.*" \
+  --default-enum-style=rust \
+  --no-doc-comments \
+  --with-derive-default \
+  --with-derive-eq \
+  --with-derive-hash \
+  --with-derive-ord \
+  --use-core \
+  wrapper.h -- -I/usr/local/cuda/include -I/usr/lib/gcc/x86_64-linux-gnu/12/include/ \
+  > sys.rs
--- a/src/nccl/mod.rs
+++ b/src/nccl/mod.rs
+//! Wrappers around the [NCCL API](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/index.html)
+//! in three levels. See crate documentation for description of each.
+pub mod result;
+pub mod safe;
+#[allow(warnings)]
+pub mod sys;
+pub use safe::*;
--- a/src/nccl/result.rs
+++ b/src/nccl/result.rs
+//! A thin wrapper around [sys] providing [Result]s with [NcclError].
+use super::sys::{self, ncclCommSplit, ncclGetVersion, ncclRedOpCreatePreMulSum, ncclRedOpDestroy};
+use std::mem::MaybeUninit;
+/// Wrapper around [sys::ncclResult_t].
+/// See [NCCL docs](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/types.html?ncclresult-t)
+#[derive(Clone, PartialEq, Eq)]
+pub struct NcclError(pub sys::ncclResult_t);
+impl std::fmt::Debug for NcclError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "NcclError")
+    }
+}
+#[derive(Clone, PartialEq, Eq)]
+pub enum NcclStatus {
+    Success,
+    InProgress,
+    NumResults,
+}
+impl sys::ncclResult_t {
+    /// Transforms into a [Result] of [NcclError]
+    pub fn result(self) -> Result<NcclStatus, NcclError> {
+        match self {
+            sys::ncclResult_t::ncclSuccess => Ok(NcclStatus::Success),
+            sys::ncclResult_t::ncclInProgress => Ok(NcclStatus::InProgress),
+            sys::ncclResult_t::ncclNumResults => Ok(NcclStatus::NumResults),
+            _ => Err(NcclError(self)),
+        }
+    }
+}
+/// See [cuda docs](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/comms.html?c.ncclCommFinalize)
+/// # Safety
+/// User is in charge of sending valid pointers.
+pub unsafe fn comm_finalize(comm: sys::ncclComm_t) -> Result<NcclStatus, NcclError> {
+    sys::ncclCommFinalize(comm).result()
+}
+/// See [cuda docs](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/comms.html?ncclcommdestroy)
+/// # Safety
+/// User is in charge of sending valid pointers.
+pub unsafe fn comm_destroy(comm: sys::ncclComm_t) -> Result<NcclStatus, NcclError> {
+    sys::ncclCommDestroy(comm).result()
+}
+/// See [cuda docs](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/comms.html?ncclcommabort)
+/// # Safety
+/// User is in charge of sending valid pointers.
+pub unsafe fn comm_abort(comm: sys::ncclComm_t) -> Result<NcclStatus, NcclError> {
+    sys::ncclCommAbort(comm).result()
+}
+/// See [cuda docs](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/comms.html?ncclcommcount)
+pub fn get_nccl_version() -> Result<::core::ffi::c_int, NcclError> {
+    let mut version: ::core::ffi::c_int = 0;
+    unsafe {
+        ncclGetVersion(&mut version).result()?;
+    }
+    Ok(version)
+}
+/// See [cuda docs](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/comms.html?ncclgetuniqueid)
+pub fn get_uniqueid() -> Result<sys::ncclUniqueId, NcclError> {
+    let mut uniqueid = MaybeUninit::uninit();
+    Ok(unsafe {
+        sys::ncclGetUniqueId(uniqueid.as_mut_ptr()).result()?;
+        uniqueid.assume_init()
+    })
+}
+/// See [cuda docs](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/comms.html?ncclcomminitrankconfig)
+/// # Safety
+/// User is in charge of sending valid pointers.
+pub unsafe fn comm_init_rank_config(
+    comm: *mut sys::ncclComm_t,
+    nranks: ::core::ffi::c_int,
+    comm_id: sys::ncclUniqueId,
+    rank: ::core::ffi::c_int,
+    config: *mut sys::ncclConfig_t,
+) -> Result<NcclStatus, NcclError> {
+    sys::ncclCommInitRankConfig(comm, nranks, comm_id, rank, config).result()
+}
+/// See [cuda docs](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/comms.html?ncclcomminitrank)
+/// # Safety
+/// User is in charge of sending valid pointers.
+pub unsafe fn comm_init_rank(
+    comm: *mut sys::ncclComm_t,
+    nranks: ::core::ffi::c_int,
+    comm_id: sys::ncclUniqueId,
+    rank: ::core::ffi::c_int,
+) -> Result<NcclStatus, NcclError> {
+    sys::ncclCommInitRank(comm, nranks, comm_id, rank).result()
+}
+/// See [cuda docs](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/comms.html?ncclcomminitall)
+/// # Safety
+/// User is in charge of sending valid pointers.
+pub unsafe fn comm_init_all(
+    comm: *mut sys::ncclComm_t,
+    ndev: ::core::ffi::c_int,
+    devlist: *const ::core::ffi::c_int,
+) -> Result<NcclStatus, NcclError> {
+    sys::ncclCommInitAll(comm, ndev, devlist).result()
+}
+/// See [cuda docs](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/comms.html?ncclcommsplit)
+/// # Safety
+/// User is in charge of sending valid pointers.
+pub unsafe fn comm_split(
+    comm: sys::ncclComm_t,
+    color: ::core::ffi::c_int,
+    key: ::core::ffi::c_int,
+    newcomm: *mut sys::ncclComm_t,
+    config: *mut sys::ncclConfig_t,
+) -> Result<NcclStatus, NcclError> {
+    ncclCommSplit(comm, color, key, newcomm, config).result()
+}
+/// See [cuda docs](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/comms.html?ncclcommcount)
+/// # Safety
+/// User is in charge of sending valid pointers.
+pub unsafe fn comm_count(comm: sys::ncclComm_t) -> Result<::core::ffi::c_int, NcclError> {
+    let mut count = 0;
+    sys::ncclCommCount(comm, &mut count).result()?;
+    Ok(count)
+}
+/// See [cuda docs](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/comms.html?ncclcommcudevice)
+/// # Safety
+/// User is in charge of sending valid pointers.
+pub unsafe fn comm_cu_device(comm: sys::ncclComm_t) -> Result<::core::ffi::c_int, NcclError> {
+    let mut device = 0;
+    sys::ncclCommCuDevice(comm, &mut device).result()?;
+    Ok(device)
+}
+/// See [cuda docs](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/comms.html?ncclcommuserrank)
+/// # Safety
+/// User is in charge of sending valid pointers.
+pub unsafe fn comm_user_rank(comm: sys::ncclComm_t) -> Result<::core::ffi::c_int, NcclError> {
+    let mut rank = 0;
+    sys::ncclCommUserRank(comm, &mut rank).result()?;
+    Ok(rank)
+}
+/// See [cuda docs](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/ops.html?c.ncclRedOpCreatePreMulSum)
+/// # Safety
+/// User is in charge of sending valid pointers.
+pub unsafe fn reduce_op_create_pre_mul_sum(
+    op: *mut sys::ncclRedOp_t,
+    scalar: *mut ::core::ffi::c_void,
+    datatype: sys::ncclDataType_t,
+    residence: sys::ncclScalarResidence_t,
+    comm: sys::ncclComm_t,
+) -> Result<NcclStatus, NcclError> {
+    ncclRedOpCreatePreMulSum(op, scalar, datatype, residence, comm).result()
+}
+/// See [cuda docs](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/ops.html?ncclredopdestroy)
+/// # Safety
+/// User is in charge of sending valid pointers.
+pub unsafe fn reduce_op_destroy(
+    op: sys::ncclRedOp_t,
+    comm: sys::ncclComm_t,
+) -> Result<NcclStatus, NcclError> {
+    ncclRedOpDestroy(op, comm).result()
+}
+/// See [cuda docs](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/colls.html?ncclreduce)
+/// # Safety
+/// User is in charge of sending valid pointers.
+#[allow(clippy::too_many_arguments)]
+pub unsafe fn reduce(
+    sendbuff: *const ::core::ffi::c_void,
+    recvbuff: *mut ::core::ffi::c_void,
+    count: usize,
+    datatype: sys::ncclDataType_t,
+    op: sys::ncclRedOp_t,
+    root: ::core::ffi::c_int,
+    comm: sys::ncclComm_t,
+    stream: sys::cudaStream_t,
+) -> Result<NcclStatus, NcclError> {
+    sys::ncclReduce(sendbuff, recvbuff, count, datatype, op, root, comm, stream).result()
+}
+/// See [cuda docs](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/colls.html?ncclbroadcast)
+/// # Safety
+/// User is in charge of sending valid pointers.
+pub unsafe fn broadcast(
+    sendbuff: *const ::core::ffi::c_void,
+    recvbuff: *mut ::core::ffi::c_void,
+    count: usize,
+    datatype: sys::ncclDataType_t,
+    root: ::core::ffi::c_int,
+    comm: sys::ncclComm_t,
+    stream: sys::cudaStream_t,
+) -> Result<NcclStatus, NcclError> {
+    sys::ncclBroadcast(sendbuff, recvbuff, count, datatype, root, comm, stream).result()
+}
+/// See [cuda docs](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/colls.html?ncclallreduce)
+/// # Safety
+/// User is in charge of sending valid pointers.
+pub unsafe fn all_reduce(
+    sendbuff: *const ::core::ffi::c_void,
+    recvbuff: *mut ::core::ffi::c_void,
+    count: usize,
+    datatype: sys::ncclDataType_t,
+    op: sys::ncclRedOp_t,
+    comm: sys::ncclComm_t,
+    stream: sys::cudaStream_t,
+) -> Result<NcclStatus, NcclError> {
+    sys::ncclAllReduce(sendbuff, recvbuff, count, datatype, op, comm, stream).result()
+}
+/// See [cuda docs](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/colls.html?ncclreducescatter)
+/// # Safety
+/// User is in charge of sending valid pointers.
+pub unsafe fn reduce_scatter(
+    sendbuff: *const ::core::ffi::c_void,
+    recvbuff: *mut ::core::ffi::c_void,
+    recvcount: usize,
+    datatype: sys::ncclDataType_t,
+    op: sys::ncclRedOp_t,
+    comm: sys::ncclComm_t,
+    stream: sys::cudaStream_t,
+) -> Result<NcclStatus, NcclError> {
+    sys::ncclReduceScatter(sendbuff, recvbuff, recvcount, datatype, op, comm, stream).result()
+}
+/// See [cuda docs](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/colls.html?ncclallgather)
+/// # Safety
+/// User is in charge of sending valid pointers.
+pub unsafe fn all_gather(
+    sendbuff: *const ::core::ffi::c_void,
+    recvbuff: *mut ::core::ffi::c_void,
+    sendcount: usize,
+    datatype: sys::ncclDataType_t,
+    comm: sys::ncclComm_t,
+    stream: sys::cudaStream_t,
+) -> Result<NcclStatus, NcclError> {
+    sys::ncclAllGather(sendbuff, recvbuff, sendcount, datatype, comm, stream).result()
+}
+/// See [cuda docs](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/p2p.html?ncclsend)
+/// # Safety
+/// User is in charge of sending valid pointers.
+pub unsafe fn send(
+    sendbuff: *const ::core::ffi::c_void,
+    count: usize,
+    datatype: sys::ncclDataType_t,
+    peer: ::core::ffi::c_int,
+    comm: sys::ncclComm_t,
+    stream: sys::cudaStream_t,
+) -> Result<NcclStatus, NcclError> {
+    sys::ncclSend(sendbuff, count, datatype, peer, comm, stream).result()
+}
+/// See [cuda docs](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/p2p.html?ncclrecv)
+/// # Safety
+/// User is in charge of sending valid pointers.
+pub unsafe fn recv(
+    recvbuff: *mut ::core::ffi::c_void,
+    count: usize,
+    datatype: sys::ncclDataType_t,
+    peer: ::core::ffi::c_int,
+    comm: sys::ncclComm_t,
+    stream: sys::cudaStream_t,
+) -> Result<NcclStatus, NcclError> {
+    sys::ncclRecv(recvbuff, count, datatype, peer, comm, stream).result()
+}
+/// See [cuda docs](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/group.html?c.ncclGroupEnd)
+pub fn group_end() -> Result<NcclStatus, NcclError> {
+    unsafe { sys::ncclGroupEnd().result() }
+}
+/// See [cuda docs](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/group.html?ncclgroupstart)
+pub fn group_start() -> Result<NcclStatus, NcclError> {
+    unsafe { sys::ncclGroupStart().result() }
+}
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::driver::CudaDevice;
+    use std::ffi::c_void;
+    #[test]
+    fn single_thread() {
+        let n_devices = CudaDevice::count().unwrap() as usize;
+        let n = 2;
+        let mut devs = vec![];
+        let mut sendslices = vec![];
+        let mut recvslices = vec![];
+        for i in 0..n_devices {
+            let dev = CudaDevice::new(i).unwrap();
+            let slice = dev.htod_copy(vec![(i + 1) as f32 * 1.0; n]).unwrap();
+            sendslices.push(slice);
+            let slice = dev.alloc_zeros::<f32>(n).unwrap();
+            recvslices.push(slice);
+            devs.push(dev);
+        }
+        let mut comms = vec![std::ptr::null_mut(); n_devices];
+        let ordinals: Vec<_> = devs.iter().map(|d| d.ordinal as i32).collect();
+        unsafe {
+            comm_init_all(comms.as_mut_ptr(), n_devices as i32, ordinals.as_ptr()).unwrap();
+            group_start().unwrap();
+            for i in 0..n_devices {
+                // Very important to set the cuda context to this device.
+                let dev = CudaDevice::new(i).unwrap();
+                all_reduce(
+                    sendslices[i].cu_device_ptr as *const c_void,
+                    recvslices[i].cu_device_ptr as *mut c_void,
+                    n,
+                    sys::ncclDataType_t::ncclFloat32,
+                    sys::ncclRedOp_t::ncclSum,
+                    comms[i],
+                    dev.stream as sys::cudaStream_t,
+                )
+                .unwrap();
+            }
+            group_end().unwrap();
+        }
+        for (i, recv) in recvslices.iter().enumerate() {
+            // Get the current device context
+            let dev = CudaDevice::new(i).unwrap();
+            let out = dev.dtoh_sync_copy(recv).unwrap();
+            assert_eq!(out, vec![(n_devices * (n_devices + 1)) as f32 / 2.0; n]);
+        }
+    }
+    #[test]
+    fn multi_thread() {
+        let n_devices = CudaDevice::count().unwrap() as usize;
+        let n = 2;
+        let comm_id = get_uniqueid().unwrap();
+        let threads: Vec<_> = (0..n_devices)
+            .map(|i| {
+                let n_devices = n_devices.clone();
+                std::thread::spawn(move || {
+                    let dev = CudaDevice::new(i).unwrap();
+                    let sendslice = dev.htod_copy(vec![(i + 1) as f32 * 1.0; n]).unwrap();
+                    let recvslice = dev.alloc_zeros::<f32>(n).unwrap();
+                    let mut comm = MaybeUninit::uninit();
+                    unsafe {
+                        comm_init_rank(comm.as_mut_ptr(), n_devices as i32, comm_id, i as i32)
+                            .unwrap();
+                        let comm = comm.assume_init();
+                        use std::ffi::c_void;
+                        all_reduce(
+                            sendslice.cu_device_ptr as *const c_void,
+                            recvslice.cu_device_ptr as *mut c_void,
+                            n,
+                            sys::ncclDataType_t::ncclFloat32,
+                            sys::ncclRedOp_t::ncclSum,
+                            comm,
+                            dev.stream as sys::cudaStream_t,
+                        )
+                        .unwrap();
+                    }
+                })
+            })
+            .collect();
+        for t in threads {
+            t.join().unwrap();
+        }
+    }
+}
--- a/src/nccl/safe.rs
+++ b/src/nccl/safe.rs
+use super::{result, sys};
+use crate::driver::{CudaDevice, CudaSlice};
+use std::mem::MaybeUninit;
+use std::ptr;
+use std::{sync::Arc, vec, vec::Vec};
+pub use result::{group_end, group_start};
+#[derive(Debug)]
+pub struct Comm {
+    comm: sys::ncclComm_t,
+    device: Arc<CudaDevice>,
+    rank: usize,
+    world_size: usize,
+}
+#[derive(Debug, Clone, Copy)]
+pub struct Id {
+    id: sys::ncclUniqueId,
+}
+impl Id {
+    pub fn new() -> Result<Self, result::NcclError> {
+        let id = result::get_uniqueid()?;
+        Ok(Self { id })
+    }
+    pub fn uninit(internal: [::core::ffi::c_char; 128usize]) -> Self {
+        let id = sys::ncclUniqueId { internal };
+        Self { id }
+    }
+    pub fn internal(&self) -> &[::core::ffi::c_char; 128usize] {
+        &self.id.internal
+    }
+}
+pub enum ReduceOp {
+    Sum,
+    Prod,
+    Max,
+    Min,
+    Avg,
+}
+fn convert_to_nccl_reduce_op(op: &ReduceOp) -> sys::ncclRedOp_t {
+    match op {
+        ReduceOp::Sum => sys::ncclRedOp_t::ncclSum,
+        ReduceOp::Prod => sys::ncclRedOp_t::ncclProd,
+        ReduceOp::Max => sys::ncclRedOp_t::ncclMax,
+        ReduceOp::Min => sys::ncclRedOp_t::ncclMin,
+        ReduceOp::Avg => sys::ncclRedOp_t::ncclAvg,
+    }
+}
+impl Drop for Comm {
+    fn drop(&mut self) {
+        // TODO(thenerdstation): Shoule we instead do finalize then destory?
+        unsafe {
+            result::comm_abort(self.comm).expect("Error when aborting Comm.");
+        }
+    }
+}
+pub trait NcclType {
+    fn as_nccl_type() -> sys::ncclDataType_t;
+}
+macro_rules! define_nccl_type {
+    ($t:ty, $nccl_type:expr) => {
+        impl NcclType for $t {
+            fn as_nccl_type() -> sys::ncclDataType_t {
+                $nccl_type
+            }
+        }
+    };
+}
+define_nccl_type!(f32, sys::ncclDataType_t::ncclFloat32);
+define_nccl_type!(f64, sys::ncclDataType_t::ncclFloat64);
+define_nccl_type!(i8, sys::ncclDataType_t::ncclInt8);
+define_nccl_type!(i32, sys::ncclDataType_t::ncclInt32);
+define_nccl_type!(i64, sys::ncclDataType_t::ncclInt64);
+define_nccl_type!(u8, sys::ncclDataType_t::ncclUint8);
+define_nccl_type!(u32, sys::ncclDataType_t::ncclUint32);
+define_nccl_type!(u64, sys::ncclDataType_t::ncclUint64);
+define_nccl_type!(char, sys::ncclDataType_t::ncclUint8);
+#[cfg(feature = "f16")]
+define_nccl_type!(half::f16, sys::ncclDataType_t::ncclFloat16);
+#[cfg(feature = "f16")]
+define_nccl_type!(half::bf16, sys::ncclDataType_t::ncclBfloat16);
+impl Comm {
+    /// Primitive to create new communication link on a single thread.
+    /// WARNING: You are likely to get limited throughput using a single core
+    /// to control multiple GPUs
+    /// ```
+    /// # use cudarc::driver::safe::{CudaDevice};
+    /// # use cudarc::nccl::safe::{Comm, ReduceOp, group_start, group_end};
+    /// let n = 2;
+    /// let n_devices = CudaDevice::count().unwrap() as usize;
+    /// let devices : Vec<_> = (0..n_devices).flat_map(CudaDevice::new).collect();
+    /// let comms = Comm::from_devices(devices).unwrap();
+    /// group_start().unwrap();
+    /// (0..n_devices).map(|i| {
+    ///     let comm = &comms[i];
+    ///     let dev = comm.device();
+    ///     let slice = dev.htod_copy(vec![(i + 1) as f32 * 1.0; n]).unwrap();
+    ///     let mut slice_receive = dev.alloc_zeros::<f32>(n).unwrap();
+    ///     comm.all_reduce(&slice, &mut slice_receive, &ReduceOp::Sum)
+    ///         .unwrap();
+    /// });
+    /// group_start().unwrap();
+    /// ```
+    pub fn from_devices(devices: Vec<Arc<CudaDevice>>) -> Result<Vec<Self>, result::NcclError> {
+        let n_devices = devices.len();
+        let mut comms = vec![std::ptr::null_mut(); n_devices];
+        let ordinals: Vec<_> = devices.iter().map(|d| d.ordinal as i32).collect();
+        unsafe {
+            result::comm_init_all(comms.as_mut_ptr(), n_devices as i32, ordinals.as_ptr())?;
+        }
+        let comms: Vec<Self> = comms
+            .into_iter()
+            .zip(devices.iter().cloned())
+            .enumerate()
+            .map(|(rank, (comm, device))| Self {
+                comm,
+                device,
+                rank,
+                world_size: n_devices,
+            })
+            .collect();
+        Ok(comms)
+    }
+    pub fn device(&self) -> Arc<CudaDevice> {
+        self.device.clone()
+    }
+    pub fn rank(&self) -> usize {
+        self.rank
+    }
+    pub fn world_size(&self) -> usize {
+        self.world_size
+    }
+    /// Primitive to create new communication link on each process (threads are possible but not
+    /// recommended).
+    ///
+    /// WARNING: If using threads, uou are likely to get limited throughput using a single core
+    /// to control multiple GPUs. Cuda drivers effectively use a global mutex thrashing
+    /// performance on multi threaded multi GPU.
+    /// ```
+    /// # use cudarc::driver::safe::{CudaDevice};
+    /// # use cudarc::nccl::safe::{Comm, Id, ReduceOp};
+    /// let n = 2;
+    /// let n_devices = 1; // This is to simplify this example.
+    /// // Spawn this only on rank 0
+    /// let id = Id::new().unwrap();
+    /// // Send id.internal() to other ranks
+    /// // let id = Id::uninit(id.internal().clone()); on other ranks
+    ///
+    /// let rank = 0;
+    /// let dev = CudaDevice::new(rank).unwrap();
+    /// let comm = Comm::from_rank(dev.clone(), rank, n_devices, id).unwrap();
+    /// let slice = dev.htod_copy(vec![(rank + 1) as f32 * 1.0; n]).unwrap();
+    /// let mut slice_receive = dev.alloc_zeros::<f32>(n).unwrap();
+    /// comm.all_reduce(&slice, &mut slice_receive, &ReduceOp::Sum)
+    ///     .unwrap();
+    /// let out = dev.dtoh_sync_copy(&slice_receive).unwrap();
+    /// assert_eq!(out, vec![(n_devices * (n_devices + 1)) as f32 / 2.0; n]);
+    /// ```
+    pub fn from_rank(
+        device: Arc<CudaDevice>,
+        rank: usize,
+        world_size: usize,
+        id: Id,
+    ) -> Result<Self, result::NcclError> {
+        let mut comm = MaybeUninit::uninit();
+        let comm = unsafe {
+            result::comm_init_rank(
+                comm.as_mut_ptr(),
+                world_size
+                    .try_into()
+                    .expect("World_size cannot be casted to i32"),
+                id.id,
+                rank.try_into().expect("Rank cannot be cast to i32"),
+            )?;
+            comm.assume_init()
+        };
+        Ok(Self {
+            comm,
+            device,
+            rank,
+            world_size,
+        })
+    }
+}
+impl Comm {
+    pub fn send<T: NcclType>(
+        &self,
+        data: &CudaSlice<T>,
+        peer: i32,
+    ) -> Result<(), result::NcclError> {
+        unsafe {
+            result::send(
+                data.cu_device_ptr as *mut _,
+                data.len,
+                T::as_nccl_type(),
+                peer,
+                self.comm,
+                self.device.stream as *mut _,
+            )?;
+        }
+        Ok(())
+    }
+    pub fn recv<T: NcclType>(
+        &self,
+        buff: &mut CudaSlice<T>,
+        peer: i32,
+    ) -> Result<result::NcclStatus, result::NcclError> {
+        unsafe {
+            result::recv(
+                buff.cu_device_ptr as *mut _,
+                buff.len,
+                T::as_nccl_type(),
+                peer,
+                self.comm,
+                self.device.stream as *mut _,
+            )
+        }
+    }
+    pub fn broadcast<T: NcclType>(
+        &self,
+        sendbuff: &Option<CudaSlice<T>>,
+        recvbuff: &mut CudaSlice<T>,
+        root: i32,
+    ) -> Result<result::NcclStatus, result::NcclError> {
+        unsafe {
+            let send_ptr = match sendbuff {
+                Some(buffer) => buffer.cu_device_ptr as *mut _,
+                None => ptr::null(),
+            };
+            result::broadcast(
+                send_ptr,
+                recvbuff.cu_device_ptr as *mut _,
+                recvbuff.len,
+                T::as_nccl_type(),
+                root,
+                self.comm,
+                self.device.stream as *mut _,
+            )
+        }
+    }
+    pub fn all_gather<T: NcclType>(
+        &self,
+        sendbuff: &CudaSlice<T>,
+        recvbuff: &mut CudaSlice<T>,
+    ) -> Result<result::NcclStatus, result::NcclError> {
+        unsafe {
+            result::all_gather(
+                sendbuff.cu_device_ptr as *mut _,
+                recvbuff.cu_device_ptr as *mut _,
+                sendbuff.len,
+                T::as_nccl_type(),
+                self.comm,
+                self.device.stream as *mut _,
+            )
+        }
+    }
+    pub fn all_reduce<T: NcclType>(
+        &self,
+        sendbuff: &CudaSlice<T>,
+        recvbuff: &mut CudaSlice<T>,
+        reduce_op: &ReduceOp,
+    ) -> Result<result::NcclStatus, result::NcclError> {
+        unsafe {
+            result::all_reduce(
+                sendbuff.cu_device_ptr as *mut _,
+                recvbuff.cu_device_ptr as *mut _,
+                sendbuff.len,
+                T::as_nccl_type(),
+                convert_to_nccl_reduce_op(reduce_op),
+                self.comm,
+                self.device.stream as *mut _,
+            )
+        }
+    }
+    pub fn reduce<T: NcclType>(
+        &self,
+        sendbuff: &CudaSlice<T>,
+        recvbuff: &mut CudaSlice<T>,
+        reduce_op: &ReduceOp,
+        root: i32,
+    ) -> Result<result::NcclStatus, result::NcclError> {
+        unsafe {
+            result::reduce(
+                sendbuff.cu_device_ptr as *mut _,
+                recvbuff.cu_device_ptr as *mut _,
+                sendbuff.len,
+                T::as_nccl_type(),
+                convert_to_nccl_reduce_op(reduce_op),
+                root,
+                self.comm,
+                self.device.stream as *mut _,
+            )
+        }
+    }
+    pub fn reduce_scatter<T: NcclType>(
+        &self,
+        sendbuff: &CudaSlice<T>,
+        recvbuff: &mut CudaSlice<T>,
+        reduce_op: &ReduceOp,
+    ) -> Result<result::NcclStatus, result::NcclError> {
+        unsafe {
+            result::reduce_scatter(
+                sendbuff.cu_device_ptr as *mut _,
+                recvbuff.cu_device_ptr as *mut _,
+                recvbuff.len,
+                T::as_nccl_type(),
+                convert_to_nccl_reduce_op(reduce_op),
+                self.comm,
+                self.device.stream as *mut _,
+            )
+        }
+    }
+}
+#[macro_export]
+macro_rules! group {
+    ($x:block) => {
+        unsafe {
+            result::group_start().unwrap();
+        }
+        $x
+        unsafe {
+            result::group_end().unwrap();
+        }
+    };
+}
+#[cfg(test)]
+mod tests {
+    use super::*;
+    #[test]
+    fn test_all_reduce() {
+        let n = 2;
+        let n_devices = CudaDevice::count().unwrap() as usize;
+        let id = Id::new().unwrap();
+        let threads: Vec<_> = (0..n_devices)
+            .map(|i| {
+                println!("III {i}");
+                std::thread::spawn(move || {
+                    println!("Within thread {i}");
+                    let dev = CudaDevice::new(i).unwrap();
+                    let comm = Comm::from_rank(dev.clone(), i, n_devices, id).unwrap();
+                    let slice = dev.htod_copy(vec![(i + 1) as f32 * 1.0; n]).unwrap();
+                    let mut slice_receive = dev.alloc_zeros::<f32>(n).unwrap();
+                    comm.all_reduce(&slice, &mut slice_receive, &ReduceOp::Sum)
+                        .unwrap();
+                    let out = dev.dtoh_sync_copy(&slice_receive).unwrap();
+                    assert_eq!(out, vec![(n_devices * (n_devices + 1)) as f32 / 2.0; n]);
+                })
+            })
+            .collect();
+        for t in threads {
+            t.join().unwrap()
+        }
+    }
+}
--- a/src/nccl/sys.rs
+++ b/src/nccl/sys.rs
+/* automatically generated by rust-bindgen 0.66.1 */
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct CUstream_st {
+    _unused: [u8; 0],
+}
+pub type cudaStream_t = *mut CUstream_st;
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct ncclComm {
+    _unused: [u8; 0],
+}
+pub type ncclComm_t = *mut ncclComm;
+#[repr(C)]
+#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
+pub struct ncclUniqueId {
+    pub internal: [::core::ffi::c_char; 128usize],
+}
+#[test]
+fn bindgen_test_layout_ncclUniqueId() {
+    const UNINIT: ::core::mem::MaybeUninit<ncclUniqueId> = ::core::mem::MaybeUninit::uninit();
+    let ptr = UNINIT.as_ptr();
+    assert_eq!(
+        ::core::mem::size_of::<ncclUniqueId>(),
+        128usize,
+        concat!("Size of: ", stringify!(ncclUniqueId))
+    );
+    assert_eq!(
+        ::core::mem::align_of::<ncclUniqueId>(),
+        1usize,
+        concat!("Alignment of ", stringify!(ncclUniqueId))
+    );
+    assert_eq!(
+        unsafe { ::core::ptr::addr_of!((*ptr).internal) as usize - ptr as usize },
+        0usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ncclUniqueId),
+            "::",
+            stringify!(internal)
+        )
+    );
+}
+impl Default for ncclUniqueId {
+    fn default() -> Self {
+        let mut s = ::core::mem::MaybeUninit::<Self>::uninit();
+        unsafe {
+            ::core::ptr::write_bytes(s.as_mut_ptr(), 0, 1);
+            s.assume_init()
+        }
+    }
+}
+#[repr(u32)]
+#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
+pub enum ncclResult_t {
+    ncclSuccess = 0,
+    ncclUnhandledCudaError = 1,
+    ncclSystemError = 2,
+    ncclInternalError = 3,
+    ncclInvalidArgument = 4,
+    ncclInvalidUsage = 5,
+    ncclRemoteError = 6,
+    ncclInProgress = 7,
+    ncclNumResults = 8,
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
+pub struct ncclConfig_v21700 {
+    pub size: usize,
+    pub magic: ::core::ffi::c_uint,
+    pub version: ::core::ffi::c_uint,
+    pub blocking: ::core::ffi::c_int,
+    pub cgaClusterSize: ::core::ffi::c_int,
+    pub minCTAs: ::core::ffi::c_int,
+    pub maxCTAs: ::core::ffi::c_int,
+    pub netName: *const ::core::ffi::c_char,
+    pub splitShare: ::core::ffi::c_int,
+}
+#[test]
+fn bindgen_test_layout_ncclConfig_v21700() {
+    const UNINIT: ::core::mem::MaybeUninit<ncclConfig_v21700> = ::core::mem::MaybeUninit::uninit();
+    let ptr = UNINIT.as_ptr();
+    assert_eq!(
+        ::core::mem::size_of::<ncclConfig_v21700>(),
+        48usize,
+        concat!("Size of: ", stringify!(ncclConfig_v21700))
+    );
+    assert_eq!(
+        ::core::mem::align_of::<ncclConfig_v21700>(),
+        8usize,
+        concat!("Alignment of ", stringify!(ncclConfig_v21700))
+    );
+    assert_eq!(
+        unsafe { ::core::ptr::addr_of!((*ptr).size) as usize - ptr as usize },
+        0usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ncclConfig_v21700),
+            "::",
+            stringify!(size)
+        )
+    );
+    assert_eq!(
+        unsafe { ::core::ptr::addr_of!((*ptr).magic) as usize - ptr as usize },
+        8usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ncclConfig_v21700),
+            "::",
+            stringify!(magic)
+        )
+    );
+    assert_eq!(
+        unsafe { ::core::ptr::addr_of!((*ptr).version) as usize - ptr as usize },
+        12usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ncclConfig_v21700),
+            "::",
+            stringify!(version)
+        )
+    );
+    assert_eq!(
+        unsafe { ::core::ptr::addr_of!((*ptr).blocking) as usize - ptr as usize },
+        16usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ncclConfig_v21700),
+            "::",
+            stringify!(blocking)
+        )
+    );
+    assert_eq!(
+        unsafe { ::core::ptr::addr_of!((*ptr).cgaClusterSize) as usize - ptr as usize },
+        20usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ncclConfig_v21700),
+            "::",
+            stringify!(cgaClusterSize)
+        )
+    );
+    assert_eq!(
+        unsafe { ::core::ptr::addr_of!((*ptr).minCTAs) as usize - ptr as usize },
+        24usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ncclConfig_v21700),
+            "::",
+            stringify!(minCTAs)
+        )
+    );
+    assert_eq!(
+        unsafe { ::core::ptr::addr_of!((*ptr).maxCTAs) as usize - ptr as usize },
+        28usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ncclConfig_v21700),
+            "::",
+            stringify!(maxCTAs)
+        )
+    );
+    assert_eq!(
+        unsafe { ::core::ptr::addr_of!((*ptr).netName) as usize - ptr as usize },
+        32usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ncclConfig_v21700),
+            "::",
+            stringify!(netName)
+        )
+    );
+    assert_eq!(
+        unsafe { ::core::ptr::addr_of!((*ptr).splitShare) as usize - ptr as usize },
+        40usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ncclConfig_v21700),
+            "::",
+            stringify!(splitShare)
+        )
+    );
+}
+impl Default for ncclConfig_v21700 {
+    fn default() -> Self {
+        let mut s = ::core::mem::MaybeUninit::<Self>::uninit();
+        unsafe {
+            ::core::ptr::write_bytes(s.as_mut_ptr(), 0, 1);
+            s.assume_init()
+        }
+    }
+}
+pub type ncclConfig_t = ncclConfig_v21700;
+extern "C" {
+    pub fn ncclGetVersion(version: *mut ::core::ffi::c_int) -> ncclResult_t;
+}
+extern "C" {
+    pub fn ncclGetUniqueId(uniqueId: *mut ncclUniqueId) -> ncclResult_t;
+}
+extern "C" {
+    pub fn ncclCommInitRankConfig(
+        comm: *mut ncclComm_t,
+        nranks: ::core::ffi::c_int,
+        commId: ncclUniqueId,
+        rank: ::core::ffi::c_int,
+        config: *mut ncclConfig_t,
+    ) -> ncclResult_t;
+}
+extern "C" {
+    pub fn ncclCommInitRank(
+        comm: *mut ncclComm_t,
+        nranks: ::core::ffi::c_int,
+        commId: ncclUniqueId,
+        rank: ::core::ffi::c_int,
+    ) -> ncclResult_t;
+}
+extern "C" {
+    pub fn ncclCommInitAll(
+        comm: *mut ncclComm_t,
+        ndev: ::core::ffi::c_int,
+        devlist: *const ::core::ffi::c_int,
+    ) -> ncclResult_t;
+}
+extern "C" {
+    pub fn ncclCommFinalize(comm: ncclComm_t) -> ncclResult_t;
+}
+extern "C" {
+    pub fn ncclCommDestroy(comm: ncclComm_t) -> ncclResult_t;
+}
+extern "C" {
+    pub fn ncclCommAbort(comm: ncclComm_t) -> ncclResult_t;
+}
+extern "C" {
+    pub fn ncclCommSplit(
+        comm: ncclComm_t,
+        color: ::core::ffi::c_int,
+        key: ::core::ffi::c_int,
+        newcomm: *mut ncclComm_t,
+        config: *mut ncclConfig_t,
+    ) -> ncclResult_t;
+}
+extern "C" {
+    pub fn ncclGetErrorString(result: ncclResult_t) -> *const ::core::ffi::c_char;
+}
+extern "C" {
+    pub fn ncclGetLastError(comm: ncclComm_t) -> *const ::core::ffi::c_char;
+}
+extern "C" {
+    pub fn ncclCommGetAsyncError(comm: ncclComm_t, asyncError: *mut ncclResult_t) -> ncclResult_t;
+}
+extern "C" {
+    pub fn ncclCommCount(comm: ncclComm_t, count: *mut ::core::ffi::c_int) -> ncclResult_t;
+}
+extern "C" {
+    pub fn ncclCommCuDevice(comm: ncclComm_t, device: *mut ::core::ffi::c_int) -> ncclResult_t;
+}
+extern "C" {
+    pub fn ncclCommUserRank(comm: ncclComm_t, rank: *mut ::core::ffi::c_int) -> ncclResult_t;
+}
+#[repr(u32)]
+#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
+pub enum ncclRedOp_dummy_t {
+    ncclNumOps_dummy = 5,
+}
+#[repr(u32)]
+#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
+pub enum ncclRedOp_t {
+    ncclSum = 0,
+    ncclProd = 1,
+    ncclMax = 2,
+    ncclMin = 3,
+    ncclAvg = 4,
+    ncclNumOps = 5,
+    ncclMaxRedOp = 2147483647,
+}
+impl ncclDataType_t {
+    pub const ncclChar: ncclDataType_t = ncclDataType_t::ncclInt8;
+}
+impl ncclDataType_t {
+    pub const ncclInt: ncclDataType_t = ncclDataType_t::ncclInt32;
+}
+impl ncclDataType_t {
+    pub const ncclHalf: ncclDataType_t = ncclDataType_t::ncclFloat16;
+}
+impl ncclDataType_t {
+    pub const ncclFloat: ncclDataType_t = ncclDataType_t::ncclFloat32;
+}
+impl ncclDataType_t {
+    pub const ncclDouble: ncclDataType_t = ncclDataType_t::ncclFloat64;
+}
+#[repr(u32)]
+#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
+pub enum ncclDataType_t {
+    ncclInt8 = 0,
+    ncclUint8 = 1,
+    ncclInt32 = 2,
+    ncclUint32 = 3,
+    ncclInt64 = 4,
+    ncclUint64 = 5,
+    ncclFloat16 = 6,
+    ncclFloat32 = 7,
+    ncclFloat64 = 8,
+    ncclBfloat16 = 9,
+    ncclNumTypes = 10,
+}
+#[repr(u32)]
+#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
+pub enum ncclScalarResidence_t {
+    ncclScalarDevice = 0,
+    ncclScalarHostImmediate = 1,
+}
+extern "C" {
+    pub fn ncclRedOpCreatePreMulSum(
+        op: *mut ncclRedOp_t,
+        scalar: *mut ::core::ffi::c_void,
+        datatype: ncclDataType_t,
+        residence: ncclScalarResidence_t,
+        comm: ncclComm_t,
+    ) -> ncclResult_t;
+}
+extern "C" {
+    pub fn ncclRedOpDestroy(op: ncclRedOp_t, comm: ncclComm_t) -> ncclResult_t;
+}
+extern "C" {
+    pub fn ncclReduce(
+        sendbuff: *const ::core::ffi::c_void,
+        recvbuff: *mut ::core::ffi::c_void,
+        count: usize,
+        datatype: ncclDataType_t,
+        op: ncclRedOp_t,
+        root: ::core::ffi::c_int,
+        comm: ncclComm_t,
+        stream: cudaStream_t,
+    ) -> ncclResult_t;
+}
+extern "C" {
+    pub fn ncclBcast(
+        buff: *mut ::core::ffi::c_void,
+        count: usize,
+        datatype: ncclDataType_t,
+        root: ::core::ffi::c_int,
+        comm: ncclComm_t,
+        stream: cudaStream_t,
+    ) -> ncclResult_t;
+}
+extern "C" {
+    pub fn ncclBroadcast(
+        sendbuff: *const ::core::ffi::c_void,
+        recvbuff: *mut ::core::ffi::c_void,
+        count: usize,
+        datatype: ncclDataType_t,
+        root: ::core::ffi::c_int,
+        comm: ncclComm_t,
+        stream: cudaStream_t,
+    ) -> ncclResult_t;
+}
+extern "C" {
+    pub fn ncclAllReduce(
+        sendbuff: *const ::core::ffi::c_void,
+        recvbuff: *mut ::core::ffi::c_void,
+        count: usize,
+        datatype: ncclDataType_t,
+        op: ncclRedOp_t,
+        comm: ncclComm_t,
+        stream: cudaStream_t,
+    ) -> ncclResult_t;
+}
+extern "C" {
+    pub fn ncclReduceScatter(
+        sendbuff: *const ::core::ffi::c_void,
+        recvbuff: *mut ::core::ffi::c_void,
+        recvcount: usize,
+        datatype: ncclDataType_t,
+        op: ncclRedOp_t,
+        comm: ncclComm_t,
+        stream: cudaStream_t,
+    ) -> ncclResult_t;
+}
+extern "C" {
+    pub fn ncclAllGather(
+        sendbuff: *const ::core::ffi::c_void,
+        recvbuff: *mut ::core::ffi::c_void,
+        sendcount: usize,
+        datatype: ncclDataType_t,
+        comm: ncclComm_t,
+        stream: cudaStream_t,
+    ) -> ncclResult_t;
+}
+extern "C" {
+    pub fn ncclSend(
+        sendbuff: *const ::core::ffi::c_void,
+        count: usize,
+        datatype: ncclDataType_t,
+        peer: ::core::ffi::c_int,
+        comm: ncclComm_t,
+        stream: cudaStream_t,
+    ) -> ncclResult_t;
+}
+extern "C" {
+    pub fn ncclRecv(
+        recvbuff: *mut ::core::ffi::c_void,
+        count: usize,
+        datatype: ncclDataType_t,
+        peer: ::core::ffi::c_int,
+        comm: ncclComm_t,
+        stream: cudaStream_t,
+    ) -> ncclResult_t;
+}
+extern "C" {
+    pub fn ncclGroupStart() -> ncclResult_t;
+}
+extern "C" {
+    pub fn ncclGroupEnd() -> ncclResult_t;
+}
--- a/src/nccl/wrapper.h
+++ b/src/nccl/wrapper.h
+#include "nccl.h"
\ No newline at end of file
--- a/src/nvrtc/bindgen.sh
+++ b/src/nvrtc/bindgen.sh
+#!/bin/bash
+set -exu
+bindgen \
+  --whitelist-type="^nvrtc.*" \
+  --whitelist-function="^nvrtc.*" \
+  --default-enum-style=rust \
+  --no-doc-comments \
+  --with-derive-default \
+  --with-derive-eq \
+  --with-derive-hash \
+  --with-derive-ord \
+  --size_t-is-usize \
+  --use-core \
+  wrapper.h -- -I/usr/local/cuda/include \
+  > sys.rs
\ No newline at end of file
--- a/src/nvrtc/mod.rs
+++ b/src/nvrtc/mod.rs
+//! Wrappers around the [Nvidia Runtime Compilation (nvrtc) API](https://docs.nvidia.com/cuda/nvrtc/index.html),
+//! in three levels. See crate documentation for description of each.
+//!
+//! Call [compile_ptx()] or [compile_ptx_with_opts()].
+pub mod result;
+pub mod safe;
+#[allow(warnings)]
+pub mod sys;
+pub use safe::*;