"vscode:/vscode.git/clone" did not exist on "27388377866ad23bd3edcec966b8127752c40175"
Commit 3ec8c534 authored by yongshk's avatar yongshk
Browse files

Initial commit

parents
This diff is collapsed.
#include "cublasLt.h"
\ No newline at end of file
#!/bin/bash
set -exu
bindgen \
--whitelist-type="^cudnn.*" \
--whitelist-var="^cudnn.*" \
--whitelist-function="^cudnn.*" \
--default-enum-style=rust \
--no-doc-comments \
--with-derive-default \
--with-derive-eq \
--with-derive-hash \
--with-derive-ord \
--size_t-is-usize \
--use-core \
wrapper.h -- -I/usr/local/cuda/include \
> sys.rs
\ No newline at end of file
pub mod result;
pub mod safe;
#[allow(warnings)]
pub mod sys;
pub use safe::*;
This diff is collapsed.
This diff is collapsed.
use crate::{
cudnn::{result, result::CudnnError, sys},
driver::{CudaDevice, CudaStream},
};
use std::{marker::PhantomData, sync::Arc};
/// A handle to cuDNN.
///
/// This type is not send/sync because of <https://docs.nvidia.com/deeplearning/cudnn/developer-guide/index.html#thread-safety>
#[derive(Debug)]
pub struct Cudnn {
pub(crate) handle: sys::cudnnHandle_t,
pub(crate) device: Arc<CudaDevice>,
}
impl Cudnn {
/// Creates a new cudnn handle and sets the stream to the `device`'s stream.
pub fn new(device: Arc<CudaDevice>) -> Result<Arc<Self>, CudnnError> {
device.bind_to_thread().unwrap();
let handle = result::create_handle()?;
unsafe { result::set_stream(handle, device.stream as *mut _) }?;
Ok(Arc::new(Self { handle, device }))
}
/// Sets the handle's current to either the stream specified, or the device's default work
/// stream.
///
/// # Safety
/// This is unsafe because you can end up scheduling multiple concurrent kernels that all
/// write to the same memory address.
pub unsafe fn set_stream(&self, opt_stream: Option<&CudaStream>) -> Result<(), CudnnError> {
match opt_stream {
Some(s) => result::set_stream(self.handle, s.stream as *mut _),
None => result::set_stream(self.handle, self.device.stream as *mut _),
}
}
}
impl Drop for Cudnn {
fn drop(&mut self) {
let handle = std::mem::replace(&mut self.handle, std::ptr::null_mut());
if !handle.is_null() {
unsafe { result::destroy_handle(handle) }.unwrap();
}
}
}
/// Maps a rust type to a [sys::cudnnDataType_t]
pub trait CudnnDataType {
const DATA_TYPE: sys::cudnnDataType_t;
/// Certain CUDNN data types have a scaling parameter (usually called alpha/beta)
/// that is a different type. See [nvidia docs](https://docs.nvidia.com/deeplearning/cudnn/developer-guide/index.html#scaling-parameters)
/// for more info, but basically f16 has a scalar of f32.
type Scalar;
/// Converts the type into the scaling parameter type. See [Self::Scalar].
fn into_scaling_parameter(self) -> Self::Scalar;
}
macro_rules! cudnn_dtype {
($RustTy:ty, $CudnnTy:tt) => {
impl CudnnDataType for $RustTy {
const DATA_TYPE: sys::cudnnDataType_t = sys::cudnnDataType_t::$CudnnTy;
type Scalar = Self;
fn into_scaling_parameter(self) -> Self::Scalar {
self
}
}
};
}
cudnn_dtype!(f32, CUDNN_DATA_FLOAT);
cudnn_dtype!(f64, CUDNN_DATA_DOUBLE);
cudnn_dtype!(i8, CUDNN_DATA_INT8);
cudnn_dtype!(i32, CUDNN_DATA_INT32);
cudnn_dtype!(i64, CUDNN_DATA_INT64);
cudnn_dtype!(u8, CUDNN_DATA_UINT8);
cudnn_dtype!(bool, CUDNN_DATA_BOOLEAN);
#[cfg(feature = "f16")]
impl CudnnDataType for half::f16 {
const DATA_TYPE: sys::cudnnDataType_t = sys::cudnnDataType_t::CUDNN_DATA_HALF;
type Scalar = f32;
fn into_scaling_parameter(self) -> Self::Scalar {
self.to_f32()
}
}
#[cfg(feature = "f16")]
impl CudnnDataType for half::bf16 {
const DATA_TYPE: sys::cudnnDataType_t = sys::cudnnDataType_t::CUDNN_DATA_BFLOAT16;
type Scalar = f32;
fn into_scaling_parameter(self) -> Self::Scalar {
self.to_f32()
}
}
/// A descriptor of a tensor. Create with:
/// 1. [`Cudnn::create_4d_tensor()`]
/// 2. [`Cudnn::create_4d_tensor_ex()`]
/// 3. [`Cudnn::create_nd_tensor()`]
#[derive(Debug)]
pub struct TensorDescriptor<T> {
pub(crate) desc: sys::cudnnTensorDescriptor_t,
#[allow(unused)]
pub(crate) handle: Arc<Cudnn>,
pub(crate) marker: PhantomData<T>,
}
impl Cudnn {
/// Creates a 4d tensor descriptor.
pub fn create_4d_tensor<T: CudnnDataType>(
self: &Arc<Cudnn>,
format: sys::cudnnTensorFormat_t,
dims: [std::ffi::c_int; 4],
) -> Result<TensorDescriptor<T>, CudnnError> {
let desc = result::create_tensor_descriptor()?;
let desc = TensorDescriptor {
desc,
handle: self.clone(),
marker: PhantomData,
};
unsafe { result::set_tensor4d_descriptor(desc.desc, format, T::DATA_TYPE, dims) }?;
Ok(desc)
}
/// Creates a 4d tensor descriptor.
pub fn create_4d_tensor_ex<T: CudnnDataType>(
self: &Arc<Cudnn>,
dims: [std::ffi::c_int; 4],
strides: [std::ffi::c_int; 4],
) -> Result<TensorDescriptor<T>, CudnnError> {
let desc = result::create_tensor_descriptor()?;
let desc = TensorDescriptor {
desc,
handle: self.clone(),
marker: PhantomData,
};
unsafe { result::set_tensor4d_descriptor_ex(desc.desc, T::DATA_TYPE, dims, strides) }?;
Ok(desc)
}
/// Creates an nd (at LEAST 4d) tensor descriptor.
pub fn create_nd_tensor<T: CudnnDataType>(
self: &Arc<Cudnn>,
dims: &[std::ffi::c_int],
strides: &[std::ffi::c_int],
) -> Result<TensorDescriptor<T>, CudnnError> {
assert!(dims.len() >= 4);
assert_eq!(dims.len(), strides.len());
let desc = result::create_tensor_descriptor()?;
let desc = TensorDescriptor {
desc,
handle: self.clone(),
marker: PhantomData,
};
unsafe {
result::set_tensornd_descriptor(
desc.desc,
T::DATA_TYPE,
dims.len() as std::ffi::c_int,
dims.as_ptr(),
strides.as_ptr(),
)
}?;
Ok(desc)
}
}
impl<T> Drop for TensorDescriptor<T> {
fn drop(&mut self) {
let desc = std::mem::replace(&mut self.desc, std::ptr::null_mut());
if !desc.is_null() {
unsafe { result::destroy_tensor_descriptor(desc) }.unwrap()
}
}
}
//! Safe wrappers around cuDNN.
//!
//! # Convolutions
//!
//! 1. Allocate tensor descriptors with [`Cudnn::create_4d_tensor()`]
//! 2. Allocate filter descriptors with [`Cudnn::create_4d_filter()`]
//! 3. Allocate conv descriptors with [`Cudnn::create_conv2d()`]
//! 4. Instantiate one of the following algorithms with the descriptors:
//! a. [`Conv2dForward`]
//! b. [`Conv2dBackwardData`] for computing gradient of image
//! c. [`Conv2dBackwardFilter`] for computing gradient of filters
//! 5. Call the `pick_algorithm` method of the struct. Specify the number of options to compare with a const generic.
//! 6. Call the `get_workspace_size` method of the struct.
//! 7. Re-allocate the workspace to the appropriate size.
//! 8. Call the `launch` method of the struct.
//!
//! # Reductions
mod conv;
mod core;
mod reduce;
pub use self::conv::{
Conv2dBackwardData, Conv2dBackwardFilter, Conv2dDescriptor, Conv2dForward, FilterDescriptor,
};
pub use self::core::{Cudnn, CudnnDataType, TensorDescriptor};
pub use self::reduce::{FlatIndices, NoIndices, ReduceTensor, ReductionDescriptor};
pub use super::result::CudnnError;
#[cfg(test)]
mod tests {
use super::*;
use crate::{cudnn, driver::CudaDevice};
#[test]
fn test_create_descriptors() -> Result<(), CudnnError> {
let cudnn = Cudnn::new(CudaDevice::new(0).unwrap())?;
let _ = cudnn.create_4d_tensor_ex::<f32>([1, 2, 3, 4], [24, 12, 4, 1])?;
let _ = cudnn.create_nd_tensor::<f64>(&[1, 2, 3, 4, 5, 6], &[720, 360, 120, 30, 6, 1])?;
let _ = cudnn.create_4d_filter::<f32>(
cudnn::sys::cudnnTensorFormat_t::CUDNN_TENSOR_NCHW,
[3, 3, 3, 3],
)?;
let _ = cudnn.create_reduction_flat_indices::<f32>(
cudnn::sys::cudnnReduceTensorOp_t::CUDNN_REDUCE_TENSOR_ADD,
cudnn::sys::cudnnNanPropagation_t::CUDNN_PROPAGATE_NAN,
)?;
let _ = cudnn.create_reduction_no_indices::<f32>(
cudnn::sys::cudnnReduceTensorOp_t::CUDNN_REDUCE_TENSOR_ADD,
cudnn::sys::cudnnNanPropagation_t::CUDNN_PROPAGATE_NAN,
)?;
Ok(())
}
#[test]
fn test_conv_pick_algorithms() -> Result<(), CudnnError> {
let cudnn = Cudnn::new(CudaDevice::new(0).unwrap())?;
let conv = cudnn.create_conv2d::<f32>(
[0; 2],
[1; 2],
[1; 2],
cudnn::sys::cudnnConvolutionMode_t::CUDNN_CROSS_CORRELATION,
)?;
let x = cudnn.create_4d_tensor::<f32>(
cudnn::sys::cudnnTensorFormat_t::CUDNN_TENSOR_NCHW,
[100, 128, 224, 224],
)?;
let filter = cudnn.create_4d_filter::<f32>(
cudnn::sys::cudnnTensorFormat_t::CUDNN_TENSOR_NCHW,
[256, 128, 3, 3],
)?;
let y = cudnn.create_4d_tensor::<f32>(
cudnn::sys::cudnnTensorFormat_t::CUDNN_TENSOR_NCHW,
[100, 256, 222, 222],
)?;
{
let op = Conv2dForward {
conv: &conv,
x: &x,
w: &filter,
y: &y,
};
let algo = op.pick_algorithm()?;
assert_eq!(
algo,
cudnn::sys::cudnnConvolutionFwdAlgo_t::CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
);
}
{
let op = Conv2dBackwardData {
conv: &conv,
dx: &x,
w: &filter,
dy: &y,
};
let algo = op.pick_algorithm()?;
assert_eq!(
algo,
cudnn::sys::cudnnConvolutionBwdDataAlgo_t::CUDNN_CONVOLUTION_BWD_DATA_ALGO_1
);
}
{
let op = Conv2dBackwardFilter {
conv: &conv,
x: &x,
dw: &filter,
dy: &y,
};
let algo = op.pick_algorithm()?;
assert_eq!(
algo,
cudnn::sys::cudnnConvolutionBwdFilterAlgo_t::CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1
);
}
Ok(())
}
#[test]
fn test_reduction() {
let dev = CudaDevice::new(0).unwrap();
let a = dev
.htod_copy(std::vec![1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0])
.unwrap();
let mut c = dev.alloc_zeros::<f32>(1).unwrap();
let cudnn = Cudnn::new(dev.clone()).unwrap();
let reduce = cudnn
.create_reduction_no_indices::<f32>(
cudnn::sys::cudnnReduceTensorOp_t::CUDNN_REDUCE_TENSOR_ADD,
cudnn::sys::cudnnNanPropagation_t::CUDNN_PROPAGATE_NAN,
)
.unwrap();
let a_desc = cudnn
.create_nd_tensor::<f32>(&[1, 1, 2, 3], &[0, 6, 3, 1])
.unwrap();
let c_desc = cudnn
.create_nd_tensor::<f32>(&[1, 1, 1, 1], &[0, 0, 0, 1])
.unwrap();
let op = ReduceTensor {
reduce: &reduce,
a: &a_desc,
c: &c_desc,
};
let workspace_size = op.get_workspace_size().unwrap();
let mut workspace = dev.alloc_zeros::<u8>(workspace_size).unwrap();
unsafe { op.launch(&mut workspace, (1.0, 0.0), &a, &mut c) }.unwrap();
let c_host = dev.sync_reclaim(c).unwrap();
assert_eq!(c_host.len(), 1);
assert_eq!(c_host[0], 21.0);
}
}
This diff is collapsed.
This diff is collapsed.
#include "cudnn.h"
\ No newline at end of file
#!/bin/bash
set -exu
bindgen \
--whitelist-type="^curand.*" \
--whitelist-var="^curand.*" \
--whitelist-function="^curand.*" \
--default-enum-style=rust \
--no-doc-comments \
--with-derive-default \
--with-derive-eq \
--with-derive-hash \
--with-derive-ord \
--size_t-is-usize \
--use-core \
wrapper.h -- -I/usr/local/cuda/include \
> sys.rs
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment