Commit 3ec8c534 authored by yongshk's avatar yongshk
Browse files

Initial commit

parents
/* automatically generated by rust-bindgen 0.68.1 */
pub type __off_t = ::core::ffi::c_long;
pub type __off64_t = ::core::ffi::c_long;
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct CUstream_st {
_unused: [u8; 0],
}
pub type cudaStream_t = *mut CUstream_st;
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudaDataType_t {
CUDA_R_16F = 2,
CUDA_C_16F = 6,
CUDA_R_16BF = 14,
CUDA_C_16BF = 15,
CUDA_R_32F = 0,
CUDA_C_32F = 4,
CUDA_R_64F = 1,
CUDA_C_64F = 5,
CUDA_R_4I = 16,
CUDA_C_4I = 17,
CUDA_R_4U = 18,
CUDA_C_4U = 19,
CUDA_R_8I = 3,
CUDA_C_8I = 7,
CUDA_R_8U = 8,
CUDA_C_8U = 9,
CUDA_R_16I = 20,
CUDA_C_16I = 21,
CUDA_R_16U = 22,
CUDA_C_16U = 23,
CUDA_R_32I = 10,
CUDA_C_32I = 11,
CUDA_R_32U = 12,
CUDA_C_32U = 13,
CUDA_R_64I = 24,
CUDA_C_64I = 25,
CUDA_R_64U = 26,
CUDA_C_64U = 27,
CUDA_R_8F_E4M3 = 28,
CUDA_R_8F_E5M2 = 29,
}
pub use self::cudaDataType_t as cudaDataType;
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum libraryPropertyType_t {
MAJOR_VERSION = 0,
MINOR_VERSION = 1,
PATCH_LEVEL = 2,
}
pub use self::libraryPropertyType_t as libraryPropertyType;
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cublasStatus_t {
CUBLAS_STATUS_SUCCESS = 0,
CUBLAS_STATUS_NOT_INITIALIZED = 1,
CUBLAS_STATUS_ALLOC_FAILED = 3,
CUBLAS_STATUS_INVALID_VALUE = 7,
CUBLAS_STATUS_ARCH_MISMATCH = 8,
CUBLAS_STATUS_MAPPING_ERROR = 11,
CUBLAS_STATUS_EXECUTION_FAILED = 13,
CUBLAS_STATUS_INTERNAL_ERROR = 14,
CUBLAS_STATUS_NOT_SUPPORTED = 15,
CUBLAS_STATUS_LICENSE_ERROR = 16,
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cublasComputeType_t {
CUBLAS_COMPUTE_16F = 64,
CUBLAS_COMPUTE_16F_PEDANTIC = 65,
CUBLAS_COMPUTE_32F = 68,
CUBLAS_COMPUTE_32F_PEDANTIC = 69,
CUBLAS_COMPUTE_32F_FAST_16F = 74,
CUBLAS_COMPUTE_32F_FAST_16BF = 75,
CUBLAS_COMPUTE_32F_FAST_TF32 = 77,
CUBLAS_COMPUTE_64F = 70,
CUBLAS_COMPUTE_64F_PEDANTIC = 71,
CUBLAS_COMPUTE_32I = 72,
CUBLAS_COMPUTE_32I_PEDANTIC = 73,
}
pub type FILE = _IO_FILE;
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct _IO_marker {
_unused: [u8; 0],
}
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct _IO_codecvt {
_unused: [u8; 0],
}
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct _IO_wide_data {
_unused: [u8; 0],
}
pub type _IO_lock_t = ::core::ffi::c_void;
#[repr(C)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub struct _IO_FILE {
pub _flags: ::core::ffi::c_int,
pub _IO_read_ptr: *mut ::core::ffi::c_char,
pub _IO_read_end: *mut ::core::ffi::c_char,
pub _IO_read_base: *mut ::core::ffi::c_char,
pub _IO_write_base: *mut ::core::ffi::c_char,
pub _IO_write_ptr: *mut ::core::ffi::c_char,
pub _IO_write_end: *mut ::core::ffi::c_char,
pub _IO_buf_base: *mut ::core::ffi::c_char,
pub _IO_buf_end: *mut ::core::ffi::c_char,
pub _IO_save_base: *mut ::core::ffi::c_char,
pub _IO_backup_base: *mut ::core::ffi::c_char,
pub _IO_save_end: *mut ::core::ffi::c_char,
pub _markers: *mut _IO_marker,
pub _chain: *mut _IO_FILE,
pub _fileno: ::core::ffi::c_int,
pub _flags2: ::core::ffi::c_int,
pub _old_offset: __off_t,
pub _cur_column: ::core::ffi::c_ushort,
pub _vtable_offset: ::core::ffi::c_schar,
pub _shortbuf: [::core::ffi::c_char; 1usize],
pub _lock: *mut _IO_lock_t,
pub _offset: __off64_t,
pub _codecvt: *mut _IO_codecvt,
pub _wide_data: *mut _IO_wide_data,
pub _freeres_list: *mut _IO_FILE,
pub _freeres_buf: *mut ::core::ffi::c_void,
pub __pad5: usize,
pub _mode: ::core::ffi::c_int,
pub _unused2: [::core::ffi::c_char; 20usize],
}
#[test]
fn bindgen_test_layout__IO_FILE() {
const UNINIT: ::core::mem::MaybeUninit<_IO_FILE> = ::core::mem::MaybeUninit::uninit();
let ptr = UNINIT.as_ptr();
assert_eq!(
::core::mem::size_of::<_IO_FILE>(),
216usize,
concat!("Size of: ", stringify!(_IO_FILE))
);
assert_eq!(
::core::mem::align_of::<_IO_FILE>(),
8usize,
concat!("Alignment of ", stringify!(_IO_FILE))
);
assert_eq!(
unsafe { ::core::ptr::addr_of!((*ptr)._flags) as usize - ptr as usize },
0usize,
concat!(
"Offset of field: ",
stringify!(_IO_FILE),
"::",
stringify!(_flags)
)
);
assert_eq!(
unsafe { ::core::ptr::addr_of!((*ptr)._IO_read_ptr) as usize - ptr as usize },
8usize,
concat!(
"Offset of field: ",
stringify!(_IO_FILE),
"::",
stringify!(_IO_read_ptr)
)
);
assert_eq!(
unsafe { ::core::ptr::addr_of!((*ptr)._IO_read_end) as usize - ptr as usize },
16usize,
concat!(
"Offset of field: ",
stringify!(_IO_FILE),
"::",
stringify!(_IO_read_end)
)
);
assert_eq!(
unsafe { ::core::ptr::addr_of!((*ptr)._IO_read_base) as usize - ptr as usize },
24usize,
concat!(
"Offset of field: ",
stringify!(_IO_FILE),
"::",
stringify!(_IO_read_base)
)
);
assert_eq!(
unsafe { ::core::ptr::addr_of!((*ptr)._IO_write_base) as usize - ptr as usize },
32usize,
concat!(
"Offset of field: ",
stringify!(_IO_FILE),
"::",
stringify!(_IO_write_base)
)
);
assert_eq!(
unsafe { ::core::ptr::addr_of!((*ptr)._IO_write_ptr) as usize - ptr as usize },
40usize,
concat!(
"Offset of field: ",
stringify!(_IO_FILE),
"::",
stringify!(_IO_write_ptr)
)
);
assert_eq!(
unsafe { ::core::ptr::addr_of!((*ptr)._IO_write_end) as usize - ptr as usize },
48usize,
concat!(
"Offset of field: ",
stringify!(_IO_FILE),
"::",
stringify!(_IO_write_end)
)
);
assert_eq!(
unsafe { ::core::ptr::addr_of!((*ptr)._IO_buf_base) as usize - ptr as usize },
56usize,
concat!(
"Offset of field: ",
stringify!(_IO_FILE),
"::",
stringify!(_IO_buf_base)
)
);
assert_eq!(
unsafe { ::core::ptr::addr_of!((*ptr)._IO_buf_end) as usize - ptr as usize },
64usize,
concat!(
"Offset of field: ",
stringify!(_IO_FILE),
"::",
stringify!(_IO_buf_end)
)
);
assert_eq!(
unsafe { ::core::ptr::addr_of!((*ptr)._IO_save_base) as usize - ptr as usize },
72usize,
concat!(
"Offset of field: ",
stringify!(_IO_FILE),
"::",
stringify!(_IO_save_base)
)
);
assert_eq!(
unsafe { ::core::ptr::addr_of!((*ptr)._IO_backup_base) as usize - ptr as usize },
80usize,
concat!(
"Offset of field: ",
stringify!(_IO_FILE),
"::",
stringify!(_IO_backup_base)
)
);
assert_eq!(
unsafe { ::core::ptr::addr_of!((*ptr)._IO_save_end) as usize - ptr as usize },
88usize,
concat!(
"Offset of field: ",
stringify!(_IO_FILE),
"::",
stringify!(_IO_save_end)
)
);
assert_eq!(
unsafe { ::core::ptr::addr_of!((*ptr)._markers) as usize - ptr as usize },
96usize,
concat!(
"Offset of field: ",
stringify!(_IO_FILE),
"::",
stringify!(_markers)
)
);
assert_eq!(
unsafe { ::core::ptr::addr_of!((*ptr)._chain) as usize - ptr as usize },
104usize,
concat!(
"Offset of field: ",
stringify!(_IO_FILE),
"::",
stringify!(_chain)
)
);
assert_eq!(
unsafe { ::core::ptr::addr_of!((*ptr)._fileno) as usize - ptr as usize },
112usize,
concat!(
"Offset of field: ",
stringify!(_IO_FILE),
"::",
stringify!(_fileno)
)
);
assert_eq!(
unsafe { ::core::ptr::addr_of!((*ptr)._flags2) as usize - ptr as usize },
116usize,
concat!(
"Offset of field: ",
stringify!(_IO_FILE),
"::",
stringify!(_flags2)
)
);
assert_eq!(
unsafe { ::core::ptr::addr_of!((*ptr)._old_offset) as usize - ptr as usize },
120usize,
concat!(
"Offset of field: ",
stringify!(_IO_FILE),
"::",
stringify!(_old_offset)
)
);
assert_eq!(
unsafe { ::core::ptr::addr_of!((*ptr)._cur_column) as usize - ptr as usize },
128usize,
concat!(
"Offset of field: ",
stringify!(_IO_FILE),
"::",
stringify!(_cur_column)
)
);
assert_eq!(
unsafe { ::core::ptr::addr_of!((*ptr)._vtable_offset) as usize - ptr as usize },
130usize,
concat!(
"Offset of field: ",
stringify!(_IO_FILE),
"::",
stringify!(_vtable_offset)
)
);
assert_eq!(
unsafe { ::core::ptr::addr_of!((*ptr)._shortbuf) as usize - ptr as usize },
131usize,
concat!(
"Offset of field: ",
stringify!(_IO_FILE),
"::",
stringify!(_shortbuf)
)
);
assert_eq!(
unsafe { ::core::ptr::addr_of!((*ptr)._lock) as usize - ptr as usize },
136usize,
concat!(
"Offset of field: ",
stringify!(_IO_FILE),
"::",
stringify!(_lock)
)
);
assert_eq!(
unsafe { ::core::ptr::addr_of!((*ptr)._offset) as usize - ptr as usize },
144usize,
concat!(
"Offset of field: ",
stringify!(_IO_FILE),
"::",
stringify!(_offset)
)
);
assert_eq!(
unsafe { ::core::ptr::addr_of!((*ptr)._codecvt) as usize - ptr as usize },
152usize,
concat!(
"Offset of field: ",
stringify!(_IO_FILE),
"::",
stringify!(_codecvt)
)
);
assert_eq!(
unsafe { ::core::ptr::addr_of!((*ptr)._wide_data) as usize - ptr as usize },
160usize,
concat!(
"Offset of field: ",
stringify!(_IO_FILE),
"::",
stringify!(_wide_data)
)
);
assert_eq!(
unsafe { ::core::ptr::addr_of!((*ptr)._freeres_list) as usize - ptr as usize },
168usize,
concat!(
"Offset of field: ",
stringify!(_IO_FILE),
"::",
stringify!(_freeres_list)
)
);
assert_eq!(
unsafe { ::core::ptr::addr_of!((*ptr)._freeres_buf) as usize - ptr as usize },
176usize,
concat!(
"Offset of field: ",
stringify!(_IO_FILE),
"::",
stringify!(_freeres_buf)
)
);
assert_eq!(
unsafe { ::core::ptr::addr_of!((*ptr).__pad5) as usize - ptr as usize },
184usize,
concat!(
"Offset of field: ",
stringify!(_IO_FILE),
"::",
stringify!(__pad5)
)
);
assert_eq!(
unsafe { ::core::ptr::addr_of!((*ptr)._mode) as usize - ptr as usize },
192usize,
concat!(
"Offset of field: ",
stringify!(_IO_FILE),
"::",
stringify!(_mode)
)
);
assert_eq!(
unsafe { ::core::ptr::addr_of!((*ptr)._unused2) as usize - ptr as usize },
196usize,
concat!(
"Offset of field: ",
stringify!(_IO_FILE),
"::",
stringify!(_unused2)
)
);
}
impl Default for _IO_FILE {
fn default() -> Self {
let mut s = ::core::mem::MaybeUninit::<Self>::uninit();
unsafe {
::core::ptr::write_bytes(s.as_mut_ptr(), 0, 1);
s.assume_init()
}
}
}
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct cublasLtContext {
_unused: [u8; 0],
}
pub type cublasLtHandle_t = *mut cublasLtContext;
extern "C" {
pub fn cublasLtCreate(lightHandle: *mut cublasLtHandle_t) -> cublasStatus_t;
}
extern "C" {
pub fn cublasLtDestroy(lightHandle: cublasLtHandle_t) -> cublasStatus_t;
}
extern "C" {
pub fn cublasLtGetStatusName(status: cublasStatus_t) -> *const ::core::ffi::c_char;
}
extern "C" {
pub fn cublasLtGetStatusString(status: cublasStatus_t) -> *const ::core::ffi::c_char;
}
extern "C" {
pub fn cublasLtGetVersion() -> usize;
}
extern "C" {
pub fn cublasLtGetCudartVersion() -> usize;
}
extern "C" {
pub fn cublasLtGetProperty(
type_: libraryPropertyType,
value: *mut ::core::ffi::c_int,
) -> cublasStatus_t;
}
extern "C" {
pub fn cublasLtHeuristicsCacheGetCapacity(capacity: *mut usize) -> cublasStatus_t;
}
extern "C" {
pub fn cublasLtHeuristicsCacheSetCapacity(capacity: usize) -> cublasStatus_t;
}
extern "C" {
pub fn cublasLtDisableCpuInstructionsSetMask(mask: ::core::ffi::c_uint) -> ::core::ffi::c_uint;
}
#[repr(C)]
#[derive(Debug, Default, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub struct cublasLtMatrixLayoutOpaque_t {
pub data: [u64; 8usize],
}
#[test]
fn bindgen_test_layout_cublasLtMatrixLayoutOpaque_t() {
const UNINIT: ::core::mem::MaybeUninit<cublasLtMatrixLayoutOpaque_t> =
::core::mem::MaybeUninit::uninit();
let ptr = UNINIT.as_ptr();
assert_eq!(
::core::mem::size_of::<cublasLtMatrixLayoutOpaque_t>(),
64usize,
concat!("Size of: ", stringify!(cublasLtMatrixLayoutOpaque_t))
);
assert_eq!(
::core::mem::align_of::<cublasLtMatrixLayoutOpaque_t>(),
8usize,
concat!("Alignment of ", stringify!(cublasLtMatrixLayoutOpaque_t))
);
assert_eq!(
unsafe { ::core::ptr::addr_of!((*ptr).data) as usize - ptr as usize },
0usize,
concat!(
"Offset of field: ",
stringify!(cublasLtMatrixLayoutOpaque_t),
"::",
stringify!(data)
)
);
}
pub type cublasLtMatrixLayout_t = *mut cublasLtMatrixLayoutOpaque_t;
#[repr(C)]
#[derive(Debug, Default, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub struct cublasLtMatmulAlgo_t {
pub data: [u64; 8usize],
}
#[test]
fn bindgen_test_layout_cublasLtMatmulAlgo_t() {
const UNINIT: ::core::mem::MaybeUninit<cublasLtMatmulAlgo_t> =
::core::mem::MaybeUninit::uninit();
let ptr = UNINIT.as_ptr();
assert_eq!(
::core::mem::size_of::<cublasLtMatmulAlgo_t>(),
64usize,
concat!("Size of: ", stringify!(cublasLtMatmulAlgo_t))
);
assert_eq!(
::core::mem::align_of::<cublasLtMatmulAlgo_t>(),
8usize,
concat!("Alignment of ", stringify!(cublasLtMatmulAlgo_t))
);
assert_eq!(
unsafe { ::core::ptr::addr_of!((*ptr).data) as usize - ptr as usize },
0usize,
concat!(
"Offset of field: ",
stringify!(cublasLtMatmulAlgo_t),
"::",
stringify!(data)
)
);
}
#[repr(C)]
#[derive(Debug, Default, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub struct cublasLtMatmulDescOpaque_t {
pub data: [u64; 32usize],
}
#[test]
fn bindgen_test_layout_cublasLtMatmulDescOpaque_t() {
const UNINIT: ::core::mem::MaybeUninit<cublasLtMatmulDescOpaque_t> =
::core::mem::MaybeUninit::uninit();
let ptr = UNINIT.as_ptr();
assert_eq!(
::core::mem::size_of::<cublasLtMatmulDescOpaque_t>(),
256usize,
concat!("Size of: ", stringify!(cublasLtMatmulDescOpaque_t))
);
assert_eq!(
::core::mem::align_of::<cublasLtMatmulDescOpaque_t>(),
8usize,
concat!("Alignment of ", stringify!(cublasLtMatmulDescOpaque_t))
);
assert_eq!(
unsafe { ::core::ptr::addr_of!((*ptr).data) as usize - ptr as usize },
0usize,
concat!(
"Offset of field: ",
stringify!(cublasLtMatmulDescOpaque_t),
"::",
stringify!(data)
)
);
}
pub type cublasLtMatmulDesc_t = *mut cublasLtMatmulDescOpaque_t;
#[repr(C)]
#[derive(Debug, Default, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub struct cublasLtMatrixTransformDescOpaque_t {
pub data: [u64; 8usize],
}
#[test]
fn bindgen_test_layout_cublasLtMatrixTransformDescOpaque_t() {
const UNINIT: ::core::mem::MaybeUninit<cublasLtMatrixTransformDescOpaque_t> =
::core::mem::MaybeUninit::uninit();
let ptr = UNINIT.as_ptr();
assert_eq!(
::core::mem::size_of::<cublasLtMatrixTransformDescOpaque_t>(),
64usize,
concat!("Size of: ", stringify!(cublasLtMatrixTransformDescOpaque_t))
);
assert_eq!(
::core::mem::align_of::<cublasLtMatrixTransformDescOpaque_t>(),
8usize,
concat!(
"Alignment of ",
stringify!(cublasLtMatrixTransformDescOpaque_t)
)
);
assert_eq!(
unsafe { ::core::ptr::addr_of!((*ptr).data) as usize - ptr as usize },
0usize,
concat!(
"Offset of field: ",
stringify!(cublasLtMatrixTransformDescOpaque_t),
"::",
stringify!(data)
)
);
}
pub type cublasLtMatrixTransformDesc_t = *mut cublasLtMatrixTransformDescOpaque_t;
#[repr(C)]
#[derive(Debug, Default, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub struct cublasLtMatmulPreferenceOpaque_t {
pub data: [u64; 8usize],
}
#[test]
fn bindgen_test_layout_cublasLtMatmulPreferenceOpaque_t() {
const UNINIT: ::core::mem::MaybeUninit<cublasLtMatmulPreferenceOpaque_t> =
::core::mem::MaybeUninit::uninit();
let ptr = UNINIT.as_ptr();
assert_eq!(
::core::mem::size_of::<cublasLtMatmulPreferenceOpaque_t>(),
64usize,
concat!("Size of: ", stringify!(cublasLtMatmulPreferenceOpaque_t))
);
assert_eq!(
::core::mem::align_of::<cublasLtMatmulPreferenceOpaque_t>(),
8usize,
concat!(
"Alignment of ",
stringify!(cublasLtMatmulPreferenceOpaque_t)
)
);
assert_eq!(
unsafe { ::core::ptr::addr_of!((*ptr).data) as usize - ptr as usize },
0usize,
concat!(
"Offset of field: ",
stringify!(cublasLtMatmulPreferenceOpaque_t),
"::",
stringify!(data)
)
);
}
pub type cublasLtMatmulPreference_t = *mut cublasLtMatmulPreferenceOpaque_t;
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cublasLtMatmulTile_t {
CUBLASLT_MATMUL_TILE_UNDEFINED = 0,
CUBLASLT_MATMUL_TILE_8x8 = 1,
CUBLASLT_MATMUL_TILE_8x16 = 2,
CUBLASLT_MATMUL_TILE_16x8 = 3,
CUBLASLT_MATMUL_TILE_8x32 = 4,
CUBLASLT_MATMUL_TILE_16x16 = 5,
CUBLASLT_MATMUL_TILE_32x8 = 6,
CUBLASLT_MATMUL_TILE_8x64 = 7,
CUBLASLT_MATMUL_TILE_16x32 = 8,
CUBLASLT_MATMUL_TILE_32x16 = 9,
CUBLASLT_MATMUL_TILE_64x8 = 10,
CUBLASLT_MATMUL_TILE_32x32 = 11,
CUBLASLT_MATMUL_TILE_32x64 = 12,
CUBLASLT_MATMUL_TILE_64x32 = 13,
CUBLASLT_MATMUL_TILE_32x128 = 14,
CUBLASLT_MATMUL_TILE_64x64 = 15,
CUBLASLT_MATMUL_TILE_128x32 = 16,
CUBLASLT_MATMUL_TILE_64x128 = 17,
CUBLASLT_MATMUL_TILE_128x64 = 18,
CUBLASLT_MATMUL_TILE_64x256 = 19,
CUBLASLT_MATMUL_TILE_128x128 = 20,
CUBLASLT_MATMUL_TILE_256x64 = 21,
CUBLASLT_MATMUL_TILE_64x512 = 22,
CUBLASLT_MATMUL_TILE_128x256 = 23,
CUBLASLT_MATMUL_TILE_256x128 = 24,
CUBLASLT_MATMUL_TILE_512x64 = 25,
CUBLASLT_MATMUL_TILE_64x96 = 26,
CUBLASLT_MATMUL_TILE_96x64 = 27,
CUBLASLT_MATMUL_TILE_96x128 = 28,
CUBLASLT_MATMUL_TILE_128x160 = 29,
CUBLASLT_MATMUL_TILE_160x128 = 30,
CUBLASLT_MATMUL_TILE_192x128 = 31,
CUBLASLT_MATMUL_TILE_128x192 = 32,
CUBLASLT_MATMUL_TILE_128x96 = 33,
CUBLASLT_MATMUL_TILE_32x256 = 34,
CUBLASLT_MATMUL_TILE_256x32 = 35,
CUBLASLT_MATMUL_TILE_END = 36,
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cublasLtMatmulStages_t {
CUBLASLT_MATMUL_STAGES_UNDEFINED = 0,
CUBLASLT_MATMUL_STAGES_16x1 = 1,
CUBLASLT_MATMUL_STAGES_16x2 = 2,
CUBLASLT_MATMUL_STAGES_16x3 = 3,
CUBLASLT_MATMUL_STAGES_16x4 = 4,
CUBLASLT_MATMUL_STAGES_16x5 = 5,
CUBLASLT_MATMUL_STAGES_16x6 = 6,
CUBLASLT_MATMUL_STAGES_32x1 = 7,
CUBLASLT_MATMUL_STAGES_32x2 = 8,
CUBLASLT_MATMUL_STAGES_32x3 = 9,
CUBLASLT_MATMUL_STAGES_32x4 = 10,
CUBLASLT_MATMUL_STAGES_32x5 = 11,
CUBLASLT_MATMUL_STAGES_32x6 = 12,
CUBLASLT_MATMUL_STAGES_64x1 = 13,
CUBLASLT_MATMUL_STAGES_64x2 = 14,
CUBLASLT_MATMUL_STAGES_64x3 = 15,
CUBLASLT_MATMUL_STAGES_64x4 = 16,
CUBLASLT_MATMUL_STAGES_64x5 = 17,
CUBLASLT_MATMUL_STAGES_64x6 = 18,
CUBLASLT_MATMUL_STAGES_128x1 = 19,
CUBLASLT_MATMUL_STAGES_128x2 = 20,
CUBLASLT_MATMUL_STAGES_128x3 = 21,
CUBLASLT_MATMUL_STAGES_128x4 = 22,
CUBLASLT_MATMUL_STAGES_128x5 = 23,
CUBLASLT_MATMUL_STAGES_128x6 = 24,
CUBLASLT_MATMUL_STAGES_32x10 = 25,
CUBLASLT_MATMUL_STAGES_8x4 = 26,
CUBLASLT_MATMUL_STAGES_16x10 = 27,
CUBLASLT_MATMUL_STAGES_8x5 = 28,
CUBLASLT_MATMUL_STAGES_8x3 = 31,
CUBLASLT_MATMUL_STAGES_8xAUTO = 32,
CUBLASLT_MATMUL_STAGES_16xAUTO = 33,
CUBLASLT_MATMUL_STAGES_32xAUTO = 34,
CUBLASLT_MATMUL_STAGES_64xAUTO = 35,
CUBLASLT_MATMUL_STAGES_128xAUTO = 36,
CUBLASLT_MATMUL_STAGES_END = 37,
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cublasLtClusterShape_t {
CUBLASLT_CLUSTER_SHAPE_AUTO = 0,
CUBLASLT_CLUSTER_SHAPE_1x1x1 = 2,
CUBLASLT_CLUSTER_SHAPE_2x1x1 = 3,
CUBLASLT_CLUSTER_SHAPE_4x1x1 = 4,
CUBLASLT_CLUSTER_SHAPE_1x2x1 = 5,
CUBLASLT_CLUSTER_SHAPE_2x2x1 = 6,
CUBLASLT_CLUSTER_SHAPE_4x2x1 = 7,
CUBLASLT_CLUSTER_SHAPE_1x4x1 = 8,
CUBLASLT_CLUSTER_SHAPE_2x4x1 = 9,
CUBLASLT_CLUSTER_SHAPE_4x4x1 = 10,
CUBLASLT_CLUSTER_SHAPE_8x1x1 = 11,
CUBLASLT_CLUSTER_SHAPE_1x8x1 = 12,
CUBLASLT_CLUSTER_SHAPE_8x2x1 = 13,
CUBLASLT_CLUSTER_SHAPE_2x8x1 = 14,
CUBLASLT_CLUSTER_SHAPE_16x1x1 = 15,
CUBLASLT_CLUSTER_SHAPE_1x16x1 = 16,
CUBLASLT_CLUSTER_SHAPE_3x1x1 = 17,
CUBLASLT_CLUSTER_SHAPE_5x1x1 = 18,
CUBLASLT_CLUSTER_SHAPE_6x1x1 = 19,
CUBLASLT_CLUSTER_SHAPE_7x1x1 = 20,
CUBLASLT_CLUSTER_SHAPE_9x1x1 = 21,
CUBLASLT_CLUSTER_SHAPE_10x1x1 = 22,
CUBLASLT_CLUSTER_SHAPE_11x1x1 = 23,
CUBLASLT_CLUSTER_SHAPE_12x1x1 = 24,
CUBLASLT_CLUSTER_SHAPE_13x1x1 = 25,
CUBLASLT_CLUSTER_SHAPE_14x1x1 = 26,
CUBLASLT_CLUSTER_SHAPE_15x1x1 = 27,
CUBLASLT_CLUSTER_SHAPE_3x2x1 = 28,
CUBLASLT_CLUSTER_SHAPE_5x2x1 = 29,
CUBLASLT_CLUSTER_SHAPE_6x2x1 = 30,
CUBLASLT_CLUSTER_SHAPE_7x2x1 = 31,
CUBLASLT_CLUSTER_SHAPE_1x3x1 = 32,
CUBLASLT_CLUSTER_SHAPE_2x3x1 = 33,
CUBLASLT_CLUSTER_SHAPE_3x3x1 = 34,
CUBLASLT_CLUSTER_SHAPE_4x3x1 = 35,
CUBLASLT_CLUSTER_SHAPE_5x3x1 = 36,
CUBLASLT_CLUSTER_SHAPE_3x4x1 = 37,
CUBLASLT_CLUSTER_SHAPE_1x5x1 = 38,
CUBLASLT_CLUSTER_SHAPE_2x5x1 = 39,
CUBLASLT_CLUSTER_SHAPE_3x5x1 = 40,
CUBLASLT_CLUSTER_SHAPE_1x6x1 = 41,
CUBLASLT_CLUSTER_SHAPE_2x6x1 = 42,
CUBLASLT_CLUSTER_SHAPE_1x7x1 = 43,
CUBLASLT_CLUSTER_SHAPE_2x7x1 = 44,
CUBLASLT_CLUSTER_SHAPE_1x9x1 = 45,
CUBLASLT_CLUSTER_SHAPE_1x10x1 = 46,
CUBLASLT_CLUSTER_SHAPE_1x11x1 = 47,
CUBLASLT_CLUSTER_SHAPE_1x12x1 = 48,
CUBLASLT_CLUSTER_SHAPE_1x13x1 = 49,
CUBLASLT_CLUSTER_SHAPE_1x14x1 = 50,
CUBLASLT_CLUSTER_SHAPE_1x15x1 = 51,
CUBLASLT_CLUSTER_SHAPE_END = 52,
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cublasLtMatmulInnerShape_t {
CUBLASLT_MATMUL_INNER_SHAPE_UNDEFINED = 0,
CUBLASLT_MATMUL_INNER_SHAPE_MMA884 = 1,
CUBLASLT_MATMUL_INNER_SHAPE_MMA1684 = 2,
CUBLASLT_MATMUL_INNER_SHAPE_MMA1688 = 3,
CUBLASLT_MATMUL_INNER_SHAPE_MMA16816 = 4,
CUBLASLT_MATMUL_INNER_SHAPE_END = 5,
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cublasLtPointerMode_t {
CUBLASLT_POINTER_MODE_HOST = 0,
CUBLASLT_POINTER_MODE_DEVICE = 1,
CUBLASLT_POINTER_MODE_DEVICE_VECTOR = 2,
CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_ZERO = 3,
CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_HOST = 4,
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cublasLtPointerModeMask_t {
CUBLASLT_POINTER_MODE_MASK_HOST = 1,
CUBLASLT_POINTER_MODE_MASK_DEVICE = 2,
CUBLASLT_POINTER_MODE_MASK_DEVICE_VECTOR = 4,
CUBLASLT_POINTER_MODE_MASK_ALPHA_DEVICE_VECTOR_BETA_ZERO = 8,
CUBLASLT_POINTER_MODE_MASK_ALPHA_DEVICE_VECTOR_BETA_HOST = 16,
}
pub type cublasLtNumericalImplFlags_t = u64;
extern "C" {
pub fn cublasLtMatmul(
lightHandle: cublasLtHandle_t,
computeDesc: cublasLtMatmulDesc_t,
alpha: *const ::core::ffi::c_void,
A: *const ::core::ffi::c_void,
Adesc: cublasLtMatrixLayout_t,
B: *const ::core::ffi::c_void,
Bdesc: cublasLtMatrixLayout_t,
beta: *const ::core::ffi::c_void,
C: *const ::core::ffi::c_void,
Cdesc: cublasLtMatrixLayout_t,
D: *mut ::core::ffi::c_void,
Ddesc: cublasLtMatrixLayout_t,
algo: *const cublasLtMatmulAlgo_t,
workspace: *mut ::core::ffi::c_void,
workspaceSizeInBytes: usize,
stream: cudaStream_t,
) -> cublasStatus_t;
}
extern "C" {
pub fn cublasLtMatrixTransform(
lightHandle: cublasLtHandle_t,
transformDesc: cublasLtMatrixTransformDesc_t,
alpha: *const ::core::ffi::c_void,
A: *const ::core::ffi::c_void,
Adesc: cublasLtMatrixLayout_t,
beta: *const ::core::ffi::c_void,
B: *const ::core::ffi::c_void,
Bdesc: cublasLtMatrixLayout_t,
C: *mut ::core::ffi::c_void,
Cdesc: cublasLtMatrixLayout_t,
stream: cudaStream_t,
) -> cublasStatus_t;
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cublasLtOrder_t {
CUBLASLT_ORDER_COL = 0,
CUBLASLT_ORDER_ROW = 1,
CUBLASLT_ORDER_COL32 = 2,
CUBLASLT_ORDER_COL4_4R2_8C = 3,
CUBLASLT_ORDER_COL32_2R_4R4 = 4,
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cublasLtMatrixLayoutAttribute_t {
CUBLASLT_MATRIX_LAYOUT_TYPE = 0,
CUBLASLT_MATRIX_LAYOUT_ORDER = 1,
CUBLASLT_MATRIX_LAYOUT_ROWS = 2,
CUBLASLT_MATRIX_LAYOUT_COLS = 3,
CUBLASLT_MATRIX_LAYOUT_LD = 4,
CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT = 5,
CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET = 6,
CUBLASLT_MATRIX_LAYOUT_PLANE_OFFSET = 7,
}
extern "C" {
pub fn cublasLtMatrixLayoutInit_internal(
matLayout: cublasLtMatrixLayout_t,
size: usize,
type_: cudaDataType,
rows: u64,
cols: u64,
ld: i64,
) -> cublasStatus_t;
}
extern "C" {
pub fn cublasLtMatrixLayoutCreate(
matLayout: *mut cublasLtMatrixLayout_t,
type_: cudaDataType,
rows: u64,
cols: u64,
ld: i64,
) -> cublasStatus_t;
}
extern "C" {
pub fn cublasLtMatrixLayoutDestroy(matLayout: cublasLtMatrixLayout_t) -> cublasStatus_t;
}
extern "C" {
pub fn cublasLtMatrixLayoutSetAttribute(
matLayout: cublasLtMatrixLayout_t,
attr: cublasLtMatrixLayoutAttribute_t,
buf: *const ::core::ffi::c_void,
sizeInBytes: usize,
) -> cublasStatus_t;
}
extern "C" {
pub fn cublasLtMatrixLayoutGetAttribute(
matLayout: cublasLtMatrixLayout_t,
attr: cublasLtMatrixLayoutAttribute_t,
buf: *mut ::core::ffi::c_void,
sizeInBytes: usize,
sizeWritten: *mut usize,
) -> cublasStatus_t;
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cublasLtMatmulDescAttributes_t {
CUBLASLT_MATMUL_DESC_COMPUTE_TYPE = 0,
CUBLASLT_MATMUL_DESC_SCALE_TYPE = 1,
CUBLASLT_MATMUL_DESC_POINTER_MODE = 2,
CUBLASLT_MATMUL_DESC_TRANSA = 3,
CUBLASLT_MATMUL_DESC_TRANSB = 4,
CUBLASLT_MATMUL_DESC_TRANSC = 5,
CUBLASLT_MATMUL_DESC_FILL_MODE = 6,
CUBLASLT_MATMUL_DESC_EPILOGUE = 7,
CUBLASLT_MATMUL_DESC_BIAS_POINTER = 8,
CUBLASLT_MATMUL_DESC_BIAS_BATCH_STRIDE = 10,
CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER = 11,
CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD = 12,
CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_BATCH_STRIDE = 13,
CUBLASLT_MATMUL_DESC_ALPHA_VECTOR_BATCH_STRIDE = 14,
CUBLASLT_MATMUL_DESC_SM_COUNT_TARGET = 15,
CUBLASLT_MATMUL_DESC_A_SCALE_POINTER = 17,
CUBLASLT_MATMUL_DESC_B_SCALE_POINTER = 18,
CUBLASLT_MATMUL_DESC_C_SCALE_POINTER = 19,
CUBLASLT_MATMUL_DESC_D_SCALE_POINTER = 20,
CUBLASLT_MATMUL_DESC_AMAX_D_POINTER = 21,
CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_DATA_TYPE = 22,
CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_SCALE_POINTER = 23,
CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_AMAX_POINTER = 24,
CUBLASLT_MATMUL_DESC_FAST_ACCUM = 25,
CUBLASLT_MATMUL_DESC_BIAS_DATA_TYPE = 26,
CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_NUM_CHUNKS_D_ROWS = 27,
CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_NUM_CHUNKS_D_COLS = 28,
CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_IN_COUNTERS_POINTER = 29,
CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_OUT_COUNTERS_POINTER = 30,
}
extern "C" {
pub fn cublasLtMatmulDescInit_internal(
matmulDesc: cublasLtMatmulDesc_t,
size: usize,
computeType: cublasComputeType_t,
scaleType: cudaDataType_t,
) -> cublasStatus_t;
}
extern "C" {
pub fn cublasLtMatmulDescCreate(
matmulDesc: *mut cublasLtMatmulDesc_t,
computeType: cublasComputeType_t,
scaleType: cudaDataType_t,
) -> cublasStatus_t;
}
extern "C" {
pub fn cublasLtMatmulDescDestroy(matmulDesc: cublasLtMatmulDesc_t) -> cublasStatus_t;
}
extern "C" {
pub fn cublasLtMatmulDescSetAttribute(
matmulDesc: cublasLtMatmulDesc_t,
attr: cublasLtMatmulDescAttributes_t,
buf: *const ::core::ffi::c_void,
sizeInBytes: usize,
) -> cublasStatus_t;
}
extern "C" {
pub fn cublasLtMatmulDescGetAttribute(
matmulDesc: cublasLtMatmulDesc_t,
attr: cublasLtMatmulDescAttributes_t,
buf: *mut ::core::ffi::c_void,
sizeInBytes: usize,
sizeWritten: *mut usize,
) -> cublasStatus_t;
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cublasLtMatrixTransformDescAttributes_t {
CUBLASLT_MATRIX_TRANSFORM_DESC_SCALE_TYPE = 0,
CUBLASLT_MATRIX_TRANSFORM_DESC_POINTER_MODE = 1,
CUBLASLT_MATRIX_TRANSFORM_DESC_TRANSA = 2,
CUBLASLT_MATRIX_TRANSFORM_DESC_TRANSB = 3,
}
extern "C" {
pub fn cublasLtMatrixTransformDescInit_internal(
transformDesc: cublasLtMatrixTransformDesc_t,
size: usize,
scaleType: cudaDataType,
) -> cublasStatus_t;
}
extern "C" {
pub fn cublasLtMatrixTransformDescCreate(
transformDesc: *mut cublasLtMatrixTransformDesc_t,
scaleType: cudaDataType,
) -> cublasStatus_t;
}
extern "C" {
pub fn cublasLtMatrixTransformDescDestroy(
transformDesc: cublasLtMatrixTransformDesc_t,
) -> cublasStatus_t;
}
extern "C" {
pub fn cublasLtMatrixTransformDescSetAttribute(
transformDesc: cublasLtMatrixTransformDesc_t,
attr: cublasLtMatrixTransformDescAttributes_t,
buf: *const ::core::ffi::c_void,
sizeInBytes: usize,
) -> cublasStatus_t;
}
extern "C" {
pub fn cublasLtMatrixTransformDescGetAttribute(
transformDesc: cublasLtMatrixTransformDesc_t,
attr: cublasLtMatrixTransformDescAttributes_t,
buf: *mut ::core::ffi::c_void,
sizeInBytes: usize,
sizeWritten: *mut usize,
) -> cublasStatus_t;
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cublasLtReductionScheme_t {
CUBLASLT_REDUCTION_SCHEME_NONE = 0,
CUBLASLT_REDUCTION_SCHEME_INPLACE = 1,
CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE = 2,
CUBLASLT_REDUCTION_SCHEME_OUTPUT_TYPE = 4,
CUBLASLT_REDUCTION_SCHEME_MASK = 7,
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cublasLtEpilogue_t {
CUBLASLT_EPILOGUE_DEFAULT = 1,
CUBLASLT_EPILOGUE_RELU = 2,
CUBLASLT_EPILOGUE_RELU_AUX = 130,
CUBLASLT_EPILOGUE_BIAS = 4,
CUBLASLT_EPILOGUE_RELU_BIAS = 6,
CUBLASLT_EPILOGUE_RELU_AUX_BIAS = 134,
CUBLASLT_EPILOGUE_DRELU = 136,
CUBLASLT_EPILOGUE_DRELU_BGRAD = 152,
CUBLASLT_EPILOGUE_GELU = 32,
CUBLASLT_EPILOGUE_GELU_AUX = 160,
CUBLASLT_EPILOGUE_GELU_BIAS = 36,
CUBLASLT_EPILOGUE_GELU_AUX_BIAS = 164,
CUBLASLT_EPILOGUE_DGELU = 192,
CUBLASLT_EPILOGUE_DGELU_BGRAD = 208,
CUBLASLT_EPILOGUE_BGRADA = 256,
CUBLASLT_EPILOGUE_BGRADB = 512,
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cublasLtMatmulSearch_t {
CUBLASLT_SEARCH_BEST_FIT = 0,
CUBLASLT_SEARCH_LIMITED_BY_ALGO_ID = 1,
CUBLASLT_SEARCH_RESERVED_02 = 2,
CUBLASLT_SEARCH_RESERVED_03 = 3,
CUBLASLT_SEARCH_RESERVED_04 = 4,
CUBLASLT_SEARCH_RESERVED_05 = 5,
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cublasLtMatmulPreferenceAttributes_t {
CUBLASLT_MATMUL_PREF_SEARCH_MODE = 0,
CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES = 1,
CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK = 3,
CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_A_BYTES = 5,
CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_B_BYTES = 6,
CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_C_BYTES = 7,
CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_D_BYTES = 8,
CUBLASLT_MATMUL_PREF_MAX_WAVES_COUNT = 9,
CUBLASLT_MATMUL_PREF_IMPL_MASK = 12,
}
extern "C" {
pub fn cublasLtMatmulPreferenceInit_internal(
pref: cublasLtMatmulPreference_t,
size: usize,
) -> cublasStatus_t;
}
extern "C" {
pub fn cublasLtMatmulPreferenceCreate(pref: *mut cublasLtMatmulPreference_t) -> cublasStatus_t;
}
extern "C" {
pub fn cublasLtMatmulPreferenceDestroy(pref: cublasLtMatmulPreference_t) -> cublasStatus_t;
}
extern "C" {
pub fn cublasLtMatmulPreferenceSetAttribute(
pref: cublasLtMatmulPreference_t,
attr: cublasLtMatmulPreferenceAttributes_t,
buf: *const ::core::ffi::c_void,
sizeInBytes: usize,
) -> cublasStatus_t;
}
extern "C" {
pub fn cublasLtMatmulPreferenceGetAttribute(
pref: cublasLtMatmulPreference_t,
attr: cublasLtMatmulPreferenceAttributes_t,
buf: *mut ::core::ffi::c_void,
sizeInBytes: usize,
sizeWritten: *mut usize,
) -> cublasStatus_t;
}
#[repr(C)]
#[derive(Debug, Copy, Clone, PartialOrd, PartialEq)]
pub struct cublasLtMatmulHeuristicResult_t {
pub algo: cublasLtMatmulAlgo_t,
pub workspaceSize: usize,
pub state: cublasStatus_t,
pub wavesCount: f32,
pub reserved: [::core::ffi::c_int; 4usize],
}
#[test]
fn bindgen_test_layout_cublasLtMatmulHeuristicResult_t() {
const UNINIT: ::core::mem::MaybeUninit<cublasLtMatmulHeuristicResult_t> =
::core::mem::MaybeUninit::uninit();
let ptr = UNINIT.as_ptr();
assert_eq!(
::core::mem::size_of::<cublasLtMatmulHeuristicResult_t>(),
96usize,
concat!("Size of: ", stringify!(cublasLtMatmulHeuristicResult_t))
);
assert_eq!(
::core::mem::align_of::<cublasLtMatmulHeuristicResult_t>(),
8usize,
concat!("Alignment of ", stringify!(cublasLtMatmulHeuristicResult_t))
);
assert_eq!(
unsafe { ::core::ptr::addr_of!((*ptr).algo) as usize - ptr as usize },
0usize,
concat!(
"Offset of field: ",
stringify!(cublasLtMatmulHeuristicResult_t),
"::",
stringify!(algo)
)
);
assert_eq!(
unsafe { ::core::ptr::addr_of!((*ptr).workspaceSize) as usize - ptr as usize },
64usize,
concat!(
"Offset of field: ",
stringify!(cublasLtMatmulHeuristicResult_t),
"::",
stringify!(workspaceSize)
)
);
assert_eq!(
unsafe { ::core::ptr::addr_of!((*ptr).state) as usize - ptr as usize },
72usize,
concat!(
"Offset of field: ",
stringify!(cublasLtMatmulHeuristicResult_t),
"::",
stringify!(state)
)
);
assert_eq!(
unsafe { ::core::ptr::addr_of!((*ptr).wavesCount) as usize - ptr as usize },
76usize,
concat!(
"Offset of field: ",
stringify!(cublasLtMatmulHeuristicResult_t),
"::",
stringify!(wavesCount)
)
);
assert_eq!(
unsafe { ::core::ptr::addr_of!((*ptr).reserved) as usize - ptr as usize },
80usize,
concat!(
"Offset of field: ",
stringify!(cublasLtMatmulHeuristicResult_t),
"::",
stringify!(reserved)
)
);
}
impl Default for cublasLtMatmulHeuristicResult_t {
fn default() -> Self {
let mut s = ::core::mem::MaybeUninit::<Self>::uninit();
unsafe {
::core::ptr::write_bytes(s.as_mut_ptr(), 0, 1);
s.assume_init()
}
}
}
extern "C" {
pub fn cublasLtMatmulAlgoGetHeuristic(
lightHandle: cublasLtHandle_t,
operationDesc: cublasLtMatmulDesc_t,
Adesc: cublasLtMatrixLayout_t,
Bdesc: cublasLtMatrixLayout_t,
Cdesc: cublasLtMatrixLayout_t,
Ddesc: cublasLtMatrixLayout_t,
preference: cublasLtMatmulPreference_t,
requestedAlgoCount: ::core::ffi::c_int,
heuristicResultsArray: *mut cublasLtMatmulHeuristicResult_t,
returnAlgoCount: *mut ::core::ffi::c_int,
) -> cublasStatus_t;
}
extern "C" {
pub fn cublasLtMatmulAlgoGetIds(
lightHandle: cublasLtHandle_t,
computeType: cublasComputeType_t,
scaleType: cudaDataType_t,
Atype: cudaDataType_t,
Btype: cudaDataType_t,
Ctype: cudaDataType_t,
Dtype: cudaDataType_t,
requestedAlgoCount: ::core::ffi::c_int,
algoIdsArray: *mut ::core::ffi::c_int,
returnAlgoCount: *mut ::core::ffi::c_int,
) -> cublasStatus_t;
}
extern "C" {
pub fn cublasLtMatmulAlgoInit(
lightHandle: cublasLtHandle_t,
computeType: cublasComputeType_t,
scaleType: cudaDataType_t,
Atype: cudaDataType_t,
Btype: cudaDataType_t,
Ctype: cudaDataType_t,
Dtype: cudaDataType_t,
algoId: ::core::ffi::c_int,
algo: *mut cublasLtMatmulAlgo_t,
) -> cublasStatus_t;
}
extern "C" {
pub fn cublasLtMatmulAlgoCheck(
lightHandle: cublasLtHandle_t,
operationDesc: cublasLtMatmulDesc_t,
Adesc: cublasLtMatrixLayout_t,
Bdesc: cublasLtMatrixLayout_t,
Cdesc: cublasLtMatrixLayout_t,
Ddesc: cublasLtMatrixLayout_t,
algo: *const cublasLtMatmulAlgo_t,
result: *mut cublasLtMatmulHeuristicResult_t,
) -> cublasStatus_t;
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cublasLtMatmulAlgoCapAttributes_t {
CUBLASLT_ALGO_CAP_SPLITK_SUPPORT = 0,
CUBLASLT_ALGO_CAP_REDUCTION_SCHEME_MASK = 1,
CUBLASLT_ALGO_CAP_CTA_SWIZZLING_SUPPORT = 2,
CUBLASLT_ALGO_CAP_STRIDED_BATCH_SUPPORT = 3,
CUBLASLT_ALGO_CAP_OUT_OF_PLACE_RESULT_SUPPORT = 4,
CUBLASLT_ALGO_CAP_UPLO_SUPPORT = 5,
CUBLASLT_ALGO_CAP_TILE_IDS = 6,
CUBLASLT_ALGO_CAP_CUSTOM_OPTION_MAX = 7,
CUBLASLT_ALGO_CAP_CUSTOM_MEMORY_ORDER = 10,
CUBLASLT_ALGO_CAP_POINTER_MODE_MASK = 11,
CUBLASLT_ALGO_CAP_EPILOGUE_MASK = 12,
CUBLASLT_ALGO_CAP_STAGES_IDS = 13,
CUBLASLT_ALGO_CAP_LD_NEGATIVE = 14,
CUBLASLT_ALGO_CAP_NUMERICAL_IMPL_FLAGS = 15,
CUBLASLT_ALGO_CAP_MIN_ALIGNMENT_A_BYTES = 16,
CUBLASLT_ALGO_CAP_MIN_ALIGNMENT_B_BYTES = 17,
CUBLASLT_ALGO_CAP_MIN_ALIGNMENT_C_BYTES = 18,
CUBLASLT_ALGO_CAP_MIN_ALIGNMENT_D_BYTES = 19,
CUBLASLT_ALGO_CAP_ATOMIC_SYNC = 20,
}
extern "C" {
pub fn cublasLtMatmulAlgoCapGetAttribute(
algo: *const cublasLtMatmulAlgo_t,
attr: cublasLtMatmulAlgoCapAttributes_t,
buf: *mut ::core::ffi::c_void,
sizeInBytes: usize,
sizeWritten: *mut usize,
) -> cublasStatus_t;
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cublasLtMatmulAlgoConfigAttributes_t {
CUBLASLT_ALGO_CONFIG_ID = 0,
CUBLASLT_ALGO_CONFIG_TILE_ID = 1,
CUBLASLT_ALGO_CONFIG_SPLITK_NUM = 2,
CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME = 3,
CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING = 4,
CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION = 5,
CUBLASLT_ALGO_CONFIG_STAGES_ID = 6,
CUBLASLT_ALGO_CONFIG_INNER_SHAPE_ID = 7,
CUBLASLT_ALGO_CONFIG_CLUSTER_SHAPE_ID = 8,
}
extern "C" {
pub fn cublasLtMatmulAlgoConfigSetAttribute(
algo: *mut cublasLtMatmulAlgo_t,
attr: cublasLtMatmulAlgoConfigAttributes_t,
buf: *const ::core::ffi::c_void,
sizeInBytes: usize,
) -> cublasStatus_t;
}
extern "C" {
pub fn cublasLtMatmulAlgoConfigGetAttribute(
algo: *const cublasLtMatmulAlgo_t,
attr: cublasLtMatmulAlgoConfigAttributes_t,
buf: *mut ::core::ffi::c_void,
sizeInBytes: usize,
sizeWritten: *mut usize,
) -> cublasStatus_t;
}
pub type cublasLtLoggerCallback_t = ::core::option::Option<
unsafe extern "C" fn(
logLevel: ::core::ffi::c_int,
functionName: *const ::core::ffi::c_char,
message: *const ::core::ffi::c_char,
),
>;
extern "C" {
pub fn cublasLtLoggerSetCallback(callback: cublasLtLoggerCallback_t) -> cublasStatus_t;
}
extern "C" {
pub fn cublasLtLoggerSetFile(file: *mut FILE) -> cublasStatus_t;
}
extern "C" {
pub fn cublasLtLoggerOpenFile(logFile: *const ::core::ffi::c_char) -> cublasStatus_t;
}
extern "C" {
pub fn cublasLtLoggerSetLevel(level: ::core::ffi::c_int) -> cublasStatus_t;
}
extern "C" {
pub fn cublasLtLoggerSetMask(mask: ::core::ffi::c_int) -> cublasStatus_t;
}
extern "C" {
pub fn cublasLtLoggerForceDisable() -> cublasStatus_t;
}
#include "cublasLt.h"
\ No newline at end of file
#!/bin/bash
set -exu
bindgen \
--whitelist-type="^cudnn.*" \
--whitelist-var="^cudnn.*" \
--whitelist-function="^cudnn.*" \
--default-enum-style=rust \
--no-doc-comments \
--with-derive-default \
--with-derive-eq \
--with-derive-hash \
--with-derive-ord \
--size_t-is-usize \
--use-core \
wrapper.h -- -I/usr/local/cuda/include \
> sys.rs
\ No newline at end of file
pub mod result;
pub mod safe;
#[allow(warnings)]
pub mod sys;
pub use safe::*;
//! A thin wrapper around [sys] providing [Result]s with [CudnnError].
use std::mem::MaybeUninit;
use super::sys;
pub type CudnnResult<T> = Result<T, CudnnError>;
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
pub struct CudnnError(pub sys::cudnnStatus_t);
impl sys::cudnnStatus_t {
/// Transforms into a [Result] of [CudnnError]
pub fn result(self) -> Result<(), CudnnError> {
match self {
sys::cudnnStatus_t::CUDNN_STATUS_SUCCESS => Ok(()),
_ => Err(CudnnError(self)),
}
}
}
#[cfg(feature = "std")]
impl std::fmt::Display for CudnnError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{self:?}")
}
}
#[cfg(feature = "std")]
impl std::error::Error for CudnnError {}
/// This function returns the version number of the cuDNN library. It returns the CUDNN_VERSION defined present in the cudnn.h header file.
///
/// See [nvidia docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnGetVersion)
pub fn get_version() -> usize {
unsafe { sys::cudnnGetVersion() }
}
/// The same version of a given cuDNN library can be compiled against different CUDA toolkit versions.
/// This routine returns the CUDA toolkit version that the currently used cuDNN library has been compiled against.
///
/// See [nvidia docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnGetCudartVersion)
pub fn get_cudart_version() -> usize {
unsafe { sys::cudnnGetCudartVersion() }
}
/// Runs all *VersionCheck functions.
pub fn version_check() -> Result<(), CudnnError> {
unsafe {
sys::cudnnAdvInferVersionCheck().result()?;
sys::cudnnAdvTrainVersionCheck().result()?;
sys::cudnnCnnInferVersionCheck().result()?;
sys::cudnnCnnTrainVersionCheck().result()?;
sys::cudnnOpsInferVersionCheck().result()?;
sys::cudnnOpsTrainVersionCheck().result()?;
}
Ok(())
}
/// Creates a handle to the cuDNN library. See
/// [nvidia docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnCreate)
pub fn create_handle() -> Result<sys::cudnnHandle_t, CudnnError> {
let mut handle = MaybeUninit::uninit();
unsafe {
sys::cudnnCreate(handle.as_mut_ptr()).result()?;
Ok(handle.assume_init())
}
}
/// Destroys a handle previously created with [create_handle()]. See
/// [nvidia docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnDestroy)
///
/// # Safety
///
/// `handle` must not have been freed already.
pub unsafe fn destroy_handle(handle: sys::cudnnHandle_t) -> Result<(), CudnnError> {
sys::cudnnDestroy(handle).result()
}
/// Sets the stream cuDNN will use. See
/// [nvidia docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnSetStream)
///
/// # Safety
///
/// `handle` and `stream` must be valid.
pub unsafe fn set_stream(
handle: sys::cudnnHandle_t,
stream: sys::cudaStream_t,
) -> Result<(), CudnnError> {
sys::cudnnSetStream(handle, stream).result()
}
/// Allocates a new tensor descriptor.
/// See [nvidia docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnCreateTensorDescriptor)
pub fn create_tensor_descriptor() -> Result<sys::cudnnTensorDescriptor_t, CudnnError> {
let mut desc = MaybeUninit::uninit();
unsafe {
sys::cudnnCreateTensorDescriptor(desc.as_mut_ptr()).result()?;
Ok(desc.assume_init())
}
}
/// Sets data on a tensor descriptor. See [nvidia docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnSetTensor4dDescriptor)
///
/// # Safety
/// `tensor_desc` must have been created with [create_tensor_descriptor], and
/// NOT freed by [destroy_tensor_descriptor]
pub unsafe fn set_tensor4d_descriptor(
tensor_desc: sys::cudnnTensorDescriptor_t,
format: sys::cudnnTensorFormat_t,
data_type: sys::cudnnDataType_t,
[n, c, h, w]: [std::ffi::c_int; 4],
) -> Result<(), CudnnError> {
sys::cudnnSetTensor4dDescriptor(tensor_desc, format, data_type, n, c, h, w).result()
}
/// Sets data on a tensor descriptor. See [nvidia docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnSetTensor4dDescriptorEx)
///
/// # Safety
/// `tensor_desc` must have been created with [create_tensor_descriptor], and
/// NOT freed by [destroy_tensor_descriptor]
pub unsafe fn set_tensor4d_descriptor_ex(
tensor_desc: sys::cudnnTensorDescriptor_t,
data_type: sys::cudnnDataType_t,
[n, c, h, w]: [std::ffi::c_int; 4],
[n_stride, c_stride, h_stride, w_stride]: [std::ffi::c_int; 4],
) -> Result<(), CudnnError> {
sys::cudnnSetTensor4dDescriptorEx(
tensor_desc,
data_type,
n,
c,
h,
w,
n_stride,
c_stride,
h_stride,
w_stride,
)
.result()
}
/// Sets data on a tensor descriptor. See [nvidia docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnSetTensorNdDescriptor)
///
/// # Safety
/// `tensor_desc` must have been created with [create_tensor_descriptor], and
/// NOT freed by [destroy_tensor_descriptor]
pub unsafe fn set_tensornd_descriptor(
tensor_desc: sys::cudnnTensorDescriptor_t,
data_type: sys::cudnnDataType_t,
num_dims: ::std::os::raw::c_int,
dims: *const ::std::os::raw::c_int,
strides: *const ::std::os::raw::c_int,
) -> Result<(), CudnnError> {
sys::cudnnSetTensorNdDescriptor(tensor_desc, data_type, num_dims, dims, strides).result()
}
/// Destroys a tensor descriptor. See [nvidia docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnDestroyTensorDescriptor)
///
/// # Safety
/// `desc` must NOT have been freed already.
pub unsafe fn destroy_tensor_descriptor(
desc: sys::cudnnTensorDescriptor_t,
) -> Result<(), CudnnError> {
sys::cudnnDestroyTensorDescriptor(desc).result()
}
/// Creates a filter descriptor. See [nvidia docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnCreateFilterDescriptor)
pub fn create_filter_descriptor() -> Result<sys::cudnnFilterDescriptor_t, CudnnError> {
let mut desc = MaybeUninit::uninit();
unsafe {
sys::cudnnCreateFilterDescriptor(desc.as_mut_ptr()).result()?;
Ok(desc.assume_init())
}
}
/// Sets data on a pre allocated filter descriptor. See [nvidia docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnSetFilter4dDescriptor)
///
/// # Safety
/// `filter_desc` must be have been allocated with [create_filter_descriptor]
/// and NOT already freed by [destroy_filter_descriptor].
pub unsafe fn set_filter4d_descriptor(
filter_desc: sys::cudnnFilterDescriptor_t,
data_type: sys::cudnnDataType_t,
format: sys::cudnnTensorFormat_t,
[k, c, h, w]: [::std::os::raw::c_int; 4],
) -> Result<(), CudnnError> {
sys::cudnnSetFilter4dDescriptor(filter_desc, data_type, format, k, c, h, w).result()
}
/// Destroys a filter descriptor. See [nvidia docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnDestroyFilterDescriptor)
///
/// # Safety
/// `desc` must NOT have already been freed.
pub unsafe fn destroy_filter_descriptor(
desc: sys::cudnnFilterDescriptor_t,
) -> Result<(), CudnnError> {
sys::cudnnDestroyFilterDescriptor(desc).result()
}
/// Allocates a convolution descriptor. See [nvidia docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnCreateConvolutionDescriptor).
pub fn create_convolution_descriptor() -> Result<sys::cudnnConvolutionDescriptor_t, CudnnError> {
let mut desc = MaybeUninit::uninit();
unsafe {
sys::cudnnCreateConvolutionDescriptor(desc.as_mut_ptr()).result()?;
Ok(desc.assume_init())
}
}
/// Sets data on a conv descriptor. See [nvidia docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnSetConvolution2dDescriptor)
///
/// # Safety
/// `conv_desc` must have been allocated by [create_convolution_descriptor]
/// and NOT freed by [destroy_convolution_descriptor].
#[allow(clippy::too_many_arguments)]
pub unsafe fn set_convolution2d_descriptor(
conv_desc: sys::cudnnConvolutionDescriptor_t,
pad_h: std::ffi::c_int,
pad_w: std::ffi::c_int,
u: std::ffi::c_int,
v: std::ffi::c_int,
dilation_h: std::ffi::c_int,
dilation_w: std::ffi::c_int,
mode: sys::cudnnConvolutionMode_t,
compute_type: sys::cudnnDataType_t,
) -> Result<(), CudnnError> {
sys::cudnnSetConvolution2dDescriptor(
conv_desc,
pad_h,
pad_w,
u,
v,
dilation_h,
dilation_w,
mode,
compute_type,
)
.result()
}
/// See [nvidia docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnSetConvolutionMathType).
/// # Safety
/// `desc` must NOT have been freed already
pub unsafe fn set_convolution_math_type(
desc: sys::cudnnConvolutionDescriptor_t,
math_type: sys::cudnnMathType_t,
) -> Result<(), CudnnError> {
sys::cudnnSetConvolutionMathType(desc, math_type).result()
}
/// See [nvidia docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnSetConvolutionGroupCount)
/// # Safety
/// `desc` must NOT have been freed already
pub unsafe fn set_convolution_group_count(
desc: sys::cudnnConvolutionDescriptor_t,
group_count: i32,
) -> Result<(), CudnnError> {
sys::cudnnSetConvolutionGroupCount(desc, group_count).result()
}
/// Destroys a descriptor. See [nvidia docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnDestroyConvolutionDescriptor).
/// # Safety
/// `desc` must NOT have been already freed.
pub unsafe fn destroy_convolution_descriptor(
desc: sys::cudnnConvolutionDescriptor_t,
) -> Result<(), CudnnError> {
sys::cudnnDestroyConvolutionDescriptor(desc).result()
}
/// See [nvidia docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnGetConvolutionForwardAlgorithm_v7)
///
/// # Safety
/// - All handles & descriptors must still be allocated.
/// - The pointers must point to valid memory.
#[allow(clippy::too_many_arguments)]
pub unsafe fn get_convolution_forward_algorithm(
handle: sys::cudnnHandle_t,
src: sys::cudnnTensorDescriptor_t,
filter: sys::cudnnFilterDescriptor_t,
conv: sys::cudnnConvolutionDescriptor_t,
dest: sys::cudnnTensorDescriptor_t,
requested_algo_count: std::ffi::c_int,
returned_algo_count: *mut std::ffi::c_int,
perf_results: *mut sys::cudnnConvolutionFwdAlgoPerf_t,
) -> Result<(), CudnnError> {
sys::cudnnGetConvolutionForwardAlgorithm_v7(
handle,
src,
filter,
conv,
dest,
requested_algo_count,
returned_algo_count,
perf_results,
)
.result()
}
/// Returns size in **bytes**. See [nvidia docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnGetConvolutionForwardWorkspaceSize)
/// # Safety
/// - All handles & descriptors must still be allocated.
/// - The pointers must point to valid memory.
pub unsafe fn get_convolution_forward_workspace_size(
handle: sys::cudnnHandle_t,
x: sys::cudnnTensorDescriptor_t,
w: sys::cudnnFilterDescriptor_t,
conv: sys::cudnnConvolutionDescriptor_t,
y: sys::cudnnTensorDescriptor_t,
algo: sys::cudnnConvolutionFwdAlgo_t,
) -> Result<usize, CudnnError> {
let mut size_in_bytes = [0];
sys::cudnnGetConvolutionForwardWorkspaceSize(
handle,
x,
w,
conv,
y,
algo,
size_in_bytes.as_mut_ptr(),
)
.result()?;
Ok(size_in_bytes[0])
}
/// Launch the conv forward kernel.
/// See [nvidia docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnConvolutionForward).
///
/// # Safety
/// - handles and descriptors must still be allocated
/// - all pointers must be valid data pointers
/// - the format of descriptors should match the data allocated
/// in the pointers.
#[allow(clippy::too_many_arguments)]
pub unsafe fn convolution_forward(
handle: sys::cudnnHandle_t,
alpha: *const ::core::ffi::c_void,
x_desc: sys::cudnnTensorDescriptor_t,
x: *const ::core::ffi::c_void,
w_desc: sys::cudnnFilterDescriptor_t,
w: *const ::core::ffi::c_void,
conv_desc: sys::cudnnConvolutionDescriptor_t,
algo: sys::cudnnConvolutionFwdAlgo_t,
work_space: *mut ::core::ffi::c_void,
work_space_size_in_bytes: usize,
beta: *const ::core::ffi::c_void,
y_desc: sys::cudnnTensorDescriptor_t,
y: *mut ::core::ffi::c_void,
) -> Result<(), CudnnError> {
sys::cudnnConvolutionForward(
handle,
alpha,
x_desc,
x,
w_desc,
w,
conv_desc,
algo,
work_space,
work_space_size_in_bytes,
beta,
y_desc,
y,
)
.result()
}
/// See [nvidia docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnGetConvolutionBackwardDataAlgorithm_v7)
///
/// # Safety
/// - All handles & descriptors must still be allocated.
/// - The pointers must point to valid memory.
#[allow(clippy::too_many_arguments)]
pub unsafe fn get_convolution_backward_data_algorithm(
handle: sys::cudnnHandle_t,
w_desc: sys::cudnnFilterDescriptor_t,
dy_desc: sys::cudnnTensorDescriptor_t,
conv_desc: sys::cudnnConvolutionDescriptor_t,
dx_desc: sys::cudnnTensorDescriptor_t,
requested_algo_count: ::std::os::raw::c_int,
returned_algo_count: *mut ::std::os::raw::c_int,
perf_results: *mut sys::cudnnConvolutionBwdDataAlgoPerf_t,
) -> Result<(), CudnnError> {
sys::cudnnGetConvolutionBackwardDataAlgorithm_v7(
handle,
w_desc,
dy_desc,
conv_desc,
dx_desc,
requested_algo_count,
returned_algo_count,
perf_results,
)
.result()
}
/// Returns size in **bytes**. See [nvidia docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnGetConvolutionBackwardDataWorkspaceSize)
/// # Safety
/// - All handles & descriptors must still be allocated.
/// - The pointers must point to valid memory.
pub unsafe fn get_convolution_backward_data_workspace_size(
handle: sys::cudnnHandle_t,
w_desc: sys::cudnnFilterDescriptor_t,
dy_desc: sys::cudnnTensorDescriptor_t,
conv_desc: sys::cudnnConvolutionDescriptor_t,
dx_desc: sys::cudnnTensorDescriptor_t,
algo: sys::cudnnConvolutionBwdDataAlgo_t,
) -> Result<usize, CudnnError> {
let mut size_in_bytes = [0];
sys::cudnnGetConvolutionBackwardDataWorkspaceSize(
handle,
w_desc,
dy_desc,
conv_desc,
dx_desc,
algo,
size_in_bytes.as_mut_ptr(),
)
.result()?;
Ok(size_in_bytes[0])
}
/// Launch the backward data kernel.
/// See [nvidia docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnConvolutionBackwardData).
///
/// # Safety
/// - handles and descriptors must still be allocated
/// - all pointers must be valid data pointers
/// - the format of descriptors should match the data allocated
/// in the pointers.
#[allow(clippy::too_many_arguments)]
pub unsafe fn convolution_backward_data(
handle: sys::cudnnHandle_t,
alpha: *const ::core::ffi::c_void,
w_desc: sys::cudnnFilterDescriptor_t,
w: *const ::core::ffi::c_void,
dy_desc: sys::cudnnTensorDescriptor_t,
dy: *const ::core::ffi::c_void,
conv_desc: sys::cudnnConvolutionDescriptor_t,
algo: sys::cudnnConvolutionBwdDataAlgo_t,
work_space: *mut ::core::ffi::c_void,
work_space_size_in_bytes: usize,
beta: *const ::core::ffi::c_void,
dx_desc: sys::cudnnTensorDescriptor_t,
dx: *mut ::core::ffi::c_void,
) -> Result<(), CudnnError> {
sys::cudnnConvolutionBackwardData(
handle,
alpha,
w_desc,
w,
dy_desc,
dy,
conv_desc,
algo,
work_space,
work_space_size_in_bytes,
beta,
dx_desc,
dx,
)
.result()
}
/// See [nvidia docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnGetConvolutionBackwardFilterAlgorithm_v7)
///
/// # Safety
/// - All handles & descriptors must still be allocated.
/// - The pointers must point to valid memory.
#[allow(clippy::too_many_arguments)]
pub unsafe fn get_convolution_backward_filter_algorithm(
handle: sys::cudnnHandle_t,
src_desc: sys::cudnnTensorDescriptor_t,
diff_desc: sys::cudnnTensorDescriptor_t,
conv_desc: sys::cudnnConvolutionDescriptor_t,
grad_desc: sys::cudnnFilterDescriptor_t,
requested_algo_count: ::std::os::raw::c_int,
returned_algo_count: *mut ::std::os::raw::c_int,
perf_results: *mut sys::cudnnConvolutionBwdFilterAlgoPerf_t,
) -> Result<(), CudnnError> {
sys::cudnnGetConvolutionBackwardFilterAlgorithm_v7(
handle,
src_desc,
diff_desc,
conv_desc,
grad_desc,
requested_algo_count,
returned_algo_count,
perf_results,
)
.result()
}
/// Returns size in **bytes**.
/// See [nvidia docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnGetConvolutionBackwardFilterWorkspaceSize)
/// # Safety
/// - All handles & descriptors must still be allocated.
/// - The pointers must point to valid memory.
pub unsafe fn get_convolution_backward_filter_workspace_size(
handle: sys::cudnnHandle_t,
x_desc: sys::cudnnTensorDescriptor_t,
dy_desc: sys::cudnnTensorDescriptor_t,
conv_desc: sys::cudnnConvolutionDescriptor_t,
grad_desc: sys::cudnnFilterDescriptor_t,
algo: sys::cudnnConvolutionBwdFilterAlgo_t,
) -> Result<usize, CudnnError> {
let mut size_in_bytes = [0];
sys::cudnnGetConvolutionBackwardFilterWorkspaceSize(
handle,
x_desc,
dy_desc,
conv_desc,
grad_desc,
algo,
size_in_bytes.as_mut_ptr(),
)
.result()?;
Ok(size_in_bytes[0])
}
/// Launch the backward data kernel.
/// See [nvidia docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnConvolutionBackwardFilter).
///
/// # Safety
/// - handles and descriptors must still be allocated
/// - all pointers must be valid data pointers
/// - the format of descriptors should match the data allocated
/// in the pointers.
#[allow(clippy::too_many_arguments)]
pub unsafe fn convolution_backward_filter(
handle: sys::cudnnHandle_t,
alpha: *const ::core::ffi::c_void,
x_desc: sys::cudnnTensorDescriptor_t,
x: *const ::core::ffi::c_void,
dy_desc: sys::cudnnTensorDescriptor_t,
dy: *const ::core::ffi::c_void,
conv_desc: sys::cudnnConvolutionDescriptor_t,
algo: sys::cudnnConvolutionBwdFilterAlgo_t,
work_space: *mut ::core::ffi::c_void,
work_space_size_in_bytes: usize,
beta: *const ::core::ffi::c_void,
dw_desc: sys::cudnnFilterDescriptor_t,
dw: *mut ::core::ffi::c_void,
) -> Result<(), CudnnError> {
sys::cudnnConvolutionBackwardFilter(
handle,
alpha,
x_desc,
x,
dy_desc,
dy,
conv_desc,
algo,
work_space,
work_space_size_in_bytes,
beta,
dw_desc,
dw,
)
.result()
}
/// See [nvidia docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnCreateReduceTensorDescriptor).
pub fn create_reduce_tensor_descriptor() -> Result<sys::cudnnReduceTensorDescriptor_t, CudnnError> {
let mut handle = MaybeUninit::uninit();
unsafe {
sys::cudnnCreateReduceTensorDescriptor(handle.as_mut_ptr()).result()?;
Ok(handle.assume_init())
}
}
/// See [nvidia docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnSetReduceTensorDescriptor)
///
/// # Safety
/// All the descriptors must be allocated properly and not have been destroyed.
pub unsafe fn set_reduce_tensor_descriptor(
tensor_desc: sys::cudnnReduceTensorDescriptor_t,
tensor_op: sys::cudnnReduceTensorOp_t,
tensor_comp_type: sys::cudnnDataType_t,
tensor_nan_opt: sys::cudnnNanPropagation_t,
tensor_indices: sys::cudnnReduceTensorIndices_t,
tensor_indices_type: sys::cudnnIndicesType_t,
) -> Result<(), CudnnError> {
sys::cudnnSetReduceTensorDescriptor(
tensor_desc,
tensor_op,
tensor_comp_type,
tensor_nan_opt,
tensor_indices,
tensor_indices_type,
)
.result()
}
/// See [nvidia docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnDestroyReduceTensorDescriptor).
///
/// # Safety
/// Descriptor must not have been freed already.
pub unsafe fn destroy_reduce_tensor_descriptor(
tensor_desc: sys::cudnnReduceTensorDescriptor_t,
) -> Result<(), CudnnError> {
sys::cudnnDestroyReduceTensorDescriptor(tensor_desc).result()
}
/// See [nvidia docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnGetReductionIndicesSize)
///
/// # Safety
/// Handle and descriptor must be valid (properly allocated and not freed already).
pub unsafe fn get_reduction_indices_size(
handle: sys::cudnnHandle_t,
reduce_tensor_desc: sys::cudnnReduceTensorDescriptor_t,
a_desc: sys::cudnnTensorDescriptor_t,
c_desc: sys::cudnnTensorDescriptor_t,
) -> Result<usize, CudnnError> {
let mut size_in_bytes = [0];
sys::cudnnGetReductionIndicesSize(
handle,
reduce_tensor_desc,
a_desc,
c_desc,
size_in_bytes.as_mut_ptr(),
)
.result()?;
Ok(size_in_bytes[0])
}
/// See [nvidia docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnGetReductionWorkspaceSize)
///
/// # Safety
/// Handle and descriptors must be properly allocated and not freed already.
pub unsafe fn get_reduction_workspace_size(
handle: sys::cudnnHandle_t,
reduce_tensor_desc: sys::cudnnReduceTensorDescriptor_t,
a_desc: sys::cudnnTensorDescriptor_t,
c_desc: sys::cudnnTensorDescriptor_t,
) -> Result<usize, CudnnError> {
let mut size_in_bytes = [0];
sys::cudnnGetReductionWorkspaceSize(
handle,
reduce_tensor_desc,
a_desc,
c_desc,
size_in_bytes.as_mut_ptr(),
)
.result()?;
Ok(size_in_bytes[0])
}
/// See [nvidia docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnReduceTensor)
///
/// # Safety
/// - All data must be properly allocated and not freed.
/// - The descriptors must be the same data type as the pointers
/// - Misuse of this function could result in out of bounds memory accesses.
#[allow(clippy::too_many_arguments)]
pub unsafe fn reduce_tensor(
handle: sys::cudnnHandle_t,
reduce_tensor_desc: sys::cudnnReduceTensorDescriptor_t,
indices: *mut std::ffi::c_void,
indices_size_in_bytes: usize,
workspace: *mut std::ffi::c_void,
workspace_size_in_bytes: usize,
alpha: *const std::ffi::c_void,
a_desc: sys::cudnnTensorDescriptor_t,
a: *const std::ffi::c_void,
beta: *const std::ffi::c_void,
c_desc: sys::cudnnTensorDescriptor_t,
c: *mut std::ffi::c_void,
) -> Result<(), CudnnError> {
sys::cudnnReduceTensor(
handle,
reduce_tensor_desc,
indices,
indices_size_in_bytes,
workspace,
workspace_size_in_bytes,
alpha,
a_desc,
a,
beta,
c_desc,
c,
)
.result()
}
use super::core::*;
use crate::{
cudnn::{result, result::CudnnError, sys},
driver::{DevicePtr, DevicePtrMut},
};
use std::{marker::PhantomData, sync::Arc};
/// A descriptor of the filters for conv2d operation. Create with [`Cudnn::create_4d_filter()`]
#[derive(Debug)]
pub struct FilterDescriptor<T> {
pub(crate) desc: sys::cudnnFilterDescriptor_t,
#[allow(unused)]
pub(crate) handle: Arc<Cudnn>,
pub(crate) marker: PhantomData<T>,
}
impl Cudnn {
/// Create a filter 4d descriptor.
pub fn create_4d_filter<T: CudnnDataType>(
self: &Arc<Cudnn>,
format: sys::cudnnTensorFormat_t,
dims: [std::ffi::c_int; 4],
) -> Result<FilterDescriptor<T>, CudnnError> {
let desc = result::create_filter_descriptor()?;
let desc = FilterDescriptor {
desc,
handle: self.clone(),
marker: PhantomData,
};
unsafe { result::set_filter4d_descriptor(desc.desc, T::DATA_TYPE, format, dims) }?;
Ok(desc)
}
}
impl<T> Drop for FilterDescriptor<T> {
fn drop(&mut self) {
let desc = std::mem::replace(&mut self.desc, std::ptr::null_mut());
if !desc.is_null() {
unsafe { result::destroy_filter_descriptor(desc) }.unwrap()
}
}
}
/// A descriptor for a conv2d operation holding stride, padding, and dilation.
#[derive(Debug)]
pub struct Conv2dDescriptor<T> {
pub(crate) desc: sys::cudnnConvolutionDescriptor_t,
pub(crate) handle: Arc<Cudnn>,
pub(crate) marker: PhantomData<T>,
}
impl Cudnn {
/// Creates a conv2d descriptor.
/// - `pad` is the padding to apply to height and width of image
/// - `stride` is the kernel strides
/// - `dilation` is the kernel dilation
/// - `mode` - CROSS_CORRELATION is standard convolution
pub fn create_conv2d<T: CudnnDataType>(
self: &Arc<Cudnn>,
pad: [std::ffi::c_int; 2],
stride: [std::ffi::c_int; 2],
dilation: [std::ffi::c_int; 2],
mode: sys::cudnnConvolutionMode_t,
) -> Result<Conv2dDescriptor<T>, CudnnError> {
let [pad_h, pad_w] = pad;
let [stride_h, stride_w] = stride;
let [dilation_h, dilation_w] = dilation;
let desc = result::create_convolution_descriptor()?;
let desc = Conv2dDescriptor {
desc,
handle: self.clone(),
marker: PhantomData,
};
unsafe {
result::set_convolution2d_descriptor(
desc.desc,
pad_h,
pad_w,
stride_h,
stride_w,
dilation_h,
dilation_w,
mode,
T::DATA_TYPE,
)
}?;
Ok(desc)
}
}
impl<T> Conv2dDescriptor<T> {
/// Set's the math type for this convolution. Refer to [nvidia docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnSetConvolutionMathType)
/// for more information.
pub fn set_math_type(&mut self, math_type: sys::cudnnMathType_t) -> Result<(), CudnnError> {
unsafe { result::set_convolution_math_type(self.desc, math_type) }
}
/// Set's the group count for this convolution. Refer to [nvidia docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnSetConvolutionGroupCount)
/// for more information.
pub fn set_group_count(&mut self, group_count: i32) -> Result<(), CudnnError> {
unsafe { result::set_convolution_group_count(self.desc, group_count) }
}
}
impl<T> Drop for Conv2dDescriptor<T> {
fn drop(&mut self) {
let desc = std::mem::replace(&mut self.desc, std::ptr::null_mut());
if !desc.is_null() {
unsafe { result::destroy_convolution_descriptor(desc) }.unwrap()
}
}
}
/// The convolution 2d forward operation. Pass in references to descriptors
/// directly, and then call:
/// 1. [`Conv2dForward::pick_algorithm()`] to use cudnn heuristics to select the algorithm
/// 2. [`Conv2dForward::get_workspace_size()`] to get required workspace size.
/// 3. [`Conv2dForward::launch()`] to execute it
#[derive(Debug)]
pub struct Conv2dForward<'a, X: CudnnDataType, C: CudnnDataType, Y: CudnnDataType> {
/// Conv parameters
pub conv: &'a Conv2dDescriptor<C>,
/// Input image descriptor
pub x: &'a TensorDescriptor<X>,
/// Filter descriptor
pub w: &'a FilterDescriptor<X>,
/// Output image descriptor
pub y: &'a TensorDescriptor<Y>,
}
impl<'a, X: CudnnDataType, C: CudnnDataType, Y: CudnnDataType> Conv2dForward<'a, X, C, Y> {
/// Picks the fastest algorithm from all available cuDNN algorithms based on cudnn heuristics.
pub fn pick_algorithm(&self) -> Result<sys::cudnnConvolutionFwdAlgo_t, CudnnError> {
const NUM_ALGOS: usize = 8;
debug_assert_eq!(
sys::cudnnConvolutionFwdAlgo_t::CUDNN_CONVOLUTION_FWD_ALGO_COUNT as u32,
NUM_ALGOS as u32
);
let mut returned_count = [0; 1];
let mut perf_results = [Default::default(); NUM_ALGOS];
unsafe {
result::get_convolution_forward_algorithm(
self.conv.handle.handle,
self.x.desc,
self.w.desc,
self.conv.desc,
self.y.desc,
NUM_ALGOS as std::ffi::c_int,
returned_count.as_mut_ptr(),
perf_results.as_mut_ptr(),
)
}?;
assert!(returned_count[0] > 0);
perf_results[0].status.result()?;
Ok(perf_results[0].algo)
}
/// Returns size in **bytes** to execute the selected algorithm.
pub fn get_workspace_size(
&self,
algo: sys::cudnnConvolutionFwdAlgo_t,
) -> Result<usize, CudnnError> {
unsafe {
result::get_convolution_forward_workspace_size(
self.conv.handle.handle,
self.x.desc,
self.w.desc,
self.conv.desc,
self.y.desc,
algo,
)
}
}
/// Launches the operation.
///
/// - `img` is the input image
/// - `filter` is the convolution kernels
/// - `y` is the output
///
/// # Safety
/// The img/filter/y arguments must match the data type/layout specified in the
/// descriptors in `self.
pub unsafe fn launch<Workspace, Img, Filter, Dst>(
&self,
algo: sys::cudnnConvolutionFwdAlgo_t,
workspace: Option<&mut Workspace>,
(alpha, beta): (Y, Y),
img: &Img,
filter: &Filter,
y: &mut Dst,
) -> Result<(), CudnnError>
where
Workspace: DevicePtrMut<u8>,
Img: DevicePtr<X>,
Filter: DevicePtr<X>,
Dst: DevicePtrMut<Y>,
{
let (num_bytes, workspace_ptr) = match workspace {
Some(w) => (
w.num_bytes(),
*w.device_ptr_mut() as *mut u8 as *mut std::ffi::c_void,
),
None => (0, std::ptr::null_mut()),
};
let alpha = alpha.into_scaling_parameter();
let beta = beta.into_scaling_parameter();
result::convolution_forward(
self.conv.handle.handle,
(&alpha) as *const Y::Scalar as *const std::ffi::c_void,
self.x.desc,
*img.device_ptr() as *const X as *const std::ffi::c_void,
self.w.desc,
*filter.device_ptr() as *const X as *const std::ffi::c_void,
self.conv.desc,
algo,
workspace_ptr,
num_bytes,
(&beta) as *const Y::Scalar as *const std::ffi::c_void,
self.y.desc,
*y.device_ptr_mut() as *mut Y as *mut std::ffi::c_void,
)
}
}
/// The convolution 2d backward operation for the input image. Pass in references to descriptors
/// directly, and then call:
/// 1. [`Conv2dBackwardData::pick_algorithm()`] to use cudnn heuristics to select the algorithm
/// 2. [`Conv2dBackwardData::get_workspace_size()`] to get required workspace size.
/// 3. [`Conv2dBackwardData::launch()`] to execute it
#[derive(Debug)]
pub struct Conv2dBackwardData<'a, X: CudnnDataType, C: CudnnDataType, Y: CudnnDataType> {
/// Conv descriptor
pub conv: &'a Conv2dDescriptor<C>,
/// Input image descriptor
pub dx: &'a TensorDescriptor<X>,
/// Filter descriptor
pub w: &'a FilterDescriptor<X>,
/// Output image descriptor
pub dy: &'a TensorDescriptor<Y>,
}
impl<'a, X: CudnnDataType, C: CudnnDataType, Y: CudnnDataType> Conv2dBackwardData<'a, X, C, Y> {
/// Picks the fastest algorithm from all available cuDNN algorithms based on cudnn heuristics.
pub fn pick_algorithm(&self) -> Result<sys::cudnnConvolutionBwdDataAlgo_t, CudnnError> {
const NUM_ALGOS: usize = 6;
debug_assert_eq!(
sys::cudnnConvolutionBwdDataAlgo_t::CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT as u32,
NUM_ALGOS as u32
);
let mut returned_count = [0; 1];
let mut perf_results = [Default::default(); NUM_ALGOS];
unsafe {
result::get_convolution_backward_data_algorithm(
self.conv.handle.handle,
self.w.desc,
self.dy.desc,
self.conv.desc,
self.dx.desc,
NUM_ALGOS as std::ffi::c_int,
returned_count.as_mut_ptr(),
perf_results.as_mut_ptr(),
)
}?;
assert!(returned_count[0] > 0);
perf_results[0].status.result()?;
Ok(perf_results[0].algo)
}
/// Returns size in **bytes** to execute the selected algorithm.
pub fn get_workspace_size(
&self,
algo: sys::cudnnConvolutionBwdDataAlgo_t,
) -> Result<usize, CudnnError> {
unsafe {
result::get_convolution_backward_data_workspace_size(
self.conv.handle.handle,
self.w.desc,
self.dy.desc,
self.conv.desc,
self.dx.desc,
algo,
)
}
}
/// Launches the operation.
///
/// - `dx` is the gradient of the input image to populate
/// - `filter` is the convolution kernels
/// - `dy` is the gradient of the output image
///
/// # Safety
/// The arguments must match the data type/layout specified in the
/// descriptors in `self.
pub unsafe fn launch<Workspace, Img, Filter, Dst>(
&self,
algo: sys::cudnnConvolutionBwdDataAlgo_t,
workspace: Option<&mut Workspace>,
(alpha, beta): (Y, Y),
dx: &mut Img,
filter: &Filter,
dy: &Dst,
) -> Result<(), CudnnError>
where
Workspace: DevicePtrMut<u8>,
Img: DevicePtrMut<X>,
Filter: DevicePtr<X>,
Dst: DevicePtr<Y>,
{
let (num_bytes, workspace_ptr) = match workspace {
Some(w) => (
w.num_bytes(),
*w.device_ptr_mut() as *mut u8 as *mut std::ffi::c_void,
),
None => (0, std::ptr::null_mut()),
};
let alpha = alpha.into_scaling_parameter();
let beta = beta.into_scaling_parameter();
result::convolution_backward_data(
self.conv.handle.handle,
(&alpha) as *const Y::Scalar as *const std::ffi::c_void,
self.w.desc,
*filter.device_ptr() as *const X as *const std::ffi::c_void,
self.dy.desc,
*dy.device_ptr() as *const Y as *const std::ffi::c_void,
self.conv.desc,
algo,
workspace_ptr,
num_bytes,
(&beta) as *const Y::Scalar as *const std::ffi::c_void,
self.dx.desc,
*dx.device_ptr_mut() as *mut X as *mut std::ffi::c_void,
)
}
}
/// The convolution 2d backward operation for the filters. Pass in references to descriptors
/// directly, and then call:
/// 1. [`Conv2dBackwardFilter::pick_algorithm()`] to use cudnn heuristics to select the algorithm
/// 2. [`Conv2dBackwardFilter::get_workspace_size()`] to get required workspace size.
/// 3. [`Conv2dBackwardFilter::launch()`] to execute it
#[derive(Debug)]
pub struct Conv2dBackwardFilter<'a, X: CudnnDataType, C: CudnnDataType, Y: CudnnDataType> {
/// Conv descriptor
pub conv: &'a Conv2dDescriptor<C>,
/// Input image descriptor
pub x: &'a TensorDescriptor<X>,
/// Filter descriptor
pub dw: &'a FilterDescriptor<X>,
/// Output image descriptor
pub dy: &'a TensorDescriptor<Y>,
}
impl<'a, X: CudnnDataType, C: CudnnDataType, Y: CudnnDataType> Conv2dBackwardFilter<'a, X, C, Y> {
/// Picks the fastest algorithm from all available cuDNN algorithms based on cudnn heuristics.
pub fn pick_algorithm(&self) -> Result<sys::cudnnConvolutionBwdFilterAlgo_t, CudnnError> {
const NUM_ALGOS: usize = 7;
debug_assert_eq!(
sys::cudnnConvolutionBwdFilterAlgo_t::CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT as u32,
NUM_ALGOS as u32
);
let mut returned_count = [0; 1];
let mut perf_results = [Default::default(); NUM_ALGOS];
unsafe {
result::get_convolution_backward_filter_algorithm(
self.conv.handle.handle,
self.x.desc,
self.dy.desc,
self.conv.desc,
self.dw.desc,
NUM_ALGOS as std::ffi::c_int,
returned_count.as_mut_ptr(),
perf_results.as_mut_ptr(),
)
}?;
assert!(returned_count[0] > 0);
perf_results[0].status.result()?;
Ok(perf_results[0].algo)
}
/// Returns size in **bytes** to execute the selected algorithm.
pub fn get_workspace_size(
&self,
algo: sys::cudnnConvolutionBwdFilterAlgo_t,
) -> Result<usize, CudnnError> {
unsafe {
result::get_convolution_backward_filter_workspace_size(
self.conv.handle.handle,
self.x.desc,
self.dy.desc,
self.conv.desc,
self.dw.desc,
algo,
)
}
}
/// Launches the operation.
///
/// - `x` is the input image
/// - `dfilter` is the gradient of the convolution kernels
/// - `dy` is the gradient of the output image
///
/// # Safety
/// The arguments must match the data type/layout specified in the
/// descriptors in `self.
pub unsafe fn launch<Workspace, Img, Filter, Dst>(
&self,
algo: sys::cudnnConvolutionBwdFilterAlgo_t,
workspace: Option<&mut Workspace>,
(alpha, beta): (Y, Y),
x: &Img,
dfilter: &mut Filter,
dy: &Dst,
) -> Result<(), CudnnError>
where
Workspace: DevicePtrMut<u8>,
Img: DevicePtr<X>,
Filter: DevicePtrMut<X>,
Dst: DevicePtr<Y>,
{
let (num_bytes, workspace_ptr) = workspace
.map(|x| (x.num_bytes(), *x.device_ptr_mut() as *mut std::ffi::c_void))
.unwrap_or((0, std::ptr::null_mut()));
let alpha = alpha.into_scaling_parameter();
let beta = beta.into_scaling_parameter();
result::convolution_backward_filter(
self.conv.handle.handle,
(&alpha) as *const Y::Scalar as *const std::ffi::c_void,
self.x.desc,
*x.device_ptr() as *const _,
self.dy.desc,
*dy.device_ptr() as *const _,
self.conv.desc,
algo,
workspace_ptr,
num_bytes,
(&beta) as *const Y::Scalar as *const std::ffi::c_void,
self.dw.desc,
*dfilter.device_ptr_mut() as *mut _,
)
}
}
use crate::{
cudnn::{result, result::CudnnError, sys},
driver::{CudaDevice, CudaStream},
};
use std::{marker::PhantomData, sync::Arc};
/// A handle to cuDNN.
///
/// This type is not send/sync because of <https://docs.nvidia.com/deeplearning/cudnn/developer-guide/index.html#thread-safety>
#[derive(Debug)]
pub struct Cudnn {
pub(crate) handle: sys::cudnnHandle_t,
pub(crate) device: Arc<CudaDevice>,
}
impl Cudnn {
/// Creates a new cudnn handle and sets the stream to the `device`'s stream.
pub fn new(device: Arc<CudaDevice>) -> Result<Arc<Self>, CudnnError> {
device.bind_to_thread().unwrap();
let handle = result::create_handle()?;
unsafe { result::set_stream(handle, device.stream as *mut _) }?;
Ok(Arc::new(Self { handle, device }))
}
/// Sets the handle's current to either the stream specified, or the device's default work
/// stream.
///
/// # Safety
/// This is unsafe because you can end up scheduling multiple concurrent kernels that all
/// write to the same memory address.
pub unsafe fn set_stream(&self, opt_stream: Option<&CudaStream>) -> Result<(), CudnnError> {
match opt_stream {
Some(s) => result::set_stream(self.handle, s.stream as *mut _),
None => result::set_stream(self.handle, self.device.stream as *mut _),
}
}
}
impl Drop for Cudnn {
fn drop(&mut self) {
let handle = std::mem::replace(&mut self.handle, std::ptr::null_mut());
if !handle.is_null() {
unsafe { result::destroy_handle(handle) }.unwrap();
}
}
}
/// Maps a rust type to a [sys::cudnnDataType_t]
pub trait CudnnDataType {
const DATA_TYPE: sys::cudnnDataType_t;
/// Certain CUDNN data types have a scaling parameter (usually called alpha/beta)
/// that is a different type. See [nvidia docs](https://docs.nvidia.com/deeplearning/cudnn/developer-guide/index.html#scaling-parameters)
/// for more info, but basically f16 has a scalar of f32.
type Scalar;
/// Converts the type into the scaling parameter type. See [Self::Scalar].
fn into_scaling_parameter(self) -> Self::Scalar;
}
macro_rules! cudnn_dtype {
($RustTy:ty, $CudnnTy:tt) => {
impl CudnnDataType for $RustTy {
const DATA_TYPE: sys::cudnnDataType_t = sys::cudnnDataType_t::$CudnnTy;
type Scalar = Self;
fn into_scaling_parameter(self) -> Self::Scalar {
self
}
}
};
}
cudnn_dtype!(f32, CUDNN_DATA_FLOAT);
cudnn_dtype!(f64, CUDNN_DATA_DOUBLE);
cudnn_dtype!(i8, CUDNN_DATA_INT8);
cudnn_dtype!(i32, CUDNN_DATA_INT32);
cudnn_dtype!(i64, CUDNN_DATA_INT64);
cudnn_dtype!(u8, CUDNN_DATA_UINT8);
cudnn_dtype!(bool, CUDNN_DATA_BOOLEAN);
#[cfg(feature = "f16")]
impl CudnnDataType for half::f16 {
const DATA_TYPE: sys::cudnnDataType_t = sys::cudnnDataType_t::CUDNN_DATA_HALF;
type Scalar = f32;
fn into_scaling_parameter(self) -> Self::Scalar {
self.to_f32()
}
}
#[cfg(feature = "f16")]
impl CudnnDataType for half::bf16 {
const DATA_TYPE: sys::cudnnDataType_t = sys::cudnnDataType_t::CUDNN_DATA_BFLOAT16;
type Scalar = f32;
fn into_scaling_parameter(self) -> Self::Scalar {
self.to_f32()
}
}
/// A descriptor of a tensor. Create with:
/// 1. [`Cudnn::create_4d_tensor()`]
/// 2. [`Cudnn::create_4d_tensor_ex()`]
/// 3. [`Cudnn::create_nd_tensor()`]
#[derive(Debug)]
pub struct TensorDescriptor<T> {
pub(crate) desc: sys::cudnnTensorDescriptor_t,
#[allow(unused)]
pub(crate) handle: Arc<Cudnn>,
pub(crate) marker: PhantomData<T>,
}
impl Cudnn {
/// Creates a 4d tensor descriptor.
pub fn create_4d_tensor<T: CudnnDataType>(
self: &Arc<Cudnn>,
format: sys::cudnnTensorFormat_t,
dims: [std::ffi::c_int; 4],
) -> Result<TensorDescriptor<T>, CudnnError> {
let desc = result::create_tensor_descriptor()?;
let desc = TensorDescriptor {
desc,
handle: self.clone(),
marker: PhantomData,
};
unsafe { result::set_tensor4d_descriptor(desc.desc, format, T::DATA_TYPE, dims) }?;
Ok(desc)
}
/// Creates a 4d tensor descriptor.
pub fn create_4d_tensor_ex<T: CudnnDataType>(
self: &Arc<Cudnn>,
dims: [std::ffi::c_int; 4],
strides: [std::ffi::c_int; 4],
) -> Result<TensorDescriptor<T>, CudnnError> {
let desc = result::create_tensor_descriptor()?;
let desc = TensorDescriptor {
desc,
handle: self.clone(),
marker: PhantomData,
};
unsafe { result::set_tensor4d_descriptor_ex(desc.desc, T::DATA_TYPE, dims, strides) }?;
Ok(desc)
}
/// Creates an nd (at LEAST 4d) tensor descriptor.
pub fn create_nd_tensor<T: CudnnDataType>(
self: &Arc<Cudnn>,
dims: &[std::ffi::c_int],
strides: &[std::ffi::c_int],
) -> Result<TensorDescriptor<T>, CudnnError> {
assert!(dims.len() >= 4);
assert_eq!(dims.len(), strides.len());
let desc = result::create_tensor_descriptor()?;
let desc = TensorDescriptor {
desc,
handle: self.clone(),
marker: PhantomData,
};
unsafe {
result::set_tensornd_descriptor(
desc.desc,
T::DATA_TYPE,
dims.len() as std::ffi::c_int,
dims.as_ptr(),
strides.as_ptr(),
)
}?;
Ok(desc)
}
}
impl<T> Drop for TensorDescriptor<T> {
fn drop(&mut self) {
let desc = std::mem::replace(&mut self.desc, std::ptr::null_mut());
if !desc.is_null() {
unsafe { result::destroy_tensor_descriptor(desc) }.unwrap()
}
}
}
//! Safe wrappers around cuDNN.
//!
//! # Convolutions
//!
//! 1. Allocate tensor descriptors with [`Cudnn::create_4d_tensor()`]
//! 2. Allocate filter descriptors with [`Cudnn::create_4d_filter()`]
//! 3. Allocate conv descriptors with [`Cudnn::create_conv2d()`]
//! 4. Instantiate one of the following algorithms with the descriptors:
//! a. [`Conv2dForward`]
//! b. [`Conv2dBackwardData`] for computing gradient of image
//! c. [`Conv2dBackwardFilter`] for computing gradient of filters
//! 5. Call the `pick_algorithm` method of the struct. Specify the number of options to compare with a const generic.
//! 6. Call the `get_workspace_size` method of the struct.
//! 7. Re-allocate the workspace to the appropriate size.
//! 8. Call the `launch` method of the struct.
//!
//! # Reductions
mod conv;
mod core;
mod reduce;
pub use self::conv::{
Conv2dBackwardData, Conv2dBackwardFilter, Conv2dDescriptor, Conv2dForward, FilterDescriptor,
};
pub use self::core::{Cudnn, CudnnDataType, TensorDescriptor};
pub use self::reduce::{FlatIndices, NoIndices, ReduceTensor, ReductionDescriptor};
pub use super::result::CudnnError;
#[cfg(test)]
mod tests {
use super::*;
use crate::{cudnn, driver::CudaDevice};
#[test]
fn test_create_descriptors() -> Result<(), CudnnError> {
let cudnn = Cudnn::new(CudaDevice::new(0).unwrap())?;
let _ = cudnn.create_4d_tensor_ex::<f32>([1, 2, 3, 4], [24, 12, 4, 1])?;
let _ = cudnn.create_nd_tensor::<f64>(&[1, 2, 3, 4, 5, 6], &[720, 360, 120, 30, 6, 1])?;
let _ = cudnn.create_4d_filter::<f32>(
cudnn::sys::cudnnTensorFormat_t::CUDNN_TENSOR_NCHW,
[3, 3, 3, 3],
)?;
let _ = cudnn.create_reduction_flat_indices::<f32>(
cudnn::sys::cudnnReduceTensorOp_t::CUDNN_REDUCE_TENSOR_ADD,
cudnn::sys::cudnnNanPropagation_t::CUDNN_PROPAGATE_NAN,
)?;
let _ = cudnn.create_reduction_no_indices::<f32>(
cudnn::sys::cudnnReduceTensorOp_t::CUDNN_REDUCE_TENSOR_ADD,
cudnn::sys::cudnnNanPropagation_t::CUDNN_PROPAGATE_NAN,
)?;
Ok(())
}
#[test]
fn test_conv_pick_algorithms() -> Result<(), CudnnError> {
let cudnn = Cudnn::new(CudaDevice::new(0).unwrap())?;
let conv = cudnn.create_conv2d::<f32>(
[0; 2],
[1; 2],
[1; 2],
cudnn::sys::cudnnConvolutionMode_t::CUDNN_CROSS_CORRELATION,
)?;
let x = cudnn.create_4d_tensor::<f32>(
cudnn::sys::cudnnTensorFormat_t::CUDNN_TENSOR_NCHW,
[100, 128, 224, 224],
)?;
let filter = cudnn.create_4d_filter::<f32>(
cudnn::sys::cudnnTensorFormat_t::CUDNN_TENSOR_NCHW,
[256, 128, 3, 3],
)?;
let y = cudnn.create_4d_tensor::<f32>(
cudnn::sys::cudnnTensorFormat_t::CUDNN_TENSOR_NCHW,
[100, 256, 222, 222],
)?;
{
let op = Conv2dForward {
conv: &conv,
x: &x,
w: &filter,
y: &y,
};
let algo = op.pick_algorithm()?;
assert_eq!(
algo,
cudnn::sys::cudnnConvolutionFwdAlgo_t::CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
);
}
{
let op = Conv2dBackwardData {
conv: &conv,
dx: &x,
w: &filter,
dy: &y,
};
let algo = op.pick_algorithm()?;
assert_eq!(
algo,
cudnn::sys::cudnnConvolutionBwdDataAlgo_t::CUDNN_CONVOLUTION_BWD_DATA_ALGO_1
);
}
{
let op = Conv2dBackwardFilter {
conv: &conv,
x: &x,
dw: &filter,
dy: &y,
};
let algo = op.pick_algorithm()?;
assert_eq!(
algo,
cudnn::sys::cudnnConvolutionBwdFilterAlgo_t::CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1
);
}
Ok(())
}
#[test]
fn test_reduction() {
let dev = CudaDevice::new(0).unwrap();
let a = dev
.htod_copy(std::vec![1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0])
.unwrap();
let mut c = dev.alloc_zeros::<f32>(1).unwrap();
let cudnn = Cudnn::new(dev.clone()).unwrap();
let reduce = cudnn
.create_reduction_no_indices::<f32>(
cudnn::sys::cudnnReduceTensorOp_t::CUDNN_REDUCE_TENSOR_ADD,
cudnn::sys::cudnnNanPropagation_t::CUDNN_PROPAGATE_NAN,
)
.unwrap();
let a_desc = cudnn
.create_nd_tensor::<f32>(&[1, 1, 2, 3], &[0, 6, 3, 1])
.unwrap();
let c_desc = cudnn
.create_nd_tensor::<f32>(&[1, 1, 1, 1], &[0, 0, 0, 1])
.unwrap();
let op = ReduceTensor {
reduce: &reduce,
a: &a_desc,
c: &c_desc,
};
let workspace_size = op.get_workspace_size().unwrap();
let mut workspace = dev.alloc_zeros::<u8>(workspace_size).unwrap();
unsafe { op.launch(&mut workspace, (1.0, 0.0), &a, &mut c) }.unwrap();
let c_host = dev.sync_reclaim(c).unwrap();
assert_eq!(c_host.len(), 1);
assert_eq!(c_host[0], 21.0);
}
}
use super::core::*;
use crate::{
cudnn::{result, result::CudnnError, sys},
driver::{DevicePtr, DevicePtrMut},
};
use std::{marker::PhantomData, sync::Arc};
/// A marker type used with [ReductionDescriptor] to indicate the
/// reduction operation should return flattened indices. Corresponds
/// to [sys::cudnnReduceTensorIndices_t::CUDNN_REDUCE_TENSOR_FLATTENED_INDICES].
#[derive(Debug, Default, Copy, Clone)]
pub struct FlatIndices;
/// A marker type used with [ReductionDescriptor] to indicate the
/// reduction operation should **NOT** return indices. Corresponds
/// to [sys::cudnnReduceTensorIndices_t::CUDNN_REDUCE_TENSOR_NO_INDICES].
#[derive(Debug, Default, Copy, Clone)]
pub struct NoIndices;
/// A reduction descriptor. Create with [`Cudnn::create_reduction_with_indices()`] if you
/// want the indices returned, or [`Cudnn::create_reduction_without_indices()`] if not.
#[derive(Debug)]
pub struct ReductionDescriptor<T, Idx> {
pub(crate) desc: sys::cudnnReduceTensorDescriptor_t,
#[allow(unused)]
pub(crate) indices: Idx,
#[allow(unused)]
pub(crate) handle: Arc<Cudnn>,
pub(crate) marker: PhantomData<T>,
}
impl Cudnn {
/// Create a reduction descriptor that computes indices.
pub fn create_reduction_flat_indices<T: CudnnDataType>(
self: &Arc<Cudnn>,
op: sys::cudnnReduceTensorOp_t,
nan_opt: sys::cudnnNanPropagation_t,
) -> Result<ReductionDescriptor<T, FlatIndices>, CudnnError> {
let desc = result::create_reduce_tensor_descriptor()?;
let desc = ReductionDescriptor {
desc,
indices: FlatIndices,
handle: self.clone(),
marker: PhantomData,
};
unsafe {
result::set_reduce_tensor_descriptor(
desc.desc,
op,
T::DATA_TYPE,
nan_opt,
sys::cudnnReduceTensorIndices_t::CUDNN_REDUCE_TENSOR_FLATTENED_INDICES,
sys::cudnnIndicesType_t::CUDNN_32BIT_INDICES,
)
}?;
Ok(desc)
}
/// Create a reduction descriptor that does NOT compute indices.
pub fn create_reduction_no_indices<T: CudnnDataType>(
self: &Arc<Cudnn>,
op: sys::cudnnReduceTensorOp_t,
nan_opt: sys::cudnnNanPropagation_t,
) -> Result<ReductionDescriptor<T, NoIndices>, CudnnError> {
let desc = result::create_reduce_tensor_descriptor()?;
let desc = ReductionDescriptor {
desc,
indices: NoIndices,
handle: self.clone(),
marker: PhantomData,
};
unsafe {
result::set_reduce_tensor_descriptor(
desc.desc,
op,
T::DATA_TYPE,
nan_opt,
sys::cudnnReduceTensorIndices_t::CUDNN_REDUCE_TENSOR_NO_INDICES,
sys::cudnnIndicesType_t::CUDNN_32BIT_INDICES,
)
}?;
Ok(desc)
}
}
impl<T, Idx> Drop for ReductionDescriptor<T, Idx> {
fn drop(&mut self) {
let desc = std::mem::replace(&mut self.desc, std::ptr::null_mut());
if !desc.is_null() {
unsafe { result::destroy_reduce_tensor_descriptor(desc) }.unwrap()
}
}
}
/// A reduction operation. Pass in fields directly, and then call launch.
pub struct ReduceTensor<'a, T: CudnnDataType, Idx> {
/// The reduction descriptor.
pub reduce: &'a ReductionDescriptor<T, Idx>,
/// The input tensor
pub a: &'a TensorDescriptor<T>,
/// The output tensor
pub c: &'a TensorDescriptor<T>,
}
impl<'a, T: CudnnDataType> ReduceTensor<'a, T, FlatIndices> {
/// Get's the size of the indices tensor required for this operation.
///
/// See [nvidia docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnGetReductionIndicesSize).
pub fn get_indices_size(&self) -> Result<usize, CudnnError> {
unsafe {
result::get_reduction_indices_size(
self.reduce.handle.handle,
self.reduce.desc,
self.a.desc,
self.c.desc,
)
}
}
}
impl<'a, T: CudnnDataType, Idx> ReduceTensor<'a, T, Idx> {
/// Gets the size of the workspace for this operation.
///
/// See [nvidia docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnGetReductionWorkspaceSize)
pub fn get_workspace_size(&self) -> Result<usize, CudnnError> {
unsafe {
result::get_reduction_workspace_size(
self.reduce.handle.handle,
self.reduce.desc,
self.a.desc,
self.c.desc,
)
}
}
}
impl<'a, T: CudnnDataType> ReduceTensor<'a, T, FlatIndices> {
/// Launches the operation with indices.
///
/// # Safety
/// The arguments must match the data type/layout specified in the
/// descriptors in `self`.
pub unsafe fn launch<Indices, Workspace, A, C>(
&self,
indices: &mut Indices,
workspace: &mut Workspace,
(alpha, beta): (T, T),
a: &A,
c: &mut C,
) -> Result<(), CudnnError>
where
Indices: DevicePtrMut<u32>,
Workspace: DevicePtrMut<u8>,
A: DevicePtr<T>,
C: DevicePtrMut<T>,
{
result::reduce_tensor(
self.reduce.handle.handle,
self.reduce.desc,
*indices.device_ptr_mut() as *mut std::ffi::c_void,
indices.num_bytes(),
*workspace.device_ptr_mut() as *mut std::ffi::c_void,
workspace.num_bytes(),
(&alpha) as *const T as *const std::ffi::c_void,
self.a.desc,
*a.device_ptr() as *const _,
(&beta) as *const T as *const std::ffi::c_void,
self.c.desc,
*c.device_ptr_mut() as *mut _,
)
}
}
impl<'a, T: CudnnDataType> ReduceTensor<'a, T, NoIndices> {
/// Launches the operation with no indices.
///
/// # Safety
/// The arguments must match the data type/layout specified in the
/// descriptors in `self`.
pub unsafe fn launch<Workspace, A, C>(
&self,
workspace: &mut Workspace,
(alpha, beta): (T, T),
a: &A,
c: &mut C,
) -> Result<(), CudnnError>
where
Workspace: DevicePtrMut<u8>,
A: DevicePtr<T>,
C: DevicePtrMut<T>,
{
result::reduce_tensor(
self.reduce.handle.handle,
self.reduce.desc,
std::ptr::null_mut(),
0,
*workspace.device_ptr_mut() as *mut std::ffi::c_void,
workspace.num_bytes(),
(&alpha) as *const T as *const std::ffi::c_void,
self.a.desc,
*a.device_ptr() as *const _,
(&beta) as *const T as *const std::ffi::c_void,
self.c.desc,
*c.device_ptr_mut() as *mut _,
)
}
}
/* automatically generated by rust-bindgen 0.59.1 */
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct CUstream_st {
_unused: [u8; 0],
}
pub type cudaStream_t = *mut CUstream_st;
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum libraryPropertyType_t {
MAJOR_VERSION = 0,
MINOR_VERSION = 1,
PATCH_LEVEL = 2,
}
pub use self::libraryPropertyType_t as libraryPropertyType;
pub type __int32_t = ::std::os::raw::c_int;
pub type __uint32_t = ::std::os::raw::c_uint;
pub type __int64_t = ::std::os::raw::c_long;
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct cudnnContext {
_unused: [u8; 0],
}
pub type cudnnHandle_t = *mut cudnnContext;
extern "C" {
pub fn cudnnGetVersion() -> usize;
}
extern "C" {
pub fn cudnnGetCudartVersion() -> usize;
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnStatus_t {
CUDNN_STATUS_SUCCESS = 0,
CUDNN_STATUS_NOT_INITIALIZED = 1,
CUDNN_STATUS_ALLOC_FAILED = 2,
CUDNN_STATUS_BAD_PARAM = 3,
CUDNN_STATUS_INTERNAL_ERROR = 4,
CUDNN_STATUS_INVALID_VALUE = 5,
CUDNN_STATUS_ARCH_MISMATCH = 6,
CUDNN_STATUS_MAPPING_ERROR = 7,
CUDNN_STATUS_EXECUTION_FAILED = 8,
CUDNN_STATUS_NOT_SUPPORTED = 9,
CUDNN_STATUS_LICENSE_ERROR = 10,
CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING = 11,
CUDNN_STATUS_RUNTIME_IN_PROGRESS = 12,
CUDNN_STATUS_RUNTIME_FP_OVERFLOW = 13,
CUDNN_STATUS_VERSION_MISMATCH = 14,
}
extern "C" {
pub fn cudnnGetErrorString(status: cudnnStatus_t) -> *const ::std::os::raw::c_char;
}
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct cudnnRuntimeTag_t {
_unused: [u8; 0],
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnErrQueryMode_t {
CUDNN_ERRQUERY_RAWCODE = 0,
CUDNN_ERRQUERY_NONBLOCKING = 1,
CUDNN_ERRQUERY_BLOCKING = 2,
}
extern "C" {
pub fn cudnnQueryRuntimeError(
handle: cudnnHandle_t,
rstatus: *mut cudnnStatus_t,
mode: cudnnErrQueryMode_t,
tag: *mut cudnnRuntimeTag_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetProperty(
type_: libraryPropertyType,
value: *mut ::std::os::raw::c_int,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnCreate(handle: *mut cudnnHandle_t) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnDestroy(handle: cudnnHandle_t) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnSetStream(handle: cudnnHandle_t, streamId: cudaStream_t) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetStream(handle: cudnnHandle_t, streamId: *mut cudaStream_t) -> cudnnStatus_t;
}
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct cudnnTensorStruct {
_unused: [u8; 0],
}
pub type cudnnTensorDescriptor_t = *mut cudnnTensorStruct;
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct cudnnPoolingStruct {
_unused: [u8; 0],
}
pub type cudnnPoolingDescriptor_t = *mut cudnnPoolingStruct;
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct cudnnFilterStruct {
_unused: [u8; 0],
}
pub type cudnnFilterDescriptor_t = *mut cudnnFilterStruct;
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct cudnnLRNStruct {
_unused: [u8; 0],
}
pub type cudnnLRNDescriptor_t = *mut cudnnLRNStruct;
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct cudnnActivationStruct {
_unused: [u8; 0],
}
pub type cudnnActivationDescriptor_t = *mut cudnnActivationStruct;
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct cudnnSpatialTransformerStruct {
_unused: [u8; 0],
}
pub type cudnnSpatialTransformerDescriptor_t = *mut cudnnSpatialTransformerStruct;
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct cudnnOpTensorStruct {
_unused: [u8; 0],
}
pub type cudnnOpTensorDescriptor_t = *mut cudnnOpTensorStruct;
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct cudnnReduceTensorStruct {
_unused: [u8; 0],
}
pub type cudnnReduceTensorDescriptor_t = *mut cudnnReduceTensorStruct;
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct cudnnCTCLossStruct {
_unused: [u8; 0],
}
pub type cudnnCTCLossDescriptor_t = *mut cudnnCTCLossStruct;
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct cudnnTensorTransformStruct {
_unused: [u8; 0],
}
pub type cudnnTensorTransformDescriptor_t = *mut cudnnTensorTransformStruct;
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnDataType_t {
CUDNN_DATA_FLOAT = 0,
CUDNN_DATA_DOUBLE = 1,
CUDNN_DATA_HALF = 2,
CUDNN_DATA_INT8 = 3,
CUDNN_DATA_INT32 = 4,
CUDNN_DATA_INT8x4 = 5,
CUDNN_DATA_UINT8 = 6,
CUDNN_DATA_UINT8x4 = 7,
CUDNN_DATA_INT8x32 = 8,
CUDNN_DATA_BFLOAT16 = 9,
CUDNN_DATA_INT64 = 10,
CUDNN_DATA_BOOLEAN = 11,
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnMathType_t {
CUDNN_DEFAULT_MATH = 0,
CUDNN_TENSOR_OP_MATH = 1,
CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION = 2,
CUDNN_FMA_MATH = 3,
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnNanPropagation_t {
CUDNN_NOT_PROPAGATE_NAN = 0,
CUDNN_PROPAGATE_NAN = 1,
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnDeterminism_t {
CUDNN_NON_DETERMINISTIC = 0,
CUDNN_DETERMINISTIC = 1,
}
extern "C" {
pub fn cudnnCreateTensorDescriptor(tensorDesc: *mut cudnnTensorDescriptor_t) -> cudnnStatus_t;
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnTensorFormat_t {
CUDNN_TENSOR_NCHW = 0,
CUDNN_TENSOR_NHWC = 1,
CUDNN_TENSOR_NCHW_VECT_C = 2,
}
extern "C" {
pub fn cudnnSetTensor4dDescriptor(
tensorDesc: cudnnTensorDescriptor_t,
format: cudnnTensorFormat_t,
dataType: cudnnDataType_t,
n: ::std::os::raw::c_int,
c: ::std::os::raw::c_int,
h: ::std::os::raw::c_int,
w: ::std::os::raw::c_int,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnSetTensor4dDescriptorEx(
tensorDesc: cudnnTensorDescriptor_t,
dataType: cudnnDataType_t,
n: ::std::os::raw::c_int,
c: ::std::os::raw::c_int,
h: ::std::os::raw::c_int,
w: ::std::os::raw::c_int,
nStride: ::std::os::raw::c_int,
cStride: ::std::os::raw::c_int,
hStride: ::std::os::raw::c_int,
wStride: ::std::os::raw::c_int,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetTensor4dDescriptor(
tensorDesc: cudnnTensorDescriptor_t,
dataType: *mut cudnnDataType_t,
n: *mut ::std::os::raw::c_int,
c: *mut ::std::os::raw::c_int,
h: *mut ::std::os::raw::c_int,
w: *mut ::std::os::raw::c_int,
nStride: *mut ::std::os::raw::c_int,
cStride: *mut ::std::os::raw::c_int,
hStride: *mut ::std::os::raw::c_int,
wStride: *mut ::std::os::raw::c_int,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnSetTensorNdDescriptor(
tensorDesc: cudnnTensorDescriptor_t,
dataType: cudnnDataType_t,
nbDims: ::std::os::raw::c_int,
dimA: *const ::std::os::raw::c_int,
strideA: *const ::std::os::raw::c_int,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnSetTensorNdDescriptorEx(
tensorDesc: cudnnTensorDescriptor_t,
format: cudnnTensorFormat_t,
dataType: cudnnDataType_t,
nbDims: ::std::os::raw::c_int,
dimA: *const ::std::os::raw::c_int,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetTensorNdDescriptor(
tensorDesc: cudnnTensorDescriptor_t,
nbDimsRequested: ::std::os::raw::c_int,
dataType: *mut cudnnDataType_t,
nbDims: *mut ::std::os::raw::c_int,
dimA: *mut ::std::os::raw::c_int,
strideA: *mut ::std::os::raw::c_int,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetTensorSizeInBytes(
tensorDesc: cudnnTensorDescriptor_t,
size: *mut usize,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnDestroyTensorDescriptor(tensorDesc: cudnnTensorDescriptor_t) -> cudnnStatus_t;
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnFoldingDirection_t {
CUDNN_TRANSFORM_FOLD = 0,
CUDNN_TRANSFORM_UNFOLD = 1,
}
extern "C" {
pub fn cudnnInitTransformDest(
transformDesc: cudnnTensorTransformDescriptor_t,
srcDesc: cudnnTensorDescriptor_t,
destDesc: cudnnTensorDescriptor_t,
destSizeInBytes: *mut usize,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnCreateTensorTransformDescriptor(
transformDesc: *mut cudnnTensorTransformDescriptor_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnSetTensorTransformDescriptor(
transformDesc: cudnnTensorTransformDescriptor_t,
nbDims: u32,
destFormat: cudnnTensorFormat_t,
padBeforeA: *const i32,
padAfterA: *const i32,
foldA: *const u32,
direction: cudnnFoldingDirection_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetTensorTransformDescriptor(
transformDesc: cudnnTensorTransformDescriptor_t,
nbDimsRequested: u32,
destFormat: *mut cudnnTensorFormat_t,
padBeforeA: *mut i32,
padAfterA: *mut i32,
foldA: *mut u32,
direction: *mut cudnnFoldingDirection_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnDestroyTensorTransformDescriptor(
transformDesc: cudnnTensorTransformDescriptor_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnTransformTensor(
handle: cudnnHandle_t,
alpha: *const ::core::ffi::c_void,
xDesc: cudnnTensorDescriptor_t,
x: *const ::core::ffi::c_void,
beta: *const ::core::ffi::c_void,
yDesc: cudnnTensorDescriptor_t,
y: *mut ::core::ffi::c_void,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnTransformTensorEx(
handle: cudnnHandle_t,
transDesc: cudnnTensorTransformDescriptor_t,
alpha: *const ::core::ffi::c_void,
srcDesc: cudnnTensorDescriptor_t,
srcData: *const ::core::ffi::c_void,
beta: *const ::core::ffi::c_void,
destDesc: cudnnTensorDescriptor_t,
destData: *mut ::core::ffi::c_void,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnAddTensor(
handle: cudnnHandle_t,
alpha: *const ::core::ffi::c_void,
aDesc: cudnnTensorDescriptor_t,
A: *const ::core::ffi::c_void,
beta: *const ::core::ffi::c_void,
cDesc: cudnnTensorDescriptor_t,
C: *mut ::core::ffi::c_void,
) -> cudnnStatus_t;
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnOpTensorOp_t {
CUDNN_OP_TENSOR_ADD = 0,
CUDNN_OP_TENSOR_MUL = 1,
CUDNN_OP_TENSOR_MIN = 2,
CUDNN_OP_TENSOR_MAX = 3,
CUDNN_OP_TENSOR_SQRT = 4,
CUDNN_OP_TENSOR_NOT = 5,
}
extern "C" {
pub fn cudnnCreateOpTensorDescriptor(
opTensorDesc: *mut cudnnOpTensorDescriptor_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnSetOpTensorDescriptor(
opTensorDesc: cudnnOpTensorDescriptor_t,
opTensorOp: cudnnOpTensorOp_t,
opTensorCompType: cudnnDataType_t,
opTensorNanOpt: cudnnNanPropagation_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetOpTensorDescriptor(
opTensorDesc: cudnnOpTensorDescriptor_t,
opTensorOp: *mut cudnnOpTensorOp_t,
opTensorCompType: *mut cudnnDataType_t,
opTensorNanOpt: *mut cudnnNanPropagation_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnDestroyOpTensorDescriptor(opTensorDesc: cudnnOpTensorDescriptor_t)
-> cudnnStatus_t;
}
extern "C" {
pub fn cudnnOpTensor(
handle: cudnnHandle_t,
opTensorDesc: cudnnOpTensorDescriptor_t,
alpha1: *const ::core::ffi::c_void,
aDesc: cudnnTensorDescriptor_t,
A: *const ::core::ffi::c_void,
alpha2: *const ::core::ffi::c_void,
bDesc: cudnnTensorDescriptor_t,
B: *const ::core::ffi::c_void,
beta: *const ::core::ffi::c_void,
cDesc: cudnnTensorDescriptor_t,
C: *mut ::core::ffi::c_void,
) -> cudnnStatus_t;
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnReduceTensorOp_t {
CUDNN_REDUCE_TENSOR_ADD = 0,
CUDNN_REDUCE_TENSOR_MUL = 1,
CUDNN_REDUCE_TENSOR_MIN = 2,
CUDNN_REDUCE_TENSOR_MAX = 3,
CUDNN_REDUCE_TENSOR_AMAX = 4,
CUDNN_REDUCE_TENSOR_AVG = 5,
CUDNN_REDUCE_TENSOR_NORM1 = 6,
CUDNN_REDUCE_TENSOR_NORM2 = 7,
CUDNN_REDUCE_TENSOR_MUL_NO_ZEROS = 8,
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnReduceTensorIndices_t {
CUDNN_REDUCE_TENSOR_NO_INDICES = 0,
CUDNN_REDUCE_TENSOR_FLATTENED_INDICES = 1,
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnIndicesType_t {
CUDNN_32BIT_INDICES = 0,
CUDNN_64BIT_INDICES = 1,
CUDNN_16BIT_INDICES = 2,
CUDNN_8BIT_INDICES = 3,
}
extern "C" {
pub fn cudnnCreateReduceTensorDescriptor(
reduceTensorDesc: *mut cudnnReduceTensorDescriptor_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnSetReduceTensorDescriptor(
reduceTensorDesc: cudnnReduceTensorDescriptor_t,
reduceTensorOp: cudnnReduceTensorOp_t,
reduceTensorCompType: cudnnDataType_t,
reduceTensorNanOpt: cudnnNanPropagation_t,
reduceTensorIndices: cudnnReduceTensorIndices_t,
reduceTensorIndicesType: cudnnIndicesType_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetReduceTensorDescriptor(
reduceTensorDesc: cudnnReduceTensorDescriptor_t,
reduceTensorOp: *mut cudnnReduceTensorOp_t,
reduceTensorCompType: *mut cudnnDataType_t,
reduceTensorNanOpt: *mut cudnnNanPropagation_t,
reduceTensorIndices: *mut cudnnReduceTensorIndices_t,
reduceTensorIndicesType: *mut cudnnIndicesType_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnDestroyReduceTensorDescriptor(
reduceTensorDesc: cudnnReduceTensorDescriptor_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetReductionIndicesSize(
handle: cudnnHandle_t,
reduceTensorDesc: cudnnReduceTensorDescriptor_t,
aDesc: cudnnTensorDescriptor_t,
cDesc: cudnnTensorDescriptor_t,
sizeInBytes: *mut usize,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetReductionWorkspaceSize(
handle: cudnnHandle_t,
reduceTensorDesc: cudnnReduceTensorDescriptor_t,
aDesc: cudnnTensorDescriptor_t,
cDesc: cudnnTensorDescriptor_t,
sizeInBytes: *mut usize,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnReduceTensor(
handle: cudnnHandle_t,
reduceTensorDesc: cudnnReduceTensorDescriptor_t,
indices: *mut ::core::ffi::c_void,
indicesSizeInBytes: usize,
workspace: *mut ::core::ffi::c_void,
workspaceSizeInBytes: usize,
alpha: *const ::core::ffi::c_void,
aDesc: cudnnTensorDescriptor_t,
A: *const ::core::ffi::c_void,
beta: *const ::core::ffi::c_void,
cDesc: cudnnTensorDescriptor_t,
C: *mut ::core::ffi::c_void,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnSetTensor(
handle: cudnnHandle_t,
yDesc: cudnnTensorDescriptor_t,
y: *mut ::core::ffi::c_void,
valuePtr: *const ::core::ffi::c_void,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnScaleTensor(
handle: cudnnHandle_t,
yDesc: cudnnTensorDescriptor_t,
y: *mut ::core::ffi::c_void,
alpha: *const ::core::ffi::c_void,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnCreateFilterDescriptor(filterDesc: *mut cudnnFilterDescriptor_t) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnSetFilter4dDescriptor(
filterDesc: cudnnFilterDescriptor_t,
dataType: cudnnDataType_t,
format: cudnnTensorFormat_t,
k: ::std::os::raw::c_int,
c: ::std::os::raw::c_int,
h: ::std::os::raw::c_int,
w: ::std::os::raw::c_int,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetFilter4dDescriptor(
filterDesc: cudnnFilterDescriptor_t,
dataType: *mut cudnnDataType_t,
format: *mut cudnnTensorFormat_t,
k: *mut ::std::os::raw::c_int,
c: *mut ::std::os::raw::c_int,
h: *mut ::std::os::raw::c_int,
w: *mut ::std::os::raw::c_int,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnSetFilterNdDescriptor(
filterDesc: cudnnFilterDescriptor_t,
dataType: cudnnDataType_t,
format: cudnnTensorFormat_t,
nbDims: ::std::os::raw::c_int,
filterDimA: *const ::std::os::raw::c_int,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetFilterNdDescriptor(
filterDesc: cudnnFilterDescriptor_t,
nbDimsRequested: ::std::os::raw::c_int,
dataType: *mut cudnnDataType_t,
format: *mut cudnnTensorFormat_t,
nbDims: *mut ::std::os::raw::c_int,
filterDimA: *mut ::std::os::raw::c_int,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetFilterSizeInBytes(
filterDesc: cudnnFilterDescriptor_t,
size: *mut usize,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnTransformFilter(
handle: cudnnHandle_t,
transDesc: cudnnTensorTransformDescriptor_t,
alpha: *const ::core::ffi::c_void,
srcDesc: cudnnFilterDescriptor_t,
srcData: *const ::core::ffi::c_void,
beta: *const ::core::ffi::c_void,
destDesc: cudnnFilterDescriptor_t,
destData: *mut ::core::ffi::c_void,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnDestroyFilterDescriptor(filterDesc: cudnnFilterDescriptor_t) -> cudnnStatus_t;
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnSoftmaxAlgorithm_t {
CUDNN_SOFTMAX_FAST = 0,
CUDNN_SOFTMAX_ACCURATE = 1,
CUDNN_SOFTMAX_LOG = 2,
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnSoftmaxMode_t {
CUDNN_SOFTMAX_MODE_INSTANCE = 0,
CUDNN_SOFTMAX_MODE_CHANNEL = 1,
}
extern "C" {
pub fn cudnnSoftmaxForward(
handle: cudnnHandle_t,
algo: cudnnSoftmaxAlgorithm_t,
mode: cudnnSoftmaxMode_t,
alpha: *const ::core::ffi::c_void,
xDesc: cudnnTensorDescriptor_t,
x: *const ::core::ffi::c_void,
beta: *const ::core::ffi::c_void,
yDesc: cudnnTensorDescriptor_t,
y: *mut ::core::ffi::c_void,
) -> cudnnStatus_t;
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnPoolingMode_t {
CUDNN_POOLING_MAX = 0,
CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING = 1,
CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING = 2,
CUDNN_POOLING_MAX_DETERMINISTIC = 3,
}
extern "C" {
pub fn cudnnCreatePoolingDescriptor(
poolingDesc: *mut cudnnPoolingDescriptor_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnSetPooling2dDescriptor(
poolingDesc: cudnnPoolingDescriptor_t,
mode: cudnnPoolingMode_t,
maxpoolingNanOpt: cudnnNanPropagation_t,
windowHeight: ::std::os::raw::c_int,
windowWidth: ::std::os::raw::c_int,
verticalPadding: ::std::os::raw::c_int,
horizontalPadding: ::std::os::raw::c_int,
verticalStride: ::std::os::raw::c_int,
horizontalStride: ::std::os::raw::c_int,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetPooling2dDescriptor(
poolingDesc: cudnnPoolingDescriptor_t,
mode: *mut cudnnPoolingMode_t,
maxpoolingNanOpt: *mut cudnnNanPropagation_t,
windowHeight: *mut ::std::os::raw::c_int,
windowWidth: *mut ::std::os::raw::c_int,
verticalPadding: *mut ::std::os::raw::c_int,
horizontalPadding: *mut ::std::os::raw::c_int,
verticalStride: *mut ::std::os::raw::c_int,
horizontalStride: *mut ::std::os::raw::c_int,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnSetPoolingNdDescriptor(
poolingDesc: cudnnPoolingDescriptor_t,
mode: cudnnPoolingMode_t,
maxpoolingNanOpt: cudnnNanPropagation_t,
nbDims: ::std::os::raw::c_int,
windowDimA: *const ::std::os::raw::c_int,
paddingA: *const ::std::os::raw::c_int,
strideA: *const ::std::os::raw::c_int,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetPoolingNdDescriptor(
poolingDesc: cudnnPoolingDescriptor_t,
nbDimsRequested: ::std::os::raw::c_int,
mode: *mut cudnnPoolingMode_t,
maxpoolingNanOpt: *mut cudnnNanPropagation_t,
nbDims: *mut ::std::os::raw::c_int,
windowDimA: *mut ::std::os::raw::c_int,
paddingA: *mut ::std::os::raw::c_int,
strideA: *mut ::std::os::raw::c_int,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetPoolingNdForwardOutputDim(
poolingDesc: cudnnPoolingDescriptor_t,
inputTensorDesc: cudnnTensorDescriptor_t,
nbDims: ::std::os::raw::c_int,
outputTensorDimA: *mut ::std::os::raw::c_int,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetPooling2dForwardOutputDim(
poolingDesc: cudnnPoolingDescriptor_t,
inputTensorDesc: cudnnTensorDescriptor_t,
n: *mut ::std::os::raw::c_int,
c: *mut ::std::os::raw::c_int,
h: *mut ::std::os::raw::c_int,
w: *mut ::std::os::raw::c_int,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnDestroyPoolingDescriptor(poolingDesc: cudnnPoolingDescriptor_t) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnPoolingForward(
handle: cudnnHandle_t,
poolingDesc: cudnnPoolingDescriptor_t,
alpha: *const ::core::ffi::c_void,
xDesc: cudnnTensorDescriptor_t,
x: *const ::core::ffi::c_void,
beta: *const ::core::ffi::c_void,
yDesc: cudnnTensorDescriptor_t,
y: *mut ::core::ffi::c_void,
) -> cudnnStatus_t;
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnActivationMode_t {
CUDNN_ACTIVATION_SIGMOID = 0,
CUDNN_ACTIVATION_RELU = 1,
CUDNN_ACTIVATION_TANH = 2,
CUDNN_ACTIVATION_CLIPPED_RELU = 3,
CUDNN_ACTIVATION_ELU = 4,
CUDNN_ACTIVATION_IDENTITY = 5,
CUDNN_ACTIVATION_SWISH = 6,
}
extern "C" {
pub fn cudnnCreateActivationDescriptor(
activationDesc: *mut cudnnActivationDescriptor_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnSetActivationDescriptor(
activationDesc: cudnnActivationDescriptor_t,
mode: cudnnActivationMode_t,
reluNanOpt: cudnnNanPropagation_t,
coef: f64,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetActivationDescriptor(
activationDesc: cudnnActivationDescriptor_t,
mode: *mut cudnnActivationMode_t,
reluNanOpt: *mut cudnnNanPropagation_t,
coef: *mut f64,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnSetActivationDescriptorSwishBeta(
activationDesc: cudnnActivationDescriptor_t,
swish_beta: f64,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetActivationDescriptorSwishBeta(
activationDesc: cudnnActivationDescriptor_t,
swish_beta: *mut f64,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnDestroyActivationDescriptor(
activationDesc: cudnnActivationDescriptor_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnActivationForward(
handle: cudnnHandle_t,
activationDesc: cudnnActivationDescriptor_t,
alpha: *const ::core::ffi::c_void,
xDesc: cudnnTensorDescriptor_t,
x: *const ::core::ffi::c_void,
beta: *const ::core::ffi::c_void,
yDesc: cudnnTensorDescriptor_t,
y: *mut ::core::ffi::c_void,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnCreateLRNDescriptor(normDesc: *mut cudnnLRNDescriptor_t) -> cudnnStatus_t;
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnLRNMode_t {
CUDNN_LRN_CROSS_CHANNEL_DIM1 = 0,
}
extern "C" {
pub fn cudnnSetLRNDescriptor(
normDesc: cudnnLRNDescriptor_t,
lrnN: ::std::os::raw::c_uint,
lrnAlpha: f64,
lrnBeta: f64,
lrnK: f64,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetLRNDescriptor(
normDesc: cudnnLRNDescriptor_t,
lrnN: *mut ::std::os::raw::c_uint,
lrnAlpha: *mut f64,
lrnBeta: *mut f64,
lrnK: *mut f64,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnDestroyLRNDescriptor(lrnDesc: cudnnLRNDescriptor_t) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnLRNCrossChannelForward(
handle: cudnnHandle_t,
normDesc: cudnnLRNDescriptor_t,
lrnMode: cudnnLRNMode_t,
alpha: *const ::core::ffi::c_void,
xDesc: cudnnTensorDescriptor_t,
x: *const ::core::ffi::c_void,
beta: *const ::core::ffi::c_void,
yDesc: cudnnTensorDescriptor_t,
y: *mut ::core::ffi::c_void,
) -> cudnnStatus_t;
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnDivNormMode_t {
CUDNN_DIVNORM_PRECOMPUTED_MEANS = 0,
}
extern "C" {
pub fn cudnnDivisiveNormalizationForward(
handle: cudnnHandle_t,
normDesc: cudnnLRNDescriptor_t,
mode: cudnnDivNormMode_t,
alpha: *const ::core::ffi::c_void,
xDesc: cudnnTensorDescriptor_t,
x: *const ::core::ffi::c_void,
means: *const ::core::ffi::c_void,
temp: *mut ::core::ffi::c_void,
temp2: *mut ::core::ffi::c_void,
beta: *const ::core::ffi::c_void,
yDesc: cudnnTensorDescriptor_t,
y: *mut ::core::ffi::c_void,
) -> cudnnStatus_t;
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnBatchNormMode_t {
CUDNN_BATCHNORM_PER_ACTIVATION = 0,
CUDNN_BATCHNORM_SPATIAL = 1,
CUDNN_BATCHNORM_SPATIAL_PERSISTENT = 2,
}
extern "C" {
pub fn cudnnDeriveBNTensorDescriptor(
derivedBnDesc: cudnnTensorDescriptor_t,
xDesc: cudnnTensorDescriptor_t,
mode: cudnnBatchNormMode_t,
) -> cudnnStatus_t;
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnBatchNormOps_t {
CUDNN_BATCHNORM_OPS_BN = 0,
CUDNN_BATCHNORM_OPS_BN_ACTIVATION = 1,
CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION = 2,
}
extern "C" {
pub fn cudnnBatchNormalizationForwardInference(
handle: cudnnHandle_t,
mode: cudnnBatchNormMode_t,
alpha: *const ::core::ffi::c_void,
beta: *const ::core::ffi::c_void,
xDesc: cudnnTensorDescriptor_t,
x: *const ::core::ffi::c_void,
yDesc: cudnnTensorDescriptor_t,
y: *mut ::core::ffi::c_void,
bnScaleBiasMeanVarDesc: cudnnTensorDescriptor_t,
bnScale: *const ::core::ffi::c_void,
bnBias: *const ::core::ffi::c_void,
estimatedMean: *const ::core::ffi::c_void,
estimatedVariance: *const ::core::ffi::c_void,
epsilon: f64,
) -> cudnnStatus_t;
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnNormMode_t {
CUDNN_NORM_PER_ACTIVATION = 0,
CUDNN_NORM_PER_CHANNEL = 1,
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnNormAlgo_t {
CUDNN_NORM_ALGO_STANDARD = 0,
CUDNN_NORM_ALGO_PERSIST = 1,
}
extern "C" {
pub fn cudnnDeriveNormTensorDescriptor(
derivedNormScaleBiasDesc: cudnnTensorDescriptor_t,
derivedNormMeanVarDesc: cudnnTensorDescriptor_t,
xDesc: cudnnTensorDescriptor_t,
mode: cudnnNormMode_t,
groupCnt: ::std::os::raw::c_int,
) -> cudnnStatus_t;
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnNormOps_t {
CUDNN_NORM_OPS_NORM = 0,
CUDNN_NORM_OPS_NORM_ACTIVATION = 1,
CUDNN_NORM_OPS_NORM_ADD_ACTIVATION = 2,
}
extern "C" {
pub fn cudnnNormalizationForwardInference(
handle: cudnnHandle_t,
mode: cudnnNormMode_t,
normOps: cudnnNormOps_t,
algo: cudnnNormAlgo_t,
alpha: *const ::core::ffi::c_void,
beta: *const ::core::ffi::c_void,
xDesc: cudnnTensorDescriptor_t,
x: *const ::core::ffi::c_void,
normScaleBiasDesc: cudnnTensorDescriptor_t,
normScale: *const ::core::ffi::c_void,
normBias: *const ::core::ffi::c_void,
normMeanVarDesc: cudnnTensorDescriptor_t,
estimatedMean: *const ::core::ffi::c_void,
estimatedVariance: *const ::core::ffi::c_void,
zDesc: cudnnTensorDescriptor_t,
z: *const ::core::ffi::c_void,
activationDesc: cudnnActivationDescriptor_t,
yDesc: cudnnTensorDescriptor_t,
y: *mut ::core::ffi::c_void,
epsilon: f64,
groupCnt: ::std::os::raw::c_int,
) -> cudnnStatus_t;
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnSamplerType_t {
CUDNN_SAMPLER_BILINEAR = 0,
}
extern "C" {
pub fn cudnnCreateSpatialTransformerDescriptor(
stDesc: *mut cudnnSpatialTransformerDescriptor_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnSetSpatialTransformerNdDescriptor(
stDesc: cudnnSpatialTransformerDescriptor_t,
samplerType: cudnnSamplerType_t,
dataType: cudnnDataType_t,
nbDims: ::std::os::raw::c_int,
dimA: *const ::std::os::raw::c_int,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnDestroySpatialTransformerDescriptor(
stDesc: cudnnSpatialTransformerDescriptor_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnSpatialTfGridGeneratorForward(
handle: cudnnHandle_t,
stDesc: cudnnSpatialTransformerDescriptor_t,
theta: *const ::core::ffi::c_void,
grid: *mut ::core::ffi::c_void,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnSpatialTfSamplerForward(
handle: cudnnHandle_t,
stDesc: cudnnSpatialTransformerDescriptor_t,
alpha: *const ::core::ffi::c_void,
xDesc: cudnnTensorDescriptor_t,
x: *const ::core::ffi::c_void,
grid: *const ::core::ffi::c_void,
beta: *const ::core::ffi::c_void,
yDesc: cudnnTensorDescriptor_t,
y: *mut ::core::ffi::c_void,
) -> cudnnStatus_t;
}
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct cudnnDropoutStruct {
_unused: [u8; 0],
}
pub type cudnnDropoutDescriptor_t = *mut cudnnDropoutStruct;
extern "C" {
pub fn cudnnCreateDropoutDescriptor(
dropoutDesc: *mut cudnnDropoutDescriptor_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnDestroyDropoutDescriptor(dropoutDesc: cudnnDropoutDescriptor_t) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnDropoutGetStatesSize(
handle: cudnnHandle_t,
sizeInBytes: *mut usize,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnDropoutGetReserveSpaceSize(
xdesc: cudnnTensorDescriptor_t,
sizeInBytes: *mut usize,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnSetDropoutDescriptor(
dropoutDesc: cudnnDropoutDescriptor_t,
handle: cudnnHandle_t,
dropout: f32,
states: *mut ::core::ffi::c_void,
stateSizeInBytes: usize,
seed: ::std::os::raw::c_ulonglong,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnRestoreDropoutDescriptor(
dropoutDesc: cudnnDropoutDescriptor_t,
handle: cudnnHandle_t,
dropout: f32,
states: *mut ::core::ffi::c_void,
stateSizeInBytes: usize,
seed: ::std::os::raw::c_ulonglong,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetDropoutDescriptor(
dropoutDesc: cudnnDropoutDescriptor_t,
handle: cudnnHandle_t,
dropout: *mut f32,
states: *mut *mut ::core::ffi::c_void,
seed: *mut ::std::os::raw::c_ulonglong,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnDropoutForward(
handle: cudnnHandle_t,
dropoutDesc: cudnnDropoutDescriptor_t,
xdesc: cudnnTensorDescriptor_t,
x: *const ::core::ffi::c_void,
ydesc: cudnnTensorDescriptor_t,
y: *mut ::core::ffi::c_void,
reserveSpace: *mut ::core::ffi::c_void,
reserveSpaceSizeInBytes: usize,
) -> cudnnStatus_t;
}
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct cudnnAlgorithmStruct {
_unused: [u8; 0],
}
pub type cudnnAlgorithmDescriptor_t = *mut cudnnAlgorithmStruct;
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct cudnnAlgorithmPerformanceStruct {
_unused: [u8; 0],
}
pub type cudnnAlgorithmPerformance_t = *mut cudnnAlgorithmPerformanceStruct;
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnConvolutionFwdAlgo_t {
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM = 0,
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM = 1,
CUDNN_CONVOLUTION_FWD_ALGO_GEMM = 2,
CUDNN_CONVOLUTION_FWD_ALGO_DIRECT = 3,
CUDNN_CONVOLUTION_FWD_ALGO_FFT = 4,
CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING = 5,
CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD = 6,
CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED = 7,
CUDNN_CONVOLUTION_FWD_ALGO_COUNT = 8,
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnConvolutionBwdFilterAlgo_t {
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0 = 0,
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 = 1,
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT = 2,
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3 = 3,
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD = 4,
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED = 5,
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING = 6,
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT = 7,
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnConvolutionBwdDataAlgo_t {
CUDNN_CONVOLUTION_BWD_DATA_ALGO_0 = 0,
CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 = 1,
CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT = 2,
CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING = 3,
CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD = 4,
CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED = 5,
CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT = 6,
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnRNNAlgo_t {
CUDNN_RNN_ALGO_STANDARD = 0,
CUDNN_RNN_ALGO_PERSIST_STATIC = 1,
CUDNN_RNN_ALGO_PERSIST_DYNAMIC = 2,
CUDNN_RNN_ALGO_PERSIST_STATIC_SMALL_H = 3,
CUDNN_RNN_ALGO_COUNT = 4,
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnCTCLossAlgo_t {
CUDNN_CTC_LOSS_ALGO_DETERMINISTIC = 0,
CUDNN_CTC_LOSS_ALGO_NON_DETERMINISTIC = 1,
}
#[repr(C)]
#[derive(Copy, Clone)]
pub struct cudnnAlgorithmUnionStruct {
pub algo: cudnnAlgorithmUnionStruct_Algorithm,
}
#[repr(C)]
#[derive(Copy, Clone)]
pub union cudnnAlgorithmUnionStruct_Algorithm {
pub convFwdAlgo: cudnnConvolutionFwdAlgo_t,
pub convBwdFilterAlgo: cudnnConvolutionBwdFilterAlgo_t,
pub convBwdDataAlgo: cudnnConvolutionBwdDataAlgo_t,
pub RNNAlgo: cudnnRNNAlgo_t,
pub CTCLossAlgo: cudnnCTCLossAlgo_t,
}
#[test]
fn bindgen_test_layout_cudnnAlgorithmUnionStruct_Algorithm() {
assert_eq!(
::core::mem::size_of::<cudnnAlgorithmUnionStruct_Algorithm>(),
4usize,
concat!("Size of: ", stringify!(cudnnAlgorithmUnionStruct_Algorithm))
);
assert_eq!(
::core::mem::align_of::<cudnnAlgorithmUnionStruct_Algorithm>(),
4usize,
concat!(
"Alignment of ",
stringify!(cudnnAlgorithmUnionStruct_Algorithm)
)
);
assert_eq!(
unsafe {
&(*(::core::ptr::null::<cudnnAlgorithmUnionStruct_Algorithm>())).convFwdAlgo as *const _
as usize
},
0usize,
concat!(
"Offset of field: ",
stringify!(cudnnAlgorithmUnionStruct_Algorithm),
"::",
stringify!(convFwdAlgo)
)
);
assert_eq!(
unsafe {
&(*(::core::ptr::null::<cudnnAlgorithmUnionStruct_Algorithm>())).convBwdFilterAlgo
as *const _ as usize
},
0usize,
concat!(
"Offset of field: ",
stringify!(cudnnAlgorithmUnionStruct_Algorithm),
"::",
stringify!(convBwdFilterAlgo)
)
);
assert_eq!(
unsafe {
&(*(::core::ptr::null::<cudnnAlgorithmUnionStruct_Algorithm>())).convBwdDataAlgo
as *const _ as usize
},
0usize,
concat!(
"Offset of field: ",
stringify!(cudnnAlgorithmUnionStruct_Algorithm),
"::",
stringify!(convBwdDataAlgo)
)
);
assert_eq!(
unsafe {
&(*(::core::ptr::null::<cudnnAlgorithmUnionStruct_Algorithm>())).RNNAlgo as *const _
as usize
},
0usize,
concat!(
"Offset of field: ",
stringify!(cudnnAlgorithmUnionStruct_Algorithm),
"::",
stringify!(RNNAlgo)
)
);
assert_eq!(
unsafe {
&(*(::core::ptr::null::<cudnnAlgorithmUnionStruct_Algorithm>())).CTCLossAlgo as *const _
as usize
},
0usize,
concat!(
"Offset of field: ",
stringify!(cudnnAlgorithmUnionStruct_Algorithm),
"::",
stringify!(CTCLossAlgo)
)
);
}
impl Default for cudnnAlgorithmUnionStruct_Algorithm {
fn default() -> Self {
let mut s = ::core::mem::MaybeUninit::<Self>::uninit();
unsafe {
::core::ptr::write_bytes(s.as_mut_ptr(), 0, 1);
s.assume_init()
}
}
}
#[test]
fn bindgen_test_layout_cudnnAlgorithmUnionStruct() {
assert_eq!(
::core::mem::size_of::<cudnnAlgorithmUnionStruct>(),
4usize,
concat!("Size of: ", stringify!(cudnnAlgorithmUnionStruct))
);
assert_eq!(
::core::mem::align_of::<cudnnAlgorithmUnionStruct>(),
4usize,
concat!("Alignment of ", stringify!(cudnnAlgorithmUnionStruct))
);
assert_eq!(
unsafe { &(*(::core::ptr::null::<cudnnAlgorithmUnionStruct>())).algo as *const _ as usize },
0usize,
concat!(
"Offset of field: ",
stringify!(cudnnAlgorithmUnionStruct),
"::",
stringify!(algo)
)
);
}
impl Default for cudnnAlgorithmUnionStruct {
fn default() -> Self {
let mut s = ::core::mem::MaybeUninit::<Self>::uninit();
unsafe {
::core::ptr::write_bytes(s.as_mut_ptr(), 0, 1);
s.assume_init()
}
}
}
pub type cudnnAlgorithm_t = cudnnAlgorithmUnionStruct;
extern "C" {
pub fn cudnnCreateAlgorithmDescriptor(
algoDesc: *mut cudnnAlgorithmDescriptor_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnSetAlgorithmDescriptor(
algoDesc: cudnnAlgorithmDescriptor_t,
algorithm: cudnnAlgorithm_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetAlgorithmDescriptor(
algoDesc: cudnnAlgorithmDescriptor_t,
algorithm: *mut cudnnAlgorithm_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnCopyAlgorithmDescriptor(
src: cudnnAlgorithmDescriptor_t,
dest: cudnnAlgorithmDescriptor_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnDestroyAlgorithmDescriptor(algoDesc: cudnnAlgorithmDescriptor_t) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnCreateAlgorithmPerformance(
algoPerf: *mut cudnnAlgorithmPerformance_t,
numberToCreate: ::std::os::raw::c_int,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnSetAlgorithmPerformance(
algoPerf: cudnnAlgorithmPerformance_t,
algoDesc: cudnnAlgorithmDescriptor_t,
status: cudnnStatus_t,
time: f32,
memory: usize,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetAlgorithmPerformance(
algoPerf: cudnnAlgorithmPerformance_t,
algoDesc: *mut cudnnAlgorithmDescriptor_t,
status: *mut cudnnStatus_t,
time: *mut f32,
memory: *mut usize,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnDestroyAlgorithmPerformance(
algoPerf: *mut cudnnAlgorithmPerformance_t,
numberToDestroy: ::std::os::raw::c_int,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetAlgorithmSpaceSize(
handle: cudnnHandle_t,
algoDesc: cudnnAlgorithmDescriptor_t,
algoSpaceSizeInBytes: *mut usize,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnSaveAlgorithm(
handle: cudnnHandle_t,
algoDesc: cudnnAlgorithmDescriptor_t,
algoSpace: *mut ::core::ffi::c_void,
algoSpaceSizeInBytes: usize,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnRestoreAlgorithm(
handle: cudnnHandle_t,
algoSpace: *mut ::core::ffi::c_void,
algoSpaceSizeInBytes: usize,
algoDesc: cudnnAlgorithmDescriptor_t,
) -> cudnnStatus_t;
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnSeverity_t {
CUDNN_SEV_FATAL = 0,
CUDNN_SEV_ERROR = 1,
CUDNN_SEV_WARNING = 2,
CUDNN_SEV_INFO = 3,
}
#[repr(C)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub struct cudnnDebugStruct {
pub cudnn_version: ::std::os::raw::c_uint,
pub cudnnStatus: cudnnStatus_t,
pub time_sec: ::std::os::raw::c_uint,
pub time_usec: ::std::os::raw::c_uint,
pub time_delta: ::std::os::raw::c_uint,
pub handle: cudnnHandle_t,
pub stream: cudaStream_t,
pub pid: ::std::os::raw::c_ulonglong,
pub tid: ::std::os::raw::c_ulonglong,
pub cudaDeviceId: ::std::os::raw::c_int,
pub reserved: [::std::os::raw::c_int; 15usize],
}
#[test]
fn bindgen_test_layout_cudnnDebugStruct() {
assert_eq!(
::core::mem::size_of::<cudnnDebugStruct>(),
120usize,
concat!("Size of: ", stringify!(cudnnDebugStruct))
);
assert_eq!(
::core::mem::align_of::<cudnnDebugStruct>(),
8usize,
concat!("Alignment of ", stringify!(cudnnDebugStruct))
);
assert_eq!(
unsafe { &(*(::core::ptr::null::<cudnnDebugStruct>())).cudnn_version as *const _ as usize },
0usize,
concat!(
"Offset of field: ",
stringify!(cudnnDebugStruct),
"::",
stringify!(cudnn_version)
)
);
assert_eq!(
unsafe { &(*(::core::ptr::null::<cudnnDebugStruct>())).cudnnStatus as *const _ as usize },
4usize,
concat!(
"Offset of field: ",
stringify!(cudnnDebugStruct),
"::",
stringify!(cudnnStatus)
)
);
assert_eq!(
unsafe { &(*(::core::ptr::null::<cudnnDebugStruct>())).time_sec as *const _ as usize },
8usize,
concat!(
"Offset of field: ",
stringify!(cudnnDebugStruct),
"::",
stringify!(time_sec)
)
);
assert_eq!(
unsafe { &(*(::core::ptr::null::<cudnnDebugStruct>())).time_usec as *const _ as usize },
12usize,
concat!(
"Offset of field: ",
stringify!(cudnnDebugStruct),
"::",
stringify!(time_usec)
)
);
assert_eq!(
unsafe { &(*(::core::ptr::null::<cudnnDebugStruct>())).time_delta as *const _ as usize },
16usize,
concat!(
"Offset of field: ",
stringify!(cudnnDebugStruct),
"::",
stringify!(time_delta)
)
);
assert_eq!(
unsafe { &(*(::core::ptr::null::<cudnnDebugStruct>())).handle as *const _ as usize },
24usize,
concat!(
"Offset of field: ",
stringify!(cudnnDebugStruct),
"::",
stringify!(handle)
)
);
assert_eq!(
unsafe { &(*(::core::ptr::null::<cudnnDebugStruct>())).stream as *const _ as usize },
32usize,
concat!(
"Offset of field: ",
stringify!(cudnnDebugStruct),
"::",
stringify!(stream)
)
);
assert_eq!(
unsafe { &(*(::core::ptr::null::<cudnnDebugStruct>())).pid as *const _ as usize },
40usize,
concat!(
"Offset of field: ",
stringify!(cudnnDebugStruct),
"::",
stringify!(pid)
)
);
assert_eq!(
unsafe { &(*(::core::ptr::null::<cudnnDebugStruct>())).tid as *const _ as usize },
48usize,
concat!(
"Offset of field: ",
stringify!(cudnnDebugStruct),
"::",
stringify!(tid)
)
);
assert_eq!(
unsafe { &(*(::core::ptr::null::<cudnnDebugStruct>())).cudaDeviceId as *const _ as usize },
56usize,
concat!(
"Offset of field: ",
stringify!(cudnnDebugStruct),
"::",
stringify!(cudaDeviceId)
)
);
assert_eq!(
unsafe { &(*(::core::ptr::null::<cudnnDebugStruct>())).reserved as *const _ as usize },
60usize,
concat!(
"Offset of field: ",
stringify!(cudnnDebugStruct),
"::",
stringify!(reserved)
)
);
}
impl Default for cudnnDebugStruct {
fn default() -> Self {
let mut s = ::core::mem::MaybeUninit::<Self>::uninit();
unsafe {
::core::ptr::write_bytes(s.as_mut_ptr(), 0, 1);
s.assume_init()
}
}
}
pub type cudnnDebug_t = cudnnDebugStruct;
pub type cudnnCallback_t = ::core::option::Option<
unsafe extern "C" fn(
sev: cudnnSeverity_t,
udata: *mut ::core::ffi::c_void,
dbg: *const cudnnDebug_t,
msg: *const ::std::os::raw::c_char,
),
>;
extern "C" {
pub fn cudnnSetCallback(
mask: ::std::os::raw::c_uint,
udata: *mut ::core::ffi::c_void,
fptr: cudnnCallback_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetCallback(
mask: *mut ::std::os::raw::c_uint,
udata: *mut *mut ::core::ffi::c_void,
fptr: *mut cudnnCallback_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnOpsInferVersionCheck() -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnSoftmaxBackward(
handle: cudnnHandle_t,
algo: cudnnSoftmaxAlgorithm_t,
mode: cudnnSoftmaxMode_t,
alpha: *const ::core::ffi::c_void,
yDesc: cudnnTensorDescriptor_t,
y: *const ::core::ffi::c_void,
dyDesc: cudnnTensorDescriptor_t,
dy: *const ::core::ffi::c_void,
beta: *const ::core::ffi::c_void,
dxDesc: cudnnTensorDescriptor_t,
dx: *mut ::core::ffi::c_void,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnPoolingBackward(
handle: cudnnHandle_t,
poolingDesc: cudnnPoolingDescriptor_t,
alpha: *const ::core::ffi::c_void,
yDesc: cudnnTensorDescriptor_t,
y: *const ::core::ffi::c_void,
dyDesc: cudnnTensorDescriptor_t,
dy: *const ::core::ffi::c_void,
xDesc: cudnnTensorDescriptor_t,
x: *const ::core::ffi::c_void,
beta: *const ::core::ffi::c_void,
dxDesc: cudnnTensorDescriptor_t,
dx: *mut ::core::ffi::c_void,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnActivationBackward(
handle: cudnnHandle_t,
activationDesc: cudnnActivationDescriptor_t,
alpha: *const ::core::ffi::c_void,
yDesc: cudnnTensorDescriptor_t,
y: *const ::core::ffi::c_void,
dyDesc: cudnnTensorDescriptor_t,
dy: *const ::core::ffi::c_void,
xDesc: cudnnTensorDescriptor_t,
x: *const ::core::ffi::c_void,
beta: *const ::core::ffi::c_void,
dxDesc: cudnnTensorDescriptor_t,
dx: *mut ::core::ffi::c_void,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnLRNCrossChannelBackward(
handle: cudnnHandle_t,
normDesc: cudnnLRNDescriptor_t,
lrnMode: cudnnLRNMode_t,
alpha: *const ::core::ffi::c_void,
yDesc: cudnnTensorDescriptor_t,
y: *const ::core::ffi::c_void,
dyDesc: cudnnTensorDescriptor_t,
dy: *const ::core::ffi::c_void,
xDesc: cudnnTensorDescriptor_t,
x: *const ::core::ffi::c_void,
beta: *const ::core::ffi::c_void,
dxDesc: cudnnTensorDescriptor_t,
dx: *mut ::core::ffi::c_void,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnDivisiveNormalizationBackward(
handle: cudnnHandle_t,
normDesc: cudnnLRNDescriptor_t,
mode: cudnnDivNormMode_t,
alpha: *const ::core::ffi::c_void,
xDesc: cudnnTensorDescriptor_t,
x: *const ::core::ffi::c_void,
means: *const ::core::ffi::c_void,
dy: *const ::core::ffi::c_void,
temp: *mut ::core::ffi::c_void,
temp2: *mut ::core::ffi::c_void,
beta: *const ::core::ffi::c_void,
dXdMeansDesc: cudnnTensorDescriptor_t,
dx: *mut ::core::ffi::c_void,
dMeans: *mut ::core::ffi::c_void,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
handle: cudnnHandle_t,
mode: cudnnBatchNormMode_t,
bnOps: cudnnBatchNormOps_t,
xDesc: cudnnTensorDescriptor_t,
zDesc: cudnnTensorDescriptor_t,
yDesc: cudnnTensorDescriptor_t,
bnScaleBiasMeanVarDesc: cudnnTensorDescriptor_t,
activationDesc: cudnnActivationDescriptor_t,
sizeInBytes: *mut usize,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetBatchNormalizationBackwardExWorkspaceSize(
handle: cudnnHandle_t,
mode: cudnnBatchNormMode_t,
bnOps: cudnnBatchNormOps_t,
xDesc: cudnnTensorDescriptor_t,
yDesc: cudnnTensorDescriptor_t,
dyDesc: cudnnTensorDescriptor_t,
dzDesc: cudnnTensorDescriptor_t,
dxDesc: cudnnTensorDescriptor_t,
dBnScaleBiasDesc: cudnnTensorDescriptor_t,
activationDesc: cudnnActivationDescriptor_t,
sizeInBytes: *mut usize,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetBatchNormalizationTrainingExReserveSpaceSize(
handle: cudnnHandle_t,
mode: cudnnBatchNormMode_t,
bnOps: cudnnBatchNormOps_t,
activationDesc: cudnnActivationDescriptor_t,
xDesc: cudnnTensorDescriptor_t,
sizeInBytes: *mut usize,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnBatchNormalizationForwardTraining(
handle: cudnnHandle_t,
mode: cudnnBatchNormMode_t,
alpha: *const ::core::ffi::c_void,
beta: *const ::core::ffi::c_void,
xDesc: cudnnTensorDescriptor_t,
x: *const ::core::ffi::c_void,
yDesc: cudnnTensorDescriptor_t,
y: *mut ::core::ffi::c_void,
bnScaleBiasMeanVarDesc: cudnnTensorDescriptor_t,
bnScale: *const ::core::ffi::c_void,
bnBias: *const ::core::ffi::c_void,
exponentialAverageFactor: f64,
resultRunningMean: *mut ::core::ffi::c_void,
resultRunningVariance: *mut ::core::ffi::c_void,
epsilon: f64,
resultSaveMean: *mut ::core::ffi::c_void,
resultSaveInvVariance: *mut ::core::ffi::c_void,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnBatchNormalizationForwardTrainingEx(
handle: cudnnHandle_t,
mode: cudnnBatchNormMode_t,
bnOps: cudnnBatchNormOps_t,
alpha: *const ::core::ffi::c_void,
beta: *const ::core::ffi::c_void,
xDesc: cudnnTensorDescriptor_t,
xData: *const ::core::ffi::c_void,
zDesc: cudnnTensorDescriptor_t,
zData: *const ::core::ffi::c_void,
yDesc: cudnnTensorDescriptor_t,
yData: *mut ::core::ffi::c_void,
bnScaleBiasMeanVarDesc: cudnnTensorDescriptor_t,
bnScale: *const ::core::ffi::c_void,
bnBias: *const ::core::ffi::c_void,
exponentialAverageFactor: f64,
resultRunningMean: *mut ::core::ffi::c_void,
resultRunningVariance: *mut ::core::ffi::c_void,
epsilon: f64,
resultSaveMean: *mut ::core::ffi::c_void,
resultSaveInvVariance: *mut ::core::ffi::c_void,
activationDesc: cudnnActivationDescriptor_t,
workspace: *mut ::core::ffi::c_void,
workSpaceSizeInBytes: usize,
reserveSpace: *mut ::core::ffi::c_void,
reserveSpaceSizeInBytes: usize,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnBatchNormalizationBackward(
handle: cudnnHandle_t,
mode: cudnnBatchNormMode_t,
alphaDataDiff: *const ::core::ffi::c_void,
betaDataDiff: *const ::core::ffi::c_void,
alphaParamDiff: *const ::core::ffi::c_void,
betaParamDiff: *const ::core::ffi::c_void,
xDesc: cudnnTensorDescriptor_t,
x: *const ::core::ffi::c_void,
dyDesc: cudnnTensorDescriptor_t,
dy: *const ::core::ffi::c_void,
dxDesc: cudnnTensorDescriptor_t,
dx: *mut ::core::ffi::c_void,
dBnScaleBiasDesc: cudnnTensorDescriptor_t,
bnScale: *const ::core::ffi::c_void,
dBnScaleResult: *mut ::core::ffi::c_void,
dBnBiasResult: *mut ::core::ffi::c_void,
epsilon: f64,
savedMean: *const ::core::ffi::c_void,
savedInvVariance: *const ::core::ffi::c_void,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnBatchNormalizationBackwardEx(
handle: cudnnHandle_t,
mode: cudnnBatchNormMode_t,
bnOps: cudnnBatchNormOps_t,
alphaDataDiff: *const ::core::ffi::c_void,
betaDataDiff: *const ::core::ffi::c_void,
alphaParamDiff: *const ::core::ffi::c_void,
betaParamDiff: *const ::core::ffi::c_void,
xDesc: cudnnTensorDescriptor_t,
xData: *const ::core::ffi::c_void,
yDesc: cudnnTensorDescriptor_t,
yData: *const ::core::ffi::c_void,
dyDesc: cudnnTensorDescriptor_t,
dyData: *const ::core::ffi::c_void,
dzDesc: cudnnTensorDescriptor_t,
dzData: *mut ::core::ffi::c_void,
dxDesc: cudnnTensorDescriptor_t,
dxData: *mut ::core::ffi::c_void,
dBnScaleBiasDesc: cudnnTensorDescriptor_t,
bnScaleData: *const ::core::ffi::c_void,
bnBiasData: *const ::core::ffi::c_void,
dBnScaleData: *mut ::core::ffi::c_void,
dBnBiasData: *mut ::core::ffi::c_void,
epsilon: f64,
savedMean: *const ::core::ffi::c_void,
savedInvVariance: *const ::core::ffi::c_void,
activationDesc: cudnnActivationDescriptor_t,
workSpace: *mut ::core::ffi::c_void,
workSpaceSizeInBytes: usize,
reserveSpace: *mut ::core::ffi::c_void,
reserveSpaceSizeInBytes: usize,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetNormalizationForwardTrainingWorkspaceSize(
handle: cudnnHandle_t,
mode: cudnnNormMode_t,
normOps: cudnnNormOps_t,
algo: cudnnNormAlgo_t,
xDesc: cudnnTensorDescriptor_t,
zDesc: cudnnTensorDescriptor_t,
yDesc: cudnnTensorDescriptor_t,
normScaleBiasDesc: cudnnTensorDescriptor_t,
activationDesc: cudnnActivationDescriptor_t,
normMeanVarDesc: cudnnTensorDescriptor_t,
sizeInBytes: *mut usize,
groupCnt: ::std::os::raw::c_int,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetNormalizationBackwardWorkspaceSize(
handle: cudnnHandle_t,
mode: cudnnNormMode_t,
normOps: cudnnNormOps_t,
algo: cudnnNormAlgo_t,
xDesc: cudnnTensorDescriptor_t,
yDesc: cudnnTensorDescriptor_t,
dyDesc: cudnnTensorDescriptor_t,
dzDesc: cudnnTensorDescriptor_t,
dxDesc: cudnnTensorDescriptor_t,
dNormScaleBiasDesc: cudnnTensorDescriptor_t,
activationDesc: cudnnActivationDescriptor_t,
normMeanVarDesc: cudnnTensorDescriptor_t,
sizeInBytes: *mut usize,
groupCnt: ::std::os::raw::c_int,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetNormalizationTrainingReserveSpaceSize(
handle: cudnnHandle_t,
mode: cudnnNormMode_t,
normOps: cudnnNormOps_t,
algo: cudnnNormAlgo_t,
activationDesc: cudnnActivationDescriptor_t,
xDesc: cudnnTensorDescriptor_t,
sizeInBytes: *mut usize,
groupCnt: ::std::os::raw::c_int,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnNormalizationForwardTraining(
handle: cudnnHandle_t,
mode: cudnnNormMode_t,
normOps: cudnnNormOps_t,
algo: cudnnNormAlgo_t,
alpha: *const ::core::ffi::c_void,
beta: *const ::core::ffi::c_void,
xDesc: cudnnTensorDescriptor_t,
xData: *const ::core::ffi::c_void,
normScaleBiasDesc: cudnnTensorDescriptor_t,
normScale: *const ::core::ffi::c_void,
normBias: *const ::core::ffi::c_void,
exponentialAverageFactor: f64,
normMeanVarDesc: cudnnTensorDescriptor_t,
resultRunningMean: *mut ::core::ffi::c_void,
resultRunningVariance: *mut ::core::ffi::c_void,
epsilon: f64,
resultSaveMean: *mut ::core::ffi::c_void,
resultSaveInvVariance: *mut ::core::ffi::c_void,
activationDesc: cudnnActivationDescriptor_t,
zDesc: cudnnTensorDescriptor_t,
zData: *const ::core::ffi::c_void,
yDesc: cudnnTensorDescriptor_t,
yData: *mut ::core::ffi::c_void,
workspace: *mut ::core::ffi::c_void,
workSpaceSizeInBytes: usize,
reserveSpace: *mut ::core::ffi::c_void,
reserveSpaceSizeInBytes: usize,
groupCnt: ::std::os::raw::c_int,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnNormalizationBackward(
handle: cudnnHandle_t,
mode: cudnnNormMode_t,
normOps: cudnnNormOps_t,
algo: cudnnNormAlgo_t,
alphaDataDiff: *const ::core::ffi::c_void,
betaDataDiff: *const ::core::ffi::c_void,
alphaParamDiff: *const ::core::ffi::c_void,
betaParamDiff: *const ::core::ffi::c_void,
xDesc: cudnnTensorDescriptor_t,
xData: *const ::core::ffi::c_void,
yDesc: cudnnTensorDescriptor_t,
yData: *const ::core::ffi::c_void,
dyDesc: cudnnTensorDescriptor_t,
dyData: *const ::core::ffi::c_void,
dzDesc: cudnnTensorDescriptor_t,
dzData: *mut ::core::ffi::c_void,
dxDesc: cudnnTensorDescriptor_t,
dxData: *mut ::core::ffi::c_void,
dNormScaleBiasDesc: cudnnTensorDescriptor_t,
normScaleData: *const ::core::ffi::c_void,
normBiasData: *const ::core::ffi::c_void,
dNormScaleData: *mut ::core::ffi::c_void,
dNormBiasData: *mut ::core::ffi::c_void,
epsilon: f64,
normMeanVarDesc: cudnnTensorDescriptor_t,
savedMean: *const ::core::ffi::c_void,
savedInvVariance: *const ::core::ffi::c_void,
activationDesc: cudnnActivationDescriptor_t,
workSpace: *mut ::core::ffi::c_void,
workSpaceSizeInBytes: usize,
reserveSpace: *mut ::core::ffi::c_void,
reserveSpaceSizeInBytes: usize,
groupCnt: ::std::os::raw::c_int,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnSpatialTfGridGeneratorBackward(
handle: cudnnHandle_t,
stDesc: cudnnSpatialTransformerDescriptor_t,
dgrid: *const ::core::ffi::c_void,
dtheta: *mut ::core::ffi::c_void,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnSpatialTfSamplerBackward(
handle: cudnnHandle_t,
stDesc: cudnnSpatialTransformerDescriptor_t,
alpha: *const ::core::ffi::c_void,
xDesc: cudnnTensorDescriptor_t,
x: *const ::core::ffi::c_void,
beta: *const ::core::ffi::c_void,
dxDesc: cudnnTensorDescriptor_t,
dx: *mut ::core::ffi::c_void,
alphaDgrid: *const ::core::ffi::c_void,
dyDesc: cudnnTensorDescriptor_t,
dy: *const ::core::ffi::c_void,
grid: *const ::core::ffi::c_void,
betaDgrid: *const ::core::ffi::c_void,
dgrid: *mut ::core::ffi::c_void,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnDropoutBackward(
handle: cudnnHandle_t,
dropoutDesc: cudnnDropoutDescriptor_t,
dydesc: cudnnTensorDescriptor_t,
dy: *const ::core::ffi::c_void,
dxdesc: cudnnTensorDescriptor_t,
dx: *mut ::core::ffi::c_void,
reserveSpace: *mut ::core::ffi::c_void,
reserveSpaceSizeInBytes: usize,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnOpsTrainVersionCheck() -> cudnnStatus_t;
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnForwardMode_t {
CUDNN_FWD_MODE_INFERENCE = 0,
CUDNN_FWD_MODE_TRAINING = 1,
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnRNNMode_t {
CUDNN_RNN_RELU = 0,
CUDNN_RNN_TANH = 1,
CUDNN_LSTM = 2,
CUDNN_GRU = 3,
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnRNNBiasMode_t {
CUDNN_RNN_NO_BIAS = 0,
CUDNN_RNN_SINGLE_INP_BIAS = 1,
CUDNN_RNN_DOUBLE_BIAS = 2,
CUDNN_RNN_SINGLE_REC_BIAS = 3,
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnDirectionMode_t {
CUDNN_UNIDIRECTIONAL = 0,
CUDNN_BIDIRECTIONAL = 1,
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnRNNInputMode_t {
CUDNN_LINEAR_INPUT = 0,
CUDNN_SKIP_INPUT = 1,
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnRNNClipMode_t {
CUDNN_RNN_CLIP_NONE = 0,
CUDNN_RNN_CLIP_MINMAX = 1,
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnRNNDataLayout_t {
CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED = 0,
CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED = 1,
CUDNN_RNN_DATA_LAYOUT_BATCH_MAJOR_UNPACKED = 2,
}
pub type cudnnRNNPaddingMode_t = ::std::os::raw::c_uint;
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct cudnnRNNStruct {
_unused: [u8; 0],
}
pub type cudnnRNNDescriptor_t = *mut cudnnRNNStruct;
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct cudnnPersistentRNNPlan {
_unused: [u8; 0],
}
pub type cudnnPersistentRNNPlan_t = *mut cudnnPersistentRNNPlan;
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct cudnnRNNDataStruct {
_unused: [u8; 0],
}
pub type cudnnRNNDataDescriptor_t = *mut cudnnRNNDataStruct;
extern "C" {
pub fn cudnnCreateRNNDescriptor(rnnDesc: *mut cudnnRNNDescriptor_t) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnDestroyRNNDescriptor(rnnDesc: cudnnRNNDescriptor_t) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnSetRNNDescriptor_v8(
rnnDesc: cudnnRNNDescriptor_t,
algo: cudnnRNNAlgo_t,
cellMode: cudnnRNNMode_t,
biasMode: cudnnRNNBiasMode_t,
dirMode: cudnnDirectionMode_t,
inputMode: cudnnRNNInputMode_t,
dataType: cudnnDataType_t,
mathPrec: cudnnDataType_t,
mathType: cudnnMathType_t,
inputSize: i32,
hiddenSize: i32,
projSize: i32,
numLayers: i32,
dropoutDesc: cudnnDropoutDescriptor_t,
auxFlags: u32,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetRNNDescriptor_v8(
rnnDesc: cudnnRNNDescriptor_t,
algo: *mut cudnnRNNAlgo_t,
cellMode: *mut cudnnRNNMode_t,
biasMode: *mut cudnnRNNBiasMode_t,
dirMode: *mut cudnnDirectionMode_t,
inputMode: *mut cudnnRNNInputMode_t,
dataType: *mut cudnnDataType_t,
mathPrec: *mut cudnnDataType_t,
mathType: *mut cudnnMathType_t,
inputSize: *mut i32,
hiddenSize: *mut i32,
projSize: *mut i32,
numLayers: *mut i32,
dropoutDesc: *mut cudnnDropoutDescriptor_t,
auxFlags: *mut u32,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnSetRNNDescriptor_v6(
handle: cudnnHandle_t,
rnnDesc: cudnnRNNDescriptor_t,
hiddenSize: ::std::os::raw::c_int,
numLayers: ::std::os::raw::c_int,
dropoutDesc: cudnnDropoutDescriptor_t,
inputMode: cudnnRNNInputMode_t,
direction: cudnnDirectionMode_t,
cellMode: cudnnRNNMode_t,
algo: cudnnRNNAlgo_t,
mathPrec: cudnnDataType_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetRNNDescriptor_v6(
handle: cudnnHandle_t,
rnnDesc: cudnnRNNDescriptor_t,
hiddenSize: *mut ::std::os::raw::c_int,
numLayers: *mut ::std::os::raw::c_int,
dropoutDesc: *mut cudnnDropoutDescriptor_t,
inputMode: *mut cudnnRNNInputMode_t,
direction: *mut cudnnDirectionMode_t,
cellMode: *mut cudnnRNNMode_t,
algo: *mut cudnnRNNAlgo_t,
mathPrec: *mut cudnnDataType_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnSetRNNMatrixMathType(
rnnDesc: cudnnRNNDescriptor_t,
mType: cudnnMathType_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetRNNMatrixMathType(
rnnDesc: cudnnRNNDescriptor_t,
mType: *mut cudnnMathType_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnSetRNNBiasMode(
rnnDesc: cudnnRNNDescriptor_t,
biasMode: cudnnRNNBiasMode_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetRNNBiasMode(
rnnDesc: cudnnRNNDescriptor_t,
biasMode: *mut cudnnRNNBiasMode_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnRNNSetClip_v8(
rnnDesc: cudnnRNNDescriptor_t,
clipMode: cudnnRNNClipMode_t,
clipNanOpt: cudnnNanPropagation_t,
lclip: f64,
rclip: f64,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnRNNGetClip_v8(
rnnDesc: cudnnRNNDescriptor_t,
clipMode: *mut cudnnRNNClipMode_t,
clipNanOpt: *mut cudnnNanPropagation_t,
lclip: *mut f64,
rclip: *mut f64,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnRNNSetClip(
handle: cudnnHandle_t,
rnnDesc: cudnnRNNDescriptor_t,
clipMode: cudnnRNNClipMode_t,
clipNanOpt: cudnnNanPropagation_t,
lclip: f64,
rclip: f64,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnRNNGetClip(
handle: cudnnHandle_t,
rnnDesc: cudnnRNNDescriptor_t,
clipMode: *mut cudnnRNNClipMode_t,
clipNanOpt: *mut cudnnNanPropagation_t,
lclip: *mut f64,
rclip: *mut f64,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnSetRNNProjectionLayers(
handle: cudnnHandle_t,
rnnDesc: cudnnRNNDescriptor_t,
recProjSize: ::std::os::raw::c_int,
outProjSize: ::std::os::raw::c_int,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetRNNProjectionLayers(
handle: cudnnHandle_t,
rnnDesc: cudnnRNNDescriptor_t,
recProjSize: *mut ::std::os::raw::c_int,
outProjSize: *mut ::std::os::raw::c_int,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnCreatePersistentRNNPlan(
rnnDesc: cudnnRNNDescriptor_t,
minibatch: ::std::os::raw::c_int,
dataType: cudnnDataType_t,
plan: *mut cudnnPersistentRNNPlan_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnDestroyPersistentRNNPlan(plan: cudnnPersistentRNNPlan_t) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnSetPersistentRNNPlan(
rnnDesc: cudnnRNNDescriptor_t,
plan: cudnnPersistentRNNPlan_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnBuildRNNDynamic(
handle: cudnnHandle_t,
rnnDesc: cudnnRNNDescriptor_t,
miniBatch: ::std::os::raw::c_int,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetRNNWorkspaceSize(
handle: cudnnHandle_t,
rnnDesc: cudnnRNNDescriptor_t,
seqLength: ::std::os::raw::c_int,
xDesc: *const cudnnTensorDescriptor_t,
sizeInBytes: *mut usize,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetRNNTrainingReserveSize(
handle: cudnnHandle_t,
rnnDesc: cudnnRNNDescriptor_t,
seqLength: ::std::os::raw::c_int,
xDesc: *const cudnnTensorDescriptor_t,
sizeInBytes: *mut usize,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetRNNTempSpaceSizes(
handle: cudnnHandle_t,
rnnDesc: cudnnRNNDescriptor_t,
fMode: cudnnForwardMode_t,
xDesc: cudnnRNNDataDescriptor_t,
workSpaceSize: *mut usize,
reserveSpaceSize: *mut usize,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetRNNParamsSize(
handle: cudnnHandle_t,
rnnDesc: cudnnRNNDescriptor_t,
xDesc: cudnnTensorDescriptor_t,
sizeInBytes: *mut usize,
dataType: cudnnDataType_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetRNNWeightSpaceSize(
handle: cudnnHandle_t,
rnnDesc: cudnnRNNDescriptor_t,
weightSpaceSize: *mut usize,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetRNNLinLayerMatrixParams(
handle: cudnnHandle_t,
rnnDesc: cudnnRNNDescriptor_t,
pseudoLayer: ::std::os::raw::c_int,
xDesc: cudnnTensorDescriptor_t,
wDesc: cudnnFilterDescriptor_t,
w: *const ::core::ffi::c_void,
linLayerID: ::std::os::raw::c_int,
linLayerMatDesc: cudnnFilterDescriptor_t,
linLayerMat: *mut *mut ::core::ffi::c_void,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetRNNLinLayerBiasParams(
handle: cudnnHandle_t,
rnnDesc: cudnnRNNDescriptor_t,
pseudoLayer: ::std::os::raw::c_int,
xDesc: cudnnTensorDescriptor_t,
wDesc: cudnnFilterDescriptor_t,
w: *const ::core::ffi::c_void,
linLayerID: ::std::os::raw::c_int,
linLayerBiasDesc: cudnnFilterDescriptor_t,
linLayerBias: *mut *mut ::core::ffi::c_void,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetRNNWeightParams(
handle: cudnnHandle_t,
rnnDesc: cudnnRNNDescriptor_t,
pseudoLayer: i32,
weightSpaceSize: usize,
weightSpace: *const ::core::ffi::c_void,
linLayerID: i32,
mDesc: cudnnTensorDescriptor_t,
mAddr: *mut *mut ::core::ffi::c_void,
bDesc: cudnnTensorDescriptor_t,
bAddr: *mut *mut ::core::ffi::c_void,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnRNNForwardInference(
handle: cudnnHandle_t,
rnnDesc: cudnnRNNDescriptor_t,
seqLength: ::std::os::raw::c_int,
xDesc: *const cudnnTensorDescriptor_t,
x: *const ::core::ffi::c_void,
hxDesc: cudnnTensorDescriptor_t,
hx: *const ::core::ffi::c_void,
cxDesc: cudnnTensorDescriptor_t,
cx: *const ::core::ffi::c_void,
wDesc: cudnnFilterDescriptor_t,
w: *const ::core::ffi::c_void,
yDesc: *const cudnnTensorDescriptor_t,
y: *mut ::core::ffi::c_void,
hyDesc: cudnnTensorDescriptor_t,
hy: *mut ::core::ffi::c_void,
cyDesc: cudnnTensorDescriptor_t,
cy: *mut ::core::ffi::c_void,
workSpace: *mut ::core::ffi::c_void,
workSpaceSizeInBytes: usize,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnSetRNNPaddingMode(
rnnDesc: cudnnRNNDescriptor_t,
paddingMode: ::std::os::raw::c_uint,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetRNNPaddingMode(
rnnDesc: cudnnRNNDescriptor_t,
paddingMode: *mut ::std::os::raw::c_uint,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnCreateRNNDataDescriptor(
rnnDataDesc: *mut cudnnRNNDataDescriptor_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnDestroyRNNDataDescriptor(rnnDataDesc: cudnnRNNDataDescriptor_t) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnSetRNNDataDescriptor(
rnnDataDesc: cudnnRNNDataDescriptor_t,
dataType: cudnnDataType_t,
layout: cudnnRNNDataLayout_t,
maxSeqLength: ::std::os::raw::c_int,
batchSize: ::std::os::raw::c_int,
vectorSize: ::std::os::raw::c_int,
seqLengthArray: *const ::std::os::raw::c_int,
paddingFill: *mut ::core::ffi::c_void,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetRNNDataDescriptor(
rnnDataDesc: cudnnRNNDataDescriptor_t,
dataType: *mut cudnnDataType_t,
layout: *mut cudnnRNNDataLayout_t,
maxSeqLength: *mut ::std::os::raw::c_int,
batchSize: *mut ::std::os::raw::c_int,
vectorSize: *mut ::std::os::raw::c_int,
arrayLengthRequested: ::std::os::raw::c_int,
seqLengthArray: *mut ::std::os::raw::c_int,
paddingFill: *mut ::core::ffi::c_void,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnRNNForwardInferenceEx(
handle: cudnnHandle_t,
rnnDesc: cudnnRNNDescriptor_t,
xDesc: cudnnRNNDataDescriptor_t,
x: *const ::core::ffi::c_void,
hxDesc: cudnnTensorDescriptor_t,
hx: *const ::core::ffi::c_void,
cxDesc: cudnnTensorDescriptor_t,
cx: *const ::core::ffi::c_void,
wDesc: cudnnFilterDescriptor_t,
w: *const ::core::ffi::c_void,
yDesc: cudnnRNNDataDescriptor_t,
y: *mut ::core::ffi::c_void,
hyDesc: cudnnTensorDescriptor_t,
hy: *mut ::core::ffi::c_void,
cyDesc: cudnnTensorDescriptor_t,
cy: *mut ::core::ffi::c_void,
kDesc: cudnnRNNDataDescriptor_t,
keys: *const ::core::ffi::c_void,
cDesc: cudnnRNNDataDescriptor_t,
cAttn: *mut ::core::ffi::c_void,
iDesc: cudnnRNNDataDescriptor_t,
iAttn: *mut ::core::ffi::c_void,
qDesc: cudnnRNNDataDescriptor_t,
queries: *mut ::core::ffi::c_void,
workSpace: *mut ::core::ffi::c_void,
workSpaceSizeInBytes: usize,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnRNNForward(
handle: cudnnHandle_t,
rnnDesc: cudnnRNNDescriptor_t,
fwdMode: cudnnForwardMode_t,
devSeqLengths: *const i32,
xDesc: cudnnRNNDataDescriptor_t,
x: *const ::core::ffi::c_void,
yDesc: cudnnRNNDataDescriptor_t,
y: *mut ::core::ffi::c_void,
hDesc: cudnnTensorDescriptor_t,
hx: *const ::core::ffi::c_void,
hy: *mut ::core::ffi::c_void,
cDesc: cudnnTensorDescriptor_t,
cx: *const ::core::ffi::c_void,
cy: *mut ::core::ffi::c_void,
weightSpaceSize: usize,
weightSpace: *const ::core::ffi::c_void,
workSpaceSize: usize,
workSpace: *mut ::core::ffi::c_void,
reserveSpaceSize: usize,
reserveSpace: *mut ::core::ffi::c_void,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnSetRNNAlgorithmDescriptor(
handle: cudnnHandle_t,
rnnDesc: cudnnRNNDescriptor_t,
algoDesc: cudnnAlgorithmDescriptor_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetRNNForwardInferenceAlgorithmMaxCount(
handle: cudnnHandle_t,
rnnDesc: cudnnRNNDescriptor_t,
count: *mut ::std::os::raw::c_int,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnFindRNNForwardInferenceAlgorithmEx(
handle: cudnnHandle_t,
rnnDesc: cudnnRNNDescriptor_t,
seqLength: ::std::os::raw::c_int,
xDesc: *const cudnnTensorDescriptor_t,
x: *const ::core::ffi::c_void,
hxDesc: cudnnTensorDescriptor_t,
hx: *const ::core::ffi::c_void,
cxDesc: cudnnTensorDescriptor_t,
cx: *const ::core::ffi::c_void,
wDesc: cudnnFilterDescriptor_t,
w: *const ::core::ffi::c_void,
yDesc: *const cudnnTensorDescriptor_t,
y: *mut ::core::ffi::c_void,
hyDesc: cudnnTensorDescriptor_t,
hy: *mut ::core::ffi::c_void,
cyDesc: cudnnTensorDescriptor_t,
cy: *mut ::core::ffi::c_void,
findIntensity: f32,
requestedAlgoCount: ::std::os::raw::c_int,
returnedAlgoCount: *mut ::std::os::raw::c_int,
perfResults: *mut cudnnAlgorithmPerformance_t,
workspace: *mut ::core::ffi::c_void,
workSpaceSizeInBytes: usize,
) -> cudnnStatus_t;
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnSeqDataAxis_t {
CUDNN_SEQDATA_TIME_DIM = 0,
CUDNN_SEQDATA_BATCH_DIM = 1,
CUDNN_SEQDATA_BEAM_DIM = 2,
CUDNN_SEQDATA_VECT_DIM = 3,
}
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct cudnnSeqDataStruct {
_unused: [u8; 0],
}
pub type cudnnSeqDataDescriptor_t = *mut cudnnSeqDataStruct;
extern "C" {
pub fn cudnnCreateSeqDataDescriptor(
seqDataDesc: *mut cudnnSeqDataDescriptor_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnDestroySeqDataDescriptor(seqDataDesc: cudnnSeqDataDescriptor_t) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnSetSeqDataDescriptor(
seqDataDesc: cudnnSeqDataDescriptor_t,
dataType: cudnnDataType_t,
nbDims: ::std::os::raw::c_int,
dimA: *const ::std::os::raw::c_int,
axes: *const cudnnSeqDataAxis_t,
seqLengthArraySize: usize,
seqLengthArray: *const ::std::os::raw::c_int,
paddingFill: *mut ::core::ffi::c_void,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetSeqDataDescriptor(
seqDataDesc: cudnnSeqDataDescriptor_t,
dataType: *mut cudnnDataType_t,
nbDims: *mut ::std::os::raw::c_int,
nbDimsRequested: ::std::os::raw::c_int,
dimA: *mut ::std::os::raw::c_int,
axes: *mut cudnnSeqDataAxis_t,
seqLengthArraySize: *mut usize,
seqLengthSizeRequested: usize,
seqLengthArray: *mut ::std::os::raw::c_int,
paddingFill: *mut ::core::ffi::c_void,
) -> cudnnStatus_t;
}
pub type cudnnAttnQueryMap_t = ::std::os::raw::c_uint;
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct cudnnAttnStruct {
_unused: [u8; 0],
}
pub type cudnnAttnDescriptor_t = *mut cudnnAttnStruct;
extern "C" {
pub fn cudnnCreateAttnDescriptor(attnDesc: *mut cudnnAttnDescriptor_t) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnDestroyAttnDescriptor(attnDesc: cudnnAttnDescriptor_t) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnSetAttnDescriptor(
attnDesc: cudnnAttnDescriptor_t,
attnMode: ::std::os::raw::c_uint,
nHeads: ::std::os::raw::c_int,
smScaler: f64,
dataType: cudnnDataType_t,
computePrec: cudnnDataType_t,
mathType: cudnnMathType_t,
attnDropoutDesc: cudnnDropoutDescriptor_t,
postDropoutDesc: cudnnDropoutDescriptor_t,
qSize: ::std::os::raw::c_int,
kSize: ::std::os::raw::c_int,
vSize: ::std::os::raw::c_int,
qProjSize: ::std::os::raw::c_int,
kProjSize: ::std::os::raw::c_int,
vProjSize: ::std::os::raw::c_int,
oProjSize: ::std::os::raw::c_int,
qoMaxSeqLength: ::std::os::raw::c_int,
kvMaxSeqLength: ::std::os::raw::c_int,
maxBatchSize: ::std::os::raw::c_int,
maxBeamSize: ::std::os::raw::c_int,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetAttnDescriptor(
attnDesc: cudnnAttnDescriptor_t,
attnMode: *mut ::std::os::raw::c_uint,
nHeads: *mut ::std::os::raw::c_int,
smScaler: *mut f64,
dataType: *mut cudnnDataType_t,
computePrec: *mut cudnnDataType_t,
mathType: *mut cudnnMathType_t,
attnDropoutDesc: *mut cudnnDropoutDescriptor_t,
postDropoutDesc: *mut cudnnDropoutDescriptor_t,
qSize: *mut ::std::os::raw::c_int,
kSize: *mut ::std::os::raw::c_int,
vSize: *mut ::std::os::raw::c_int,
qProjSize: *mut ::std::os::raw::c_int,
kProjSize: *mut ::std::os::raw::c_int,
vProjSize: *mut ::std::os::raw::c_int,
oProjSize: *mut ::std::os::raw::c_int,
qoMaxSeqLength: *mut ::std::os::raw::c_int,
kvMaxSeqLength: *mut ::std::os::raw::c_int,
maxBatchSize: *mut ::std::os::raw::c_int,
maxBeamSize: *mut ::std::os::raw::c_int,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetMultiHeadAttnBuffers(
handle: cudnnHandle_t,
attnDesc: cudnnAttnDescriptor_t,
weightSizeInBytes: *mut usize,
workSpaceSizeInBytes: *mut usize,
reserveSpaceSizeInBytes: *mut usize,
) -> cudnnStatus_t;
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnMultiHeadAttnWeightKind_t {
CUDNN_MH_ATTN_Q_WEIGHTS = 0,
CUDNN_MH_ATTN_K_WEIGHTS = 1,
CUDNN_MH_ATTN_V_WEIGHTS = 2,
CUDNN_MH_ATTN_O_WEIGHTS = 3,
CUDNN_MH_ATTN_Q_BIASES = 4,
CUDNN_MH_ATTN_K_BIASES = 5,
CUDNN_MH_ATTN_V_BIASES = 6,
CUDNN_MH_ATTN_O_BIASES = 7,
}
extern "C" {
pub fn cudnnGetMultiHeadAttnWeights(
handle: cudnnHandle_t,
attnDesc: cudnnAttnDescriptor_t,
wKind: cudnnMultiHeadAttnWeightKind_t,
weightSizeInBytes: usize,
weights: *const ::core::ffi::c_void,
wDesc: cudnnTensorDescriptor_t,
wAddr: *mut *mut ::core::ffi::c_void,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnMultiHeadAttnForward(
handle: cudnnHandle_t,
attnDesc: cudnnAttnDescriptor_t,
currIdx: ::std::os::raw::c_int,
loWinIdx: *const ::std::os::raw::c_int,
hiWinIdx: *const ::std::os::raw::c_int,
devSeqLengthsQO: *const ::std::os::raw::c_int,
devSeqLengthsKV: *const ::std::os::raw::c_int,
qDesc: cudnnSeqDataDescriptor_t,
queries: *const ::core::ffi::c_void,
residuals: *const ::core::ffi::c_void,
kDesc: cudnnSeqDataDescriptor_t,
keys: *const ::core::ffi::c_void,
vDesc: cudnnSeqDataDescriptor_t,
values: *const ::core::ffi::c_void,
oDesc: cudnnSeqDataDescriptor_t,
out: *mut ::core::ffi::c_void,
weightSizeInBytes: usize,
weights: *const ::core::ffi::c_void,
workSpaceSizeInBytes: usize,
workSpace: *mut ::core::ffi::c_void,
reserveSpaceSizeInBytes: usize,
reserveSpace: *mut ::core::ffi::c_void,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnAdvInferVersionCheck() -> cudnnStatus_t;
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnWgradMode_t {
CUDNN_WGRAD_MODE_ADD = 0,
CUDNN_WGRAD_MODE_SET = 1,
}
extern "C" {
pub fn cudnnRNNForwardTraining(
handle: cudnnHandle_t,
rnnDesc: cudnnRNNDescriptor_t,
seqLength: ::std::os::raw::c_int,
xDesc: *const cudnnTensorDescriptor_t,
x: *const ::core::ffi::c_void,
hxDesc: cudnnTensorDescriptor_t,
hx: *const ::core::ffi::c_void,
cxDesc: cudnnTensorDescriptor_t,
cx: *const ::core::ffi::c_void,
wDesc: cudnnFilterDescriptor_t,
w: *const ::core::ffi::c_void,
yDesc: *const cudnnTensorDescriptor_t,
y: *mut ::core::ffi::c_void,
hyDesc: cudnnTensorDescriptor_t,
hy: *mut ::core::ffi::c_void,
cyDesc: cudnnTensorDescriptor_t,
cy: *mut ::core::ffi::c_void,
workSpace: *mut ::core::ffi::c_void,
workSpaceSizeInBytes: usize,
reserveSpace: *mut ::core::ffi::c_void,
reserveSpaceSizeInBytes: usize,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnRNNBackwardData(
handle: cudnnHandle_t,
rnnDesc: cudnnRNNDescriptor_t,
seqLength: ::std::os::raw::c_int,
yDesc: *const cudnnTensorDescriptor_t,
y: *const ::core::ffi::c_void,
dyDesc: *const cudnnTensorDescriptor_t,
dy: *const ::core::ffi::c_void,
dhyDesc: cudnnTensorDescriptor_t,
dhy: *const ::core::ffi::c_void,
dcyDesc: cudnnTensorDescriptor_t,
dcy: *const ::core::ffi::c_void,
wDesc: cudnnFilterDescriptor_t,
w: *const ::core::ffi::c_void,
hxDesc: cudnnTensorDescriptor_t,
hx: *const ::core::ffi::c_void,
cxDesc: cudnnTensorDescriptor_t,
cx: *const ::core::ffi::c_void,
dxDesc: *const cudnnTensorDescriptor_t,
dx: *mut ::core::ffi::c_void,
dhxDesc: cudnnTensorDescriptor_t,
dhx: *mut ::core::ffi::c_void,
dcxDesc: cudnnTensorDescriptor_t,
dcx: *mut ::core::ffi::c_void,
workSpace: *mut ::core::ffi::c_void,
workSpaceSizeInBytes: usize,
reserveSpace: *mut ::core::ffi::c_void,
reserveSpaceSizeInBytes: usize,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnRNNBackwardData_v8(
handle: cudnnHandle_t,
rnnDesc: cudnnRNNDescriptor_t,
devSeqLengths: *const i32,
yDesc: cudnnRNNDataDescriptor_t,
y: *const ::core::ffi::c_void,
dy: *const ::core::ffi::c_void,
xDesc: cudnnRNNDataDescriptor_t,
dx: *mut ::core::ffi::c_void,
hDesc: cudnnTensorDescriptor_t,
hx: *const ::core::ffi::c_void,
dhy: *const ::core::ffi::c_void,
dhx: *mut ::core::ffi::c_void,
cDesc: cudnnTensorDescriptor_t,
cx: *const ::core::ffi::c_void,
dcy: *const ::core::ffi::c_void,
dcx: *mut ::core::ffi::c_void,
weightSpaceSize: usize,
weightSpace: *const ::core::ffi::c_void,
workSpaceSize: usize,
workSpace: *mut ::core::ffi::c_void,
reserveSpaceSize: usize,
reserveSpace: *mut ::core::ffi::c_void,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnRNNBackwardWeights(
handle: cudnnHandle_t,
rnnDesc: cudnnRNNDescriptor_t,
seqLength: ::std::os::raw::c_int,
xDesc: *const cudnnTensorDescriptor_t,
x: *const ::core::ffi::c_void,
hxDesc: cudnnTensorDescriptor_t,
hx: *const ::core::ffi::c_void,
yDesc: *const cudnnTensorDescriptor_t,
y: *const ::core::ffi::c_void,
workSpace: *const ::core::ffi::c_void,
workSpaceSizeInBytes: usize,
dwDesc: cudnnFilterDescriptor_t,
dw: *mut ::core::ffi::c_void,
reserveSpace: *const ::core::ffi::c_void,
reserveSpaceSizeInBytes: usize,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnRNNBackwardWeights_v8(
handle: cudnnHandle_t,
rnnDesc: cudnnRNNDescriptor_t,
addGrad: cudnnWgradMode_t,
devSeqLengths: *const i32,
xDesc: cudnnRNNDataDescriptor_t,
x: *const ::core::ffi::c_void,
hDesc: cudnnTensorDescriptor_t,
hx: *const ::core::ffi::c_void,
yDesc: cudnnRNNDataDescriptor_t,
y: *const ::core::ffi::c_void,
weightSpaceSize: usize,
dweightSpace: *mut ::core::ffi::c_void,
workSpaceSize: usize,
workSpace: *mut ::core::ffi::c_void,
reserveSpaceSize: usize,
reserveSpace: *mut ::core::ffi::c_void,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnRNNForwardTrainingEx(
handle: cudnnHandle_t,
rnnDesc: cudnnRNNDescriptor_t,
xDesc: cudnnRNNDataDescriptor_t,
x: *const ::core::ffi::c_void,
hxDesc: cudnnTensorDescriptor_t,
hx: *const ::core::ffi::c_void,
cxDesc: cudnnTensorDescriptor_t,
cx: *const ::core::ffi::c_void,
wDesc: cudnnFilterDescriptor_t,
w: *const ::core::ffi::c_void,
yDesc: cudnnRNNDataDescriptor_t,
y: *mut ::core::ffi::c_void,
hyDesc: cudnnTensorDescriptor_t,
hy: *mut ::core::ffi::c_void,
cyDesc: cudnnTensorDescriptor_t,
cy: *mut ::core::ffi::c_void,
kDesc: cudnnRNNDataDescriptor_t,
keys: *const ::core::ffi::c_void,
cDesc: cudnnRNNDataDescriptor_t,
cAttn: *mut ::core::ffi::c_void,
iDesc: cudnnRNNDataDescriptor_t,
iAttn: *mut ::core::ffi::c_void,
qDesc: cudnnRNNDataDescriptor_t,
queries: *mut ::core::ffi::c_void,
workSpace: *mut ::core::ffi::c_void,
workSpaceSizeInBytes: usize,
reserveSpace: *mut ::core::ffi::c_void,
reserveSpaceSizeInBytes: usize,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnRNNBackwardDataEx(
handle: cudnnHandle_t,
rnnDesc: cudnnRNNDescriptor_t,
yDesc: cudnnRNNDataDescriptor_t,
y: *const ::core::ffi::c_void,
dyDesc: cudnnRNNDataDescriptor_t,
dy: *const ::core::ffi::c_void,
dcDesc: cudnnRNNDataDescriptor_t,
dcAttn: *const ::core::ffi::c_void,
dhyDesc: cudnnTensorDescriptor_t,
dhy: *const ::core::ffi::c_void,
dcyDesc: cudnnTensorDescriptor_t,
dcy: *const ::core::ffi::c_void,
wDesc: cudnnFilterDescriptor_t,
w: *const ::core::ffi::c_void,
hxDesc: cudnnTensorDescriptor_t,
hx: *const ::core::ffi::c_void,
cxDesc: cudnnTensorDescriptor_t,
cx: *const ::core::ffi::c_void,
dxDesc: cudnnRNNDataDescriptor_t,
dx: *mut ::core::ffi::c_void,
dhxDesc: cudnnTensorDescriptor_t,
dhx: *mut ::core::ffi::c_void,
dcxDesc: cudnnTensorDescriptor_t,
dcx: *mut ::core::ffi::c_void,
dkDesc: cudnnRNNDataDescriptor_t,
dkeys: *mut ::core::ffi::c_void,
workSpace: *mut ::core::ffi::c_void,
workSpaceSizeInBytes: usize,
reserveSpace: *mut ::core::ffi::c_void,
reserveSpaceSizeInBytes: usize,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnRNNBackwardWeightsEx(
handle: cudnnHandle_t,
rnnDesc: cudnnRNNDescriptor_t,
xDesc: cudnnRNNDataDescriptor_t,
x: *const ::core::ffi::c_void,
hxDesc: cudnnTensorDescriptor_t,
hx: *const ::core::ffi::c_void,
yDesc: cudnnRNNDataDescriptor_t,
y: *const ::core::ffi::c_void,
workSpace: *mut ::core::ffi::c_void,
workSpaceSizeInBytes: usize,
dwDesc: cudnnFilterDescriptor_t,
dw: *mut ::core::ffi::c_void,
reserveSpace: *mut ::core::ffi::c_void,
reserveSpaceSizeInBytes: usize,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetRNNForwardTrainingAlgorithmMaxCount(
handle: cudnnHandle_t,
rnnDesc: cudnnRNNDescriptor_t,
count: *mut ::std::os::raw::c_int,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnFindRNNForwardTrainingAlgorithmEx(
handle: cudnnHandle_t,
rnnDesc: cudnnRNNDescriptor_t,
seqLength: ::std::os::raw::c_int,
xDesc: *const cudnnTensorDescriptor_t,
x: *const ::core::ffi::c_void,
hxDesc: cudnnTensorDescriptor_t,
hx: *const ::core::ffi::c_void,
cxDesc: cudnnTensorDescriptor_t,
cx: *const ::core::ffi::c_void,
wDesc: cudnnFilterDescriptor_t,
w: *const ::core::ffi::c_void,
yDesc: *const cudnnTensorDescriptor_t,
y: *mut ::core::ffi::c_void,
hyDesc: cudnnTensorDescriptor_t,
hy: *mut ::core::ffi::c_void,
cyDesc: cudnnTensorDescriptor_t,
cy: *mut ::core::ffi::c_void,
findIntensity: f32,
requestedAlgoCount: ::std::os::raw::c_int,
returnedAlgoCount: *mut ::std::os::raw::c_int,
perfResults: *mut cudnnAlgorithmPerformance_t,
workspace: *mut ::core::ffi::c_void,
workSpaceSizeInBytes: usize,
reserveSpace: *mut ::core::ffi::c_void,
reserveSpaceSizeInBytes: usize,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetRNNBackwardDataAlgorithmMaxCount(
handle: cudnnHandle_t,
rnnDesc: cudnnRNNDescriptor_t,
count: *mut ::std::os::raw::c_int,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnFindRNNBackwardDataAlgorithmEx(
handle: cudnnHandle_t,
rnnDesc: cudnnRNNDescriptor_t,
seqLength: ::std::os::raw::c_int,
yDesc: *const cudnnTensorDescriptor_t,
y: *const ::core::ffi::c_void,
dyDesc: *const cudnnTensorDescriptor_t,
dy: *const ::core::ffi::c_void,
dhyDesc: cudnnTensorDescriptor_t,
dhy: *const ::core::ffi::c_void,
dcyDesc: cudnnTensorDescriptor_t,
dcy: *const ::core::ffi::c_void,
wDesc: cudnnFilterDescriptor_t,
w: *const ::core::ffi::c_void,
hxDesc: cudnnTensorDescriptor_t,
hx: *const ::core::ffi::c_void,
cxDesc: cudnnTensorDescriptor_t,
cx: *const ::core::ffi::c_void,
dxDesc: *const cudnnTensorDescriptor_t,
dx: *mut ::core::ffi::c_void,
dhxDesc: cudnnTensorDescriptor_t,
dhx: *mut ::core::ffi::c_void,
dcxDesc: cudnnTensorDescriptor_t,
dcx: *mut ::core::ffi::c_void,
findIntensity: f32,
requestedAlgoCount: ::std::os::raw::c_int,
returnedAlgoCount: *mut ::std::os::raw::c_int,
perfResults: *mut cudnnAlgorithmPerformance_t,
workspace: *mut ::core::ffi::c_void,
workSpaceSizeInBytes: usize,
reserveSpace: *mut ::core::ffi::c_void,
reserveSpaceSizeInBytes: usize,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetRNNBackwardWeightsAlgorithmMaxCount(
handle: cudnnHandle_t,
rnnDesc: cudnnRNNDescriptor_t,
count: *mut ::std::os::raw::c_int,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnFindRNNBackwardWeightsAlgorithmEx(
handle: cudnnHandle_t,
rnnDesc: cudnnRNNDescriptor_t,
seqLength: ::std::os::raw::c_int,
xDesc: *const cudnnTensorDescriptor_t,
x: *const ::core::ffi::c_void,
hxDesc: cudnnTensorDescriptor_t,
hx: *const ::core::ffi::c_void,
yDesc: *const cudnnTensorDescriptor_t,
y: *const ::core::ffi::c_void,
findIntensity: f32,
requestedAlgoCount: ::std::os::raw::c_int,
returnedAlgoCount: *mut ::std::os::raw::c_int,
perfResults: *mut cudnnAlgorithmPerformance_t,
workspace: *const ::core::ffi::c_void,
workSpaceSizeInBytes: usize,
dwDesc: cudnnFilterDescriptor_t,
dw: *mut ::core::ffi::c_void,
reserveSpace: *const ::core::ffi::c_void,
reserveSpaceSizeInBytes: usize,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnMultiHeadAttnBackwardData(
handle: cudnnHandle_t,
attnDesc: cudnnAttnDescriptor_t,
loWinIdx: *const ::std::os::raw::c_int,
hiWinIdx: *const ::std::os::raw::c_int,
devSeqLengthsDQDO: *const ::std::os::raw::c_int,
devSeqLengthsDKDV: *const ::std::os::raw::c_int,
doDesc: cudnnSeqDataDescriptor_t,
dout: *const ::core::ffi::c_void,
dqDesc: cudnnSeqDataDescriptor_t,
dqueries: *mut ::core::ffi::c_void,
queries: *const ::core::ffi::c_void,
dkDesc: cudnnSeqDataDescriptor_t,
dkeys: *mut ::core::ffi::c_void,
keys: *const ::core::ffi::c_void,
dvDesc: cudnnSeqDataDescriptor_t,
dvalues: *mut ::core::ffi::c_void,
values: *const ::core::ffi::c_void,
weightSizeInBytes: usize,
weights: *const ::core::ffi::c_void,
workSpaceSizeInBytes: usize,
workSpace: *mut ::core::ffi::c_void,
reserveSpaceSizeInBytes: usize,
reserveSpace: *mut ::core::ffi::c_void,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnMultiHeadAttnBackwardWeights(
handle: cudnnHandle_t,
attnDesc: cudnnAttnDescriptor_t,
addGrad: cudnnWgradMode_t,
qDesc: cudnnSeqDataDescriptor_t,
queries: *const ::core::ffi::c_void,
kDesc: cudnnSeqDataDescriptor_t,
keys: *const ::core::ffi::c_void,
vDesc: cudnnSeqDataDescriptor_t,
values: *const ::core::ffi::c_void,
doDesc: cudnnSeqDataDescriptor_t,
dout: *const ::core::ffi::c_void,
weightSizeInBytes: usize,
weights: *const ::core::ffi::c_void,
dweights: *mut ::core::ffi::c_void,
workSpaceSizeInBytes: usize,
workSpace: *mut ::core::ffi::c_void,
reserveSpaceSizeInBytes: usize,
reserveSpace: *mut ::core::ffi::c_void,
) -> cudnnStatus_t;
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnLossNormalizationMode_t {
CUDNN_LOSS_NORMALIZATION_NONE = 0,
CUDNN_LOSS_NORMALIZATION_SOFTMAX = 1,
}
extern "C" {
pub fn cudnnCreateCTCLossDescriptor(
ctcLossDesc: *mut cudnnCTCLossDescriptor_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnSetCTCLossDescriptor(
ctcLossDesc: cudnnCTCLossDescriptor_t,
compType: cudnnDataType_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnSetCTCLossDescriptorEx(
ctcLossDesc: cudnnCTCLossDescriptor_t,
compType: cudnnDataType_t,
normMode: cudnnLossNormalizationMode_t,
gradMode: cudnnNanPropagation_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnSetCTCLossDescriptor_v8(
ctcLossDesc: cudnnCTCLossDescriptor_t,
compType: cudnnDataType_t,
normMode: cudnnLossNormalizationMode_t,
gradMode: cudnnNanPropagation_t,
maxLabelLength: ::std::os::raw::c_int,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetCTCLossDescriptor(
ctcLossDesc: cudnnCTCLossDescriptor_t,
compType: *mut cudnnDataType_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetCTCLossDescriptorEx(
ctcLossDesc: cudnnCTCLossDescriptor_t,
compType: *mut cudnnDataType_t,
normMode: *mut cudnnLossNormalizationMode_t,
gradMode: *mut cudnnNanPropagation_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetCTCLossDescriptor_v8(
ctcLossDesc: cudnnCTCLossDescriptor_t,
compType: *mut cudnnDataType_t,
normMode: *mut cudnnLossNormalizationMode_t,
gradMode: *mut cudnnNanPropagation_t,
maxLabelLength: *mut ::std::os::raw::c_int,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnDestroyCTCLossDescriptor(ctcLossDesc: cudnnCTCLossDescriptor_t) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnCTCLoss(
handle: cudnnHandle_t,
probsDesc: cudnnTensorDescriptor_t,
probs: *const ::core::ffi::c_void,
hostLabels: *const ::std::os::raw::c_int,
hostLabelLengths: *const ::std::os::raw::c_int,
hostInputLengths: *const ::std::os::raw::c_int,
costs: *mut ::core::ffi::c_void,
gradientsDesc: cudnnTensorDescriptor_t,
gradients: *mut ::core::ffi::c_void,
algo: cudnnCTCLossAlgo_t,
ctcLossDesc: cudnnCTCLossDescriptor_t,
workspace: *mut ::core::ffi::c_void,
workSpaceSizeInBytes: usize,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnCTCLoss_v8(
handle: cudnnHandle_t,
algo: cudnnCTCLossAlgo_t,
ctcLossDesc: cudnnCTCLossDescriptor_t,
probsDesc: cudnnTensorDescriptor_t,
probs: *const ::core::ffi::c_void,
labels: *const ::std::os::raw::c_int,
labelLengths: *const ::std::os::raw::c_int,
inputLengths: *const ::std::os::raw::c_int,
costs: *mut ::core::ffi::c_void,
gradientsDesc: cudnnTensorDescriptor_t,
gradients: *mut ::core::ffi::c_void,
workSpaceSizeInBytes: usize,
workspace: *mut ::core::ffi::c_void,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetCTCLossWorkspaceSize(
handle: cudnnHandle_t,
probsDesc: cudnnTensorDescriptor_t,
gradientsDesc: cudnnTensorDescriptor_t,
labels: *const ::std::os::raw::c_int,
labelLengths: *const ::std::os::raw::c_int,
inputLengths: *const ::std::os::raw::c_int,
algo: cudnnCTCLossAlgo_t,
ctcLossDesc: cudnnCTCLossDescriptor_t,
sizeInBytes: *mut usize,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetCTCLossWorkspaceSize_v8(
handle: cudnnHandle_t,
algo: cudnnCTCLossAlgo_t,
ctcLossDesc: cudnnCTCLossDescriptor_t,
probsDesc: cudnnTensorDescriptor_t,
gradientsDesc: cudnnTensorDescriptor_t,
sizeInBytes: *mut usize,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnAdvTrainVersionCheck() -> cudnnStatus_t;
}
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct cudnnConvolutionStruct {
_unused: [u8; 0],
}
pub type cudnnConvolutionDescriptor_t = *mut cudnnConvolutionStruct;
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnConvolutionMode_t {
CUDNN_CONVOLUTION = 0,
CUDNN_CROSS_CORRELATION = 1,
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnReorderType_t {
CUDNN_DEFAULT_REORDER = 0,
CUDNN_NO_REORDER = 1,
}
#[repr(C)]
#[derive(Debug, Copy, Clone, PartialOrd, PartialEq)]
pub struct cudnnConvolutionFwdAlgoPerfStruct {
pub algo: cudnnConvolutionFwdAlgo_t,
pub status: cudnnStatus_t,
pub time: f32,
pub memory: usize,
pub determinism: cudnnDeterminism_t,
pub mathType: cudnnMathType_t,
pub reserved: [::std::os::raw::c_int; 3usize],
}
#[test]
fn bindgen_test_layout_cudnnConvolutionFwdAlgoPerfStruct() {
assert_eq!(
::core::mem::size_of::<cudnnConvolutionFwdAlgoPerfStruct>(),
48usize,
concat!("Size of: ", stringify!(cudnnConvolutionFwdAlgoPerfStruct))
);
assert_eq!(
::core::mem::align_of::<cudnnConvolutionFwdAlgoPerfStruct>(),
8usize,
concat!(
"Alignment of ",
stringify!(cudnnConvolutionFwdAlgoPerfStruct)
)
);
assert_eq!(
unsafe {
&(*(::core::ptr::null::<cudnnConvolutionFwdAlgoPerfStruct>())).algo as *const _ as usize
},
0usize,
concat!(
"Offset of field: ",
stringify!(cudnnConvolutionFwdAlgoPerfStruct),
"::",
stringify!(algo)
)
);
assert_eq!(
unsafe {
&(*(::core::ptr::null::<cudnnConvolutionFwdAlgoPerfStruct>())).status as *const _
as usize
},
4usize,
concat!(
"Offset of field: ",
stringify!(cudnnConvolutionFwdAlgoPerfStruct),
"::",
stringify!(status)
)
);
assert_eq!(
unsafe {
&(*(::core::ptr::null::<cudnnConvolutionFwdAlgoPerfStruct>())).time as *const _ as usize
},
8usize,
concat!(
"Offset of field: ",
stringify!(cudnnConvolutionFwdAlgoPerfStruct),
"::",
stringify!(time)
)
);
assert_eq!(
unsafe {
&(*(::core::ptr::null::<cudnnConvolutionFwdAlgoPerfStruct>())).memory as *const _
as usize
},
16usize,
concat!(
"Offset of field: ",
stringify!(cudnnConvolutionFwdAlgoPerfStruct),
"::",
stringify!(memory)
)
);
assert_eq!(
unsafe {
&(*(::core::ptr::null::<cudnnConvolutionFwdAlgoPerfStruct>())).determinism as *const _
as usize
},
24usize,
concat!(
"Offset of field: ",
stringify!(cudnnConvolutionFwdAlgoPerfStruct),
"::",
stringify!(determinism)
)
);
assert_eq!(
unsafe {
&(*(::core::ptr::null::<cudnnConvolutionFwdAlgoPerfStruct>())).mathType as *const _
as usize
},
28usize,
concat!(
"Offset of field: ",
stringify!(cudnnConvolutionFwdAlgoPerfStruct),
"::",
stringify!(mathType)
)
);
assert_eq!(
unsafe {
&(*(::core::ptr::null::<cudnnConvolutionFwdAlgoPerfStruct>())).reserved as *const _
as usize
},
32usize,
concat!(
"Offset of field: ",
stringify!(cudnnConvolutionFwdAlgoPerfStruct),
"::",
stringify!(reserved)
)
);
}
impl Default for cudnnConvolutionFwdAlgoPerfStruct {
fn default() -> Self {
let mut s = ::core::mem::MaybeUninit::<Self>::uninit();
unsafe {
::core::ptr::write_bytes(s.as_mut_ptr(), 0, 1);
s.assume_init()
}
}
}
pub type cudnnConvolutionFwdAlgoPerf_t = cudnnConvolutionFwdAlgoPerfStruct;
extern "C" {
pub fn cudnnCreateConvolutionDescriptor(
convDesc: *mut cudnnConvolutionDescriptor_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnDestroyConvolutionDescriptor(
convDesc: cudnnConvolutionDescriptor_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnSetConvolutionMathType(
convDesc: cudnnConvolutionDescriptor_t,
mathType: cudnnMathType_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetConvolutionMathType(
convDesc: cudnnConvolutionDescriptor_t,
mathType: *mut cudnnMathType_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnSetConvolutionGroupCount(
convDesc: cudnnConvolutionDescriptor_t,
groupCount: ::std::os::raw::c_int,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetConvolutionGroupCount(
convDesc: cudnnConvolutionDescriptor_t,
groupCount: *mut ::std::os::raw::c_int,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnSetConvolutionReorderType(
convDesc: cudnnConvolutionDescriptor_t,
reorderType: cudnnReorderType_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetConvolutionReorderType(
convDesc: cudnnConvolutionDescriptor_t,
reorderType: *mut cudnnReorderType_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnSetConvolution2dDescriptor(
convDesc: cudnnConvolutionDescriptor_t,
pad_h: ::std::os::raw::c_int,
pad_w: ::std::os::raw::c_int,
u: ::std::os::raw::c_int,
v: ::std::os::raw::c_int,
dilation_h: ::std::os::raw::c_int,
dilation_w: ::std::os::raw::c_int,
mode: cudnnConvolutionMode_t,
computeType: cudnnDataType_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetConvolution2dDescriptor(
convDesc: cudnnConvolutionDescriptor_t,
pad_h: *mut ::std::os::raw::c_int,
pad_w: *mut ::std::os::raw::c_int,
u: *mut ::std::os::raw::c_int,
v: *mut ::std::os::raw::c_int,
dilation_h: *mut ::std::os::raw::c_int,
dilation_w: *mut ::std::os::raw::c_int,
mode: *mut cudnnConvolutionMode_t,
computeType: *mut cudnnDataType_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnSetConvolutionNdDescriptor(
convDesc: cudnnConvolutionDescriptor_t,
arrayLength: ::std::os::raw::c_int,
padA: *const ::std::os::raw::c_int,
filterStrideA: *const ::std::os::raw::c_int,
dilationA: *const ::std::os::raw::c_int,
mode: cudnnConvolutionMode_t,
computeType: cudnnDataType_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetConvolutionNdDescriptor(
convDesc: cudnnConvolutionDescriptor_t,
arrayLengthRequested: ::std::os::raw::c_int,
arrayLength: *mut ::std::os::raw::c_int,
padA: *mut ::std::os::raw::c_int,
strideA: *mut ::std::os::raw::c_int,
dilationA: *mut ::std::os::raw::c_int,
mode: *mut cudnnConvolutionMode_t,
computeType: *mut cudnnDataType_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetConvolution2dForwardOutputDim(
convDesc: cudnnConvolutionDescriptor_t,
inputTensorDesc: cudnnTensorDescriptor_t,
filterDesc: cudnnFilterDescriptor_t,
n: *mut ::std::os::raw::c_int,
c: *mut ::std::os::raw::c_int,
h: *mut ::std::os::raw::c_int,
w: *mut ::std::os::raw::c_int,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetConvolutionNdForwardOutputDim(
convDesc: cudnnConvolutionDescriptor_t,
inputTensorDesc: cudnnTensorDescriptor_t,
filterDesc: cudnnFilterDescriptor_t,
nbDims: ::std::os::raw::c_int,
tensorOuputDimA: *mut ::std::os::raw::c_int,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetConvolutionForwardAlgorithmMaxCount(
handle: cudnnHandle_t,
count: *mut ::std::os::raw::c_int,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetConvolutionForwardAlgorithm_v7(
handle: cudnnHandle_t,
srcDesc: cudnnTensorDescriptor_t,
filterDesc: cudnnFilterDescriptor_t,
convDesc: cudnnConvolutionDescriptor_t,
destDesc: cudnnTensorDescriptor_t,
requestedAlgoCount: ::std::os::raw::c_int,
returnedAlgoCount: *mut ::std::os::raw::c_int,
perfResults: *mut cudnnConvolutionFwdAlgoPerf_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnFindConvolutionForwardAlgorithm(
handle: cudnnHandle_t,
xDesc: cudnnTensorDescriptor_t,
wDesc: cudnnFilterDescriptor_t,
convDesc: cudnnConvolutionDescriptor_t,
yDesc: cudnnTensorDescriptor_t,
requestedAlgoCount: ::std::os::raw::c_int,
returnedAlgoCount: *mut ::std::os::raw::c_int,
perfResults: *mut cudnnConvolutionFwdAlgoPerf_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnFindConvolutionForwardAlgorithmEx(
handle: cudnnHandle_t,
xDesc: cudnnTensorDescriptor_t,
x: *const ::core::ffi::c_void,
wDesc: cudnnFilterDescriptor_t,
w: *const ::core::ffi::c_void,
convDesc: cudnnConvolutionDescriptor_t,
yDesc: cudnnTensorDescriptor_t,
y: *mut ::core::ffi::c_void,
requestedAlgoCount: ::std::os::raw::c_int,
returnedAlgoCount: *mut ::std::os::raw::c_int,
perfResults: *mut cudnnConvolutionFwdAlgoPerf_t,
workSpace: *mut ::core::ffi::c_void,
workSpaceSizeInBytes: usize,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnIm2Col(
handle: cudnnHandle_t,
xDesc: cudnnTensorDescriptor_t,
x: *const ::core::ffi::c_void,
wDesc: cudnnFilterDescriptor_t,
convDesc: cudnnConvolutionDescriptor_t,
colBuffer: *mut ::core::ffi::c_void,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnReorderFilterAndBias(
handle: cudnnHandle_t,
filterDesc: cudnnFilterDescriptor_t,
reorderType: cudnnReorderType_t,
filterData: *const ::core::ffi::c_void,
reorderedFilterData: *mut ::core::ffi::c_void,
reorderBias: ::std::os::raw::c_int,
biasData: *const ::core::ffi::c_void,
reorderedBiasData: *mut ::core::ffi::c_void,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetConvolutionForwardWorkspaceSize(
handle: cudnnHandle_t,
xDesc: cudnnTensorDescriptor_t,
wDesc: cudnnFilterDescriptor_t,
convDesc: cudnnConvolutionDescriptor_t,
yDesc: cudnnTensorDescriptor_t,
algo: cudnnConvolutionFwdAlgo_t,
sizeInBytes: *mut usize,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnConvolutionForward(
handle: cudnnHandle_t,
alpha: *const ::core::ffi::c_void,
xDesc: cudnnTensorDescriptor_t,
x: *const ::core::ffi::c_void,
wDesc: cudnnFilterDescriptor_t,
w: *const ::core::ffi::c_void,
convDesc: cudnnConvolutionDescriptor_t,
algo: cudnnConvolutionFwdAlgo_t,
workSpace: *mut ::core::ffi::c_void,
workSpaceSizeInBytes: usize,
beta: *const ::core::ffi::c_void,
yDesc: cudnnTensorDescriptor_t,
y: *mut ::core::ffi::c_void,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnConvolutionBiasActivationForward(
handle: cudnnHandle_t,
alpha1: *const ::core::ffi::c_void,
xDesc: cudnnTensorDescriptor_t,
x: *const ::core::ffi::c_void,
wDesc: cudnnFilterDescriptor_t,
w: *const ::core::ffi::c_void,
convDesc: cudnnConvolutionDescriptor_t,
algo: cudnnConvolutionFwdAlgo_t,
workSpace: *mut ::core::ffi::c_void,
workSpaceSizeInBytes: usize,
alpha2: *const ::core::ffi::c_void,
zDesc: cudnnTensorDescriptor_t,
z: *const ::core::ffi::c_void,
biasDesc: cudnnTensorDescriptor_t,
bias: *const ::core::ffi::c_void,
activationDesc: cudnnActivationDescriptor_t,
yDesc: cudnnTensorDescriptor_t,
y: *mut ::core::ffi::c_void,
) -> cudnnStatus_t;
}
#[repr(C)]
#[derive(Debug, Copy, Clone, PartialOrd, PartialEq)]
pub struct cudnnConvolutionBwdDataAlgoPerfStruct {
pub algo: cudnnConvolutionBwdDataAlgo_t,
pub status: cudnnStatus_t,
pub time: f32,
pub memory: usize,
pub determinism: cudnnDeterminism_t,
pub mathType: cudnnMathType_t,
pub reserved: [::std::os::raw::c_int; 3usize],
}
#[test]
fn bindgen_test_layout_cudnnConvolutionBwdDataAlgoPerfStruct() {
assert_eq!(
::core::mem::size_of::<cudnnConvolutionBwdDataAlgoPerfStruct>(),
48usize,
concat!(
"Size of: ",
stringify!(cudnnConvolutionBwdDataAlgoPerfStruct)
)
);
assert_eq!(
::core::mem::align_of::<cudnnConvolutionBwdDataAlgoPerfStruct>(),
8usize,
concat!(
"Alignment of ",
stringify!(cudnnConvolutionBwdDataAlgoPerfStruct)
)
);
assert_eq!(
unsafe {
&(*(::core::ptr::null::<cudnnConvolutionBwdDataAlgoPerfStruct>())).algo as *const _
as usize
},
0usize,
concat!(
"Offset of field: ",
stringify!(cudnnConvolutionBwdDataAlgoPerfStruct),
"::",
stringify!(algo)
)
);
assert_eq!(
unsafe {
&(*(::core::ptr::null::<cudnnConvolutionBwdDataAlgoPerfStruct>())).status as *const _
as usize
},
4usize,
concat!(
"Offset of field: ",
stringify!(cudnnConvolutionBwdDataAlgoPerfStruct),
"::",
stringify!(status)
)
);
assert_eq!(
unsafe {
&(*(::core::ptr::null::<cudnnConvolutionBwdDataAlgoPerfStruct>())).time as *const _
as usize
},
8usize,
concat!(
"Offset of field: ",
stringify!(cudnnConvolutionBwdDataAlgoPerfStruct),
"::",
stringify!(time)
)
);
assert_eq!(
unsafe {
&(*(::core::ptr::null::<cudnnConvolutionBwdDataAlgoPerfStruct>())).memory as *const _
as usize
},
16usize,
concat!(
"Offset of field: ",
stringify!(cudnnConvolutionBwdDataAlgoPerfStruct),
"::",
stringify!(memory)
)
);
assert_eq!(
unsafe {
&(*(::core::ptr::null::<cudnnConvolutionBwdDataAlgoPerfStruct>())).determinism
as *const _ as usize
},
24usize,
concat!(
"Offset of field: ",
stringify!(cudnnConvolutionBwdDataAlgoPerfStruct),
"::",
stringify!(determinism)
)
);
assert_eq!(
unsafe {
&(*(::core::ptr::null::<cudnnConvolutionBwdDataAlgoPerfStruct>())).mathType as *const _
as usize
},
28usize,
concat!(
"Offset of field: ",
stringify!(cudnnConvolutionBwdDataAlgoPerfStruct),
"::",
stringify!(mathType)
)
);
assert_eq!(
unsafe {
&(*(::core::ptr::null::<cudnnConvolutionBwdDataAlgoPerfStruct>())).reserved as *const _
as usize
},
32usize,
concat!(
"Offset of field: ",
stringify!(cudnnConvolutionBwdDataAlgoPerfStruct),
"::",
stringify!(reserved)
)
);
}
impl Default for cudnnConvolutionBwdDataAlgoPerfStruct {
fn default() -> Self {
let mut s = ::core::mem::MaybeUninit::<Self>::uninit();
unsafe {
::core::ptr::write_bytes(s.as_mut_ptr(), 0, 1);
s.assume_init()
}
}
}
pub type cudnnConvolutionBwdDataAlgoPerf_t = cudnnConvolutionBwdDataAlgoPerfStruct;
extern "C" {
pub fn cudnnGetConvolutionBackwardDataAlgorithmMaxCount(
handle: cudnnHandle_t,
count: *mut ::std::os::raw::c_int,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnFindConvolutionBackwardDataAlgorithm(
handle: cudnnHandle_t,
wDesc: cudnnFilterDescriptor_t,
dyDesc: cudnnTensorDescriptor_t,
convDesc: cudnnConvolutionDescriptor_t,
dxDesc: cudnnTensorDescriptor_t,
requestedAlgoCount: ::std::os::raw::c_int,
returnedAlgoCount: *mut ::std::os::raw::c_int,
perfResults: *mut cudnnConvolutionBwdDataAlgoPerf_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnFindConvolutionBackwardDataAlgorithmEx(
handle: cudnnHandle_t,
wDesc: cudnnFilterDescriptor_t,
w: *const ::core::ffi::c_void,
dyDesc: cudnnTensorDescriptor_t,
dy: *const ::core::ffi::c_void,
convDesc: cudnnConvolutionDescriptor_t,
dxDesc: cudnnTensorDescriptor_t,
dx: *mut ::core::ffi::c_void,
requestedAlgoCount: ::std::os::raw::c_int,
returnedAlgoCount: *mut ::std::os::raw::c_int,
perfResults: *mut cudnnConvolutionBwdDataAlgoPerf_t,
workSpace: *mut ::core::ffi::c_void,
workSpaceSizeInBytes: usize,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetConvolutionBackwardDataAlgorithm_v7(
handle: cudnnHandle_t,
filterDesc: cudnnFilterDescriptor_t,
diffDesc: cudnnTensorDescriptor_t,
convDesc: cudnnConvolutionDescriptor_t,
gradDesc: cudnnTensorDescriptor_t,
requestedAlgoCount: ::std::os::raw::c_int,
returnedAlgoCount: *mut ::std::os::raw::c_int,
perfResults: *mut cudnnConvolutionBwdDataAlgoPerf_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetConvolutionBackwardDataWorkspaceSize(
handle: cudnnHandle_t,
wDesc: cudnnFilterDescriptor_t,
dyDesc: cudnnTensorDescriptor_t,
convDesc: cudnnConvolutionDescriptor_t,
dxDesc: cudnnTensorDescriptor_t,
algo: cudnnConvolutionBwdDataAlgo_t,
sizeInBytes: *mut usize,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnConvolutionBackwardData(
handle: cudnnHandle_t,
alpha: *const ::core::ffi::c_void,
wDesc: cudnnFilterDescriptor_t,
w: *const ::core::ffi::c_void,
dyDesc: cudnnTensorDescriptor_t,
dy: *const ::core::ffi::c_void,
convDesc: cudnnConvolutionDescriptor_t,
algo: cudnnConvolutionBwdDataAlgo_t,
workSpace: *mut ::core::ffi::c_void,
workSpaceSizeInBytes: usize,
beta: *const ::core::ffi::c_void,
dxDesc: cudnnTensorDescriptor_t,
dx: *mut ::core::ffi::c_void,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetFoldedConvBackwardDataDescriptors(
handle: cudnnHandle_t,
filterDesc: cudnnFilterDescriptor_t,
diffDesc: cudnnTensorDescriptor_t,
convDesc: cudnnConvolutionDescriptor_t,
gradDesc: cudnnTensorDescriptor_t,
transformFormat: cudnnTensorFormat_t,
foldedFilterDesc: cudnnFilterDescriptor_t,
paddedDiffDesc: cudnnTensorDescriptor_t,
foldedConvDesc: cudnnConvolutionDescriptor_t,
foldedGradDesc: cudnnTensorDescriptor_t,
filterFoldTransDesc: cudnnTensorTransformDescriptor_t,
diffPadTransDesc: cudnnTensorTransformDescriptor_t,
gradFoldTransDesc: cudnnTensorTransformDescriptor_t,
gradUnfoldTransDesc: cudnnTensorTransformDescriptor_t,
) -> cudnnStatus_t;
}
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct cudnnFusedOpsConstParamStruct {
_unused: [u8; 0],
}
pub type cudnnFusedOpsConstParamPack_t = *mut cudnnFusedOpsConstParamStruct;
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct cudnnFusedOpsVariantParamStruct {
_unused: [u8; 0],
}
pub type cudnnFusedOpsVariantParamPack_t = *mut cudnnFusedOpsVariantParamStruct;
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct cudnnFusedOpsPlanStruct {
_unused: [u8; 0],
}
pub type cudnnFusedOpsPlan_t = *mut cudnnFusedOpsPlanStruct;
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnFusedOps_t {
CUDNN_FUSED_SCALE_BIAS_ACTIVATION_CONV_BNSTATS = 0,
CUDNN_FUSED_SCALE_BIAS_ACTIVATION_WGRAD = 1,
CUDNN_FUSED_BN_FINALIZE_STATISTICS_TRAINING = 2,
CUDNN_FUSED_BN_FINALIZE_STATISTICS_INFERENCE = 3,
CUDNN_FUSED_CONV_SCALE_BIAS_ADD_ACTIVATION = 4,
CUDNN_FUSED_SCALE_BIAS_ADD_ACTIVATION_GEN_BITMASK = 5,
CUDNN_FUSED_DACTIVATION_FORK_DBATCHNORM = 6,
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnFusedOpsConstParamLabel_t {
CUDNN_PARAM_XDESC = 0,
CUDNN_PARAM_XDATA_PLACEHOLDER = 1,
CUDNN_PARAM_BN_MODE = 2,
CUDNN_PARAM_BN_EQSCALEBIAS_DESC = 3,
CUDNN_PARAM_BN_EQSCALE_PLACEHOLDER = 4,
CUDNN_PARAM_BN_EQBIAS_PLACEHOLDER = 5,
CUDNN_PARAM_ACTIVATION_DESC = 6,
CUDNN_PARAM_CONV_DESC = 7,
CUDNN_PARAM_WDESC = 8,
CUDNN_PARAM_WDATA_PLACEHOLDER = 9,
CUDNN_PARAM_DWDESC = 10,
CUDNN_PARAM_DWDATA_PLACEHOLDER = 11,
CUDNN_PARAM_YDESC = 12,
CUDNN_PARAM_YDATA_PLACEHOLDER = 13,
CUDNN_PARAM_DYDESC = 14,
CUDNN_PARAM_DYDATA_PLACEHOLDER = 15,
CUDNN_PARAM_YSTATS_DESC = 16,
CUDNN_PARAM_YSUM_PLACEHOLDER = 17,
CUDNN_PARAM_YSQSUM_PLACEHOLDER = 18,
CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC = 19,
CUDNN_PARAM_BN_SCALE_PLACEHOLDER = 20,
CUDNN_PARAM_BN_BIAS_PLACEHOLDER = 21,
CUDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER = 22,
CUDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER = 23,
CUDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER = 24,
CUDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER = 25,
CUDNN_PARAM_ZDESC = 26,
CUDNN_PARAM_ZDATA_PLACEHOLDER = 27,
CUDNN_PARAM_BN_Z_EQSCALEBIAS_DESC = 28,
CUDNN_PARAM_BN_Z_EQSCALE_PLACEHOLDER = 29,
CUDNN_PARAM_BN_Z_EQBIAS_PLACEHOLDER = 30,
CUDNN_PARAM_ACTIVATION_BITMASK_DESC = 31,
CUDNN_PARAM_ACTIVATION_BITMASK_PLACEHOLDER = 32,
CUDNN_PARAM_DXDESC = 33,
CUDNN_PARAM_DXDATA_PLACEHOLDER = 34,
CUDNN_PARAM_DZDESC = 35,
CUDNN_PARAM_DZDATA_PLACEHOLDER = 36,
CUDNN_PARAM_BN_DSCALE_PLACEHOLDER = 37,
CUDNN_PARAM_BN_DBIAS_PLACEHOLDER = 38,
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnFusedOpsPointerPlaceHolder_t {
CUDNN_PTR_NULL = 0,
CUDNN_PTR_ELEM_ALIGNED = 1,
CUDNN_PTR_16B_ALIGNED = 2,
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnFusedOpsVariantParamLabel_t {
CUDNN_PTR_XDATA = 0,
CUDNN_PTR_BN_EQSCALE = 1,
CUDNN_PTR_BN_EQBIAS = 2,
CUDNN_PTR_WDATA = 3,
CUDNN_PTR_DWDATA = 4,
CUDNN_PTR_YDATA = 5,
CUDNN_PTR_DYDATA = 6,
CUDNN_PTR_YSUM = 7,
CUDNN_PTR_YSQSUM = 8,
CUDNN_PTR_WORKSPACE = 9,
CUDNN_PTR_BN_SCALE = 10,
CUDNN_PTR_BN_BIAS = 11,
CUDNN_PTR_BN_SAVED_MEAN = 12,
CUDNN_PTR_BN_SAVED_INVSTD = 13,
CUDNN_PTR_BN_RUNNING_MEAN = 14,
CUDNN_PTR_BN_RUNNING_VAR = 15,
CUDNN_PTR_ZDATA = 16,
CUDNN_PTR_BN_Z_EQSCALE = 17,
CUDNN_PTR_BN_Z_EQBIAS = 18,
CUDNN_PTR_ACTIVATION_BITMASK = 19,
CUDNN_PTR_DXDATA = 20,
CUDNN_PTR_DZDATA = 21,
CUDNN_PTR_BN_DSCALE = 22,
CUDNN_PTR_BN_DBIAS = 23,
CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES = 100,
CUDNN_SCALAR_INT64_T_BN_ACCUMULATION_COUNT = 101,
CUDNN_SCALAR_DOUBLE_BN_EXP_AVG_FACTOR = 102,
CUDNN_SCALAR_DOUBLE_BN_EPSILON = 103,
}
extern "C" {
pub fn cudnnCnnInferVersionCheck() -> cudnnStatus_t;
}
#[repr(C)]
#[derive(Debug, Copy, Clone, PartialOrd, PartialEq)]
pub struct cudnnConvolutionBwdFilterAlgoPerfStruct {
pub algo: cudnnConvolutionBwdFilterAlgo_t,
pub status: cudnnStatus_t,
pub time: f32,
pub memory: usize,
pub determinism: cudnnDeterminism_t,
pub mathType: cudnnMathType_t,
pub reserved: [::std::os::raw::c_int; 3usize],
}
#[test]
fn bindgen_test_layout_cudnnConvolutionBwdFilterAlgoPerfStruct() {
assert_eq!(
::core::mem::size_of::<cudnnConvolutionBwdFilterAlgoPerfStruct>(),
48usize,
concat!(
"Size of: ",
stringify!(cudnnConvolutionBwdFilterAlgoPerfStruct)
)
);
assert_eq!(
::core::mem::align_of::<cudnnConvolutionBwdFilterAlgoPerfStruct>(),
8usize,
concat!(
"Alignment of ",
stringify!(cudnnConvolutionBwdFilterAlgoPerfStruct)
)
);
assert_eq!(
unsafe {
&(*(::core::ptr::null::<cudnnConvolutionBwdFilterAlgoPerfStruct>())).algo as *const _
as usize
},
0usize,
concat!(
"Offset of field: ",
stringify!(cudnnConvolutionBwdFilterAlgoPerfStruct),
"::",
stringify!(algo)
)
);
assert_eq!(
unsafe {
&(*(::core::ptr::null::<cudnnConvolutionBwdFilterAlgoPerfStruct>())).status as *const _
as usize
},
4usize,
concat!(
"Offset of field: ",
stringify!(cudnnConvolutionBwdFilterAlgoPerfStruct),
"::",
stringify!(status)
)
);
assert_eq!(
unsafe {
&(*(::core::ptr::null::<cudnnConvolutionBwdFilterAlgoPerfStruct>())).time as *const _
as usize
},
8usize,
concat!(
"Offset of field: ",
stringify!(cudnnConvolutionBwdFilterAlgoPerfStruct),
"::",
stringify!(time)
)
);
assert_eq!(
unsafe {
&(*(::core::ptr::null::<cudnnConvolutionBwdFilterAlgoPerfStruct>())).memory as *const _
as usize
},
16usize,
concat!(
"Offset of field: ",
stringify!(cudnnConvolutionBwdFilterAlgoPerfStruct),
"::",
stringify!(memory)
)
);
assert_eq!(
unsafe {
&(*(::core::ptr::null::<cudnnConvolutionBwdFilterAlgoPerfStruct>())).determinism
as *const _ as usize
},
24usize,
concat!(
"Offset of field: ",
stringify!(cudnnConvolutionBwdFilterAlgoPerfStruct),
"::",
stringify!(determinism)
)
);
assert_eq!(
unsafe {
&(*(::core::ptr::null::<cudnnConvolutionBwdFilterAlgoPerfStruct>())).mathType
as *const _ as usize
},
28usize,
concat!(
"Offset of field: ",
stringify!(cudnnConvolutionBwdFilterAlgoPerfStruct),
"::",
stringify!(mathType)
)
);
assert_eq!(
unsafe {
&(*(::core::ptr::null::<cudnnConvolutionBwdFilterAlgoPerfStruct>())).reserved
as *const _ as usize
},
32usize,
concat!(
"Offset of field: ",
stringify!(cudnnConvolutionBwdFilterAlgoPerfStruct),
"::",
stringify!(reserved)
)
);
}
impl Default for cudnnConvolutionBwdFilterAlgoPerfStruct {
fn default() -> Self {
let mut s = ::core::mem::MaybeUninit::<Self>::uninit();
unsafe {
::core::ptr::write_bytes(s.as_mut_ptr(), 0, 1);
s.assume_init()
}
}
}
pub type cudnnConvolutionBwdFilterAlgoPerf_t = cudnnConvolutionBwdFilterAlgoPerfStruct;
extern "C" {
pub fn cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(
handle: cudnnHandle_t,
count: *mut ::std::os::raw::c_int,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnFindConvolutionBackwardFilterAlgorithm(
handle: cudnnHandle_t,
xDesc: cudnnTensorDescriptor_t,
dyDesc: cudnnTensorDescriptor_t,
convDesc: cudnnConvolutionDescriptor_t,
dwDesc: cudnnFilterDescriptor_t,
requestedAlgoCount: ::std::os::raw::c_int,
returnedAlgoCount: *mut ::std::os::raw::c_int,
perfResults: *mut cudnnConvolutionBwdFilterAlgoPerf_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnFindConvolutionBackwardFilterAlgorithmEx(
handle: cudnnHandle_t,
xDesc: cudnnTensorDescriptor_t,
x: *const ::core::ffi::c_void,
dyDesc: cudnnTensorDescriptor_t,
y: *const ::core::ffi::c_void,
convDesc: cudnnConvolutionDescriptor_t,
dwDesc: cudnnFilterDescriptor_t,
dw: *mut ::core::ffi::c_void,
requestedAlgoCount: ::std::os::raw::c_int,
returnedAlgoCount: *mut ::std::os::raw::c_int,
perfResults: *mut cudnnConvolutionBwdFilterAlgoPerf_t,
workSpace: *mut ::core::ffi::c_void,
workSpaceSizeInBytes: usize,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetConvolutionBackwardFilterAlgorithm_v7(
handle: cudnnHandle_t,
srcDesc: cudnnTensorDescriptor_t,
diffDesc: cudnnTensorDescriptor_t,
convDesc: cudnnConvolutionDescriptor_t,
gradDesc: cudnnFilterDescriptor_t,
requestedAlgoCount: ::std::os::raw::c_int,
returnedAlgoCount: *mut ::std::os::raw::c_int,
perfResults: *mut cudnnConvolutionBwdFilterAlgoPerf_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetConvolutionBackwardFilterWorkspaceSize(
handle: cudnnHandle_t,
xDesc: cudnnTensorDescriptor_t,
dyDesc: cudnnTensorDescriptor_t,
convDesc: cudnnConvolutionDescriptor_t,
gradDesc: cudnnFilterDescriptor_t,
algo: cudnnConvolutionBwdFilterAlgo_t,
sizeInBytes: *mut usize,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnConvolutionBackwardFilter(
handle: cudnnHandle_t,
alpha: *const ::core::ffi::c_void,
xDesc: cudnnTensorDescriptor_t,
x: *const ::core::ffi::c_void,
dyDesc: cudnnTensorDescriptor_t,
dy: *const ::core::ffi::c_void,
convDesc: cudnnConvolutionDescriptor_t,
algo: cudnnConvolutionBwdFilterAlgo_t,
workSpace: *mut ::core::ffi::c_void,
workSpaceSizeInBytes: usize,
beta: *const ::core::ffi::c_void,
dwDesc: cudnnFilterDescriptor_t,
dw: *mut ::core::ffi::c_void,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnConvolutionBackwardBias(
handle: cudnnHandle_t,
alpha: *const ::core::ffi::c_void,
dyDesc: cudnnTensorDescriptor_t,
dy: *const ::core::ffi::c_void,
beta: *const ::core::ffi::c_void,
dbDesc: cudnnTensorDescriptor_t,
db: *mut ::core::ffi::c_void,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnCreateFusedOpsConstParamPack(
constPack: *mut cudnnFusedOpsConstParamPack_t,
ops: cudnnFusedOps_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnDestroyFusedOpsConstParamPack(
constPack: cudnnFusedOpsConstParamPack_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnSetFusedOpsConstParamPackAttribute(
constPack: cudnnFusedOpsConstParamPack_t,
paramLabel: cudnnFusedOpsConstParamLabel_t,
param: *const ::core::ffi::c_void,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetFusedOpsConstParamPackAttribute(
constPack: cudnnFusedOpsConstParamPack_t,
paramLabel: cudnnFusedOpsConstParamLabel_t,
param: *mut ::core::ffi::c_void,
isNULL: *mut ::std::os::raw::c_int,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnCreateFusedOpsVariantParamPack(
varPack: *mut cudnnFusedOpsVariantParamPack_t,
ops: cudnnFusedOps_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnDestroyFusedOpsVariantParamPack(
varPack: cudnnFusedOpsVariantParamPack_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnSetFusedOpsVariantParamPackAttribute(
varPack: cudnnFusedOpsVariantParamPack_t,
paramLabel: cudnnFusedOpsVariantParamLabel_t,
ptr: *mut ::core::ffi::c_void,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnGetFusedOpsVariantParamPackAttribute(
varPack: cudnnFusedOpsVariantParamPack_t,
paramLabel: cudnnFusedOpsVariantParamLabel_t,
ptr: *mut ::core::ffi::c_void,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnCreateFusedOpsPlan(
plan: *mut cudnnFusedOpsPlan_t,
ops: cudnnFusedOps_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnDestroyFusedOpsPlan(plan: cudnnFusedOpsPlan_t) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnMakeFusedOpsPlan(
handle: cudnnHandle_t,
plan: cudnnFusedOpsPlan_t,
constPack: cudnnFusedOpsConstParamPack_t,
workspaceSizeInBytes: *mut usize,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnFusedOpsExecute(
handle: cudnnHandle_t,
plan: cudnnFusedOpsPlan_t,
varPack: cudnnFusedOpsVariantParamPack_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnCnnTrainVersionCheck() -> cudnnStatus_t;
}
pub type cudnnBackendDescriptor_t = *mut ::core::ffi::c_void;
#[repr(C)]
#[derive(Debug, Default, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub struct cudnnFractionStruct {
pub numerator: i64,
pub denominator: i64,
}
#[test]
fn bindgen_test_layout_cudnnFractionStruct() {
assert_eq!(
::core::mem::size_of::<cudnnFractionStruct>(),
16usize,
concat!("Size of: ", stringify!(cudnnFractionStruct))
);
assert_eq!(
::core::mem::align_of::<cudnnFractionStruct>(),
8usize,
concat!("Alignment of ", stringify!(cudnnFractionStruct))
);
assert_eq!(
unsafe { &(*(::core::ptr::null::<cudnnFractionStruct>())).numerator as *const _ as usize },
0usize,
concat!(
"Offset of field: ",
stringify!(cudnnFractionStruct),
"::",
stringify!(numerator)
)
);
assert_eq!(
unsafe {
&(*(::core::ptr::null::<cudnnFractionStruct>())).denominator as *const _ as usize
},
8usize,
concat!(
"Offset of field: ",
stringify!(cudnnFractionStruct),
"::",
stringify!(denominator)
)
);
}
pub type cudnnFraction_t = cudnnFractionStruct;
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnPointwiseMode_t {
CUDNN_POINTWISE_ADD = 0,
CUDNN_POINTWISE_ADD_SQUARE = 5,
CUDNN_POINTWISE_DIV = 6,
CUDNN_POINTWISE_MAX = 3,
CUDNN_POINTWISE_MIN = 2,
CUDNN_POINTWISE_MOD = 7,
CUDNN_POINTWISE_MUL = 1,
CUDNN_POINTWISE_POW = 8,
CUDNN_POINTWISE_SUB = 9,
CUDNN_POINTWISE_ABS = 10,
CUDNN_POINTWISE_CEIL = 11,
CUDNN_POINTWISE_COS = 12,
CUDNN_POINTWISE_EXP = 13,
CUDNN_POINTWISE_FLOOR = 14,
CUDNN_POINTWISE_LOG = 15,
CUDNN_POINTWISE_NEG = 16,
CUDNN_POINTWISE_RSQRT = 17,
CUDNN_POINTWISE_SIN = 18,
CUDNN_POINTWISE_SQRT = 4,
CUDNN_POINTWISE_TAN = 19,
CUDNN_POINTWISE_ERF = 20,
CUDNN_POINTWISE_IDENTITY = 21,
CUDNN_POINTWISE_RELU_FWD = 100,
CUDNN_POINTWISE_TANH_FWD = 101,
CUDNN_POINTWISE_SIGMOID_FWD = 102,
CUDNN_POINTWISE_ELU_FWD = 103,
CUDNN_POINTWISE_GELU_FWD = 104,
CUDNN_POINTWISE_SOFTPLUS_FWD = 105,
CUDNN_POINTWISE_SWISH_FWD = 106,
CUDNN_POINTWISE_GELU_APPROX_TANH_FWD = 107,
CUDNN_POINTWISE_RELU_BWD = 200,
CUDNN_POINTWISE_TANH_BWD = 201,
CUDNN_POINTWISE_SIGMOID_BWD = 202,
CUDNN_POINTWISE_ELU_BWD = 203,
CUDNN_POINTWISE_GELU_BWD = 204,
CUDNN_POINTWISE_SOFTPLUS_BWD = 205,
CUDNN_POINTWISE_SWISH_BWD = 206,
CUDNN_POINTWISE_GELU_APPROX_TANH_BWD = 207,
CUDNN_POINTWISE_CMP_EQ = 300,
CUDNN_POINTWISE_CMP_NEQ = 301,
CUDNN_POINTWISE_CMP_GT = 302,
CUDNN_POINTWISE_CMP_GE = 303,
CUDNN_POINTWISE_CMP_LT = 304,
CUDNN_POINTWISE_CMP_LE = 305,
CUDNN_POINTWISE_LOGICAL_AND = 400,
CUDNN_POINTWISE_LOGICAL_OR = 401,
CUDNN_POINTWISE_LOGICAL_NOT = 402,
CUDNN_POINTWISE_GEN_INDEX = 501,
CUDNN_POINTWISE_BINARY_SELECT = 601,
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnResampleMode_t {
CUDNN_RESAMPLE_NEAREST = 0,
CUDNN_RESAMPLE_BILINEAR = 1,
CUDNN_RESAMPLE_AVGPOOL = 2,
CUDNN_RESAMPLE_MAXPOOL = 3,
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnSignalMode_t {
CUDNN_SIGNAL_SET = 0,
CUDNN_SIGNAL_WAIT = 1,
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnGenStatsMode_t {
CUDNN_GENSTATS_SUM_SQSUM = 0,
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnBnFinalizeStatsMode_t {
CUDNN_BN_FINALIZE_STATISTICS_TRAINING = 0,
CUDNN_BN_FINALIZE_STATISTICS_INFERENCE = 1,
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnBackendAttributeName_t {
CUDNN_ATTR_POINTWISE_MODE = 0,
CUDNN_ATTR_POINTWISE_MATH_PREC = 1,
CUDNN_ATTR_POINTWISE_NAN_PROPAGATION = 2,
CUDNN_ATTR_POINTWISE_RELU_LOWER_CLIP = 3,
CUDNN_ATTR_POINTWISE_RELU_UPPER_CLIP = 4,
CUDNN_ATTR_POINTWISE_RELU_LOWER_CLIP_SLOPE = 5,
CUDNN_ATTR_POINTWISE_ELU_ALPHA = 6,
CUDNN_ATTR_POINTWISE_SOFTPLUS_BETA = 7,
CUDNN_ATTR_POINTWISE_SWISH_BETA = 8,
CUDNN_ATTR_POINTWISE_AXIS = 9,
CUDNN_ATTR_CONVOLUTION_COMP_TYPE = 100,
CUDNN_ATTR_CONVOLUTION_CONV_MODE = 101,
CUDNN_ATTR_CONVOLUTION_DILATIONS = 102,
CUDNN_ATTR_CONVOLUTION_FILTER_STRIDES = 103,
CUDNN_ATTR_CONVOLUTION_POST_PADDINGS = 104,
CUDNN_ATTR_CONVOLUTION_PRE_PADDINGS = 105,
CUDNN_ATTR_CONVOLUTION_SPATIAL_DIMS = 106,
CUDNN_ATTR_ENGINEHEUR_MODE = 200,
CUDNN_ATTR_ENGINEHEUR_OPERATION_GRAPH = 201,
CUDNN_ATTR_ENGINEHEUR_RESULTS = 202,
CUDNN_ATTR_ENGINECFG_ENGINE = 300,
CUDNN_ATTR_ENGINECFG_INTERMEDIATE_INFO = 301,
CUDNN_ATTR_ENGINECFG_KNOB_CHOICES = 302,
CUDNN_ATTR_EXECUTION_PLAN_HANDLE = 400,
CUDNN_ATTR_EXECUTION_PLAN_ENGINE_CONFIG = 401,
CUDNN_ATTR_EXECUTION_PLAN_WORKSPACE_SIZE = 402,
CUDNN_ATTR_EXECUTION_PLAN_COMPUTED_INTERMEDIATE_UIDS = 403,
CUDNN_ATTR_EXECUTION_PLAN_RUN_ONLY_INTERMEDIATE_UIDS = 404,
CUDNN_ATTR_EXECUTION_PLAN_JSON_REPRESENTATION = 405,
CUDNN_ATTR_INTERMEDIATE_INFO_UNIQUE_ID = 500,
CUDNN_ATTR_INTERMEDIATE_INFO_SIZE = 501,
CUDNN_ATTR_INTERMEDIATE_INFO_DEPENDENT_DATA_UIDS = 502,
CUDNN_ATTR_INTERMEDIATE_INFO_DEPENDENT_ATTRIBUTES = 503,
CUDNN_ATTR_KNOB_CHOICE_KNOB_TYPE = 600,
CUDNN_ATTR_KNOB_CHOICE_KNOB_VALUE = 601,
CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_ALPHA = 700,
CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_BETA = 701,
CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_CONV_DESC = 702,
CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_W = 703,
CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_X = 704,
CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_Y = 705,
CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_ALPHA = 706,
CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_BETA = 707,
CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_CONV_DESC = 708,
CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_W = 709,
CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_DX = 710,
CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_DY = 711,
CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_ALPHA = 712,
CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_BETA = 713,
CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_CONV_DESC = 714,
CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_DW = 715,
CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_X = 716,
CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_DY = 717,
CUDNN_ATTR_OPERATION_POINTWISE_PW_DESCRIPTOR = 750,
CUDNN_ATTR_OPERATION_POINTWISE_XDESC = 751,
CUDNN_ATTR_OPERATION_POINTWISE_BDESC = 752,
CUDNN_ATTR_OPERATION_POINTWISE_YDESC = 753,
CUDNN_ATTR_OPERATION_POINTWISE_ALPHA1 = 754,
CUDNN_ATTR_OPERATION_POINTWISE_ALPHA2 = 755,
CUDNN_ATTR_OPERATION_POINTWISE_DXDESC = 756,
CUDNN_ATTR_OPERATION_POINTWISE_DYDESC = 757,
CUDNN_ATTR_OPERATION_POINTWISE_TDESC = 758,
CUDNN_ATTR_OPERATION_GENSTATS_MODE = 770,
CUDNN_ATTR_OPERATION_GENSTATS_MATH_PREC = 771,
CUDNN_ATTR_OPERATION_GENSTATS_XDESC = 772,
CUDNN_ATTR_OPERATION_GENSTATS_SUMDESC = 773,
CUDNN_ATTR_OPERATION_GENSTATS_SQSUMDESC = 774,
CUDNN_ATTR_OPERATION_BN_FINALIZE_STATS_MODE = 780,
CUDNN_ATTR_OPERATION_BN_FINALIZE_MATH_PREC = 781,
CUDNN_ATTR_OPERATION_BN_FINALIZE_Y_SUM_DESC = 782,
CUDNN_ATTR_OPERATION_BN_FINALIZE_Y_SQ_SUM_DESC = 783,
CUDNN_ATTR_OPERATION_BN_FINALIZE_SCALE_DESC = 784,
CUDNN_ATTR_OPERATION_BN_FINALIZE_BIAS_DESC = 785,
CUDNN_ATTR_OPERATION_BN_FINALIZE_PREV_RUNNING_MEAN_DESC = 786,
CUDNN_ATTR_OPERATION_BN_FINALIZE_PREV_RUNNING_VAR_DESC = 787,
CUDNN_ATTR_OPERATION_BN_FINALIZE_UPDATED_RUNNING_MEAN_DESC = 788,
CUDNN_ATTR_OPERATION_BN_FINALIZE_UPDATED_RUNNING_VAR_DESC = 789,
CUDNN_ATTR_OPERATION_BN_FINALIZE_SAVED_MEAN_DESC = 790,
CUDNN_ATTR_OPERATION_BN_FINALIZE_SAVED_INV_STD_DESC = 791,
CUDNN_ATTR_OPERATION_BN_FINALIZE_EQ_SCALE_DESC = 792,
CUDNN_ATTR_OPERATION_BN_FINALIZE_EQ_BIAS_DESC = 793,
CUDNN_ATTR_OPERATION_BN_FINALIZE_ACCUM_COUNT_DESC = 794,
CUDNN_ATTR_OPERATION_BN_FINALIZE_EPSILON_DESC = 795,
CUDNN_ATTR_OPERATION_BN_FINALIZE_EXP_AVERATE_FACTOR_DESC = 796,
CUDNN_ATTR_OPERATIONGRAPH_HANDLE = 800,
CUDNN_ATTR_OPERATIONGRAPH_OPS = 801,
CUDNN_ATTR_OPERATIONGRAPH_ENGINE_GLOBAL_COUNT = 802,
CUDNN_ATTR_TENSOR_BYTE_ALIGNMENT = 900,
CUDNN_ATTR_TENSOR_DATA_TYPE = 901,
CUDNN_ATTR_TENSOR_DIMENSIONS = 902,
CUDNN_ATTR_TENSOR_STRIDES = 903,
CUDNN_ATTR_TENSOR_VECTOR_COUNT = 904,
CUDNN_ATTR_TENSOR_VECTORIZED_DIMENSION = 905,
CUDNN_ATTR_TENSOR_UNIQUE_ID = 906,
CUDNN_ATTR_TENSOR_IS_VIRTUAL = 907,
CUDNN_ATTR_TENSOR_IS_BY_VALUE = 908,
CUDNN_ATTR_TENSOR_REORDERING_MODE = 909,
CUDNN_ATTR_VARIANT_PACK_UNIQUE_IDS = 1000,
CUDNN_ATTR_VARIANT_PACK_DATA_POINTERS = 1001,
CUDNN_ATTR_VARIANT_PACK_INTERMEDIATES = 1002,
CUDNN_ATTR_VARIANT_PACK_WORKSPACE = 1003,
CUDNN_ATTR_LAYOUT_INFO_TENSOR_UID = 1100,
CUDNN_ATTR_LAYOUT_INFO_TYPES = 1101,
CUDNN_ATTR_KNOB_INFO_TYPE = 1200,
CUDNN_ATTR_KNOB_INFO_MAXIMUM_VALUE = 1201,
CUDNN_ATTR_KNOB_INFO_MINIMUM_VALUE = 1202,
CUDNN_ATTR_KNOB_INFO_STRIDE = 1203,
CUDNN_ATTR_ENGINE_OPERATION_GRAPH = 1300,
CUDNN_ATTR_ENGINE_GLOBAL_INDEX = 1301,
CUDNN_ATTR_ENGINE_KNOB_INFO = 1302,
CUDNN_ATTR_ENGINE_NUMERICAL_NOTE = 1303,
CUDNN_ATTR_ENGINE_LAYOUT_INFO = 1304,
CUDNN_ATTR_ENGINE_BEHAVIOR_NOTE = 1305,
CUDNN_ATTR_MATMUL_COMP_TYPE = 1500,
CUDNN_ATTR_OPERATION_MATMUL_ADESC = 1520,
CUDNN_ATTR_OPERATION_MATMUL_BDESC = 1521,
CUDNN_ATTR_OPERATION_MATMUL_CDESC = 1522,
CUDNN_ATTR_OPERATION_MATMUL_DESC = 1523,
CUDNN_ATTR_OPERATION_MATMUL_IRREGULARLY_STRIDED_BATCH_COUNT = 1524,
CUDNN_ATTR_REDUCTION_OPERATOR = 1600,
CUDNN_ATTR_REDUCTION_COMP_TYPE = 1601,
CUDNN_ATTR_OPERATION_REDUCTION_XDESC = 1610,
CUDNN_ATTR_OPERATION_REDUCTION_YDESC = 1611,
CUDNN_ATTR_OPERATION_REDUCTION_DESC = 1612,
CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_MATH_PREC = 1620,
CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_MEAN_DESC = 1621,
CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_INVSTD_DESC = 1622,
CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_BN_SCALE_DESC = 1623,
CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_X_DESC = 1624,
CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DY_DESC = 1625,
CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DBN_SCALE_DESC = 1626,
CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DBN_BIAS_DESC = 1627,
CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_DY_SCALE_DESC = 1628,
CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_X_SCALE_DESC = 1629,
CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_BIAS = 1630,
CUDNN_ATTR_RESAMPLE_MODE = 1700,
CUDNN_ATTR_RESAMPLE_COMP_TYPE = 1701,
CUDNN_ATTR_RESAMPLE_SPATIAL_DIMS = 1702,
CUDNN_ATTR_RESAMPLE_POST_PADDINGS = 1703,
CUDNN_ATTR_RESAMPLE_PRE_PADDINGS = 1704,
CUDNN_ATTR_RESAMPLE_STRIDES = 1705,
CUDNN_ATTR_RESAMPLE_WINDOW_DIMS = 1706,
CUDNN_ATTR_RESAMPLE_NAN_PROPAGATION = 1707,
CUDNN_ATTR_RESAMPLE_PADDING_MODE = 1708,
CUDNN_ATTR_OPERATION_RESAMPLE_FWD_XDESC = 1710,
CUDNN_ATTR_OPERATION_RESAMPLE_FWD_YDESC = 1711,
CUDNN_ATTR_OPERATION_RESAMPLE_FWD_IDXDESC = 1712,
CUDNN_ATTR_OPERATION_RESAMPLE_FWD_ALPHA = 1713,
CUDNN_ATTR_OPERATION_RESAMPLE_FWD_BETA = 1714,
CUDNN_ATTR_OPERATION_RESAMPLE_FWD_DESC = 1716,
CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DXDESC = 1720,
CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DYDESC = 1721,
CUDNN_ATTR_OPERATION_RESAMPLE_BWD_IDXDESC = 1722,
CUDNN_ATTR_OPERATION_RESAMPLE_BWD_ALPHA = 1723,
CUDNN_ATTR_OPERATION_RESAMPLE_BWD_BETA = 1724,
CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DESC = 1725,
CUDNN_ATTR_OPERATION_CONCAT_AXIS = 1800,
CUDNN_ATTR_OPERATION_CONCAT_INPUT_DESCS = 1801,
CUDNN_ATTR_OPERATION_CONCAT_INPLACE_INDEX = 1802,
CUDNN_ATTR_OPERATION_CONCAT_OUTPUT_DESC = 1803,
CUDNN_ATTR_OPERATION_SIGNAL_MODE = 1900,
CUDNN_ATTR_OPERATION_SIGNAL_FLAGDESC = 1901,
CUDNN_ATTR_OPERATION_SIGNAL_VALUE = 1902,
CUDNN_ATTR_OPERATION_SIGNAL_XDESC = 1903,
CUDNN_ATTR_OPERATION_SIGNAL_YDESC = 1904,
CUDNN_ATTR_OPERATION_NORM_FWD_MODE = 2000,
CUDNN_ATTR_OPERATION_NORM_FWD_PHASE = 2001,
CUDNN_ATTR_OPERATION_NORM_FWD_XDESC = 2002,
CUDNN_ATTR_OPERATION_NORM_FWD_MEAN_DESC = 2003,
CUDNN_ATTR_OPERATION_NORM_FWD_INV_VARIANCE_DESC = 2004,
CUDNN_ATTR_OPERATION_NORM_FWD_SCALE_DESC = 2005,
CUDNN_ATTR_OPERATION_NORM_FWD_BIAS_DESC = 2006,
CUDNN_ATTR_OPERATION_NORM_FWD_EPSILON_DESC = 2007,
CUDNN_ATTR_OPERATION_NORM_FWD_EXP_AVG_FACTOR_DESC = 2008,
CUDNN_ATTR_OPERATION_NORM_FWD_INPUT_RUNNING_MEAN_DESC = 2009,
CUDNN_ATTR_OPERATION_NORM_FWD_INPUT_RUNNING_VAR_DESC = 2010,
CUDNN_ATTR_OPERATION_NORM_FWD_OUTPUT_RUNNING_MEAN_DESC = 2011,
CUDNN_ATTR_OPERATION_NORM_FWD_OUTPUT_RUNNING_VAR_DESC = 2012,
CUDNN_ATTR_OPERATION_NORM_FWD_YDESC = 2013,
CUDNN_ATTR_OPERATION_NORM_FWD_PEER_STAT_DESCS = 2014,
CUDNN_ATTR_OPERATION_NORM_BWD_MODE = 2100,
CUDNN_ATTR_OPERATION_NORM_BWD_XDESC = 2101,
CUDNN_ATTR_OPERATION_NORM_BWD_MEAN_DESC = 2102,
CUDNN_ATTR_OPERATION_NORM_BWD_INV_VARIANCE_DESC = 2103,
CUDNN_ATTR_OPERATION_NORM_BWD_DYDESC = 2104,
CUDNN_ATTR_OPERATION_NORM_BWD_SCALE_DESC = 2105,
CUDNN_ATTR_OPERATION_NORM_BWD_EPSILON_DESC = 2106,
CUDNN_ATTR_OPERATION_NORM_BWD_DSCALE_DESC = 2107,
CUDNN_ATTR_OPERATION_NORM_BWD_DBIAS_DESC = 2108,
CUDNN_ATTR_OPERATION_NORM_BWD_DXDESC = 2109,
CUDNN_ATTR_OPERATION_NORM_BWD_PEER_STAT_DESCS = 2110,
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnBackendAttributeType_t {
CUDNN_TYPE_HANDLE = 0,
CUDNN_TYPE_DATA_TYPE = 1,
CUDNN_TYPE_BOOLEAN = 2,
CUDNN_TYPE_INT64 = 3,
CUDNN_TYPE_FLOAT = 4,
CUDNN_TYPE_DOUBLE = 5,
CUDNN_TYPE_VOID_PTR = 6,
CUDNN_TYPE_CONVOLUTION_MODE = 7,
CUDNN_TYPE_HEUR_MODE = 8,
CUDNN_TYPE_KNOB_TYPE = 9,
CUDNN_TYPE_NAN_PROPOGATION = 10,
CUDNN_TYPE_NUMERICAL_NOTE = 11,
CUDNN_TYPE_LAYOUT_TYPE = 12,
CUDNN_TYPE_ATTRIB_NAME = 13,
CUDNN_TYPE_POINTWISE_MODE = 14,
CUDNN_TYPE_BACKEND_DESCRIPTOR = 15,
CUDNN_TYPE_GENSTATS_MODE = 16,
CUDNN_TYPE_BN_FINALIZE_STATS_MODE = 17,
CUDNN_TYPE_REDUCTION_OPERATOR_TYPE = 18,
CUDNN_TYPE_BEHAVIOR_NOTE = 19,
CUDNN_TYPE_TENSOR_REORDERING_MODE = 20,
CUDNN_TYPE_RESAMPLE_MODE = 21,
CUDNN_TYPE_PADDING_MODE = 22,
CUDNN_TYPE_INT32 = 23,
CUDNN_TYPE_CHAR = 24,
CUDNN_TYPE_SIGNAL_MODE = 25,
CUDNN_TYPE_FRACTION = 26,
CUDNN_TYPE_NORM_MODE = 27,
CUDNN_TYPE_NORM_FWD_PHASE = 28,
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnBackendDescriptorType_t {
CUDNN_BACKEND_POINTWISE_DESCRIPTOR = 0,
CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR = 1,
CUDNN_BACKEND_ENGINE_DESCRIPTOR = 2,
CUDNN_BACKEND_ENGINECFG_DESCRIPTOR = 3,
CUDNN_BACKEND_ENGINEHEUR_DESCRIPTOR = 4,
CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR = 5,
CUDNN_BACKEND_INTERMEDIATE_INFO_DESCRIPTOR = 6,
CUDNN_BACKEND_KNOB_CHOICE_DESCRIPTOR = 7,
CUDNN_BACKEND_KNOB_INFO_DESCRIPTOR = 8,
CUDNN_BACKEND_LAYOUT_INFO_DESCRIPTOR = 9,
CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR = 10,
CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR = 11,
CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR = 12,
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR = 13,
CUDNN_BACKEND_OPERATION_GEN_STATS_DESCRIPTOR = 14,
CUDNN_BACKEND_OPERATIONGRAPH_DESCRIPTOR = 15,
CUDNN_BACKEND_VARIANT_PACK_DESCRIPTOR = 16,
CUDNN_BACKEND_TENSOR_DESCRIPTOR = 17,
CUDNN_BACKEND_MATMUL_DESCRIPTOR = 18,
CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR = 19,
CUDNN_BACKEND_OPERATION_BN_FINALIZE_STATISTICS_DESCRIPTOR = 20,
CUDNN_BACKEND_REDUCTION_DESCRIPTOR = 21,
CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR = 22,
CUDNN_BACKEND_OPERATION_BN_BWD_WEIGHTS_DESCRIPTOR = 23,
CUDNN_BACKEND_RESAMPLE_DESCRIPTOR = 24,
CUDNN_BACKEND_OPERATION_RESAMPLE_FWD_DESCRIPTOR = 25,
CUDNN_BACKEND_OPERATION_RESAMPLE_BWD_DESCRIPTOR = 26,
CUDNN_BACKEND_OPERATION_CONCAT_DESCRIPTOR = 27,
CUDNN_BACKEND_OPERATION_SIGNAL_DESCRIPTOR = 28,
CUDNN_BACKEND_OPERATION_NORM_FORWARD_DESCRIPTOR = 29,
CUDNN_BACKEND_OPERATION_NORM_BACKWARD_DESCRIPTOR = 30,
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnBackendNumericalNote_t {
CUDNN_NUMERICAL_NOTE_TENSOR_CORE = 0,
CUDNN_NUMERICAL_NOTE_DOWN_CONVERT_INPUTS = 1,
CUDNN_NUMERICAL_NOTE_REDUCED_PRECISION_REDUCTION = 2,
CUDNN_NUMERICAL_NOTE_FFT = 3,
CUDNN_NUMERICAL_NOTE_NONDETERMINISTIC = 4,
CUDNN_NUMERICAL_NOTE_WINOGRAD = 5,
CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_4x4 = 6,
CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_6x6 = 7,
CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_13x13 = 8,
CUDNN_NUMERICAL_NOTE_TYPE_COUNT = 9,
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnBackendBehaviorNote_t {
CUDNN_BEHAVIOR_NOTE_RUNTIME_COMPILATION = 0,
CUDNN_BEHAVIOR_NOTE_REQUIRES_FILTER_INT8x32_REORDER = 1,
CUDNN_BEHAVIOR_NOTE_REQUIRES_BIAS_INT8x32_REORDER = 2,
CUDNN_BEHAVIOR_NOTE_TYPE_COUNT = 3,
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnBackendKnobType_t {
CUDNN_KNOB_TYPE_SPLIT_K = 0,
CUDNN_KNOB_TYPE_SWIZZLE = 1,
CUDNN_KNOB_TYPE_TILE_SIZE = 2,
CUDNN_KNOB_TYPE_USE_TEX = 3,
CUDNN_KNOB_TYPE_EDGE = 4,
CUDNN_KNOB_TYPE_KBLOCK = 5,
CUDNN_KNOB_TYPE_LDGA = 6,
CUDNN_KNOB_TYPE_LDGB = 7,
CUDNN_KNOB_TYPE_CHUNK_K = 8,
CUDNN_KNOB_TYPE_SPLIT_H = 9,
CUDNN_KNOB_TYPE_WINO_TILE = 10,
CUDNN_KNOB_TYPE_MULTIPLY = 11,
CUDNN_KNOB_TYPE_SPLIT_K_BUF = 12,
CUDNN_KNOB_TYPE_TILEK = 13,
CUDNN_KNOB_TYPE_STAGES = 14,
CUDNN_KNOB_TYPE_REDUCTION_MODE = 15,
CUDNN_KNOB_TYPE_CTA_SPLIT_K_MODE = 16,
CUDNN_KNOB_TYPE_SPLIT_K_SLC = 17,
CUDNN_KNOB_TYPE_IDX_MODE = 18,
CUDNN_KNOB_TYPE_SLICED = 19,
CUDNN_KNOB_TYPE_SPLIT_RS = 20,
CUDNN_KNOB_TYPE_SINGLEBUFFER = 21,
CUDNN_KNOB_TYPE_LDGC = 22,
CUDNN_KNOB_TYPE_SPECFILT = 23,
CUDNN_KNOB_TYPE_KERNEL_CFG = 24,
CUDNN_KNOB_TYPE_WORKSPACE = 25,
CUDNN_KNOB_TYPE_COUNTS = 26,
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnBackendLayoutType_t {
CUDNN_LAYOUT_TYPE_PREFERRED_NCHW = 0,
CUDNN_LAYOUT_TYPE_PREFERRED_NHWC = 1,
CUDNN_LAYOUT_TYPE_PREFERRED_PAD4CK = 2,
CUDNN_LAYOUT_TYPE_PREFERRED_PAD8CK = 3,
CUDNN_LAYOUT_TYPE_COUNT = 4,
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnBackendHeurMode_t {
CUDNN_HEUR_MODE_INSTANT = 0,
CUDNN_HEUR_MODE_B = 1,
CUDNN_HEUR_MODE_FALLBACK = 2,
CUDNN_HEUR_MODE_A = 3,
CUDNN_HEUR_MODES_COUNT = 4,
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnBackendTensorReordering_t {
CUDNN_TENSOR_REORDERING_NONE = 0,
CUDNN_TENSOR_REORDERING_INT8x32 = 1,
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnPaddingMode_t {
CUDNN_ZERO_PAD = 0,
CUDNN_NEG_INF_PAD = 1,
CUDNN_EDGE_VAL_PAD = 2,
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnBackendNormMode_t {
CUDNN_LAYER_NORM = 0,
CUDNN_INSTANCE_NORM = 1,
CUDNN_BATCH_NORM = 2,
CUDNN_GROUP_NORM = 3,
}
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum cudnnBackendNormFwdPhase_t {
CUDNN_NORM_FWD_INFERENCE = 0,
CUDNN_NORM_FWD_TRAINING = 1,
}
extern "C" {
pub fn cudnnBackendCreateDescriptor(
descriptorType: cudnnBackendDescriptorType_t,
descriptor: *mut cudnnBackendDescriptor_t,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnBackendDestroyDescriptor(descriptor: cudnnBackendDescriptor_t) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnBackendInitialize(descriptor: cudnnBackendDescriptor_t) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnBackendFinalize(descriptor: cudnnBackendDescriptor_t) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnBackendSetAttribute(
descriptor: cudnnBackendDescriptor_t,
attributeName: cudnnBackendAttributeName_t,
attributeType: cudnnBackendAttributeType_t,
elementCount: i64,
arrayOfElements: *const ::core::ffi::c_void,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnBackendGetAttribute(
descriptor: cudnnBackendDescriptor_t,
attributeName: cudnnBackendAttributeName_t,
attributeType: cudnnBackendAttributeType_t,
requestedElementCount: i64,
elementCount: *mut i64,
arrayOfElements: *mut ::core::ffi::c_void,
) -> cudnnStatus_t;
}
extern "C" {
pub fn cudnnBackendExecute(
handle: cudnnHandle_t,
executionPlan: cudnnBackendDescriptor_t,
variantPack: cudnnBackendDescriptor_t,
) -> cudnnStatus_t;
}
#include "cudnn.h"
\ No newline at end of file
#!/bin/bash
set -exu
bindgen \
--whitelist-type="^curand.*" \
--whitelist-var="^curand.*" \
--whitelist-function="^curand.*" \
--default-enum-style=rust \
--no-doc-comments \
--with-derive-default \
--with-derive-eq \
--with-derive-hash \
--with-derive-ord \
--size_t-is-usize \
--use-core \
wrapper.h -- -I/usr/local/cuda/include \
> sys.rs
\ No newline at end of file
//! Wrappers around the [cuRAND API](https://docs.nvidia.com/cuda/curand/index.html)
//! in three levels. See crate documentation for description of each.
pub mod result;
pub mod safe;
#[allow(warnings)]
pub mod sys;
pub use safe::*;
//! A thin wrapper around [sys] providing [Result]s with [CurandError].
//!
//! Two flavors of generation:
//! 1. Not generic: See [generate] for non-generic generation functions.
//! 2. Generic: See [UniformFill], [NormalFill], and [LogNormalFill] for generic generation functions.
use super::sys;
use std::mem::MaybeUninit;
/// Wrapper around [sys::curandStatus_t].
/// See [cuRAND docs](https://docs.nvidia.com/cuda/curand/group__HOST.html#group__HOST_1gb94a31d5c165858c96b6c18b70644437)
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
pub struct CurandError(pub sys::curandStatus_t);
impl sys::curandStatus_t {
/// Transforms into a [Result] of [CurandError]
pub fn result(self) -> Result<(), CurandError> {
match self {
sys::curandStatus_t::CURAND_STATUS_SUCCESS => Ok(()),
_ => Err(CurandError(self)),
}
}
}
#[cfg(feature = "std")]
impl std::fmt::Display for CurandError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{self:?}")
}
}
#[cfg(feature = "std")]
impl std::error::Error for CurandError {}
/// Create new random number generator with the default pseudo rng type.
///
/// See [cuRAND docs](https://docs.nvidia.com/cuda/curand/group__HOST.html#group__HOST_1g56ff2b3cf7e28849f73a1e22022bcbfd).
pub fn create_generator() -> Result<sys::curandGenerator_t, CurandError> {
create_generator_kind(sys::curandRngType_t::CURAND_RNG_PSEUDO_DEFAULT)
}
/// Create new random number generator.
///
/// See [cuRAND docs](https://docs.nvidia.com/cuda/curand/group__HOST.html#group__HOST_1g56ff2b3cf7e28849f73a1e22022bcbfd).
pub fn create_generator_kind(
kind: sys::curandRngType_t,
) -> Result<sys::curandGenerator_t, CurandError> {
let mut generator = MaybeUninit::uninit();
unsafe {
sys::curandCreateGenerator(generator.as_mut_ptr(), kind).result()?;
Ok(generator.assume_init())
}
}
/// Set the seed value of the pseudo-random number generator.
///
/// See [cuRAND docs](https://docs.nvidia.com/cuda/curand/group__HOST.html#group__HOST_1gbcd2982aa3d53571b8ad12d8188b139b)
///
/// # Safety
/// The generator must be allocated and not already freed.
pub unsafe fn set_seed(generator: sys::curandGenerator_t, seed: u64) -> Result<(), CurandError> {
sys::curandSetPseudoRandomGeneratorSeed(generator, seed).result()
}
/// Set the offset value of the pseudo-random number generator.
///
/// See [cuRAND docs](https://docs.nvidia.com/cuda/curand/group__HOST.html#group__HOST_1gb21ba987f85486e552797206451b0939)
///
/// # Safety
/// The generator must be allocated and not already freed.
pub unsafe fn set_offset(
generator: sys::curandGenerator_t,
offset: u64,
) -> Result<(), CurandError> {
sys::curandSetGeneratorOffset(generator, offset).result()
}
/// Set the current stream for CURAND kernel launches.
///
/// See [cuRAND docs](https://docs.nvidia.com/cuda/curand/group__HOST.html#group__HOST_1gc78c8d07c7acea4242e2a62bc41ff1f5)
///
/// # Safety
/// 1. The generator must be allocated and not already freed.
/// 2. The stream must be allocated and not already freed.
pub unsafe fn set_stream(
generator: sys::curandGenerator_t,
stream: sys::cudaStream_t,
) -> Result<(), CurandError> {
sys::curandSetStream(generator, stream).result()
}
/// Destroy an existing generator.
///
/// See [cuRAND docs](https://docs.nvidia.com/cuda/curand/group__HOST.html#group__HOST_1g8d82c56e2b869fef4f9929a775ee18d0).
///
/// # Safety
/// The generator must not have already been freed.
pub unsafe fn destroy_generator(generator: sys::curandGenerator_t) -> Result<(), CurandError> {
sys::curandDestroyGenerator(generator).result()
}
pub mod generate {
//! Functions to generate different distributions.
use super::{sys, CurandError};
/// Fills `out` with `num` f32 values in the range (0.0, 1.0].
///
/// See [cuRAND docs](https://docs.nvidia.com/cuda/curand/group__HOST.html#group__HOST_1g5df92a7293dc6b2e61ea481a2069ebc2)
///
/// # Safety
/// 1. generator must have been allocated and not freed.
/// 2. `out` point to `num` values
pub unsafe fn uniform_f32(
gen: sys::curandGenerator_t,
out: *mut f32,
num: usize,
) -> Result<(), CurandError> {
sys::curandGenerateUniform(gen, out, num).result()
}
/// Fills `out` with `num` f64 values in the range (0.0, 1.0].
///
/// See [cuRAND docs](https://docs.nvidia.com/cuda/curand/group__HOST.html#group__HOST_1gbb08f0268f05c9d87eac2b4a2cf7fc24)
///
/// # Safety
/// 1. generator must have been allocated and not freed.
/// 2. `out` point to `num` values
pub unsafe fn uniform_f64(
gen: sys::curandGenerator_t,
out: *mut f64,
num: usize,
) -> Result<(), CurandError> {
sys::curandGenerateUniformDouble(gen, out, num).result()
}
/// Fills `out` with `num` u32 values with all bits random.
///
/// See [cuRAND docs](https://docs.nvidia.com/cuda/curand/group__HOST.html#group__HOST_1gf18b3cbdf0b7d9e2335bada92610adac)
///
/// # Safety
/// 1. generator must have been allocated and not freed.
/// 2. `out` point to `num` values
pub unsafe fn uniform_u32(
gen: sys::curandGenerator_t,
out: *mut u32,
num: usize,
) -> Result<(), CurandError> {
sys::curandGenerate(gen, out, num).result()
}
/// Fills `out` with `num` f32 values from a normal distribution
/// parameterized by `mean` and `std`.
///
/// See [cuRAND docs](https://docs.nvidia.com/cuda/curand/group__HOST.html#group__HOST_1gb9280e447ef04e1dec4611720bd0eb69)
///
/// # Safety
/// 1. generator must have been allocated and not freed.
/// 2. `out` point to `num` values
pub unsafe fn normal_f32(
gen: sys::curandGenerator_t,
out: *mut f32,
num: usize,
mean: f32,
std: f32,
) -> Result<(), CurandError> {
sys::curandGenerateNormal(gen, out, num, mean, std).result()
}
/// Fills `out` with `num` f64 values from a normal distribution
/// parameterized by `mean` and `std`.
///
/// See [cuRAND docs](https://docs.nvidia.com/cuda/curand/group__HOST.html#group__HOST_1g046759ff9b6bf8dafc9eaae04917dc8e)
///
/// # Safety
/// 1. generator must have been allocated and not freed.
/// 2. `out` point to `num` values
pub unsafe fn normal_f64(
gen: sys::curandGenerator_t,
out: *mut f64,
num: usize,
mean: f64,
std: f64,
) -> Result<(), CurandError> {
sys::curandGenerateNormalDouble(gen, out, num, mean, std).result()
}
/// Fills `out` with `num` f32 values from a log normal distribution
/// parameterized by `mean` and `std`.
///
/// See [cuRAND docs](https://docs.nvidia.com/cuda/curand/group__HOST.html#group__HOST_1g3569cc960eb1a31357752fc813e21f49)
///
/// # Safety
/// 1. generator must have been allocated and not freed.
/// 2. `out` point to `num` values
pub unsafe fn log_normal_f32(
gen: sys::curandGenerator_t,
out: *mut f32,
num: usize,
mean: f32,
std: f32,
) -> Result<(), CurandError> {
sys::curandGenerateLogNormal(gen, out, num, mean, std).result()
}
/// Fills `out` with `num` f64 values from a normal distribution
/// parameterized by `mean` and `std`.
///
/// See [cuRAND docs](https://docs.nvidia.com/cuda/curand/group__HOST.html#group__HOST_1g300c31530c8b461ca89f1e0232a6f05f)
///
/// # Safety
/// 1. generator must have been allocated and not freed.
/// 2. `out` point to `num` values
pub unsafe fn log_normal_f64(
gen: sys::curandGenerator_t,
out: *mut f64,
num: usize,
mean: f64,
std: f64,
) -> Result<(), CurandError> {
sys::curandGenerateLogNormalDouble(gen, out, num, mean, std).result()
}
/// Fills `out` with `num` u32 values from a poisson distribution
/// parameterized by `lambda`.
///
/// See [cuRAND docs](https://docs.nvidia.com/cuda/curand/group__HOST.html#group__HOST_1g425c7c13db4444e6150d159bb1417f05)
///
/// # Safety
/// 1. generator must have been allocated and not freed.
/// 2. `out` point to `num` values
pub unsafe fn poisson_u32(
gen: sys::curandGenerator_t,
out: *mut u32,
num: usize,
lambda: f64,
) -> Result<(), CurandError> {
sys::curandGeneratePoisson(gen, out, num, lambda).result()
}
}
/// Fill with uniform distributed numbers of type `T`.
pub trait UniformFill<T> {
/// # Safety
/// This inherits the unsafe from methods in [generate].
unsafe fn fill(self, out: *mut T, num: usize) -> Result<(), CurandError>;
}
impl UniformFill<f32> for sys::curandGenerator_t {
unsafe fn fill(self, out: *mut f32, num: usize) -> Result<(), CurandError> {
generate::uniform_f32(self, out, num)
}
}
impl UniformFill<f64> for sys::curandGenerator_t {
unsafe fn fill(self, out: *mut f64, num: usize) -> Result<(), CurandError> {
generate::uniform_f64(self, out, num)
}
}
impl UniformFill<u32> for sys::curandGenerator_t {
unsafe fn fill(self, out: *mut u32, num: usize) -> Result<(), CurandError> {
generate::uniform_u32(self, out, num)
}
}
/// Fill with normally distributed numbers of type `T`.
pub trait NormalFill<T> {
/// # Safety
/// This inherits the unsafe from methods in [generate].
unsafe fn fill(self, o: *mut T, n: usize, m: T, s: T) -> Result<(), CurandError>;
}
impl NormalFill<f32> for sys::curandGenerator_t {
unsafe fn fill(self, o: *mut f32, n: usize, m: f32, s: f32) -> Result<(), CurandError> {
generate::normal_f32(self, o, n, m, s)
}
}
impl NormalFill<f64> for sys::curandGenerator_t {
unsafe fn fill(self, o: *mut f64, n: usize, m: f64, s: f64) -> Result<(), CurandError> {
generate::normal_f64(self, o, n, m, s)
}
}
/// Fill with log normally distributed numbers of type `T`.
pub trait LogNormalFill<T> {
/// # Safety
/// This inherits the unsafe from methods in [generate].
unsafe fn fill(self, o: *mut T, n: usize, m: T, s: T) -> Result<(), CurandError>;
}
impl LogNormalFill<f32> for sys::curandGenerator_t {
unsafe fn fill(self, o: *mut f32, n: usize, m: f32, s: f32) -> Result<(), CurandError> {
generate::log_normal_f32(self, o, n, m, s)
}
}
impl LogNormalFill<f64> for sys::curandGenerator_t {
unsafe fn fill(self, o: *mut f64, n: usize, m: f64, s: f64) -> Result<(), CurandError> {
generate::log_normal_f64(self, o, n, m, s)
}
}
//! Safe abstractions around [crate::curand::result] with [CudaRng].
use super::{result, sys};
use crate::driver::{CudaDevice, CudaSlice, DeviceSlice};
use std::sync::Arc;
/// Host side RNG that can fill [CudaSlice] with random values.
///
/// 1. Create:
/// ```rust
/// # use cudarc::{driver::*, curand::*};
/// let device = CudaDevice::new(0).unwrap();
/// let rng = CudaRng::new(0, device).unwrap();
/// ```
/// 2. Fill device memory:
/// ```rust
/// # use cudarc::{driver::*, curand::*};
/// # let device = CudaDevice::new(0).unwrap();
/// # let rng = CudaRng::new(0, device.clone()).unwrap();
/// let mut a_dev = device.alloc_zeros::<f32>(10).unwrap();
/// rng.fill_with_uniform(&mut a_dev).unwrap();
/// ```
///
/// The three distributions are:
/// 1. Uniform - [CudaRng::fill_with_uniform()]
/// 2. Normal - [CudaRng::fill_with_normal()]
/// 3. LogNormal - [CudaRng::fill_with_log_normal()]
pub struct CudaRng {
pub(crate) gen: sys::curandGenerator_t,
pub(crate) device: Arc<CudaDevice>,
}
impl CudaRng {
/// Constructs the RNG with the given `seed`. Requires the stream from [CudaDevice] to submit kernels.
pub fn new(seed: u64, device: Arc<CudaDevice>) -> Result<Self, result::CurandError> {
device.bind_to_thread().unwrap();
let gen = result::create_generator()?;
let mut rng = Self { gen, device };
rng.set_seed(seed)?;
unsafe { result::set_stream(rng.gen, rng.device.stream as *mut _) }?;
Ok(rng)
}
/// Re-seed the RNG.
pub fn set_seed(&mut self, seed: u64) -> Result<(), result::CurandError> {
unsafe { result::set_seed(self.gen, seed) }
}
pub fn set_offset(&mut self, offset: u64) -> Result<(), result::CurandError> {
unsafe { result::set_offset(self.gen, offset) }
}
/// Fill the [CudaSlice] with data from a `Uniform` distribution
pub fn fill_with_uniform<T>(&self, t: &mut CudaSlice<T>) -> Result<(), result::CurandError>
where
sys::curandGenerator_t: result::UniformFill<T>,
{
unsafe { result::UniformFill::fill(self.gen, t.cu_device_ptr as *mut T, t.len()) }
}
/// Fill the [CudaSlice] with data from a `Normal(mean, std)` distribution.
pub fn fill_with_normal<T>(
&self,
t: &mut CudaSlice<T>,
mean: T,
std: T,
) -> Result<(), result::CurandError>
where
sys::curandGenerator_t: result::NormalFill<T>,
{
unsafe { result::NormalFill::fill(self.gen, t.cu_device_ptr as *mut T, t.len(), mean, std) }
}
/// Fill the `CudaRc` with data from a `LogNormal(mean, std)` distribution.
pub fn fill_with_log_normal<T>(
&self,
t: &mut CudaSlice<T>,
mean: T,
std: T,
) -> Result<(), result::CurandError>
where
sys::curandGenerator_t: result::LogNormalFill<T>,
{
unsafe {
result::LogNormalFill::fill(self.gen, t.cu_device_ptr as *mut T, t.len(), mean, std)
}
}
}
impl Drop for CudaRng {
fn drop(&mut self) {
let gen = std::mem::replace(&mut self.gen, std::ptr::null_mut());
if !gen.is_null() {
unsafe { result::destroy_generator(gen) }.unwrap();
}
}
}
#[cfg(test)]
mod tests {
#![allow(clippy::needless_range_loop)]
use super::*;
use crate::{
curand::result::{LogNormalFill, NormalFill, UniformFill},
driver::*,
};
use std::vec::Vec;
fn gen_uniform<T: ValidAsZeroBits + Clone + Default + Unpin + DeviceRepr>(
seed: u64,
n: usize,
) -> Vec<T>
where
super::sys::curandGenerator_t: UniformFill<T>,
{
let dev = CudaDevice::new(0).unwrap();
let mut a_dev = dev.alloc_zeros::<T>(n).unwrap();
let rng = CudaRng::new(seed, dev.clone()).unwrap();
rng.fill_with_uniform(&mut a_dev).unwrap();
dev.sync_reclaim(a_dev).unwrap()
}
fn gen_normal<T: ValidAsZeroBits + Clone + Default + Unpin + DeviceRepr>(
seed: u64,
n: usize,
mean: T,
std: T,
) -> Vec<T>
where
super::sys::curandGenerator_t: NormalFill<T>,
{
let dev = CudaDevice::new(0).unwrap();
let mut a_dev = dev.alloc_zeros::<T>(n).unwrap();
let rng = CudaRng::new(seed, dev.clone()).unwrap();
rng.fill_with_normal(&mut a_dev, mean, std).unwrap();
dev.sync_reclaim(a_dev).unwrap()
}
fn gen_log_normal<T: ValidAsZeroBits + Clone + Default + Unpin + DeviceRepr>(
seed: u64,
n: usize,
mean: T,
std: T,
) -> Vec<T>
where
super::sys::curandGenerator_t: LogNormalFill<T>,
{
let dev = CudaDevice::new(0).unwrap();
let mut a_dev = dev.alloc_zeros::<T>(n).unwrap();
let rng = CudaRng::new(seed, dev.clone()).unwrap();
rng.fill_with_log_normal(&mut a_dev, mean, std).unwrap();
dev.sync_reclaim(a_dev).unwrap()
}
#[test]
fn test_rc_counts() {
let dev = CudaDevice::new(0).unwrap();
assert_eq!(Arc::strong_count(&dev), 1);
let a_rng = CudaRng::new(0, dev.clone()).unwrap();
assert_eq!(Arc::strong_count(&dev), 2);
let a_dev = dev.alloc_zeros::<f32>(10).unwrap();
assert_eq!(Arc::strong_count(&dev), 3);
drop(a_rng);
assert_eq!(Arc::strong_count(&dev), 2);
drop(a_dev);
assert_eq!(Arc::strong_count(&dev), 1);
}
#[test]
fn test_seed_reproducible() {
let dev = CudaDevice::new(0).unwrap();
let mut a_dev = dev.alloc_zeros::<f32>(10).unwrap();
let mut b_dev = a_dev.clone();
let a_rng = CudaRng::new(0, dev.clone()).unwrap();
let b_rng = CudaRng::new(0, dev.clone()).unwrap();
a_rng.fill_with_uniform(&mut a_dev).unwrap();
b_rng.fill_with_uniform(&mut b_dev).unwrap();
let a_host = dev.sync_reclaim(a_dev).unwrap();
let b_host = dev.sync_reclaim(b_dev).unwrap();
assert_eq!(a_host, b_host);
}
#[test]
fn test_different_seeds_neq() {
let dev = CudaDevice::new(0).unwrap();
let mut a_dev = dev.alloc_zeros::<f32>(10).unwrap();
let mut b_dev = a_dev.clone();
let a_rng = CudaRng::new(0, dev.clone()).unwrap();
let b_rng = CudaRng::new(1, dev.clone()).unwrap();
a_rng.fill_with_uniform(&mut a_dev).unwrap();
b_rng.fill_with_uniform(&mut b_dev).unwrap();
let a_host = dev.sync_reclaim(a_dev).unwrap();
let b_host = dev.sync_reclaim(b_dev).unwrap();
assert_ne!(a_host, b_host);
}
#[test]
fn test_set_offset() {
let dev = CudaDevice::new(0).unwrap();
let mut a_dev = dev.alloc_zeros::<f32>(10).unwrap();
let mut a_rng = CudaRng::new(0, dev.clone()).unwrap();
a_rng.set_seed(42).unwrap();
a_rng.set_offset(0).unwrap();
a_rng.fill_with_uniform(&mut a_dev).unwrap();
let a_host = dev.sync_reclaim(a_dev.clone()).unwrap();
a_rng.set_seed(42).unwrap();
a_rng.set_offset(0).unwrap();
a_rng.fill_with_uniform(&mut a_dev).unwrap();
let b_host = dev.sync_reclaim(a_dev).unwrap();
assert_eq!(a_host, b_host);
}
const N: usize = 1000;
#[test]
fn test_uniform_f32() {
let a = gen_uniform::<f32>(0, N);
for i in 0..N {
assert!(0.0 < a[i] && a[i] <= 1.0);
}
}
#[test]
fn test_uniform_f64() {
let a = gen_uniform::<f64>(0, N);
for i in 0..N {
assert!(0.0 < a[i] && a[i] <= 1.0);
}
}
#[test]
fn test_uniform_u32() {
let a = gen_uniform::<u32>(0, N);
for i in 0..N {
assert!(a[i] > 0);
}
}
#[test]
fn test_normal_f32() {
let a = gen_normal::<f32>(0, N, 0.0, 1.0);
for i in 0..N {
assert!(a[i] != 0.0);
}
let b = gen_normal::<f32>(0, N, -1.0, 2.0);
for i in 0..N {
assert_ne!(a[i], b[i]);
}
}
#[test]
fn test_normal_f64() {
let a = gen_normal::<f64>(0, N, 0.0, 1.0);
for i in 0..N {
assert!(a[i] != 0.0);
}
let b = gen_normal::<f64>(0, N, -1.0, 2.0);
for i in 0..N {
assert_ne!(a[i], b[i]);
}
}
#[test]
fn test_log_normal_f32() {
let a = gen_log_normal::<f32>(0, N, 0.0, 1.0);
for i in 0..N {
assert!(a[i] != 0.0);
}
let b = gen_log_normal::<f32>(0, N, -1.0, 2.0);
for i in 0..N {
assert_ne!(a[i], b[i]);
}
}
#[test]
fn test_log_normal_f64() {
let a = gen_log_normal::<f64>(0, N, 0.0, 1.0);
for i in 0..N {
assert!(a[i] != 0.0);
}
let b = gen_log_normal::<f64>(0, N, -1.0, 2.0);
for i in 0..N {
assert_ne!(a[i], b[i]);
}
}
}
//! Bindings from "curand.h" generated by rust-bindgen 0.60.1
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct CUstream_st {
_unused: [u8; 0],
}
pub type cudaStream_t = *mut CUstream_st;
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum libraryPropertyType_t {
MAJOR_VERSION = 0,
MINOR_VERSION = 1,
PATCH_LEVEL = 2,
}
pub use self::libraryPropertyType_t as libraryPropertyType;
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum curandStatus {
CURAND_STATUS_SUCCESS = 0,
CURAND_STATUS_VERSION_MISMATCH = 100,
CURAND_STATUS_NOT_INITIALIZED = 101,
CURAND_STATUS_ALLOCATION_FAILED = 102,
CURAND_STATUS_TYPE_ERROR = 103,
CURAND_STATUS_OUT_OF_RANGE = 104,
CURAND_STATUS_LENGTH_NOT_MULTIPLE = 105,
CURAND_STATUS_DOUBLE_PRECISION_REQUIRED = 106,
CURAND_STATUS_LAUNCH_FAILURE = 201,
CURAND_STATUS_PREEXISTING_FAILURE = 202,
CURAND_STATUS_INITIALIZATION_FAILED = 203,
CURAND_STATUS_ARCH_MISMATCH = 204,
CURAND_STATUS_INTERNAL_ERROR = 999,
}
pub use self::curandStatus as curandStatus_t;
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum curandRngType {
CURAND_RNG_TEST = 0,
CURAND_RNG_PSEUDO_DEFAULT = 100,
CURAND_RNG_PSEUDO_XORWOW = 101,
CURAND_RNG_PSEUDO_MRG32K3A = 121,
CURAND_RNG_PSEUDO_MTGP32 = 141,
CURAND_RNG_PSEUDO_MT19937 = 142,
CURAND_RNG_PSEUDO_PHILOX4_32_10 = 161,
CURAND_RNG_QUASI_DEFAULT = 200,
CURAND_RNG_QUASI_SOBOL32 = 201,
CURAND_RNG_QUASI_SCRAMBLED_SOBOL32 = 202,
CURAND_RNG_QUASI_SOBOL64 = 203,
CURAND_RNG_QUASI_SCRAMBLED_SOBOL64 = 204,
}
pub use self::curandRngType as curandRngType_t;
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum curandOrdering {
CURAND_ORDERING_PSEUDO_BEST = 100,
CURAND_ORDERING_PSEUDO_DEFAULT = 101,
CURAND_ORDERING_PSEUDO_SEEDED = 102,
CURAND_ORDERING_PSEUDO_LEGACY = 103,
CURAND_ORDERING_QUASI_DEFAULT = 201,
}
pub use self::curandOrdering as curandOrdering_t;
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum curandDirectionVectorSet {
CURAND_DIRECTION_VECTORS_32_JOEKUO6 = 101,
CURAND_SCRAMBLED_DIRECTION_VECTORS_32_JOEKUO6 = 102,
CURAND_DIRECTION_VECTORS_64_JOEKUO6 = 103,
CURAND_SCRAMBLED_DIRECTION_VECTORS_64_JOEKUO6 = 104,
}
pub use self::curandDirectionVectorSet as curandDirectionVectorSet_t;
pub type curandDirectionVectors32_t = [core::ffi::c_uint; 32usize];
pub type curandDirectionVectors64_t = [core::ffi::c_ulonglong; 64usize];
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct curandGenerator_st {
_unused: [u8; 0],
}
pub type curandGenerator_t = *mut curandGenerator_st;
pub type curandDistribution_st = f64;
pub type curandDistribution_t = *mut curandDistribution_st;
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct curandDistributionShift_st {
_unused: [u8; 0],
}
pub type curandDistributionShift_t = *mut curandDistributionShift_st;
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct curandDistributionM2Shift_st {
_unused: [u8; 0],
}
pub type curandDistributionM2Shift_t = *mut curandDistributionM2Shift_st;
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct curandHistogramM2_st {
_unused: [u8; 0],
}
pub type curandHistogramM2_t = *mut curandHistogramM2_st;
pub type curandHistogramM2K_st = core::ffi::c_uint;
pub type curandHistogramM2K_t = *mut curandHistogramM2K_st;
pub type curandHistogramM2V_st = curandDistribution_st;
pub type curandHistogramM2V_t = *mut curandHistogramM2V_st;
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct curandDiscreteDistribution_st {
_unused: [u8; 0],
}
pub type curandDiscreteDistribution_t = *mut curandDiscreteDistribution_st;
#[repr(u32)]
#[derive(Debug, Copy, Clone, Hash, PartialOrd, Ord, PartialEq, Eq)]
pub enum curandMethod {
CURAND_CHOOSE_BEST = 0,
CURAND_ITR = 1,
CURAND_KNUTH = 2,
CURAND_HITR = 3,
CURAND_M1 = 4,
CURAND_M2 = 5,
CURAND_BINARY_SEARCH = 6,
CURAND_DISCRETE_GAUSS = 7,
CURAND_REJECTION = 8,
CURAND_DEVICE_API = 9,
CURAND_FAST_REJECTION = 10,
CURAND_3RD = 11,
CURAND_DEFINITION = 12,
CURAND_POISSON = 13,
}
pub use self::curandMethod as curandMethod_t;
extern "C" {
pub fn curandCreateGenerator(
generator: *mut curandGenerator_t,
rng_type: curandRngType_t,
) -> curandStatus_t;
}
extern "C" {
pub fn curandCreateGeneratorHost(
generator: *mut curandGenerator_t,
rng_type: curandRngType_t,
) -> curandStatus_t;
}
extern "C" {
pub fn curandDestroyGenerator(generator: curandGenerator_t) -> curandStatus_t;
}
extern "C" {
pub fn curandGetVersion(version: *mut core::ffi::c_int) -> curandStatus_t;
}
extern "C" {
pub fn curandGetProperty(
type_: libraryPropertyType,
value: *mut core::ffi::c_int,
) -> curandStatus_t;
}
extern "C" {
pub fn curandSetStream(generator: curandGenerator_t, stream: cudaStream_t) -> curandStatus_t;
}
extern "C" {
pub fn curandSetPseudoRandomGeneratorSeed(
generator: curandGenerator_t,
seed: core::ffi::c_ulonglong,
) -> curandStatus_t;
}
extern "C" {
pub fn curandSetGeneratorOffset(
generator: curandGenerator_t,
offset: core::ffi::c_ulonglong,
) -> curandStatus_t;
}
extern "C" {
pub fn curandSetGeneratorOrdering(
generator: curandGenerator_t,
order: curandOrdering_t,
) -> curandStatus_t;
}
extern "C" {
pub fn curandSetQuasiRandomGeneratorDimensions(
generator: curandGenerator_t,
num_dimensions: core::ffi::c_uint,
) -> curandStatus_t;
}
extern "C" {
pub fn curandGenerate(
generator: curandGenerator_t,
outputPtr: *mut core::ffi::c_uint,
num: usize,
) -> curandStatus_t;
}
extern "C" {
pub fn curandGenerateLongLong(
generator: curandGenerator_t,
outputPtr: *mut core::ffi::c_ulonglong,
num: usize,
) -> curandStatus_t;
}
extern "C" {
pub fn curandGenerateUniform(
generator: curandGenerator_t,
outputPtr: *mut f32,
num: usize,
) -> curandStatus_t;
}
extern "C" {
pub fn curandGenerateUniformDouble(
generator: curandGenerator_t,
outputPtr: *mut f64,
num: usize,
) -> curandStatus_t;
}
extern "C" {
pub fn curandGenerateNormal(
generator: curandGenerator_t,
outputPtr: *mut f32,
n: usize,
mean: f32,
stddev: f32,
) -> curandStatus_t;
}
extern "C" {
pub fn curandGenerateNormalDouble(
generator: curandGenerator_t,
outputPtr: *mut f64,
n: usize,
mean: f64,
stddev: f64,
) -> curandStatus_t;
}
extern "C" {
pub fn curandGenerateLogNormal(
generator: curandGenerator_t,
outputPtr: *mut f32,
n: usize,
mean: f32,
stddev: f32,
) -> curandStatus_t;
}
extern "C" {
pub fn curandGenerateLogNormalDouble(
generator: curandGenerator_t,
outputPtr: *mut f64,
n: usize,
mean: f64,
stddev: f64,
) -> curandStatus_t;
}
extern "C" {
pub fn curandCreatePoissonDistribution(
lambda: f64,
discrete_distribution: *mut curandDiscreteDistribution_t,
) -> curandStatus_t;
}
extern "C" {
pub fn curandDestroyDistribution(
discrete_distribution: curandDiscreteDistribution_t,
) -> curandStatus_t;
}
extern "C" {
pub fn curandGeneratePoisson(
generator: curandGenerator_t,
outputPtr: *mut core::ffi::c_uint,
n: usize,
lambda: f64,
) -> curandStatus_t;
}
extern "C" {
pub fn curandGeneratePoissonMethod(
generator: curandGenerator_t,
outputPtr: *mut core::ffi::c_uint,
n: usize,
lambda: f64,
method: curandMethod_t,
) -> curandStatus_t;
}
extern "C" {
pub fn curandGenerateBinomial(
generator: curandGenerator_t,
outputPtr: *mut core::ffi::c_uint,
num: usize,
n: core::ffi::c_uint,
p: f64,
) -> curandStatus_t;
}
extern "C" {
pub fn curandGenerateBinomialMethod(
generator: curandGenerator_t,
outputPtr: *mut core::ffi::c_uint,
num: usize,
n: core::ffi::c_uint,
p: f64,
method: curandMethod_t,
) -> curandStatus_t;
}
extern "C" {
pub fn curandGenerateSeeds(generator: curandGenerator_t) -> curandStatus_t;
}
extern "C" {
pub fn curandGetDirectionVectors32(
vectors: *mut *mut curandDirectionVectors32_t,
set: curandDirectionVectorSet_t,
) -> curandStatus_t;
}
extern "C" {
pub fn curandGetScrambleConstants32(constants: *mut *mut core::ffi::c_uint) -> curandStatus_t;
}
extern "C" {
pub fn curandGetDirectionVectors64(
vectors: *mut *mut curandDirectionVectors64_t,
set: curandDirectionVectorSet_t,
) -> curandStatus_t;
}
extern "C" {
pub fn curandGetScrambleConstants64(
constants: *mut *mut core::ffi::c_ulonglong,
) -> curandStatus_t;
}
#include "curand.h"
\ No newline at end of file
#!/bin/bash
set -exu
bindgen \
--allowlist-type="^CU.*" \
--allowlist-type="^cuuint(32|64)_t" \
--allowlist-type="^cudaError_enum" \
--allowlist-type="^cu.*Complex$" \
--allowlist-type="^cuda.*" \
--allowlist-type="^libraryPropertyType.*" \
--allowlist-var="^CU.*" \
--allowlist-function="^cu.*" \
--default-enum-style=rust \
--no-doc-comments \
--with-derive-default \
--with-derive-eq \
--with-derive-hash \
--with-derive-ord \
--size_t-is-usize \
--use-core \
wrapper.h -- -I/usr/local/cuda/include \
> sys.rs
//! Wrappers around the [CUDA driver API](https://docs.nvidia.com/cuda/cuda-driver-api/index.html),
//! in three levels. See crate documentation for description of each.
//!
//! # safe api usage
//!
//! 1. Instantiate a [CudaDevice]:
//!
//! ```rust
//! # use cudarc::driver::*;
//! let device = CudaDevice::new(0).unwrap();
//! ```
//!
//! 2. Allocate device memory with host data with [CudaDevice::htod_copy()], [CudaDevice::alloc_zeros()],
//! or [CudaDevice::htod_sync_copy()].
//!
//! You can also copy data to CudaSlice using [CudaDevice::htod_sync_copy_into()]
//!
//! ```rust
//! # use cudarc::driver::*;
//! # let device = CudaDevice::new(0).unwrap();
//! let a_dev: CudaSlice<f32> = device.alloc_zeros(10).unwrap();
//! let b_dev: CudaSlice<f32> = device.htod_copy(vec![0.0; 10]).unwrap();
//! let c_dev: CudaSlice<f32> = device.htod_sync_copy(&[1.0, 2.0, 3.0]).unwrap();
//! ```
//!
//! 3. Transfer to host memory with [CudaDevice::sync_reclaim()], [CudaDevice::dtoh_sync_copy()],
//! or [CudaDevice::dtoh_sync_copy_into()]
//!
//! ```rust
//! # use cudarc::driver::*;
//! # use std::rc::Rc;
//! # let device = CudaDevice::new(0).unwrap();
//! let a_dev: CudaSlice<f32> = device.alloc_zeros(10).unwrap();
//! let mut a_buf: [f32; 10] = [1.0; 10];
//! device.dtoh_sync_copy_into(&a_dev, &mut a_buf);
//! assert_eq!(a_buf, [0.0; 10]);
//! let a_host: Vec<f32> = device.sync_reclaim(a_dev).unwrap();
//! assert_eq!(&a_host, &[0.0; 10]);
//! ```
//!
//! ## Mutating device memory - [CudaFunction]
//!
//! See [LaunchAsync] and [CudaFunction].
//!
//! In order to mutate device data, you need to use cuda kernels.
//!
//! Loading kernels is done with [CudaDevice::load_ptx()]
//! ```rust
//! # use cudarc::{driver::*, nvrtc::*};
//! let ptx = compile_ptx("extern \"C\" __global__ void my_function(float *out) { }").unwrap();
//! let device = CudaDevice::new(0).unwrap();
//! device.load_ptx(ptx, "module_name", &["my_function"]).unwrap();
//! ```
//!
//! Retrieve the function using the registered module name & actual function name:
//! ```rust
//! # use cudarc::{driver::*, nvrtc::*};
//! # let ptx = compile_ptx("extern \"C\" __global__ void my_function(float *out) { }").unwrap();
//! # let device = CudaDevice::new(0).unwrap();
//! # device.load_ptx(ptx, "module_name", &["my_function"]).unwrap();
//! let func: CudaFunction = device.get_func("module_name", "my_function").unwrap();
//! ```
//!
//! Asynchronously execute the kernel:
//! ```rust
//! # use cudarc::{driver::*, nvrtc::*};
//! # let ptx = compile_ptx("extern \"C\" __global__ void my_function(float *out) { }").unwrap();
//! # let device = CudaDevice::new(0).unwrap();
//! # device.load_ptx(ptx, "module_name", &["my_function"]).unwrap();
//! # let func: CudaFunction = device.get_func("module_name", "my_function").unwrap();
//! let mut a = device.alloc_zeros::<f32>(10).unwrap();
//! let cfg = LaunchConfig::for_num_elems(10);
//! unsafe { func.launch(cfg, (&mut a,)) }.unwrap();
//! ```
//!
//! Note: Launching kernels is **extremely unsafe**. See [LaunchAsync] for more info.
//!
//! ## Sub slices of [CudaSlice]
//!
//! For some operations, it is necessary to only operate on a small part of a single [CudaSlice].
//! For example, the slice may represent a batch of items, and you want to run separate kernels
//! on each of the items in the batch.
//!
//! Use [CudaSlice::try_slice()] and [CudaSlice::try_slice_mut()] for this. The returned
//! views ([CudaView] and [CudaViewMut] hold references to the owning [CudaSlice],
//! so rust's ownership system handles safety here.
//!
//! These view structs can be used with [CudaFunction].
//!
//! ```rust
//! # use cudarc::{driver::*, nvrtc::*};
//! # let ptx = compile_ptx("extern \"C\" __global__ void my_function(float *out) { }").unwrap();
//! # let device = CudaDevice::new(0).unwrap();
//! # device.load_ptx(ptx, "module_name", &["my_function"]).unwrap();
//! let mut a: CudaSlice<f32> = device.alloc_zeros::<f32>(3 * 10).unwrap();
//! for i_batch in 0..3 {
//! let mut a_sub_view: CudaViewMut<f32> = a.try_slice_mut(i_batch * 10..).unwrap();
//! let f: CudaFunction = device.get_func("module_name", "my_function").unwrap();
//! let cfg = LaunchConfig::for_num_elems(10);
//! unsafe { f.launch(cfg, (&mut a_sub_view,)) }.unwrap();
//! }
//! ```
//!
//! #### A note on implementation
//!
//! It would be possible to re-use [CudaSlice] itself for sub-slices, however that would involve adding
//! another structure underneath the hood that is wrapped in an [std::sync::Arc] to minimize data cloning.
//! Overall it seemed more complex than the current implementation.
//!
//! # Multi threading
//!
//! In order to use a [CudaDevice] on multiple threads, you must call [CudaDevice::bind_to_thread] on
//! each thread **before you use the device**.
//!
//! # Safety
//!
//! There are a number of aspects to this, but at a high level this API utilizes [std::sync::Arc] to
//! control when [CudaDevice] can be dropped.
//!
//! ### Context/Stream lifetimes
//!
//! The first part of safety is ensuring that [crate::driver::sys::CUcontext],
//! [crate::driver::sys::CUdevice], and [crate::driver::sys::CUstream] all
//! live the required amount of time (i.e. device outlives context, which outlives stream).
//!
//! This is accomplished by putting all of them inside one struct, the [CudaDevice]. There are other ways,
//! such as adding newtypes that carry lifetimes with them, but this approach was chosen to make working
//! with device pointers easier.
//!
//! Additionally, [CudaDevice] implements [Drop] as releasing all the data from the device in
//! the expected way.
//!
//! ### Device Data lifetimes
//!
//! The next part of safety is ensuring that [CudaSlice] do not outlive
//! the [CudaDevice]. For usability, each [CudaSlice] owns an `Arc<CudaDevice>`
//! to ensure the device stays alive.
//!
//! Additionally we don't want to double free any device pointers, so free is only
//! called when the device pointer is dropped. Thanks rust!
//!
//! ### Host and Device Data lifetimes
//!
//! Each device allocation can be associated with a host allocation. We want to ensure
//! that these have the same lifetimes *when copying data between them*.
//!
//! This is done via the various copy methods. Methods that don't take ownership
//! of the host data need to be executed synchronously, while the methods own the reference.
//! Methods that do own the host data can be executed synchronously.
//!
//! ### Single stream operations
//!
//! The next part of safety is ensuring that all operations happen on a single stream.
//! This ensures that data isn't mutated by more than 1 stream at a time, and also
//! ensures data isn't used before allocated, or used after free.
//!
//! Another important aspect of this is ensuring that mutability in an async setting
//! is sound, and something can't be freed while it's being used in a kernel.
//!
//! To this end every operation by default happens on the same stream.
//!
//! Multi stream is supported via [CudaStream], however it automatically
//! synchronizes with the main stream on creation & on drop. It is still possible
//! to be unsafe in a multi stream context though.
pub mod result;
pub mod safe;
#[allow(warnings)]
pub mod sys;
pub use safe::*;
//! A thin wrapper around [sys].
//!
//! While all the functions here will return [Result], they are
//! mostly all still unsafe because order of operations
//! really matters.
//!
//! This also only exposes the `*_async` version of functions
//! because mixing the two is confusing and even more unsafe.
//!
//! This module also groups functions into sub-modules
//! to make naming easier. For example [sys::cuStreamCreate()]
//! turns into [stream::create()], where [stream] is a module.
use super::sys;
use core::ffi::{c_uchar, c_uint, c_void, CStr};
use std::mem::MaybeUninit;
/// Wrapper around [sys::CUresult]. See
/// nvidia's [CUresult docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html#group__CUDA__TYPES_1gc6c391505e117393cc2558fff6bfc2e9)
#[derive(Clone, Copy, PartialEq, Eq)]
pub struct DriverError(pub sys::CUresult);
impl sys::CUresult {
#[inline]
pub fn result(self) -> Result<(), DriverError> {
match self {
sys::CUresult::CUDA_SUCCESS => Ok(()),
_ => Err(DriverError(self)),
}
}
}
impl DriverError {
/// Gets the name for this error.
///
/// See [cuGetErrorName() docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__ERROR.html#group__CUDA__ERROR_1g2c4ac087113652bb3d1f95bf2513c468)
pub fn error_name(&self) -> Result<&CStr, DriverError> {
let mut err_str = MaybeUninit::uninit();
unsafe {
sys::cuGetErrorName(self.0, err_str.as_mut_ptr()).result()?;
Ok(CStr::from_ptr(err_str.assume_init()))
}
}
/// Gets the error string for this error.
///
/// See [cuGetErrorString() docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__ERROR.html#group__CUDA__ERROR_1g72758fcaf05b5c7fac5c25ead9445ada)
pub fn error_string(&self) -> Result<&CStr, DriverError> {
let mut err_str = MaybeUninit::uninit();
unsafe {
sys::cuGetErrorString(self.0, err_str.as_mut_ptr()).result()?;
Ok(CStr::from_ptr(err_str.assume_init()))
}
}
}
impl std::fmt::Debug for DriverError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let err_str = self.error_string().unwrap();
f.debug_tuple("DriverError")
.field(&self.0)
.field(&err_str)
.finish()
}
}
#[cfg(feature = "std")]
impl std::fmt::Display for DriverError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{self:?}")
}
}
#[cfg(feature = "std")]
impl std::error::Error for DriverError {}
/// Initializes the CUDA driver API.
/// **MUST BE CALLED BEFORE ANYTHING ELSE**
///
/// See [cuInit() docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__INITIALIZE.html#group__CUDA__INITIALIZE_1g0a2f1517e1bd8502c7194c3a8c134bc3)
pub fn init() -> Result<(), DriverError> {
unsafe { sys::cuInit(0).result() }
}
pub mod device {
//! Device management functions (`cuDevice*`).
//!
//! See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE)
use super::{sys, DriverError};
use core::ffi::c_int;
use std::mem::MaybeUninit;
/// Get a device for a specific ordinal.
/// See [cuDeviceGet() docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g8bdd1cc7201304b01357b8034f6587cb).
pub fn get(ordinal: c_int) -> Result<sys::CUdevice, DriverError> {
let mut dev = MaybeUninit::uninit();
unsafe {
sys::cuDeviceGet(dev.as_mut_ptr(), ordinal).result()?;
Ok(dev.assume_init())
}
}
/// Gets the number of available devices.
/// See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g52b5ce05cb8c5fb6831b2c0ff2887c74)
pub fn get_count() -> Result<c_int, DriverError> {
let mut count = MaybeUninit::uninit();
unsafe {
sys::cuDeviceGetCount(count.as_mut_ptr()).result()?;
Ok(count.assume_init())
}
}
/// Returns the total amount of memory in bytes on the device.
///
/// See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1gc6a0d6551335a3780f9f3c967a0fde5d)
///
/// # Safety
/// Must be a device returned from [get].
pub unsafe fn total_mem(dev: sys::CUdevice) -> Result<usize, DriverError> {
let mut bytes = MaybeUninit::uninit();
sys::cuDeviceTotalMem_v2(bytes.as_mut_ptr(), dev).result()?;
Ok(bytes.assume_init())
}
/// Get an attribute of a device.
///
/// See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g8c6e2c7b5c7c8b7e6f7f4c2b7f6d9c5d)
///
/// # Safety
/// Must be a device returned from [get].
pub unsafe fn get_attribute(
dev: sys::CUdevice,
attrib: sys::CUdevice_attribute,
) -> Result<i32, DriverError> {
let mut value = MaybeUninit::uninit();
sys::cuDeviceGetAttribute(value.as_mut_ptr(), attrib, dev).result()?;
Ok(value.assume_init())
}
}
pub mod function {
use super::sys::{self, CUfunction_attribute_enum};
/// Sets the specific attribute of a cuda function.
///
/// See [cuda docs](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EXECUTION.html#group__CUDART__EXECUTION_1g317e77d2657abf915fd9ed03e75f3eb0)
///
/// # Safety
/// Function must exist.
pub unsafe fn set_function_attribute(
f: sys::CUfunction,
attribute: CUfunction_attribute_enum,
value: i32,
) -> Result<(), super::DriverError> {
unsafe {
sys::cuFuncSetAttribute(f, attribute, value).result()?;
}
Ok(())
}
}
pub mod occupancy {
use core::{
ffi::{c_int, c_uint},
mem::MaybeUninit,
};
use super::{sys, DriverError};
/// Returns dynamic shared memory available per block when launching numBlocks blocks on SM.
///
/// See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__OCCUPANCY.html#group__CUDA__OCCUPANCY_1gae02af6a9df9e1bbd51941af631bce69)
///
/// # Safety
/// Function must exist.
pub unsafe fn available_dynamic_shared_mem_per_block(
f: sys::CUfunction,
num_blocks: c_int,
block_size: c_int,
) -> Result<usize, DriverError> {
let mut dynamic_smem_size = MaybeUninit::uninit();
unsafe {
sys::cuOccupancyAvailableDynamicSMemPerBlock(
dynamic_smem_size.as_mut_ptr(),
f,
num_blocks,
block_size,
)
.result()?;
}
Ok(dynamic_smem_size.assume_init())
}
/// Returns occupancy of a function.
///
/// See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__OCCUPANCY.html#group__CUDA__OCCUPANCY_1gcc6e1094d05cba2cee17fe33ddd04a98)
///
/// # Safety
/// Function must exist.
pub unsafe fn max_active_block_per_multiprocessor(
f: sys::CUfunction,
block_size: c_int,
dynamic_smem_size: usize,
) -> Result<i32, DriverError> {
let mut num_blocks = MaybeUninit::uninit();
unsafe {
sys::cuOccupancyMaxActiveBlocksPerMultiprocessor(
num_blocks.as_mut_ptr(),
f,
block_size,
dynamic_smem_size,
)
.result()?;
}
Ok(num_blocks.assume_init())
}
/// Returns occupancy of a function.
///
/// See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__OCCUPANCY.html#group__CUDA__OCCUPANCY_1g8f1da4d4983e5c3025447665423ae2c2)
///
/// # Safety
/// Function must exist. No invalid flags.
pub unsafe fn max_active_block_per_multiprocessor_with_flags(
f: sys::CUfunction,
block_size: c_int,
dynamic_smem_size: usize,
flags: c_uint,
) -> Result<i32, DriverError> {
let mut num_blocks = MaybeUninit::uninit();
unsafe {
sys::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
num_blocks.as_mut_ptr(),
f,
block_size,
dynamic_smem_size,
flags,
)
.result()?;
}
Ok(num_blocks.assume_init())
}
/// Suggest a launch configuration with reasonable occupancy.
///
/// Returns (min_grid_size, block_size)
///
/// See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__OCCUPANCY.html#group__CUDA__OCCUPANCY_1gf179c4ab78962a8468e41c3f57851f03)
///
/// # Safety
/// Function must exist and the shared memory function must be correct. No invalid flags.
pub unsafe fn max_potential_block_size(
f: sys::CUfunction,
block_size_to_dynamic_smem_size: sys::CUoccupancyB2DSize,
dynamic_smem_size: usize,
block_size_limit: c_int,
) -> Result<(i32, i32), DriverError> {
let mut min_grid_size = MaybeUninit::uninit();
let mut block_size = MaybeUninit::uninit();
unsafe {
sys::cuOccupancyMaxPotentialBlockSize(
min_grid_size.as_mut_ptr(),
block_size.as_mut_ptr(),
f,
block_size_to_dynamic_smem_size,
dynamic_smem_size,
block_size_limit,
)
.result()?;
}
Ok((min_grid_size.assume_init(), block_size.assume_init()))
}
/// Suggest a launch configuration with reasonable occupancy.
///
/// Returns (min_grid_size, block_size)
///
/// See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__OCCUPANCY.html#group__CUDA__OCCUPANCY_1g04c0bb65630f82d9b99a5ca0203ee5aa)
///
/// # Safety
/// Function must exist and the shared memory function must be correct. No invalid flags.
pub unsafe fn max_potential_block_size_with_flags(
f: sys::CUfunction,
block_size_to_dynamic_smem_size: sys::CUoccupancyB2DSize,
dynamic_smem_size: usize,
block_size_limit: c_int,
flags: c_uint,
) -> Result<(i32, i32), DriverError> {
let mut min_grid_size = MaybeUninit::uninit();
let mut block_size = MaybeUninit::uninit();
unsafe {
sys::cuOccupancyMaxPotentialBlockSizeWithFlags(
min_grid_size.as_mut_ptr(),
block_size.as_mut_ptr(),
f,
block_size_to_dynamic_smem_size,
dynamic_smem_size,
block_size_limit,
flags,
)
.result()?;
}
Ok((min_grid_size.assume_init(), block_size.assume_init()))
}
}
pub mod primary_ctx {
//! Primary context management functions (`cuDevicePrimaryCtx*`).
//!
//! See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PRIMARY__CTX.html#group__CUDA__PRIMARY__CTX)
use super::{sys, DriverError};
use std::mem::MaybeUninit;
/// Creates a primary context on the device and pushes it onto the primary context stack.
/// Call [release] to free it.
///
/// See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PRIMARY__CTX.html#group__CUDA__PRIMARY__CTX_1g9051f2d5c31501997a6cb0530290a300)
///
/// # Safety
///
/// This is only safe with a device that was returned from [super::device::get].
pub unsafe fn retain(dev: sys::CUdevice) -> Result<sys::CUcontext, DriverError> {
let mut ctx = MaybeUninit::uninit();
sys::cuDevicePrimaryCtxRetain(ctx.as_mut_ptr(), dev).result()?;
Ok(ctx.assume_init())
}
/// Release a reference to the current primary context.
///
/// See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PRIMARY__CTX.html#group__CUDA__PRIMARY__CTX_1gf2a8bc16f8df0c88031f6a1ba3d6e8ad).
///
/// # Safety
///
/// This is only safe with a device that was returned from [super::device::get].
pub unsafe fn release(dev: sys::CUdevice) -> Result<(), DriverError> {
sys::cuDevicePrimaryCtxRelease_v2(dev).result()
}
}
pub mod ctx {
//! Context management functions (`cuCtx*`).
//!
//! See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX)
use super::{sys, DriverError};
use std::mem::MaybeUninit;
/// Binds the specified CUDA context to the calling CPU thread.
///
/// See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1gbe562ee6258b4fcc272ca6478ca2a2f7)
///
/// # Safety
///
/// This has weird behavior depending on the value of `ctx`. See cuda docs for more info.
/// In general this should only be called with an already initialized context,
/// and one that wasn't already freed.
pub unsafe fn set_current(ctx: sys::CUcontext) -> Result<(), DriverError> {
sys::cuCtxSetCurrent(ctx).result()
}
/// Returns the CUDA context bound to the calling CPU thread if there is one.
///
/// See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g8f13165846b73750693640fb3e8380d0)
pub fn get_current() -> Result<Option<sys::CUcontext>, DriverError> {
let mut ctx = MaybeUninit::uninit();
unsafe {
sys::cuCtxGetCurrent(ctx.as_mut_ptr()).result()?;
let ctx: sys::CUcontext = ctx.assume_init();
if ctx.is_null() {
Ok(None)
} else {
Ok(Some(ctx))
}
}
}
}
pub mod stream {
//! Stream management functions (`cuStream*`).
//!
//! See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM).
use super::{sys, DriverError};
use std::mem::MaybeUninit;
/// The kind of stream to initialize.
///
/// See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1ga581f0c5833e21ded8b5a56594e243f4)
pub enum StreamKind {
/// From cuda docs:
/// > Default stream creation flag.
Default,
/// From cuda docs:
/// > Specifies that work running in the created stream
/// > may run concurrently with work in stream 0 (the NULL stream),
/// > and that the created stream should perform no implicit
/// > synchronization with stream 0.
NonBlocking,
}
impl StreamKind {
fn flags(self) -> sys::CUstream_flags {
match self {
Self::Default => sys::CUstream_flags::CU_STREAM_DEFAULT,
Self::NonBlocking => sys::CUstream_flags::CU_STREAM_NON_BLOCKING,
}
}
}
/// The null stream, which is just a null pointer. **Recommend not using this.**
///
/// See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/stream-sync-behavior.html#stream-sync-behavior__default-stream)
pub fn null() -> sys::CUstream {
std::ptr::null_mut()
}
/// Creates a stream with the specified kind.
///
/// See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1ga581f0c5833e21ded8b5a56594e243f4)
pub fn create(kind: StreamKind) -> Result<sys::CUstream, DriverError> {
let mut stream = MaybeUninit::uninit();
unsafe {
sys::cuStreamCreate(stream.as_mut_ptr(), kind.flags() as u32).result()?;
Ok(stream.assume_init())
}
}
/// Wait until a stream's tasks are completed.
///
/// See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g15e49dd91ec15991eb7c0a741beb7dad)
///
/// # Safety
///
/// This should only be called with stream created by [create] and not already
/// destroyed. This follows default stream semantics, see relevant cuda docs.
pub unsafe fn synchronize(stream: sys::CUstream) -> Result<(), DriverError> {
sys::cuStreamSynchronize(stream).result()
}
/// Destroys a stream.
///
/// See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g244c8833de4596bcd31a06cdf21ee758)
///
/// # Safety
///
/// This should only be called with stream created by [create] and not already
/// destroyed. This follows default stream semantics, see relevant cuda docs.
pub unsafe fn destroy(stream: sys::CUstream) -> Result<(), DriverError> {
sys::cuStreamDestroy_v2(stream).result()
}
/// Make a compute stream wait on an event.
///
/// See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g6a898b652dfc6aa1d5c8d97062618b2f)
///
/// # Safety
/// 1. Both stream and event must not have been freed already
pub unsafe fn wait_event(
stream: sys::CUstream,
event: sys::CUevent,
flags: sys::CUevent_wait_flags,
) -> Result<(), DriverError> {
sys::cuStreamWaitEvent(stream, event, flags as u32).result()
}
}
/// Allocates memory with stream ordered semantics.
///
/// See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MALLOC__ASYNC.html#group__CUDA__MALLOC__ASYNC_1g13413273e84a641bce1929eae9e6501f)
///
/// # Safety
/// 1. The stream should be an already created stream.
/// 2. The memory return by this is unset, which may be invalid for `T`.
/// 3. All uses of this memory must be on the same stream.
pub unsafe fn malloc_async(
stream: sys::CUstream,
num_bytes: usize,
) -> Result<sys::CUdeviceptr, DriverError> {
let mut dev_ptr = MaybeUninit::uninit();
sys::cuMemAllocAsync(dev_ptr.as_mut_ptr(), num_bytes, stream).result()?;
Ok(dev_ptr.assume_init())
}
/// Allocates memory
///
/// See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gb82d2a09844a58dd9e744dc31e8aa467)
///
/// # Safety
/// 1. The memory return by this is unset, which may be invalid for `T`.
pub unsafe fn malloc_sync(num_bytes: usize) -> Result<sys::CUdeviceptr, DriverError> {
let mut dev_ptr = MaybeUninit::uninit();
sys::cuMemAlloc_v2(dev_ptr.as_mut_ptr(), num_bytes).result()?;
Ok(dev_ptr.assume_init())
}
/// Frees memory with stream ordered semantics.
///
/// See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MALLOC__ASYNC.html#group__CUDA__MALLOC__ASYNC_1g41acf4131f672a2a75cd93d3241f10cf)
///
/// # Safety
/// 1. The stream should be an already created stream.
/// 2. The memory should have been allocated on this stream.
/// 3. The memory should not have been freed already (double free)
pub unsafe fn free_async(dptr: sys::CUdeviceptr, stream: sys::CUstream) -> Result<(), DriverError> {
sys::cuMemFreeAsync(dptr, stream).result()
}
/// Allocates memory
///
/// See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g89b3f154e17cc89b6eea277dbdf5c93a)
///
/// # Safety
/// 1. The memory should have been allocated with malloc_sync
pub unsafe fn free_sync(dptr: sys::CUdeviceptr) -> Result<(), DriverError> {
sys::cuMemFree_v2(dptr).result()
}
/// Frees device memory.
///
/// See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g89b3f154e17cc89b6eea277dbdf5c93a)
///
/// # Safety
/// 1. Memory must only be freed once.
/// 2. All async accesses to this pointer must have been completed.
pub unsafe fn memory_free(device_ptr: sys::CUdeviceptr) -> Result<(), DriverError> {
sys::cuMemFree_v2(device_ptr).result()
}
/// Sets device memory with stream ordered semantics.
///
/// See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gaef08a7ccd61112f94e82f2b30d43627)
///
/// # Safety
/// 1. The resulting memory pattern may not be valid for `T`.
/// 2. The device pointer should not have been freed already (double free)
/// 3. The stream should be the stream the memory was allocated on.
pub unsafe fn memset_d8_async(
dptr: sys::CUdeviceptr,
uc: c_uchar,
num_bytes: usize,
stream: sys::CUstream,
) -> Result<(), DriverError> {
sys::cuMemsetD8Async(dptr, uc, num_bytes, stream).result()
}
/// Sets device memory with stream ordered semantics.
///
/// See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g6e582bf866e9e2fb014297bfaf354d7b)
///
/// # Safety
/// 1. The resulting memory pattern may not be valid for `T`.
/// 2. The device pointer should not have been freed already (double free)
pub unsafe fn memset_d8_sync(
dptr: sys::CUdeviceptr,
uc: c_uchar,
num_bytes: usize,
) -> Result<(), DriverError> {
sys::cuMemsetD8_v2(dptr, uc, num_bytes).result()
}
/// Copies memory from Host to Device with stream ordered semantics.
///
/// See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g4d32266788c440b0220b1a9ba5795169)
///
/// # Safety
/// **This function is asynchronous** in most cases, so the data from `src`
/// will be copied at a later point after this function returns.
///
/// 1. `T` must be the type that device pointer was allocated with.
/// 2. The device pointer should not have been freed already (double free)
/// 3. The stream should be the stream the memory was allocated on.
/// 4. `src` must not be moved
pub unsafe fn memcpy_htod_async<T>(
dst: sys::CUdeviceptr,
src: &[T],
stream: sys::CUstream,
) -> Result<(), DriverError> {
sys::cuMemcpyHtoDAsync_v2(
dst,
src.as_ptr() as *const _,
std::mem::size_of_val(src),
stream,
)
.result()
}
/// Copies memory from Host to Device
///
/// See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g4d32266788c440b0220b1a9ba5795169)
///
/// # Safety
/// **This function is synchronous**///
/// 1. `T` must be the type that device pointer was allocated with.
/// 2. The device pointer should not have been freed already (double free)
/// 3. `src` must not be moved
pub unsafe fn memcpy_htod_sync<T>(dst: sys::CUdeviceptr, src: &[T]) -> Result<(), DriverError> {
sys::cuMemcpyHtoD_v2(dst, src.as_ptr() as *const _, std::mem::size_of_val(src)).result()
}
/// Copies memory from Device to Host with stream ordered semantics.
///
/// See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g56f30236c7c5247f8e061b59d3268362)
///
/// # Safety
/// **This function is asynchronous** in most cases, so `dst` will be
/// mutated at a later point after this function returns.
///
/// 1. `T` must be the type that device pointer was allocated with.
/// 2. The device pointer should not have been freed already (double free)
/// 3. The stream should be the stream the memory was allocated on.
pub unsafe fn memcpy_dtoh_async<T>(
dst: &mut [T],
src: sys::CUdeviceptr,
stream: sys::CUstream,
) -> Result<(), DriverError> {
sys::cuMemcpyDtoHAsync_v2(
dst.as_mut_ptr() as *mut _,
src,
std::mem::size_of_val(dst),
stream,
)
.result()
}
/// Copies memory from Device to Host with stream ordered semantics.
///
/// See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g3480368ee0208a98f75019c9a8450893)
///
/// # Safety
/// **This function is synchronous**
///
/// 1. `T` must be the type that device pointer was allocated with.
/// 2. The device pointer should not have been freed already (double free)
pub unsafe fn memcpy_dtoh_sync<T>(dst: &mut [T], src: sys::CUdeviceptr) -> Result<(), DriverError> {
sys::cuMemcpyDtoH_v2(dst.as_mut_ptr() as *mut _, src, std::mem::size_of_val(dst)).result()
}
/// Copies memory from Device to Device with stream ordered semantics.
///
/// See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g39ea09ba682b8eccc9c3e0c04319b5c8)
///
/// # Safety
/// 1. `T` must be the type that BOTH device pointers were allocated with.
/// 2. Neither device pointer should not have been freed already (double free)
/// 3. The stream should be the stream the memory was allocated on.
pub unsafe fn memcpy_dtod_async(
dst: sys::CUdeviceptr,
src: sys::CUdeviceptr,
num_bytes: usize,
stream: sys::CUstream,
) -> Result<(), DriverError> {
sys::cuMemcpyDtoDAsync_v2(dst, src, num_bytes, stream).result()
}
/// Copies memory from Device to Device
///
/// See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g1725774abf8b51b91945f3336b778c8b)
///
/// # Safety
/// 1. `T` must be the type that BOTH device pointers were allocated with.
/// 2. Neither device pointer should not have been freed already (double free)
pub unsafe fn memcpy_dtod_sync(
dst: sys::CUdeviceptr,
src: sys::CUdeviceptr,
num_bytes: usize,
) -> Result<(), DriverError> {
sys::cuMemcpyDtoD_v2(dst, src, num_bytes).result()
}
/// Returns (free, total) memory in bytes.
///
/// See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g808f555540d0143a331cc42aa98835c0)
pub fn mem_get_info() -> Result<(usize, usize), DriverError> {
let mut free = 0;
let mut total = 0;
unsafe { sys::cuMemGetInfo_v2(&mut free as *mut _, &mut total as *mut _) }.result()?;
Ok((free, total))
}
pub mod module {
//! Module management functions (`cuModule*`).
//!
//! See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE)
use super::{sys, DriverError};
use core::ffi::c_void;
use std::ffi::CString;
use std::mem::MaybeUninit;
/// Loads a compute module from a given file.
///
/// See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1g366093bd269dafd0af21f1c7d18115d3)
pub fn load(fname: CString) -> Result<sys::CUmodule, DriverError> {
let fname_ptr = fname.as_c_str().as_ptr();
let mut module = MaybeUninit::uninit();
unsafe {
sys::cuModuleLoad(module.as_mut_ptr(), fname_ptr).result()?;
Ok(module.assume_init())
}
}
/// Load a module's data:
///
/// > The pointer may be obtained by mapping a cubin or PTX or fatbin file,
/// > passing a cubin or PTX or fatbin file as a NULL-terminated text string,
/// > or incorporating a cubin or fatbin object into the executable resources
/// > and using operating system calls such as Windows FindResource() to obtain the pointer.
///
/// See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1g04ce266ce03720f479eab76136b90c0b)
///
/// # Safety
/// The image must be properly formed pointer
pub unsafe fn load_data(image: *const c_void) -> Result<sys::CUmodule, DriverError> {
let mut module = MaybeUninit::uninit();
sys::cuModuleLoadData(module.as_mut_ptr(), image).result()?;
Ok(module.assume_init())
}
/// Returns a function handle from the given module.
///
/// See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1ga52be009b0d4045811b30c965e1cb2cf)
///
/// # Safety
/// `module` must be a properly allocated and not freed module.
pub unsafe fn get_function(
module: sys::CUmodule,
name: CString,
) -> Result<sys::CUfunction, DriverError> {
let name_ptr = name.as_c_str().as_ptr();
let mut func = MaybeUninit::uninit();
sys::cuModuleGetFunction(func.as_mut_ptr(), module, name_ptr).result()?;
Ok(func.assume_init())
}
/// Unloads a module.
///
/// See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1g8ea3d716524369de3763104ced4ea57b)
///
/// # Safety
/// `module` must not have be unloaded already.
pub unsafe fn unload(module: sys::CUmodule) -> Result<(), DriverError> {
sys::cuModuleUnload(module).result()
}
}
pub mod event {
use super::{sys, DriverError};
use std::mem::MaybeUninit;
/// Creates an event.
///
/// See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g450687e75f3ff992fe01662a43d9d3db)
pub fn create(flags: sys::CUevent_flags) -> Result<sys::CUevent, DriverError> {
let mut event = MaybeUninit::uninit();
unsafe {
sys::cuEventCreate(event.as_mut_ptr(), flags as u32).result()?;
Ok(event.assume_init())
}
}
/// Records an event.
///
/// See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g95424d3be52c4eb95d83861b70fb89d1)
///
/// # Safety
/// This function is unsafe because event can be a null event, in which case
pub unsafe fn record(event: sys::CUevent, stream: sys::CUstream) -> Result<(), DriverError> {
unsafe { sys::cuEventRecord(event, stream).result() }
}
/// Computes the elapsed time (in milliseconds) between two events.
///
/// See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1gdfb1178807353bbcaa9e245da497cf97)
/// # Safety
/// 1. Events must have been created by [create]
/// 2. They should be on the same stream
/// 3. They must not have been destroyed.
pub unsafe fn elapsed(start: sys::CUevent, end: sys::CUevent) -> Result<f32, DriverError> {
let mut ms: f32 = 0.0;
unsafe {
sys::cuEventElapsedTime((&mut ms) as *mut _, start, end).result()?;
}
Ok(ms)
}
/// Destroys an event.
///
/// > An event may be destroyed before it is complete (i.e., while cuEventQuery() would return CUDA_ERROR_NOT_READY).
/// > In this case, the call does not block on completion of the event,
/// > and any associated resources will automatically be released asynchronously at completion.
///
/// See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g593ec73a8ec5a5fc031311d3e4dca1ef)
///
/// # Safety
/// 1. Event must not have been freed already
pub unsafe fn destroy(event: sys::CUevent) -> Result<(), DriverError> {
sys::cuEventDestroy_v2(event).result()
}
}
/// Launches a cuda functions
///
/// See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1gb8f3dc3031b40da29d5f9a7139e52e15)
///
/// # Safety
/// This method is **very unsafe**.
///
/// 1. The cuda function must be a valid handle returned from a non-unloaded module.
/// 2. This is asynchronous, so the results of calling this function happen
/// at a later point after this function returns.
/// 3. All parameters used for this kernel should have been allocated by stream (I think?)
/// 4. The cuda kernel has mutable access to every parameter, that means every parameter
/// can change at a later point after callign this function. *Even non-mutable references*.
#[inline]
pub unsafe fn launch_kernel(
f: sys::CUfunction,
grid_dim: (c_uint, c_uint, c_uint),
block_dim: (c_uint, c_uint, c_uint),
shared_mem_bytes: c_uint,
stream: sys::CUstream,
kernel_params: &mut [*mut c_void],
) -> Result<(), DriverError> {
sys::cuLaunchKernel(
f,
grid_dim.0,
grid_dim.1,
grid_dim.2,
block_dim.0,
block_dim.1,
block_dim.2,
shared_mem_bytes,
stream,
kernel_params.as_mut_ptr(),
std::ptr::null_mut(),
)
.result()
}
pub mod external_memory {
use std::mem::MaybeUninit;
use super::{sys, DriverError};
/// Imports an external memory object, in this case an OpaqueFd.
///
/// The memory should be destroyed using [`destroy_external_memory`].
///
/// See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXTRES__INTEROP.html#group__CUDA__EXTRES__INTEROP_1g52aba3a7f780157d8ba12972b2481735)
///
/// # Safety
/// `size` must be the size of the size of the memory object in bytes.
#[cfg(unix)]
pub unsafe fn import_external_memory_opaque_fd(
fd: std::os::fd::RawFd,
size: u64,
) -> Result<sys::CUexternalMemory, DriverError> {
let mut external_memory = MaybeUninit::uninit();
let handle_description = sys::CUDA_EXTERNAL_MEMORY_HANDLE_DESC {
type_: sys::CUexternalMemoryHandleType::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD,
handle: sys::CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st__bindgen_ty_1 { fd },
size,
..Default::default()
};
sys::cuImportExternalMemory(external_memory.as_mut_ptr(), &handle_description).result()?;
Ok(external_memory.assume_init())
}
/// Imports an external memory object, in this case an OpaqueWin32 handle.
///
/// The memory should be destroyed using [`destroy_external_memory`].
///
/// See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXTRES__INTEROP.html#group__CUDA__EXTRES__INTEROP_1g52aba3a7f780157d8ba12972b2481735)
///
/// # Safety
/// `size` must be the size of the size of the memory object in bytes.
#[cfg(windows)]
pub unsafe fn import_external_memory_opaque_win32(
handle: std::os::windows::io::RawHandle,
size: u64,
) -> Result<sys::CUexternalMemory, DriverError> {
let mut external_memory = MaybeUninit::uninit();
let handle_description = sys::CUDA_EXTERNAL_MEMORY_HANDLE_DESC {
type_: sys::CUexternalMemoryHandleType::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32,
handle: sys::CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st__bindgen_ty_1 {
win32: sys::CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st__bindgen_ty_1__bindgen_ty_1 {
handle,
name: std::ptr::null(),
},
},
size,
..Default::default()
};
sys::cuImportExternalMemory(external_memory.as_mut_ptr(), &handle_description).result()?;
Ok(external_memory.assume_init())
}
/// Destroys an external memory object.
///
/// See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXTRES__INTEROP.html#group__CUDA__EXTRES__INTEROP_1g1b586dda86565617e7e0883b956c7052)
///
/// # Safety
/// 1. Any mapped buffers onto this object must already be freed.
/// 2. The external memory must only be destroyed once.
pub unsafe fn destroy_external_memory(
external_memory: sys::CUexternalMemory,
) -> Result<(), DriverError> {
sys::cuDestroyExternalMemory(external_memory).result()
}
/// Maps a buffer onto an imported memory object.
///
/// The buffer must be freed using [`memory_free`](super::memory_free).
///
/// See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXTRES__INTEROP.html#group__CUDA__EXTRES__INTEROP_1gb9fec33920400c70961b4e33d838da91)
///
/// # Safety
/// Mapped buffers may overlap.
pub unsafe fn get_mapped_buffer(
external_memory: sys::CUexternalMemory,
offset: u64,
size: u64,
) -> Result<sys::CUdeviceptr, DriverError> {
let mut device_ptr = MaybeUninit::uninit();
let buffer_description = sys::CUDA_EXTERNAL_MEMORY_BUFFER_DESC {
offset,
size,
..Default::default()
};
sys::cuExternalMemoryGetMappedBuffer(
device_ptr.as_mut_ptr(),
external_memory,
&buffer_description,
)
.result()?;
Ok(device_ptr.assume_init())
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment