const int MAX_DIMS = 25; const int NUM_THREADS = 1024; inline int GET_BLOCKS(const int n) { return (n + NUM_THREADS - 1) / NUM_THREADS; } template struct TensorInfo { TensorInfo(T *t, int d, int sz[MAX_DIMS], int st[MAX_DIMS]) { data = t; dims = d; for (int i = 0; i < dims; i++) { size[i] = sz[i]; stride[i] = st[i]; } } T *data; int dims; int size[MAX_DIMS]; int stride[MAX_DIMS]; }; #define KERNEL_LOOP(I, N) \ for (int I = blockIdx.x * blockDim.x + threadIdx.x; I < N; i += blockDim.x * gridDim.x) /* #define KERNEL_RUN(NAME, DIMS, N, PARAMS) \ */ #define KERNEL_RUN(NAME, DIMS, N, ...) { \ int grid = GET_BLOCKS(N); \ cudaStream_t stream = THCState_getCurrentStream(state); \ switch (DIMS) { \ case 1: NAME<<>>(__VA_ARGS__, N); break; \ case 2: NAME<<>>(__VA_ARGS__, N); break; \ case 3: NAME<<>>(__VA_ARGS__, N); break; \ default: NAME<<>>(__VA_ARGS__, N); break; \ } \ THCudaCheck(cudaGetLastError()); \ } static inline __device__ bool eq(uint8_t a, uint8_t b) { return a == b; } static inline __device__ bool eq( int8_t a, int8_t b) { return a == b; } static inline __device__ bool eq(int16_t a, int16_t b) { return a == b; } static inline __device__ bool eq(int32_t a, int32_t b) { return a == b; } static inline __device__ bool eq(int64_t a, int64_t b) { return a == b; } static inline __device__ bool eq( float a, float b) { return a == b; } static inline __device__ bool eq( double a, double b) { return a == b; } static inline __device__ bool eq(half a, half b) { return __half2float(a) == __half2float(b); }