Commit aaa5a410 authored by rusty1s's avatar rusty1s
Browse files

integer atomic

parent cc561ac4
#define OP(X, Y) max(X, Y)
template <typename T, size_t n> template <typename T, size_t n>
struct AtomicMaxIntegerImpl; struct AtomicIntegerImpl;
template<typename T> template<typename T>
struct AtomicMaxIntegerImpl<T, 1> { struct AtomicIntegerImpl<T, 1> {
inline __device__ void operator()(T *address, T val) { inline __device__ void operator()(T *address, T val) {
uint32_t * address_as_ui = (uint32_t *) (address - ((size_t) address & 3)); uint32_t * address_as_ui = (uint32_t *) (address - ((size_t) address & 3));
uint32_t old = *address_as_ui; uint32_t old = *address_as_ui;
uint32_t shift = (((size_t) address & 3) * 8); uint32_t shift = (((size_t) address & 3) * 8);
uint32_t sum; uint32_t res;
uint32_t assumed; uint32_t assumed;
do { do {
assumed = old; assumed = old;
sum = max(val, T((old >> shift) & 0xff)); res = OP(val, T((old >> shift) & 0xff));
old = (old & ~(0x000000ff << shift)) | (sum << shift); old = (old & ~(0x000000ff << shift)) | (res << shift);
old = atomicCAS(address_as_ui, assumed, old); old = atomicCAS(address_as_ui, assumed, old);
} while (assumed != old); } while (assumed != old);
} }
}; };
template<typename T> template<typename T>
struct AtomicMaxIntegerImpl<T, 2> { struct AtomicIntegerImpl<T, 2> {
inline __device__ void operator()(T *address, T val) { inline __device__ void operator()(T *address, T val) {
uint32_t * address_as_ui = (uint32_t *) ((char *) address - ((size_t) address & 2)); uint32_t * address_as_ui = (uint32_t *) ((char *) address - ((size_t) address & 2));
uint32_t old = *address_as_ui; uint32_t old = *address_as_ui;
uint32_t sum; uint32_t res;
uint32_t newval; uint32_t newval;
uint32_t assumed; uint32_t assumed;
do { do {
assumed = old; assumed = old;
sum = max(val, (size_t)address & 2 ? T(old >> 16) : T(old & 0xffff)); res = OP(val, (size_t) address & 2 ? T(old >> 16) : T(old & 0xffff));
newval = (size_t)address & 2 ? (old & 0xffff) | (sum << 16) : (old & 0xffff0000) | sum; newval = (size_t) address & 2 ? (old & 0xffff) | (res << 16) : (old & 0xffff0000) | res;
old = atomicCAS(address_as_ui, assumed, newval); old = atomicCAS(address_as_ui, assumed, newval);
} while (assumed != old); } while (assumed != old);
} }
}; };
template<typename T> template<typename T>
struct AtomicMaxIntegerImpl<T, 8> { struct AtomicIntegerImpl<T, 4> {
inline __device__ void operator()(T *address, T val) {
uint32_t *address_as_ull = (uint32_t *) (address);
uint32_t old = *address_as_ull;
uint32_t assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed, OP(val, (T) old));
} while (assumed != old);
}
};
template<typename T>
struct AtomicIntegerImpl<T, 8> {
inline __device__ void operator()(T *address, T val) { inline __device__ void operator()(T *address, T val) {
unsigned long long *address_as_ull = (unsigned long long *) (address); unsigned long long *address_as_ull = (unsigned long long *) (address);
unsigned long long old = *address_as_ull; unsigned long long old = *address_as_ull;
...@@ -46,25 +62,25 @@ struct AtomicMaxIntegerImpl<T, 8> { ...@@ -46,25 +62,25 @@ struct AtomicMaxIntegerImpl<T, 8> {
do { do {
assumed = old; assumed = old;
old = atomicCAS(address_as_ull, assumed, max(val, (T) old)); old = atomicCAS(address_as_ull, assumed, OP(val, (T) old));
} while (assumed != old); } while (assumed != old);
} }
}; };
static inline __device__ void atomicMax(uint8_t *address, uint8_t val) { static inline __device__ void atomicMax(uint8_t *address, uint8_t val) {
AtomicMaxIntegerImpl<uint8_t, sizeof(uint8_t)>()(address, val); AtomicIntegerImpl<uint8_t, sizeof(uint8_t)>()(address, val);
} }
static inline __device__ void atomicMax(int8_t *address, int8_t val) { static inline __device__ void atomicMax(int8_t *address, int8_t val) {
AtomicMaxIntegerImpl<int8_t, sizeof(int8_t)>()(address, val); AtomicIntegerImpl<int8_t, sizeof(int8_t)>()(address, val);
} }
static inline __device__ void atomicMax(int16_t *address, int16_t val) { static inline __device__ void atomicMax(int16_t *address, int16_t val) {
AtomicMaxIntegerImpl<int16_t, sizeof(int16_t)>()(address, val); AtomicIntegerImpl<int16_t, sizeof(int16_t)>()(address, val);
} }
static inline __device__ void atomicMax(int64_t *address, int64_t val) { static inline __device__ void atomicMax(int64_t *address, int64_t val) {
AtomicMaxIntegerImpl<int64_t, sizeof(int64_t)>()(address, val); AtomicIntegerImpl<int64_t, sizeof(int64_t)>()(address, val);
} }
#ifdef CUDA_HALF_TENSOR #ifdef CUDA_HALF_TENSOR
...@@ -78,7 +94,7 @@ static inline __device__ void atomicMax(float *address, float val) { ...@@ -78,7 +94,7 @@ static inline __device__ void atomicMax(float *address, float val) {
do { do {
assumed = old; assumed = old;
old = atomicCAS(address_as_i, assumed, __float_as_int(max(val, __int_as_float(assumed)))); old = atomicCAS(address_as_i, assumed, __float_as_int(OP(val, __int_as_float(assumed))));
} while (assumed != old); } while (assumed != old);
} }
...@@ -89,6 +105,6 @@ static inline __device__ void atomicMax(double *address, double val) { ...@@ -89,6 +105,6 @@ static inline __device__ void atomicMax(double *address, double val) {
do { do {
assumed = old; assumed = old;
old = atomicCAS(address_as_ull, assumed, __double_as_longlong(max(val, __longlong_as_double(assumed)))); old = atomicCAS(address_as_ull, assumed, __double_as_longlong(OP(val, __longlong_as_double(assumed))));
} while (assumed != old); } while (assumed != old);
} }
...@@ -5,7 +5,7 @@ inline int GET_BLOCKS(const int n) { ...@@ -5,7 +5,7 @@ inline int GET_BLOCKS(const int n) {
return (n + NUM_THREADS - 1) / NUM_THREADS; return (n + NUM_THREADS - 1) / NUM_THREADS;
} }
template <typename T> template<typename T>
struct TensorInfo { struct TensorInfo {
TensorInfo(T *t, int d, int sz[MAX_DIMS], int st[MAX_DIMS]) { TensorInfo(T *t, int d, int sz[MAX_DIMS], int st[MAX_DIMS]) {
data = t; dims = d; data = t; dims = d;
......
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
#include "generic/common.cu" #include "generic/common.cu"
#include "THCGenerateAllTypes.h" #include "THCGenerateAllTypes.h"
template <typename Real, int Dims> template<typename Real, int Dims>
__global__ void maxKernel(TensorInfo<Real> output, TensorInfo<int64_t> index, TensorInfo<Real> input, TensorInfo<int64_t> arg, const int dim, const int n) { __global__ void maxKernel(TensorInfo<Real> output, TensorInfo<int64_t> index, TensorInfo<Real> input, TensorInfo<int64_t> arg, const int dim, const int n) {
KERNEL_LOOP(i, n) { KERNEL_LOOP(i, n) {
int outputOffset = 0; int indexOffset = 0; int inputOffset = 0; int argOffset = 0; int outputOffset = 0; int indexOffset = 0; int inputOffset = 0; int argOffset = 0;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment