Commit 6372815e authored by rusty1s's avatar rusty1s
Browse files

first atomic max impl

parent b3091036
...@@ -37,8 +37,7 @@ def test_scatter_max(str): ...@@ -37,8 +37,7 @@ def test_scatter_max(str):
assert input.grad.data.tolist() == expected_grad_input assert input.grad.data.tolist() == expected_grad_input
# @pytest.mark.parametrize('str', tensor_strs) @pytest.mark.parametrize('str', tensor_strs)
@pytest.mark.parametrize('str', ['FloatTensor'])
def test_scatter_cuda_max(str): def test_scatter_cuda_max(str):
input = [[2, 0, 1, 4, 3], [0, 2, 1, 3, 4]] input = [[2, 0, 1, 4, 3], [0, 2, 1, 3, 4]]
index = [[4, 5, 4, 2, 3], [0, 0, 2, 2, 1]] index = [[4, 5, 4, 2, 3], [0, 0, 2, 2, 1]]
......
template <typename T, size_t n>
struct AtomicMaxIntegerImpl;
template<typename T>
struct AtomicMaxIntegerImpl<T, 1> {
inline __device__ void operator()(T *address, T val) {
uint32_t * address_as_ui =
(uint32_t *) (address - ((size_t)address & 3));
uint32_t old = *address_as_ui;
uint32_t shift = (((size_t)address & 3) * 8);
uint32_t sum;
uint32_t assumed;
do {
assumed = old;
sum = val + T((old >> shift) & 0xff);
old = (old & ~(0x000000ff << shift)) | (sum << shift);
old = atomicCAS(address_as_ui, assumed, old);
} while (assumed != old);
}
};
template<typename T>
struct AtomicMaxIntegerImpl<T, 2> {
inline __device__ void operator()(T *address, T val) {
uint32_t * address_as_ui =
(uint32_t *) ((char *)address - ((size_t)address & 2));
uint32_t old = *address_as_ui;
uint32_t sum;
uint32_t newval;
uint32_t assumed;
do {
assumed = old;
sum = val + (size_t)address & 2 ? T(old >> 16) : T(old & 0xffff);
newval = (size_t)address & 2 ? (old & 0xffff) | (sum << 16) : (old & 0xffff0000) | sum;
old = atomicCAS(address_as_ui, assumed, newval);
} while (assumed != old);
}
};
template<typename T>
struct AtomicMaxIntegerImpl<T, 4> {
inline __device__ void operator()(T *address, T val) {
uint32_t * address_as_ui = (uint32_t *) (address);
uint32_t old = *address_as_ui;
uint32_t newval;
uint32_t assumed;
do {
assumed = old;
newval = val + (T)old;
old = atomicCAS(address_as_ui, assumed, newval);
} while (assumed != old);
}
};
template<typename T>
struct AtomicMaxIntegerImpl<T, 8> {
inline __device__ void operator()(T *address, T val) {
unsigned long long * address_as_ui = (unsigned long long *) (address);
unsigned long long old = *address_as_ui;
unsigned long long newval;
unsigned long long assumed;
do {
assumed = old;
newval = val + (T)old;
old = atomicCAS(address_as_ui, assumed, newval);
} while (assumed != old);
}
};
static inline __device__ void atomicMax(uint8_t *address, uint8_t val) {}
static inline __device__ void atomicMax(int8_t *address, int8_t val) {}
static inline __device__ void atomicMax(int16_t *address, int16_t val) {}
static inline __device__ void atomicMax(int64_t *address, int64_t val) {}
#ifdef CUDA_HALF_TENSOR
static inline __device__ void atomicMax(half *address, half val) {}
#endif
static inline __device__ void atomicMax(float *address, float val) {
int *address_as_i = (int *) address;
int old = *address_as_i;
int assumed;
do {
assumed = old;
old = atomicCAS(address_as_i, assumed, __float_as_int(max(val, __int_as_float(assumed))));
} while (assumed != old);
}
static inline __device__ void atomicMax(double *address, double val) {
unsigned long long int *address_as_ull = (unsigned long long int *) address;
unsigned long long int old = *address_as_ull;
unsigned long long int assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed, __double_as_longlong(max(val, __longlong_as_double(assumed))));
} while (assumed != old);
}
#define KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x)
const int MAX_DIMS = 25; const int MAX_DIMS = 25;
const int NUM_THREADS = 1024; const int NUM_THREADS = 1024;
...@@ -23,3 +20,18 @@ struct TensorInfo { ...@@ -23,3 +20,18 @@ struct TensorInfo {
int size[MAX_DIMS]; int size[MAX_DIMS];
int stride[MAX_DIMS]; int stride[MAX_DIMS];
}; };
#define KERNEL_LOOP(I, N) \
for (int I = blockIdx.x * blockDim.x + threadIdx.x; I < N; i += blockDim.x * gridDim.x)
/* #define KERNEL_RUN(NAME, DIMS, N, PARAMS) \ */
#define KERNEL_RUN(NAME, DIMS, N, ...) \
int grid = GET_BLOCKS(N); \
cudaStream_t stream = THCState_getCurrentStream(state); \
switch (DIMS) { \
case 1: NAME<real, 1><<<grid, NUM_THREADS, 0, stream>>>(__VA_ARGS__, N); break; \
case 2: NAME<real, 2><<<grid, NUM_THREADS, 0, stream>>>(__VA_ARGS__, N); break; \
case 3: NAME<real, 3><<<grid, NUM_THREADS, 0, stream>>>(__VA_ARGS__, N); break; \
default: NAME<real, -1><<<grid, NUM_THREADS, 0, stream>>>(__VA_ARGS__, N); break; \
} \
THCudaCheck(cudaGetLastError());
...@@ -24,9 +24,13 @@ void scatter_(max)(THCState *state, int dim, THCTensor *output, THCudaLongTensor ...@@ -24,9 +24,13 @@ void scatter_(max)(THCState *state, int dim, THCTensor *output, THCudaLongTensor
TensorInfo<real> outputInfo = thc_(getTensorInfo)(state, output); TensorInfo<real> outputInfo = thc_(getTensorInfo)(state, output);
TensorInfo<int64_t> indexInfo = thc_getTensorInfo_Long(state, index); TensorInfo<int64_t> indexInfo = thc_getTensorInfo_Long(state, index);
TensorInfo<real> inputInfo = thc_(getTensorInfo)(state, input); TensorInfo<real> inputInfo = thc_(getTensorInfo)(state, input);
TensorInfo<int64_t> argOutputInfo = thc_getTensorInfo_Long(state, arg_output); TensorInfo<int64_t> argInfo = thc_getTensorInfo_Long(state, arg_output);
maxKernel<real, -1><<<GET_BLOCKS(n), NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(outputInfo, indexInfo, inputInfo, argOutputInfo, dim, n); KERNEL_RUN(maxKernel, indexInfo.dims, n, outputInfo, indexInfo, inputInfo, argInfo, dim)
/* KERNEL_RUN(argKernel, indexInfo.dims, n, outputInfo, indexInfo, dim) */
/* maxKernel<real, -1><<<GET_BLOCKS(n), NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(outputInfo, indexInfo, inputInfo, dim, n); */
/* argKernel<real, -1><<<GET_BLOCKS(n), NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(dim, n); */
} }
void scatter_(min)(THCState *state, int dim, THCTensor *output, THCudaLongTensor *index, THCTensor *input, THCudaLongTensor *arg_output) { void scatter_(min)(THCState *state, int dim, THCTensor *output, THCudaLongTensor *index, THCTensor *input, THCudaLongTensor *arg_output) {
......
#include <THC/THC.h> #include <THC/THC.h>
#include "THCAtomics.cuh"
#include "kernel.h" #include "kernel.h"
#include "common.cuh" #include "common.cuh"
...@@ -13,9 +14,27 @@ ...@@ -13,9 +14,27 @@
#include "THCGenerateAllTypes.h" #include "THCGenerateAllTypes.h"
template <typename Real, int Dims> template <typename Real, int Dims>
__global__ void maxKernel(TensorInfo<Real> output, TensorInfo<int64_t> index, TensorInfo<Real> input, TensorInfo<int64_t> arg_output, const int dim, const int n) { __global__ void maxKernel(TensorInfo<Real> output, TensorInfo<int64_t> index, TensorInfo<Real> input, TensorInfo<int64_t> arg, const int dim, const int n) {
KERNEL_LOOP(i, n) { KERNEL_LOOP(i, n) {
int outputOffset = 0; int indexOffset = 0; int inputOffset = 0; int argOffset = 0;
int curDimIndex;
for (int d = index.dims - 1; d >= 0; d--) {
curDimIndex = i % index.size[d];
indexOffset += curDimIndex * index.stride[d];
inputOffset += curDimIndex * input.stride[d];
if (d != dim) {
outputOffset += curDimIndex * output.stride[d];
argOffset += curDimIndex * arg.stride[d];
}
i /= index.size[d];
}
int64_t indexValue = index.data[indexOffset];
assert(indexValue >= 0 && indexValue < output.size[dim]);
outputOffset += indexValue * output.stride[dim];
argOffset += indexValue * arg.stride[dim];
atomicMax(&output.data[outputOffset], input.data[inputOffset]);
// TODO: Do something with arg.
} }
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment