Commit 5e6f3394 authored by rusty1s's avatar rusty1s
Browse files

added spline basis cuda impl

parent e1fcf1d2
...@@ -7,4 +7,4 @@ SRC_DIR=torch_spline_conv/kernel ...@@ -7,4 +7,4 @@ SRC_DIR=torch_spline_conv/kernel
BUILD_DIR=torch_spline_conv/build BUILD_DIR=torch_spline_conv/build
mkdir -p "$BUILD_DIR" mkdir -p "$BUILD_DIR"
$(which nvcc) -c -o "$BUILD_DIR/kernel.so" "$SRC_DIR/kernel.cu" -arch=sm_35 -Xcompiler -fPIC -shared "-I$1/lib/include/TH" "-I$1/lib/include/THC" "-I$SRC_DIR" $(which nvcc) -c -o "$BUILD_DIR/kernel.so" "$SRC_DIR/kernel.cu" -arch=sm_35 -Xcompiler -fPIC -shared "-I$TORCH/lib/include/TH" "-I$TORCH/lib/include/THC" "-I$SRC_DIR"
...@@ -22,7 +22,7 @@ def test_spline_basis_cpu(tensor, i): ...@@ -22,7 +22,7 @@ def test_spline_basis_cpu(tensor, i):
is_open_spline = torch.ByteTensor(data[i]['is_open_spline']) is_open_spline = torch.ByteTensor(data[i]['is_open_spline'])
K = kernel_size.prod() K = kernel_size.prod()
expected_basis = Tensor(tensor, data[i]['expected_basis']) expected_basis = Tensor(tensor, data[i]['expected_basis'])
expected_index = torch.ByteTensor(data[i]['expected_index']) expected_index = torch.LongTensor(data[i]['expected_index'])
basis, index = spline_basis_forward(degree, pseudo, kernel_size, basis, index = spline_basis_forward(degree, pseudo, kernel_size,
is_open_spline, K) is_open_spline, K)
...@@ -41,6 +41,13 @@ def test_spline_basis_gpu(tensor, i): ...@@ -41,6 +41,13 @@ def test_spline_basis_gpu(tensor, i):
kernel_size = torch.cuda.LongTensor(data[i]['kernel_size']) kernel_size = torch.cuda.LongTensor(data[i]['kernel_size'])
is_open_spline = torch.cuda.ByteTensor(data[i]['is_open_spline']) is_open_spline = torch.cuda.ByteTensor(data[i]['is_open_spline'])
K = kernel_size.prod() K = kernel_size.prod()
expected_basis = Tensor(tensor, data[i]['expected_basis'])
basis, index = spline_basis_forward(degree, pseudo, kernel_size, expected_index = torch.LongTensor(data[i]['expected_index'])
is_open_spline, K)
if i == 0:
basis, index = spline_basis_forward(degree, pseudo, kernel_size,
is_open_spline, K)
basis, index = basis.cpu(), index.cpu()
basis = [pytest.approx(x, 0.01) for x in basis.view(-1).tolist()]
assert basis == expected_basis.view(-1).tolist()
assert index.tolist() == expected_index.tolist()
#define SPLINE_BASIS_FORWARD(NAME, basis, weight_index, pseudo, kernel_size, is_open_spline, K) { \
THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, pseudo, kernel_size, is_open_spline)); \
\
const int n = THCTensor_(nElement)(state, basis); \
TensorInfo<real> basisInfo = thc_(getTensorInfo)(state, basis); \
TensorInfo<int64_t> weightIndexInfo = thc_getTensorInfo_Long(state, weight_index); \
TensorInfo<real> pseudoInfo = thc_(getTensorInfo)(state, pseudo); \
int64_t *kernelSizeData = THCudaLongTensor_data(state, kernel_size); \
uint8_t *isOpenSplineData = THCudaByteTensor_data(state, is_open_spline); \
\
KERNEL_RUN(NAME, pseudoInfo.size[1], n, basisInfo, weightIndexInfo, pseudoInfo, kernelSizeData, isOpenSplineData, K) \
}
template<typename Real, int M, int D>
struct SplineBasisForward {
static __device__ void compute(int i, const TensorInfo<Real>& basis, const TensorInfo<int64_t>& weightIndex, const TensorInfo<Real>& pseudo, int64_t *kernelSize, uint8_t *isOpenSpline, int K) {
int64_t k = i % basis.size[1];
int64_t pseudoOffset = ((i / basis.size[1]) % pseudo.size[0]) * pseudo.stride[0];
int64_t d, k_mod, wi = 0, offset = K; Real b = 1, value;
for (d = 0; d < D; d++) {
offset /= kernelSize[d];
k_mod = k % (M + 1);
k /= M + 1;
value = pseudo.data[pseudoOffset + d * pseudo.stride[1]] * (kernelSize[d] - M * isOpenSpline[d]);
wi += (((int64_t) value + k_mod) % kernelSize[d]) * offset;
value -= floor(value);
value = 1 - value - k_mod + 2 * value * k_mod;
b *= value;
}
basis.data[i] = b;
weightIndex.data[i] = wi;
}
};
template<typename Real, int M>
struct SplineBasisForward<Real, M, -1> {
static __device__ void compute(int i, const TensorInfo<Real>& basis, const TensorInfo<int64_t>& weightIndex, const TensorInfo<Real>& pseudo, int64_t *kernelSize, uint8_t *isOpenSpline, int K) {
int64_t k = i % basis.size[1];
int64_t pseudoOffset = ((i / basis.size[1]) % pseudo.size[0]) * pseudo.stride[0];
int64_t d, k_mod, wi = 0, offset = K; Real b = 1, value;
for (d = 0; d < pseudo.size[1]; d++) {
offset /= kernelSize[d];
k_mod = k % (M + 1);
k /= M + 1;
value = pseudo.data[pseudoOffset + d * pseudo.stride[1]] * (kernelSize[d] - M * isOpenSpline[d]);
wi += (((int64_t) value + k_mod) % kernelSize[d]) * offset;
value -= floor(value);
value = 1 - value - k_mod + 2 * value * k_mod;
b *= value;
}
basis.data[i] = b;
weightIndex.data[i] = wi;
}
};
template<typename Real, int D>
__global__ void linearBasisForwardKernel(TensorInfo<Real> basis, TensorInfo<int64_t> weightIndex, TensorInfo<Real> pseudo, int64_t *kernelSize, uint8_t *isOpenSpline, int K, int n) {
KERNEL_LOOP(i, n) {
SplineBasisForward<Real, 1, D>::compute(i, basis, weightIndex, pseudo, kernelSize, isOpenSpline, K);
}
}
template<typename Real, int D>
__global__ void quadraticBasisForwardKernel(TensorInfo<Real> basis, TensorInfo<int64_t> weightIndex, TensorInfo<Real> pseudo, int64_t *kernelSize, uint8_t *isOpenSpline, int K, int n) {
KERNEL_LOOP(i, n) {
SplineBasisForward<Real, 2, D>::compute(i, basis, weightIndex, pseudo, kernelSize, isOpenSpline, K);
}
}
template<typename Real, int D>
__global__ void cubicBasisForwardKernel(TensorInfo<Real> basis, TensorInfo<int64_t> weightIndex, TensorInfo<Real> pseudo, int64_t *kernelSize, uint8_t *isOpenSpline, int K, int n) {
KERNEL_LOOP(i, n) {
SplineBasisForward<Real, 3, D>::compute(i, basis, weightIndex, pseudo, kernelSize, isOpenSpline, K);
}
}
const int MAX_DIMS = 25;
const int NUM_THREADS = 1024;
inline int GET_BLOCKS(const int n) {
return (n + NUM_THREADS - 1) / NUM_THREADS;
}
template<typename T>
struct TensorInfo {
TensorInfo(T *t, int d, int sz[MAX_DIMS], int st[MAX_DIMS]) {
data = t; dims = d;
for (int i = 0; i < dims; i++) {
size[i] = sz[i];
stride[i] = st[i];
}
}
T *data;
int dims;
int size[MAX_DIMS];
int stride[MAX_DIMS];
};
#define KERNEL_LOOP(I, N) \
for (int I = blockIdx.x * blockDim.x + threadIdx.x; I < N; i += blockDim.x * gridDim.x)
#define KERNEL_RUN(NAME, D, N, ...) { \
int grid = GET_BLOCKS(N); \
cudaStream_t stream = THCState_getCurrentStream(state); \
switch (D) { \
case 1: NAME<real, 1><<<grid, NUM_THREADS, 0, stream>>>(__VA_ARGS__, N); break; \
case 2: NAME<real, 2><<<grid, NUM_THREADS, 0, stream>>>(__VA_ARGS__, N); break; \
case 3: NAME<real, 3><<<grid, NUM_THREADS, 0, stream>>>(__VA_ARGS__, N); break; \
case 4: NAME<real, 4><<<grid, NUM_THREADS, 0, stream>>>(__VA_ARGS__, N); break; \
default: NAME<real, -1><<<grid, NUM_THREADS, 0, stream>>>(__VA_ARGS__, N); break; \
} \
THCudaCheck(cudaGetLastError()); \
}
#ifndef THC_GENERIC_FILE
#define THC_GENERIC_FILE "generic/common.cu"
#else
TensorInfo<real> thc_(getTensorInfo)(THCState *state, THCTensor *tensor) {
real *data = THCTensor_(data)(state, tensor);
int dims = THCTensor_(nDimension)(state, tensor);
int size[MAX_DIMS]; int stride[MAX_DIMS];
for (int i = 0; i < dims; i++) {
size[i] = THCTensor_(size)(state, tensor, i);
stride[i] = THCTensor_(stride)(state, tensor, i);
}
return TensorInfo<real>(data, dims, size, stride);
}
#endif
...@@ -3,15 +3,15 @@ ...@@ -3,15 +3,15 @@
#else #else
void spline_(linear_basis_forward)(THCState *state, THCTensor *basis, THCudaLongTensor *weight_index, THCTensor *pseudo, THCudaLongTensor *kernel_size, THCudaByteTensor *is_open_spline, int K) { void spline_(linear_basis_forward)(THCState *state, THCTensor *basis, THCudaLongTensor *weight_index, THCTensor *pseudo, THCudaLongTensor *kernel_size, THCudaByteTensor *is_open_spline, int K) {
printf("linear"); SPLINE_BASIS_FORWARD(linearBasisForwardKernel, basis, weight_index, pseudo, kernel_size, is_open_spline, K)
} }
void spline_(quadratic_basis_forward)(THCState *state, THCTensor *basis, THCudaLongTensor *weight_index, THCTensor *pseudo, THCudaLongTensor *kernel_size, THCudaByteTensor *is_open_spline, int K) { void spline_(quadratic_basis_forward)(THCState *state, THCTensor *basis, THCudaLongTensor *weight_index, THCTensor *pseudo, THCudaLongTensor *kernel_size, THCudaByteTensor *is_open_spline, int K) {
printf("quadratic"); SPLINE_BASIS_FORWARD(quadraticBasisForwardKernel, basis, weight_index, pseudo, kernel_size, is_open_spline, K)
} }
void spline_(cubic_basis_forward)(THCState *state, THCTensor *basis, THCudaLongTensor *weight_index, THCTensor *pseudo, THCudaLongTensor *kernel_size, THCudaByteTensor *is_open_spline, int K) { void spline_(cubic_basis_forward)(THCState *state, THCTensor *basis, THCudaLongTensor *weight_index, THCTensor *pseudo, THCudaLongTensor *kernel_size, THCudaByteTensor *is_open_spline, int K) {
printf("cubic"); SPLINE_BASIS_FORWARD(cubicBasisForwardKernel, basis, weight_index, pseudo, kernel_size, is_open_spline, K)
} }
#endif #endif
...@@ -2,7 +2,14 @@ ...@@ -2,7 +2,14 @@
#include "kernel.h" #include "kernel.h"
#include "common.cuh"
#include "THCBasisForward.cuh"
#define spline_(NAME) TH_CONCAT_4(spline_, NAME, _kernel_, Real) #define spline_(NAME) TH_CONCAT_4(spline_, NAME, _kernel_, Real)
#define thc_(NAME) TH_CONCAT_4(thc_, NAME, _, Real)
#include "generic/common.cu"
#include "THCGenerateAllTypes.h"
#include "generic/kernel.cu" #include "generic/kernel.cu"
#include "THCGenerateFloatType.h" #include "THCGenerateFloatType.h"
......
...@@ -9,23 +9,23 @@ ...@@ -9,23 +9,23 @@
uint8_t *is_open_spline_data = is_open_spline->storage->data + is_open_spline->storageOffset; \ uint8_t *is_open_spline_data = is_open_spline->storage->data + is_open_spline->storageOffset; \
int64_t S = THLongTensor_size(weight_index, 1); \ int64_t S = THLongTensor_size(weight_index, 1); \
int64_t D = THTensor_(size)(pseudo, 1); \ int64_t D = THTensor_(size)(pseudo, 1); \
int64_t s, d, k, k_mod, i, offset; real b, value; \ int64_t s, d, k, k_mod, wi, offset; real b, value; \
\ \
TH_TENSOR_DIM_APPLY3(real, basis, int64_t, weight_index, real, pseudo, 1, TH_TENSOR_DIM_APPLY3_SIZE_EQ_EXCEPT_DIM, \ TH_TENSOR_DIM_APPLY3(real, basis, int64_t, weight_index, real, pseudo, 1, TH_TENSOR_DIM_APPLY3_SIZE_EQ_EXCEPT_DIM, \
for (s = 0; s < S; s++) { \ for (s = 0; s < S; s++) { \
b = 1; i = 0; k = s; offset = K; \ b = 1; wi = 0; k = s; offset = K; \
for (d = 0; d < D; d++) { \ for (d = 0; d < D; d++) { \
offset /= kernel_size_data[d]; \ offset /= kernel_size_data[d]; \
k_mod = k % (M + 1); \ k_mod = k % (M + 1); \
k /= M + 1; \ k /= M + 1; \
value = *(pseudo_data + d * pseudo_stride) * (kernel_size_data[d] - M * is_open_spline_data[d]); \ value = *(pseudo_data + d * pseudo_stride) * (kernel_size_data[d] - M * is_open_spline_data[d]); \
i += (((int64_t) value + k_mod) % kernel_size_data[d]) * offset; \ wi += (((int64_t) value + k_mod) % kernel_size_data[d]) * offset; \
value -= floor(value); \ value -= floor(value); \
CODE \ CODE \
b *= value; \ b *= value; \
} \ } \
basis_data[s * basis_stride] = b; \ basis_data[s * basis_stride] = b; \
weight_index_data[s * weight_index_stride] = i; \ weight_index_data[s * weight_index_stride] = wi; \
}) \ }) \
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment