added spline basis cuda impl

5e6f3394 · rusty1s · e1fcf1d2 · 5e6f3394 · 5e6f3394 · 5e6f3394
Commit 5e6f3394 authored Mar 14, 2018 by rusty1s
8 changed files
--- a/build.sh
+++ b/build.sh
@@ -7,4 +7,4 @@ SRC_DIR=torch_spline_conv/kernel
 BUILD_DIR=torch_spline_conv/build
 mkdir -p "$BUILD_DIR"
-$(which nvcc) -c -o "$BUILD_DIR/kernel.so" "$SRC_DIR/kernel.cu" -arch=sm_35 -Xcompiler -fPIC -shared "-I$1/lib/include/TH" "-I$1/lib/include/THC" "-I$SRC_DIR"
+$(which nvcc) -c -o "$BUILD_DIR/kernel.so" "$SRC_DIR/kernel.cu" -arch=sm_35 -Xcompiler -fPIC -shared "-I$TORCH/lib/include/TH" "-I$TORCH/lib/include/THC" "-I$SRC_DIR"
--- a/test/test_basis.py
+++ b/test/test_basis.py
@@ -22,7 +22,7 @@ def test_spline_basis_cpu(tensor, i):
    is_open_spline = torch.ByteTensor(data[i]['is_open_spline'])
    K = kernel_size.prod()
    expected_basis = Tensor(tensor, data[i]['expected_basis'])
-    expected_index = torch.ByteTensor(data[i]['expected_index'])
+    expected_index = torch.LongTensor(data[i]['expected_index'])
    basis, index = spline_basis_forward(degree, pseudo, kernel_size,
                                        is_open_spline, K)
@@ -41,6 +41,13 @@ def test_spline_basis_gpu(tensor, i):
    kernel_size = torch.cuda.LongTensor(data[i]['kernel_size'])
    is_open_spline = torch.cuda.ByteTensor(data[i]['is_open_spline'])
    K = kernel_size.prod()
+    expected_basis = Tensor(tensor, data[i]['expected_basis'])
-    basis, index = spline_basis_forward(degree, pseudo, kernel_size,
+    expected_index = torch.LongTensor(data[i]['expected_index'])
-                                        is_open_spline, K)
+    if i == 0:
+        basis, index = spline_basis_forward(degree, pseudo, kernel_size,
+                                            is_open_spline, K)
+        basis, index = basis.cpu(), index.cpu()
+        basis = [pytest.approx(x, 0.01) for x in basis.view(-1).tolist()]
+        assert basis == expected_basis.view(-1).tolist()
+        assert index.tolist() == expected_index.tolist()
--- a/torch_spline_conv/kernel/THCBasisForward.cuh
+++ b/torch_spline_conv/kernel/THCBasisForward.cuh
+#define SPLINE_BASIS_FORWARD(NAME, basis, weight_index, pseudo, kernel_size, is_open_spline, K) { \
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, pseudo, kernel_size, is_open_spline)); \
+\
+  const int n = THCTensor_(nElement)(state, basis); \
+  TensorInfo<real> basisInfo = thc_(getTensorInfo)(state, basis); \
+  TensorInfo<int64_t> weightIndexInfo = thc_getTensorInfo_Long(state, weight_index); \
+  TensorInfo<real> pseudoInfo = thc_(getTensorInfo)(state, pseudo); \
+  int64_t *kernelSizeData = THCudaLongTensor_data(state, kernel_size); \
+  uint8_t *isOpenSplineData = THCudaByteTensor_data(state, is_open_spline); \
+\
+  KERNEL_RUN(NAME, pseudoInfo.size[1], n, basisInfo, weightIndexInfo, pseudoInfo, kernelSizeData, isOpenSplineData, K) \
+}
+template<typename Real, int M, int D>
+struct SplineBasisForward {
+  static __device__ void compute(int i, const TensorInfo<Real>& basis, const TensorInfo<int64_t>& weightIndex, const TensorInfo<Real>& pseudo, int64_t *kernelSize, uint8_t *isOpenSpline, int K) {
+    int64_t k = i % basis.size[1];
+    int64_t pseudoOffset = ((i / basis.size[1]) % pseudo.size[0]) * pseudo.stride[0];
+    int64_t d, k_mod, wi = 0, offset = K; Real b = 1, value;
+    for (d = 0; d < D; d++) {
+      offset /= kernelSize[d];
+      k_mod = k % (M + 1);
+      k /= M + 1;
+      value = pseudo.data[pseudoOffset + d * pseudo.stride[1]] * (kernelSize[d] - M * isOpenSpline[d]);
+      wi += (((int64_t) value + k_mod) % kernelSize[d]) * offset;
+      value -= floor(value);
+      value = 1 - value - k_mod + 2 * value * k_mod;
+      b *= value;
+    }
+    basis.data[i] = b;
+    weightIndex.data[i] = wi;
+  }
+};
+template<typename Real, int M>
+struct SplineBasisForward<Real, M, -1> {
+  static __device__ void compute(int i, const TensorInfo<Real>& basis, const TensorInfo<int64_t>& weightIndex, const TensorInfo<Real>& pseudo, int64_t *kernelSize, uint8_t *isOpenSpline, int K) {
+    int64_t k = i % basis.size[1];
+    int64_t pseudoOffset = ((i / basis.size[1]) % pseudo.size[0]) * pseudo.stride[0];
+    int64_t d, k_mod, wi = 0, offset = K; Real b = 1, value;
+    for (d = 0; d < pseudo.size[1]; d++) {
+      offset /= kernelSize[d];
+      k_mod = k % (M + 1);
+      k /= M + 1;
+      value = pseudo.data[pseudoOffset + d * pseudo.stride[1]] * (kernelSize[d] - M * isOpenSpline[d]);
+      wi += (((int64_t) value + k_mod) % kernelSize[d]) * offset;
+      value -= floor(value);
+      value = 1 - value - k_mod + 2 * value * k_mod;
+      b *= value;
+    }
+    basis.data[i] = b;
+    weightIndex.data[i] = wi;
+  }
+};
+template<typename Real, int D>
+__global__ void linearBasisForwardKernel(TensorInfo<Real> basis, TensorInfo<int64_t> weightIndex, TensorInfo<Real> pseudo, int64_t *kernelSize, uint8_t *isOpenSpline, int K, int n) {
+  KERNEL_LOOP(i, n) {
+    SplineBasisForward<Real, 1, D>::compute(i, basis, weightIndex, pseudo, kernelSize, isOpenSpline, K);
+  }
+}
+template<typename Real, int D>
+__global__ void quadraticBasisForwardKernel(TensorInfo<Real> basis, TensorInfo<int64_t> weightIndex, TensorInfo<Real> pseudo, int64_t *kernelSize, uint8_t *isOpenSpline, int K, int n) {
+  KERNEL_LOOP(i, n) {
+    SplineBasisForward<Real, 2, D>::compute(i, basis, weightIndex, pseudo, kernelSize, isOpenSpline, K);
+  }
+}
+template<typename Real, int D>
+__global__ void cubicBasisForwardKernel(TensorInfo<Real> basis, TensorInfo<int64_t> weightIndex, TensorInfo<Real> pseudo, int64_t *kernelSize, uint8_t *isOpenSpline, int K, int n) {
+  KERNEL_LOOP(i, n) {
+    SplineBasisForward<Real, 3, D>::compute(i, basis, weightIndex, pseudo, kernelSize, isOpenSpline, K);
+  }
+}
--- a/torch_spline_conv/kernel/common.cuh
+++ b/torch_spline_conv/kernel/common.cuh
+const int MAX_DIMS = 25;
+const int NUM_THREADS = 1024;
+inline int GET_BLOCKS(const int n) {
+  return (n + NUM_THREADS - 1) / NUM_THREADS;
+}
+template<typename T>
+struct TensorInfo {
+  TensorInfo(T *t, int d, int sz[MAX_DIMS], int st[MAX_DIMS]) {
+    data = t; dims = d;
+    for (int i = 0; i < dims; i++) {
+      size[i] = sz[i];
+      stride[i] = st[i];
+    }
+  }
+  T *data;
+  int dims;
+  int size[MAX_DIMS];
+  int stride[MAX_DIMS];
+};
+#define KERNEL_LOOP(I, N) \
+  for (int I = blockIdx.x * blockDim.x + threadIdx.x; I < N; i += blockDim.x * gridDim.x)
+#define KERNEL_RUN(NAME, D, N, ...) { \
+  int grid = GET_BLOCKS(N); \
+  cudaStream_t stream = THCState_getCurrentStream(state); \
+  switch (D) { \
+    case  1: NAME<real,  1><<<grid, NUM_THREADS, 0, stream>>>(__VA_ARGS__, N); break; \
+    case  2: NAME<real,  2><<<grid, NUM_THREADS, 0, stream>>>(__VA_ARGS__, N); break; \
+    case  3: NAME<real,  3><<<grid, NUM_THREADS, 0, stream>>>(__VA_ARGS__, N); break; \
+    case  4: NAME<real,  4><<<grid, NUM_THREADS, 0, stream>>>(__VA_ARGS__, N); break; \
+    default: NAME<real, -1><<<grid, NUM_THREADS, 0, stream>>>(__VA_ARGS__, N); break; \
+  } \
+  THCudaCheck(cudaGetLastError()); \
+}
--- a/torch_spline_conv/kernel/generic/common.cu
+++ b/torch_spline_conv/kernel/generic/common.cu
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/common.cu"
+#else
+TensorInfo<real> thc_(getTensorInfo)(THCState *state, THCTensor *tensor) {
+  real *data = THCTensor_(data)(state, tensor);
+  int dims = THCTensor_(nDimension)(state, tensor);
+  int size[MAX_DIMS]; int stride[MAX_DIMS];
+  for (int i = 0; i < dims; i++) {
+    size[i] = THCTensor_(size)(state, tensor, i);
+    stride[i] = THCTensor_(stride)(state, tensor, i);
+  }
+  return TensorInfo<real>(data, dims, size, stride);
+}
+#endif
--- a/torch_spline_conv/kernel/generic/kernel.cu
+++ b/torch_spline_conv/kernel/generic/kernel.cu
@@ -3,15 +3,15 @@
 #else
 void spline_(linear_basis_forward)(THCState *state, THCTensor *basis, THCudaLongTensor *weight_index, THCTensor *pseudo, THCudaLongTensor *kernel_size, THCudaByteTensor *is_open_spline, int K) {
-  printf("linear");
+  SPLINE_BASIS_FORWARD(linearBasisForwardKernel, basis, weight_index, pseudo, kernel_size, is_open_spline, K)
 }
 void spline_(quadratic_basis_forward)(THCState *state, THCTensor *basis, THCudaLongTensor *weight_index, THCTensor *pseudo, THCudaLongTensor *kernel_size, THCudaByteTensor *is_open_spline, int K) {
-  printf("quadratic");
+  SPLINE_BASIS_FORWARD(quadraticBasisForwardKernel, basis, weight_index, pseudo, kernel_size, is_open_spline, K)
 }
 void spline_(cubic_basis_forward)(THCState *state, THCTensor *basis, THCudaLongTensor *weight_index, THCTensor *pseudo, THCudaLongTensor *kernel_size, THCudaByteTensor *is_open_spline, int K) {
-  printf("cubic");
+  SPLINE_BASIS_FORWARD(cubicBasisForwardKernel, basis, weight_index, pseudo, kernel_size, is_open_spline, K)
 }
 #endif
--- a/torch_spline_conv/kernel/kernel.cu
+++ b/torch_spline_conv/kernel/kernel.cu
@@ -2,7 +2,14 @@
 #include "kernel.h"
+#include "common.cuh"
+#include "THCBasisForward.cuh"
 #define spline_(NAME) TH_CONCAT_4(spline_, NAME, _kernel_, Real)
+#define thc_(NAME) TH_CONCAT_4(thc_, NAME, _, Real)
+#include "generic/common.cu"
+#include "THCGenerateAllTypes.h"
 #include "generic/kernel.cu"
 #include "THCGenerateFloatType.h"

--- a/torch_spline_conv/src/cpu.c
+++ b/torch_spline_conv/src/cpu.c
@@ -9,23 +9,23 @@
  uint8_t *is_open_spline_data = is_open_spline->storage->data + is_open_spline->storageOffset; \
  int64_t S = THLongTensor_size(weight_index, 1); \
  int64_t D = THTensor_(size)(pseudo, 1); \
-  int64_t s, d, k, k_mod, i, offset; real b, value; \
+  int64_t s, d, k, k_mod, wi, offset; real b, value; \
 \
  TH_TENSOR_DIM_APPLY3(real, basis, int64_t, weight_index, real, pseudo, 1, TH_TENSOR_DIM_APPLY3_SIZE_EQ_EXCEPT_DIM, \
    for (s = 0; s < S; s++) { \
-      b = 1; i = 0; k = s; offset = K; \
+      b = 1; wi = 0; k = s; offset = K; \
      for (d = 0; d < D; d++) { \
        offset /= kernel_size_data[d]; \
        k_mod = k % (M + 1); \
        k /= M + 1; \
        value = *(pseudo_data + d * pseudo_stride) * (kernel_size_data[d] - M * is_open_spline_data[d]); \
-        i += (((int64_t) value + k_mod) % kernel_size_data[d]) * offset; \
+        wi += (((int64_t) value + k_mod) % kernel_size_data[d]) * offset; \
        value -= floor(value); \
        CODE \
        b *= value; \
      } \
      basis_data[s * basis_stride] = b; \
-      weight_index_data[s * weight_index_stride] = i; \
+      weight_index_data[s * weight_index_stride] = wi; \
    }) \
 }