added cuda kernel

fb4632ba · rusty1s · 6aad98c2 · fb4632ba · fb4632ba · fb4632ba
Commit fb4632ba authored Feb 06, 2018 by rusty1s
8 changed files
--- a/test/test_grid.py
+++ b/test/test_grid.py
@@ -10,14 +10,16 @@ def test_grid_cluster_cpu(tensor):
    position = Tensor(tensor, [0, 9, 2, 8, 3])
    size = torch.LongTensor([5])
    expected = torch.LongTensor([0, 1, 0, 1, 0])
    output = grid_cluster(position, size)
    assert output.tolist() == expected.tolist()
    position = Tensor(tensor, [[0, 0], [9, 9], [2, 8], [2, 2], [8, 3]])
    size = torch.LongTensor([5, 5])
    expected = torch.LongTensor([0, 3, 1, 0, 2])
+    output = grid_cluster(position, size)
+    assert output.tolist() == expected.tolist()
+    position = Tensor(tensor, [[0, 9, 2, 2, 8], [0, 9, 8, 2, 3]]).t()
    output = grid_cluster(position, size)
    assert output.tolist() == expected.tolist()
@@ -36,21 +38,31 @@ def test_grid_cluster_cpu(tensor):
 @pytest.mark.skipif(not torch.cuda.is_available(), reason='no CUDA')
 @pytest.mark.parametrize('tensor', tensors)
 def test_grid_cluster_gpu(tensor):  # pragma: no cover
-    position = Tensor(tensor, [[0, 0], [9, 9], [2, 8], [2, 2], [8, 3]]).cuda()
+    position = Tensor(tensor, [0, 9, 2, 8, 3]).cuda()
-    print(position)
+    size = torch.cuda.LongTensor([5])
-    # size = torch.cuda.LongTensor([5, 5])
+    expected = torch.cuda.LongTensor([0, 1, 0, 1, 0])
-    # expected = torch.LongTensor([0, 3, 1, 0, 2])
+    output = grid_cluster(position, size)
+    assert output.cpu().tolist() == expected.tolist()
-    # output = grid_cluster(position, size)
+    position = Tensor(tensor, [[0, 0], [9, 9], [2, 8], [2, 2], [8, 3]])
-    # assert output.cpu().tolist() == expected.tolist()
+    position = position.cuda()
+    size = torch.cuda.LongTensor([5, 5])
+    expected = torch.cuda.LongTensor([0, 3, 1, 0, 2])
+    output = grid_cluster(position, size)
+    assert output.cpu().tolist() == expected.tolist()
-    # output = grid_cluster(position.expand(2, 5, 2), size)
+    position = Tensor(tensor, [[0, 9, 2, 2, 8], [0, 9, 8, 2, 3]])
-    # # assert output.cpu().tolist() == expected.expand(2, 5).tolist()
+    position = position.cuda().t()
+    output = grid_cluster(position, size)
+    assert output.cpu().tolist() == expected.tolist()
-    # expected = torch.LongTensor([0, 1, 3, 2, 4])
+    output = grid_cluster(position.expand(2, 5, 2), size)
-    # batch = torch.cuda.LongTensor([0, 0, 1, 1, 1])
+    assert output.tolist() == expected.expand(2, 5).tolist()
-    # output = grid_cluster(position, size, batch)
-    # # assert output.cpu().tolist() == expected.tolist()
+    expected = torch.LongTensor([0, 1, 3, 2, 4])
+    batch = torch.cuda.LongTensor([0, 0, 1, 1, 1])
+    output = grid_cluster(position, size, batch)
+    assert output.cpu().tolist() == expected.tolist()
-    # output = grid_cluster(position.expand(2, 5, 2), size, batch.expand(2, 5))
+    output = grid_cluster(position.expand(2, 5, 2), size, batch.expand(2, 5))
-    # # assert output.cpu().tolist() == expected.expand(2, 5).tolist()
+    assert output.cpu().tolist() == expected.expand(2, 5).tolist()
--- a/torch_cluster/functions/grid.py
+++ b/torch_cluster/functions/grid.py
@@ -43,7 +43,6 @@ def grid_cluster(position, size, batch=None):
    func = get_func('grid', position)
    func(C, cluster, position, size, c_max)
    cluster = cluster.squeeze(dim=-1)
-    if not cluster.is_cuda:
    cluster = consecutive(cluster)
    return cluster
--- a/torch_cluster/kernel/THCIndex.cuh
+++ b/torch_cluster/kernel/THCIndex.cuh
+template <typename a, int Dims>
+struct IndexToOffset {
+  static __device__ void compute(int i, const TensorInfo<a>& t1, int* t1Offset) {
+    int curDimIndex;
+    for (int d = Dims - 2; d >= 0; d--) {
+      curDimIndex = i % t1.size[d];
+      *t1Offset += curDimIndex * t1.stride[d];
+      i /= t1.size[d];
+    }
+  }
+};
+template <typename a>
+struct IndexToOffset<a, -1> {
+  static __device__ void compute(int i, const TensorInfo<a>& t1, int* t1Offset) {
+    int curDimIndex;
+    for (int d = t1.dims - 2; d >= 0; d--) {
+      curDimIndex = i % t1.size[d];
+      *t1Offset += curDimIndex * t1.stride[d];
+      i /= t1.size[d];
+    }
+  }
+};
--- a/torch_cluster/kernel/common.cuh
+++ b/torch_cluster/kernel/common.cuh
+const int MAX_DIMS = 25;
+const int NUM_THREADS = 1024;
+inline int GET_BLOCKS(const int n) {
+  return (n + NUM_THREADS - 1) / NUM_THREADS;
+}
+template<typename T>
+struct TensorInfo {
+  TensorInfo(T *t, int d, int sz[MAX_DIMS], int st[MAX_DIMS]) {
+    data = t; dims = d;
+    for (int i = 0; i < dims; i++) {
+      size[i] = sz[i];
+      stride[i] = st[i];
+    }
+  }
+  T *data;
+  int dims;
+  int size[MAX_DIMS];
+  int stride[MAX_DIMS];
+};
+#define KERNEL_LOOP(I, N) \
+  for (int I = blockIdx.x * blockDim.x + threadIdx.x; I < N; i += blockDim.x * gridDim.x)
--- a/torch_cluster/kernel/generic/common.cu
+++ b/torch_cluster/kernel/generic/common.cu
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/common.cu"
+#else
+TensorInfo<real> thc_(getTensorInfo)(THCState *state, THCTensor *tensor) {
+  real *data = THCTensor_(data)(state, tensor);
+  int dims = THCTensor_(nDimension)(state, tensor);
+  int size[MAX_DIMS]; int stride[MAX_DIMS];
+  for (int i = 0; i < dims; i++) {
+    size[i] = THCTensor_(size)(state, tensor, i);
+    stride[i] = THCTensor_(stride)(state, tensor, i);
+  }
+  return TensorInfo<real>(data, dims, size, stride);
+}
+#endif
--- a/torch_cluster/kernel/generic/kernel.cu
+++ b/torch_cluster/kernel/generic/kernel.cu
@@ -3,7 +3,28 @@
 #else
 void cluster_(grid)(THCState *state, int C, THCudaLongTensor *output, THCTensor *position, THCTensor *size, THCudaLongTensor *count) {
-  printf("drin");
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, position, size));
+  THCAssertSameGPU(THCudaLongTensor_checkGPU(state, 2, output, count));
+  THArgCheck(THCudaLongTensor_nDimension(state, output) <= MAX_DIMS, 1, "Tensor too large or too many dimensions");
+  int64_t *outputData = THCudaLongTensor_data(state, output);
+  TensorInfo<real> positionInfo = thc_(getTensorInfo)(state, position);
+  real *sizeData = THCTensor_(data)(state, size);
+  int64_t *countData = THCudaLongTensor_data(state, count);
+  const int N = THCudaLongTensor_nElement(state, output);
+  int grid = GET_BLOCKS(N);
+  cudaStream_t stream = THCState_getCurrentStream(state);
+  switch (positionInfo.dims) {
+    case  1: gridKernel<real,  1><<<grid, NUM_THREADS, 0, stream>>>(outputData, positionInfo, sizeData, countData, C, N); break;
+    case  2: gridKernel<real,  2><<<grid, NUM_THREADS, 0, stream>>>(outputData, positionInfo, sizeData, countData, C, N); break;
+    case  3: gridKernel<real,  3><<<grid, NUM_THREADS, 0, stream>>>(outputData, positionInfo, sizeData, countData, C, N); break;
+    case  4: gridKernel<real,  4><<<grid, NUM_THREADS, 0, stream>>>(outputData, positionInfo, sizeData, countData, C, N); break;
+    default: gridKernel<real, -1><<<grid, NUM_THREADS, 0, stream>>>(outputData, positionInfo, sizeData, countData, C, N); break;
+  }
+  THCudaCheck(cudaGetLastError());
 }
 #endif
--- a/torch_cluster/kernel/kernel.cu
+++ b/torch_cluster/kernel/kernel.cu
@@ -2,7 +2,41 @@
 #include "kernel.h"
+#include "common.cuh"
+#include "THCIndex.cuh"
 #define cluster_(NAME) TH_CONCAT_4(cluster_, NAME, _kernel_, Real)
+#define thc_(NAME) TH_CONCAT_4(thc_, NAME, _, Real)
-#include "generic/kernel.cu"
+#include "generic/common.cu"
 #include "THCGenerateAllTypes.h"
+template<typename Real, int Dims>
+__global__ void gridKernel(int64_t *output, TensorInfo<Real> position, Real *size, int64_t *count, const int C, const int N) {
+  KERNEL_LOOP(i, N) {
+    int positionOffset = 0;
+    IndexToOffset<Real, Dims>::compute(i, position, &positionOffset);
+    int tmp = C; int64_t c = 0;
+    for (int d = 0; d < position.size[position.dims - 1]; d++) {
+      tmp = tmp / count[d];
+      c += tmp * (int64_t) (position.data[positionOffset + d] / size[d]);
+    }
+    output[i] = c;
+  }
+}
+#include "generic/kernel.cu"
+#include "THCGenerateFloatType.h"
+#include "generic/kernel.cu"
+#include "THCGenerateDoubleType.h"
+#include "generic/kernel.cu"
+#include "THCGenerateByteType.h"
+#include "generic/kernel.cu"
+#include "THCGenerateCharType.h"
+#include "generic/kernel.cu"
+#include "THCGenerateShortType.h"
+#include "generic/kernel.cu"
+#include "THCGenerateIntType.h"
+#include "generic/kernel.cu"
+#include "THCGenerateLongType.h"
--- a/torch_cluster/src/generic/cpu.c
+++ b/torch_cluster/src/generic/cpu.c
@@ -12,7 +12,7 @@ void cluster_(grid)(int C, THLongTensor *output, THTensor *position, THTensor *s
    tmp = C; c = 0;
    for (i = 0; i < d; i++) {
      tmp = tmp / *(count_data + i);
-      c += tmp * (int64_t)floor(*(position_data + i * position_stride) / *(size_data + i));
+      c += tmp * (int64_t) (*(position_data + i * position_stride) / *(size_data + i));
    }
    output_data[0] = c;
  )