gridKernel impl

3f97baae · rusty1s · 5d7997a0 · 3f97baae · 3f97baae · 3f97baae
Commit 3f97baae authored Mar 30, 2018 by rusty1s
9 changed files
--- a/aten/TH/generic/THGrid.c
+++ b/aten/TH/generic/THGrid.c
@@ -12,7 +12,7 @@ void THGrid_(THLongTensor *cluster, THTensor *pos, THTensor *size, THLongTensor
  for (n = 0; n < THTensor_(size)(pos, 0); n++) {
    coef = 1; value = 0;
    for (d = 0; d < THTensor_(size)(pos, 1); d++) {
-      value += coef * (int64_t) (*(posData + d * pos->stride[1]) / sizeData[d]);
+      value += coef * (int64_t) (posData[d * pos->stride[1]] / sizeData[d]);
      coef *= countData[d];
    }
    posData += pos->stride[0];

--- a/aten/THC/THCGrid.cu
+++ b/aten/THC/THCGrid.cu
 #include "THCGrid.h"
+template<typename real, int dims>
+__global__ void gridKernel(int64_t *cluster, TensorInfo<real> posInfo, real *size,
+                           int64_t *count, const int nNodes) {
+  KERNEL_LOOP(i, nNodes) {
+    real *pos = posInfo->data + i * posInfo->stride[0];
+    int64_t coef = 1, value = 0;
+    for (ptrdiff_t d = 0; d < dims; d++) {
+      value += coef * (int64_t) (pos[d * posInfo->stride[1]] / size[d]);
+      coef *= count[d];
+    }
+    cluster[i] = value;
+  }
+}
 #include "generic/THCGrid.cu"
 #include "THC/THCGenerateAllTypes.h"
--- a/aten/THC/common.h
+++ b/aten/THC/common.h
+#ifndef THC_COMMON_INC
+#define THC_COMMON_INC
+#define KERNEL_LOOP(I, N) \
+  for (ptrdiff_t I = blockIdx.x * blockDim.x + threadIdx.x; I < I; I += blockDim.x * gridDim.x)
+#define THC_assertSameGPU(...) THAssertMsg(THCTensor_(checkGPU)(__VA_ARGS__), \
+  "Some of the input tensors are located on different GPUs. Please move them to a single one.")
+const int CUDA_NUM_THREADS = 1024;
+inline int GET_BLOCKS(const int N) {
+  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
+}
+#define KERNEL_RUN(NAME, N, ...) \
+  int grid = GET_BLOCKS(N); \
+  cudaStream_t stream = THCState_getCurrentStream(state); \
+  NAME<real><<<grid, NUM_THREADS, 0, stream>>>(__VA_ARGS__, N); \
+  THCudaCheck(cudaGetLastError())
+#define FIXED_DIM_KERNEL_RUN(NAME, N, DIMS, ...) \
+  int grid = GET_BLOCKS(N); \
+  cudaStream_t stream = THCState_getCurrentStream(state); \
+  switch (DIMS) { \
+    case  1: NAME<real,  1><<<grid, NUM_THREADS, 0, stream>>>(__VA_ARGS__, N); break; \
+    case  2: NAME<real,  2><<<grid, NUM_THREADS, 0, stream>>>(__VA_ARGS__, N); break; \
+    case  3: NAME<real,  3><<<grid, NUM_THREADS, 0, stream>>>(__VA_ARGS__, N); break; \
+    case  4: NAME<real,  4><<<grid, NUM_THREADS, 0, stream>>>(__VA_ARGS__, N); break; \
+    default: NAME<real, -1><<<grid, NUM_THREADS, 0, stream>>>(__VA_ARGS__, N); \
+  } \
+  THCudaCheck(cudaGetLastError())
+#endif  // THC_COMMON_INC
--- a/aten/THC/generic/THCGrid.cu
+++ b/aten/THC/generic/THCGrid.cu
@@ -4,7 +4,16 @@
 void THCGrid_(THCState *state, THCudaLongTensor *cluster, THCTensor *pos, THCTensor *size,
              THCudaLongTensor *count) {
-  printf("THCGrid drin");
+  THC_assertSameGPU(state, 4, cluster, pos, size, count);
+  int64_t *clusterData = THCudaLongTensor_data(state, cluster);
+  TensorInfo<real> posInfo = THC_(getTensorInfo)(state, pos);
+  real *sizeData = THCTensor_(data)(state, size);
+  int64_t *countData = THCudaLongTensor_data(state, count);
+  const int nNodes = THCudaLongTensor_nElement(state, cluster);
+  const int dims = THCTensor_(nElement)(size);
+  FIXED_DIM_KERNEL_RUN(gridKernel, nNodes, dims, clusterData, posInfo, sizeData, countData);
 }
 #endif  // THC_GENERIC_FILE
--- a/benchmark/benchmark.py
+++ b/benchmark/benchmark.py
-import time
-import torch
-from torch_cluster import sparse_grid_cluster
-n = 90000000
-s = 1 / 64
-print('GPU ===================')
-t = time.perf_counter()
-pos = torch.cuda.FloatTensor(n, 3).uniform_(0, 1)
-size = torch.cuda.FloatTensor([s, s, s])
-torch.cuda.synchronize()
-print('Init:', time.perf_counter() - t)
-t_all = time.perf_counter()
-sparse_grid_cluster(pos, size)
-torch.cuda.synchronize()
-t_all = time.perf_counter() - t_all
-print('All:', t_all)
-print('CPU ===================')
-pos = pos.cpu()
-size = size.cpu()
-t_all = time.perf_counter()
-sparse_grid_cluster(pos, size)
-t_all = time.perf_counter() - t_all
-print('All:', t_all)
--- a/benchmark/bernoulli.py
+++ b/benchmark/bernoulli.py
-import time
-import torch
-from torch_cluster.functions.utils.ffi import _get_func
-output = torch.cuda.FloatTensor(500000000).fill_(0.5)
-torch.cuda.synchronize()
-t = time.perf_counter()
-torch.bernoulli(output)
-torch.cuda.synchronize()
-print(time.perf_counter() - t)
-output = output.long().fill_(-1)
-func = _get_func('serial', output)
-torch.cuda.synchronize()
-t = time.perf_counter()
-func(output, output, output, output)
-torch.cuda.synchronize()
-print(time.perf_counter() - t)
--- a/benchmark/ffi.py
+++ b/benchmark/ffi.py
+import torch
+from torch_cluster._ext import ffi
+print(ffi.__dict__)
+print(ffi.THByteGrid)
+cluster = torch.LongTensor(5)
+pos = torch.Tensor([[1, 1], [1, 1], [3, 3], [4, 4], [3, 3]])
+size = torch.Tensor([2, 2])
+count = torch.LongTensor([3, 3])
+ffi.THFloatGrid(cluster, pos, size, count)
+print(cluster)
+cluster = torch.LongTensor(3)
+row = torch.LongTensor([0, 0, 1, 1, 2, 2])
+col = torch.LongTensor([1, 2, 0, 2, 0, 1])
+deg = torch.LongTensor([2, 2, 2])
+weight = torch.Tensor([1, 2, 1, 1, 2, 1])
+ffi.THFloatGreedy(cluster, row, col, deg, weight)
+print(cluster)
--- a/build.py
+++ b/build.py
@@ -8,11 +8,11 @@ from torch.utils.ffi import create_extension
 if osp.exists('build'):
    shutil.rmtree('build')
-files = ['serial', 'grid']
+files = ['Greedy', 'Grid']
-headers = ['torch_cluster/src/{}_cpu.h'.format(f) for f in files]
+headers = ['aten/TH/TH{}.h'.format(f) for f in files]
-sources = ['torch_cluster/src/{}_cpu.c'.format(f) for f in files]
+sources = ['aten/TH/TH{}.c'.format(f) for f in files]
-include_dirs = ['torch_cluster/src', 'aten/TH']
+include_dirs = ['aten/TH']
 define_macros = []
 extra_objects = []
 with_cuda = False

--- a/torch_cluster/kernel/serial.h
+++ b/torch_cluster/kernel/serial.h
@@ -2,8 +2,6 @@
 extern "C" {
 #endif
-int assignColor(THCState *state, THCudaLongTensor *color);
 void cluster_serial_kernel(THCState *state, THCudaLongTensor *output, THCudaLongTensor *row, THCudaLongTensor *col, THCudaLongTensor *degree);
 void cluster_serial_kernel_Float (THCState *state, THCudaLongTensor *output, THCudaLongTensor *row, THCudaLongTensor *col, THCudaLongTensor *degree,       THCudaTensor *weight);