performance boost, however, not finished yet

eace3488 · rusty1s · 37778e99 · eace3488 · eace3488 · eace3488
Commit eace3488 authored Feb 08, 2018 by rusty1s
11 changed files
--- a/setup.py
+++ b/setup.py
@@ -2,7 +2,7 @@ from os import path as osp
 from setuptools import setup, find_packages
-__version__ = '0.1.1'
+__version__ = '0.1.2'
 url = 'https://github.com/rusty1s/pytorch_cluster'
 install_requires = ['cffi', 'torch-unique']

--- a/torch_cluster/__init__.py
+++ b/torch_cluster/__init__.py
 from .functions.grid import grid_cluster
-__version__ = '0.1.1'
+__version__ = '0.1.2'
 __all__ = ['grid_cluster', '__version__']
--- a/torch_cluster/functions/grid.py
+++ b/torch_cluster/functions/grid.py
@@ -22,31 +22,28 @@ def grid_cluster(position, size, batch=None):
        size = torch.cat([size.new(1).fill_(1), size], dim=-1)
    # Translate to minimal positive positions.
-    min = position.min(dim=-2, keepdim=True)[0]
+    p_min = position.min(dim=-2, keepdim=True)[0]
-    position = position - min
+    position = position - p_min
-    # Compute cluster count for each dimension.
+    # Compute maximal position for each dimension.
-    max = position.max(dim=0)[0]
+    p_max = position.max(dim=0)[0]
-    while max.dim() > 1:
+    while p_max.dim() > 1:
-        max = max.max(dim=0)[0]
+        p_max = p_max.max(dim=0)[0]
-    c_max = torch.floor(max.double() / size.double() + 1).long()
-    c_max = torch.clamp(c_max, min=1)
-    C = c_max.prod()
    # Generate cluster tensor.
-    s = list(position.size())
+    s = list(position.size())[:-1] + [1]
-    s[-1] = 1
+    cluster = size.new(torch.Size(s)).long()
-    cluster = c_max.new(torch.Size(s))
    # Fill cluster tensor and reshape.
    size = size.type_as(position)
    func = get_func('grid', position)
-    func(C, cluster, position, size, c_max)
+    C = func(cluster, position, size, p_max)
    cluster = cluster.squeeze(dim=-1)
    cluster, u = consecutive(cluster)
    if batch is None:
        return cluster
    else:
-        batch = (u / c_max[1:].prod()).long()
+        print(p_max.tolist(), size.tolist(), C)
+        batch = (u / C).long()
        return cluster, batch
--- a/torch_cluster/functions/utils.py
+++ b/torch_cluster/functions/utils.py
@@ -11,12 +11,12 @@ def get_func(name, tensor):
    return func
-def get_type(max, cuda):
+def get_type(max_value, cuda):
-    if max <= 255:
+    if max_value <= 255:
        return torch.cuda.ByteTensor if cuda else torch.ByteTensor
-    elif max <= 32767:  # pragma: no cover
+    elif max_value <= 32767:  # pragma: no cover
        return torch.cuda.ShortTensor if cuda else torch.ShortTensor
-    elif max <= 2147483647:  # pragma: no cover
+    elif max_value <= 2147483647:  # pragma: no cover
        return torch.cuda.IntTensor if cuda else torch.IntTensor
    else:  # pragma: no cover
        return torch.cuda.LongTensor if cuda else torch.LongTensor

--- a/torch_cluster/kernel/generic/kernel.cu
+++ b/torch_cluster/kernel/generic/kernel.cu
@@ -2,29 +2,37 @@
 #define THC_GENERIC_FILE "generic/kernel.cu"
 #else
-void cluster_(grid)(THCState *state, int C, THCudaLongTensor *output, THCTensor *position, THCTensor *size, THCudaLongTensor *count) {
+int64_t cluster_(grid)(THCState *state, THCudaLongTensor *output, THCTensor *position, THCTensor *size, THCTensor *maxPosition) {
-  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, position, size));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, position, size, maxPosition));
-  THCAssertSameGPU(THCudaLongTensor_checkGPU(state, 2, output, count));
+  THCAssertSameGPU(THCudaLongTensor_checkGPU(state, 1, output));
-  THArgCheck(THCudaLongTensor_nDimension(state, output) <= MAX_DIMS, 1, "Tensor too large or too many dimensions");
+  THArgCheck(THCTensor_(nDimension)(state, position) <= MAX_DIMS, 1, "Tensor too large or too many dimensions");
  int64_t *outputData = THCudaLongTensor_data(state, output);
  TensorInfo<real> positionInfo = thc_(getTensorInfo)(state, position);
  real *sizeData = THCTensor_(data)(state, size);
-  int64_t *countData = THCudaLongTensor_data(state, count);
+  real *maxPositionData = THCTensor_(data)(state, maxPosition);
  const int N = THCudaLongTensor_nElement(state, output);
  int grid = GET_BLOCKS(N);
  cudaStream_t stream = THCState_getCurrentStream(state);
  switch (positionInfo.dims) {
-    case  1: gridKernel<real,  1><<<grid, NUM_THREADS, 0, stream>>>(outputData, positionInfo, sizeData, countData, C, N); break;
+    case  1: gridKernel<real,  1><<<grid, NUM_THREADS, 0, stream>>>(outputData, positionInfo, sizeData, maxPositionData, N); break;
-    case  2: gridKernel<real,  2><<<grid, NUM_THREADS, 0, stream>>>(outputData, positionInfo, sizeData, countData, C, N); break;
+    case  2: gridKernel<real,  2><<<grid, NUM_THREADS, 0, stream>>>(outputData, positionInfo, sizeData, maxPositionData, N); break;
-    case  3: gridKernel<real,  3><<<grid, NUM_THREADS, 0, stream>>>(outputData, positionInfo, sizeData, countData, C, N); break;
+    case  3: gridKernel<real,  3><<<grid, NUM_THREADS, 0, stream>>>(outputData, positionInfo, sizeData, maxPositionData, N); break;
-    case  4: gridKernel<real,  4><<<grid, NUM_THREADS, 0, stream>>>(outputData, positionInfo, sizeData, countData, C, N); break;
+    default: gridKernel<real, -1><<<grid, NUM_THREADS, 0, stream>>>(outputData, positionInfo, sizeData, maxPositionData, N); break;
-    default: gridKernel<real, -1><<<grid, NUM_THREADS, 0, stream>>>(outputData, positionInfo, sizeData, countData, C, N); break;
  }
  THCudaCheck(cudaGetLastError());
+  real C = 1;
+  for (ptrdiff_t d = 1; d < THCTensor_(nElement)(state, size); d++) {
+    C = maxPositionData[d] / sizeData[d];
+    /* printf("%f", maxPositionData[d]); */
+    /* printf("%i", (int)*(maxPositionData)); */
+    /* C *= (int64_t) (*(maxPositionData + d) / *(sizeData + d)) + 1; */
+  }
+  return C;
 }
 #endif
--- a/torch_cluster/kernel/kernel.cu
+++ b/torch_cluster/kernel/kernel.cu
@@ -12,15 +12,18 @@
 #include "THCGenerateAllTypes.h"
 template<typename Real, int Dims>
-__global__ void gridKernel(int64_t *output, TensorInfo<Real> position, Real *size, int64_t *count, const int C, const int N) {
+__global__ void gridKernel(int64_t *output, TensorInfo<Real> position, Real *size, Real *maxPosition, const int N) {
  KERNEL_LOOP(i, N) {
-    int positionOffset = 0; int tmp = C; int64_t c = 0;
+    int positionOffset = 0;
    IndexToOffset<Real, Dims>::compute(i, position, &positionOffset);
-    for (int d = 0; d < position.size[position.dims - 1]; d++) {
-      tmp = tmp / count[d];
+    int D = position.size[position.dims - 1];
-      c += tmp * (int64_t) (position.data[positionOffset + d] / size[d]);
+    int weight = 1; int64_t cluster = 0;
+    for (int d = D - 1; d >= 0; d--) {
+      cluster += weight * (int64_t) (position.data[positionOffset + d] / size[d]);
+      weight *= (int64_t) (maxPosition[d] / size[d]) + 1;
    }
-    output[i] = c;
+    output[i] = cluster;
  }
 }

--- a/torch_cluster/kernel/kernel.h
+++ b/torch_cluster/kernel/kernel.h
@@ -2,13 +2,13 @@
 extern "C" {
 #endif
-void cluster_grid_kernel_Float (THCState *state, int C, THCudaLongTensor *output, THCudaTensor       *position, THCudaTensor       *size, THCudaLongTensor *count);
+int64_t cluster_grid_kernel_Float (THCState *state, THCudaLongTensor *output, THCudaTensor       *position, THCudaTensor       *size, THCudaTensor       *maxPosition);
-void cluster_grid_kernel_Double(THCState *state, int C, THCudaLongTensor *output, THCudaDoubleTensor *position, THCudaDoubleTensor *size, THCudaLongTensor *count);
+int64_t cluster_grid_kernel_Double(THCState *state, THCudaLongTensor *output, THCudaDoubleTensor *position, THCudaDoubleTensor *size, THCudaDoubleTensor *maxPosition);
-void cluster_grid_kernel_Byte  (THCState *state, int C, THCudaLongTensor *output, THCudaByteTensor   *position, THCudaByteTensor   *size, THCudaLongTensor *count);
+int64_t cluster_grid_kernel_Byte  (THCState *state, THCudaLongTensor *output, THCudaByteTensor   *position, THCudaByteTensor   *size, THCudaByteTensor   *maxPosition);
-void cluster_grid_kernel_Char  (THCState *state, int C, THCudaLongTensor *output, THCudaCharTensor   *position, THCudaCharTensor   *size, THCudaLongTensor *count);
+int64_t cluster_grid_kernel_Char  (THCState *state, THCudaLongTensor *output, THCudaCharTensor   *position, THCudaCharTensor   *size, THCudaCharTensor   *maxPosition);
-void cluster_grid_kernel_Short (THCState *state, int C, THCudaLongTensor *output, THCudaShortTensor  *position, THCudaShortTensor  *size, THCudaLongTensor *count);
+int64_t cluster_grid_kernel_Short (THCState *state, THCudaLongTensor *output, THCudaShortTensor  *position, THCudaShortTensor  *size, THCudaShortTensor  *maxPosition);
-void cluster_grid_kernel_Int   (THCState *state, int C, THCudaLongTensor *output, THCudaIntTensor    *position, THCudaIntTensor    *size, THCudaLongTensor *count);
+int64_t cluster_grid_kernel_Int   (THCState *state, THCudaLongTensor *output, THCudaIntTensor    *position, THCudaIntTensor    *size, THCudaIntTensor    *maxPosition);
-void cluster_grid_kernel_Long  (THCState *state, int C, THCudaLongTensor *output, THCudaLongTensor   *position, THCudaLongTensor   *size, THCudaLongTensor *count);
+int64_t cluster_grid_kernel_Long  (THCState *state, THCudaLongTensor *output, THCudaLongTensor   *position, THCudaLongTensor   *size, THCudaLongTensor   *maxPosition);
 #ifdef __cplusplus
 }

--- a/torch_cluster/src/cpu.h
+++ b/torch_cluster/src/cpu.h
-void cluster_grid_Float (int C, THLongTensor *output, THFloatTensor  *position, THFloatTensor  *size, THLongTensor *count);
+int64_t cluster_grid_Float (THLongTensor *output, THFloatTensor  *position, THFloatTensor  *size, THFloatTensor  *maxPosition);
-void cluster_grid_Double(int C, THLongTensor *output, THDoubleTensor *position, THDoubleTensor *size, THLongTensor *count);
+int64_t cluster_grid_Double(THLongTensor *output, THDoubleTensor *position, THDoubleTensor *size, THDoubleTensor *maxPosition);
-void cluster_grid_Byte  (int C, THLongTensor *output, THByteTensor   *position, THByteTensor   *size, THLongTensor *count);
+int64_t cluster_grid_Byte  (THLongTensor *output, THByteTensor   *position, THByteTensor   *size, THByteTensor   *maxPosition);
-void cluster_grid_Char  (int C, THLongTensor *output, THCharTensor   *position, THCharTensor   *size, THLongTensor *count);
+int64_t cluster_grid_Char  (THLongTensor *output, THCharTensor   *position, THCharTensor   *size, THCharTensor   *maxPosition);
-void cluster_grid_Short (int C, THLongTensor *output, THShortTensor  *position, THShortTensor  *size, THLongTensor *count);
+int64_t cluster_grid_Short (THLongTensor *output, THShortTensor  *position, THShortTensor  *size, THShortTensor  *maxPosition);
-void cluster_grid_Int   (int C, THLongTensor *output, THIntTensor    *position, THIntTensor    *size, THLongTensor *count);
+int64_t cluster_grid_Int   (THLongTensor *output, THIntTensor    *position, THIntTensor    *size, THIntTensor    *maxPosition);
-void cluster_grid_Long  (int C, THLongTensor *output, THLongTensor   *position, THLongTensor   *size, THLongTensor *count);
+int64_t cluster_grid_Long  (THLongTensor *output, THLongTensor   *position, THLongTensor   *size, THLongTensor   *maxPosition);
--- a/torch_cluster/src/cuda.h
+++ b/torch_cluster/src/cuda.h
-void cluster_grid_cuda_Float (int C, THCudaLongTensor *output, THCudaTensor       *position, THCudaTensor       *size, THCudaLongTensor *count);
+int64_t cluster_grid_cuda_Float (THCudaLongTensor *output, THCudaTensor       *position, THCudaTensor       *size, THCudaTensor       *maxPosition);
-void cluster_grid_cuda_Double(int C, THCudaLongTensor *output, THCudaDoubleTensor *position, THCudaDoubleTensor *size, THCudaLongTensor *count);
+int64_t cluster_grid_cuda_Double(THCudaLongTensor *output, THCudaDoubleTensor *position, THCudaDoubleTensor *size, THCudaDoubleTensor *maxPosition);
-void cluster_grid_cuda_Byte  (int C, THCudaLongTensor *output, THCudaByteTensor   *position, THCudaByteTensor   *size, THCudaLongTensor *count);
+int64_t cluster_grid_cuda_Byte  (THCudaLongTensor *output, THCudaByteTensor   *position, THCudaByteTensor   *size, THCudaByteTensor   *maxPosition);
-void cluster_grid_cuda_Char  (int C, THCudaLongTensor *output, THCudaCharTensor   *position, THCudaCharTensor   *size, THCudaLongTensor *count);
+int64_t cluster_grid_cuda_Char  (THCudaLongTensor *output, THCudaCharTensor   *position, THCudaCharTensor   *size, THCudaCharTensor   *maxPosition);
-void cluster_grid_cuda_Short (int C, THCudaLongTensor *output, THCudaShortTensor  *position, THCudaShortTensor  *size, THCudaLongTensor *count);
+int64_t cluster_grid_cuda_Short (THCudaLongTensor *output, THCudaShortTensor  *position, THCudaShortTensor  *size, THCudaShortTensor  *maxPosition);
-void cluster_grid_cuda_Int   (int C, THCudaLongTensor *output, THCudaIntTensor    *position, THCudaIntTensor    *size, THCudaLongTensor *count);
+int64_t cluster_grid_cuda_Int   (THCudaLongTensor *output, THCudaIntTensor    *position, THCudaIntTensor    *size, THCudaIntTensor    *maxPosition);
-void cluster_grid_cuda_Long  (int C, THCudaLongTensor *output, THCudaLongTensor   *position, THCudaLongTensor   *size, THCudaLongTensor *count);
+int64_t cluster_grid_cuda_Long  (THCudaLongTensor *output, THCudaLongTensor   *position, THCudaLongTensor   *size, THCudaLongTensor   *maxPosition);
--- a/torch_cluster/src/generic/cpu.c
+++ b/torch_cluster/src/generic/cpu.c
@@ -2,20 +2,27 @@
 #define TH_GENERIC_FILE "generic/cpu.c"
 #else
-void cluster_(grid)(int C, THLongTensor *output, THTensor *position, THTensor *size, THLongTensor *count) {
+int64_t cluster_(grid)(THLongTensor *output, THTensor *position, THTensor *size, THTensor *maxPosition) {
  real *size_data = size->storage->data + size->storageOffset;
-  int64_t *count_data = count->storage->data + count->storageOffset;
+  real *maxPosition_data = maxPosition->storage->data + maxPosition->storageOffset;
-  int64_t D, d, i, c, tmp;
-  D = THTensor_(nDimension)(position);
+  int64_t Dims = THTensor_(nDimension)(position);
-  d = THTensor_(size)(position, D - 1);
+  int64_t D = THTensor_(size)(position, Dims - 1);
-  TH_TENSOR_DIM_APPLY2(int64_t, output, real, position, D - 1,
-    tmp = C; c = 0;
+  TH_TENSOR_DIM_APPLY2(int64_t, output, real, position, Dims - 1,
-    for (i = 0; i < d; i++) {
+    int weight = 1; int64_t cluster = 0;
-      tmp = tmp / *(count_data + i);
+    for (int d = D - 1; d >= 0; d--) {
-      c += tmp * (int64_t) (*(position_data + i * position_stride) / *(size_data + i));
+      cluster += weight * (int64_t) (*(position_data + d * position_stride) / *(size_data + d));
+      weight *= (int64_t) (maxPosition_data[d] / size_data[d]) + 1;
    }
-    output_data[0] = c;
+    output_data[0] = cluster;
  )
+  int64_t C = 1;
+  for (int d = 1; d < D; d++) {
+    C *= (int64_t) (maxPosition_data[d] / size_data[d]) + 1;
+  }
+  return C;
 }
 #endif
--- a/torch_cluster/src/generic/cuda.c
+++ b/torch_cluster/src/generic/cuda.c
@@ -2,8 +2,8 @@
 #define THC_GENERIC_FILE "generic/cuda.c"
 #else
-void cluster_(grid)(int C, THCudaLongTensor *output, THCTensor *position, THCTensor *size, THCudaLongTensor *count) {
+int64_t cluster_(grid)(THCudaLongTensor *output, THCTensor *position, THCTensor *size, THCTensor *maxPosition) {
-  cluster_kernel_(grid)(state, C, output, position, size, count);
+  return cluster_kernel_(grid)(state, output, position, size, maxPosition);
 }
 #endif