Revert "performance boost, however, not finished yet"

This reverts commit eace3488.

Revert "performance boost, however, not finished yet"
This reverts commit eace3488.
96c3cd41 · rusty1s · 31638af9 · 96c3cd41 · 96c3cd41 · 96c3cd41
Commit 96c3cd41 authored Feb 19, 2018 by rusty1s
11 changed files
--- a/setup.py
+++ b/setup.py
@@ -2,7 +2,7 @@ from os import path as osp
 from setuptools import setup, find_packages
-__version__ = '0.1.2'
+__version__ = '0.1.1'
 url = 'https://github.com/rusty1s/pytorch_cluster'
 install_requires = ['cffi', 'torch-unique']

--- a/torch_cluster/__init__.py
+++ b/torch_cluster/__init__.py
 from .functions.grid import grid_cluster
-__version__ = '0.1.2'
+__version__ = '0.1.1'
 __all__ = ['grid_cluster', '__version__']
--- a/torch_cluster/functions/grid.py
+++ b/torch_cluster/functions/grid.py
@@ -22,28 +22,31 @@ def grid_cluster(position, size, batch=None):
        size = torch.cat([size.new(1).fill_(1), size], dim=-1)
    # Translate to minimal positive positions.
-    p_min = position.min(dim=-2, keepdim=True)[0]
+    min = position.min(dim=-2, keepdim=True)[0]
-    position = position - p_min
+    position = position - min
-    # Compute maximal position for each dimension.
+    # Compute cluster count for each dimension.
-    p_max = position.max(dim=0)[0]
+    max = position.max(dim=0)[0]
-    while p_max.dim() > 1:
+    while max.dim() > 1:
-        p_max = p_max.max(dim=0)[0]
+        max = max.max(dim=0)[0]
+    c_max = torch.floor(max.double() / size.double() + 1).long()
+    c_max = torch.clamp(c_max, min=1)
+    C = c_max.prod()
    # Generate cluster tensor.
-    s = list(position.size())[:-1] + [1]
+    s = list(position.size())
-    cluster = size.new(torch.Size(s)).long()
+    s[-1] = 1
+    cluster = c_max.new(torch.Size(s))
    # Fill cluster tensor and reshape.
    size = size.type_as(position)
    func = get_func('grid', position)
-    C = func(cluster, position, size, p_max)
+    func(C, cluster, position, size, c_max)
    cluster = cluster.squeeze(dim=-1)
    cluster, u = consecutive(cluster)
    if batch is None:
        return cluster
    else:
-        print(p_max.tolist(), size.tolist(), C)
+        batch = (u / c_max[1:].prod()).long()
-        batch = (u / C).long()
        return cluster, batch
--- a/torch_cluster/functions/utils.py
+++ b/torch_cluster/functions/utils.py
@@ -11,12 +11,12 @@ def get_func(name, tensor):
    return func
-def get_type(max_value, cuda):
+def get_type(max, cuda):
-    if max_value <= 255:
+    if max <= 255:
        return torch.cuda.ByteTensor if cuda else torch.ByteTensor
-    elif max_value <= 32767:  # pragma: no cover
+    elif max <= 32767:  # pragma: no cover
        return torch.cuda.ShortTensor if cuda else torch.ShortTensor
-    elif max_value <= 2147483647:  # pragma: no cover
+    elif max <= 2147483647:  # pragma: no cover
        return torch.cuda.IntTensor if cuda else torch.IntTensor
    else:  # pragma: no cover
        return torch.cuda.LongTensor if cuda else torch.LongTensor

--- a/torch_cluster/kernel/generic/kernel.cu
+++ b/torch_cluster/kernel/generic/kernel.cu
@@ -2,37 +2,29 @@
 #define THC_GENERIC_FILE "generic/kernel.cu"
 #else
-int64_t cluster_(grid)(THCState *state, THCudaLongTensor *output, THCTensor *position, THCTensor *size, THCTensor *maxPosition) {
+void cluster_(grid)(THCState *state, int C, THCudaLongTensor *output, THCTensor *position, THCTensor *size, THCudaLongTensor *count) {
-  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, position, size, maxPosition));
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, position, size));
-  THCAssertSameGPU(THCudaLongTensor_checkGPU(state, 1, output));
+  THCAssertSameGPU(THCudaLongTensor_checkGPU(state, 2, output, count));
-  THArgCheck(THCTensor_(nDimension)(state, position) <= MAX_DIMS, 1, "Tensor too large or too many dimensions");
+  THArgCheck(THCudaLongTensor_nDimension(state, output) <= MAX_DIMS, 1, "Tensor too large or too many dimensions");
  int64_t *outputData = THCudaLongTensor_data(state, output);
  TensorInfo<real> positionInfo = thc_(getTensorInfo)(state, position);
  real *sizeData = THCTensor_(data)(state, size);
-  real *maxPositionData = THCTensor_(data)(state, maxPosition);
+  int64_t *countData = THCudaLongTensor_data(state, count);
  const int N = THCudaLongTensor_nElement(state, output);
  int grid = GET_BLOCKS(N);
  cudaStream_t stream = THCState_getCurrentStream(state);
  switch (positionInfo.dims) {
-    case  1: gridKernel<real,  1><<<grid, NUM_THREADS, 0, stream>>>(outputData, positionInfo, sizeData, maxPositionData, N); break;
+    case  1: gridKernel<real,  1><<<grid, NUM_THREADS, 0, stream>>>(outputData, positionInfo, sizeData, countData, C, N); break;
-    case  2: gridKernel<real,  2><<<grid, NUM_THREADS, 0, stream>>>(outputData, positionInfo, sizeData, maxPositionData, N); break;
+    case  2: gridKernel<real,  2><<<grid, NUM_THREADS, 0, stream>>>(outputData, positionInfo, sizeData, countData, C, N); break;
-    case  3: gridKernel<real,  3><<<grid, NUM_THREADS, 0, stream>>>(outputData, positionInfo, sizeData, maxPositionData, N); break;
+    case  3: gridKernel<real,  3><<<grid, NUM_THREADS, 0, stream>>>(outputData, positionInfo, sizeData, countData, C, N); break;
-    default: gridKernel<real, -1><<<grid, NUM_THREADS, 0, stream>>>(outputData, positionInfo, sizeData, maxPositionData, N); break;
+    case  4: gridKernel<real,  4><<<grid, NUM_THREADS, 0, stream>>>(outputData, positionInfo, sizeData, countData, C, N); break;
+    default: gridKernel<real, -1><<<grid, NUM_THREADS, 0, stream>>>(outputData, positionInfo, sizeData, countData, C, N); break;
  }
  THCudaCheck(cudaGetLastError());
-  real C = 1;
-  for (ptrdiff_t d = 1; d < THCTensor_(nElement)(state, size); d++) {
-    C = maxPositionData[d] / sizeData[d];
-    /* printf("%f", maxPositionData[d]); */
-    /* printf("%i", (int)*(maxPositionData)); */
-    /* C *= (int64_t) (*(maxPositionData + d) / *(sizeData + d)) + 1; */
-  }
-  return C;
 }
 #endif
--- a/torch_cluster/kernel/kernel.cu
+++ b/torch_cluster/kernel/kernel.cu
@@ -12,18 +12,15 @@
 #include "THCGenerateAllTypes.h"
 template<typename Real, int Dims>
-__global__ void gridKernel(int64_t *output, TensorInfo<Real> position, Real *size, Real *maxPosition, const int N) {
+__global__ void gridKernel(int64_t *output, TensorInfo<Real> position, Real *size, int64_t *count, const int C, const int N) {
  KERNEL_LOOP(i, N) {
-    int positionOffset = 0;
+    int positionOffset = 0; int tmp = C; int64_t c = 0;
    IndexToOffset<Real, Dims>::compute(i, position, &positionOffset);
+    for (int d = 0; d < position.size[position.dims - 1]; d++) {
-    int D = position.size[position.dims - 1];
+      tmp = tmp / count[d];
-    int weight = 1; int64_t cluster = 0;
+      c += tmp * (int64_t) (position.data[positionOffset + d] / size[d]);
-    for (int d = D - 1; d >= 0; d--) {
-      cluster += weight * (int64_t) (position.data[positionOffset + d] / size[d]);
-      weight *= (int64_t) (maxPosition[d] / size[d]) + 1;
    }
-    output[i] = cluster;
+    output[i] = c;
  }
 }

--- a/torch_cluster/kernel/kernel.h
+++ b/torch_cluster/kernel/kernel.h
@@ -2,13 +2,13 @@
 extern "C" {
 #endif
-int64_t cluster_grid_kernel_Float (THCState *state, THCudaLongTensor *output, THCudaTensor       *position, THCudaTensor       *size, THCudaTensor       *maxPosition);
+void cluster_grid_kernel_Float (THCState *state, int C, THCudaLongTensor *output, THCudaTensor       *position, THCudaTensor       *size, THCudaLongTensor *count);
-int64_t cluster_grid_kernel_Double(THCState *state, THCudaLongTensor *output, THCudaDoubleTensor *position, THCudaDoubleTensor *size, THCudaDoubleTensor *maxPosition);
+void cluster_grid_kernel_Double(THCState *state, int C, THCudaLongTensor *output, THCudaDoubleTensor *position, THCudaDoubleTensor *size, THCudaLongTensor *count);
-int64_t cluster_grid_kernel_Byte  (THCState *state, THCudaLongTensor *output, THCudaByteTensor   *position, THCudaByteTensor   *size, THCudaByteTensor   *maxPosition);
+void cluster_grid_kernel_Byte  (THCState *state, int C, THCudaLongTensor *output, THCudaByteTensor   *position, THCudaByteTensor   *size, THCudaLongTensor *count);
-int64_t cluster_grid_kernel_Char  (THCState *state, THCudaLongTensor *output, THCudaCharTensor   *position, THCudaCharTensor   *size, THCudaCharTensor   *maxPosition);
+void cluster_grid_kernel_Char  (THCState *state, int C, THCudaLongTensor *output, THCudaCharTensor   *position, THCudaCharTensor   *size, THCudaLongTensor *count);
-int64_t cluster_grid_kernel_Short (THCState *state, THCudaLongTensor *output, THCudaShortTensor  *position, THCudaShortTensor  *size, THCudaShortTensor  *maxPosition);
+void cluster_grid_kernel_Short (THCState *state, int C, THCudaLongTensor *output, THCudaShortTensor  *position, THCudaShortTensor  *size, THCudaLongTensor *count);
-int64_t cluster_grid_kernel_Int   (THCState *state, THCudaLongTensor *output, THCudaIntTensor    *position, THCudaIntTensor    *size, THCudaIntTensor    *maxPosition);
+void cluster_grid_kernel_Int   (THCState *state, int C, THCudaLongTensor *output, THCudaIntTensor    *position, THCudaIntTensor    *size, THCudaLongTensor *count);
-int64_t cluster_grid_kernel_Long  (THCState *state, THCudaLongTensor *output, THCudaLongTensor   *position, THCudaLongTensor   *size, THCudaLongTensor   *maxPosition);
+void cluster_grid_kernel_Long  (THCState *state, int C, THCudaLongTensor *output, THCudaLongTensor   *position, THCudaLongTensor   *size, THCudaLongTensor *count);
 #ifdef __cplusplus
 }

--- a/torch_cluster/src/cpu.h
+++ b/torch_cluster/src/cpu.h
-int64_t cluster_grid_Float (THLongTensor *output, THFloatTensor  *position, THFloatTensor  *size, THFloatTensor  *maxPosition);
+void cluster_grid_Float (int C, THLongTensor *output, THFloatTensor  *position, THFloatTensor  *size, THLongTensor *count);
-int64_t cluster_grid_Double(THLongTensor *output, THDoubleTensor *position, THDoubleTensor *size, THDoubleTensor *maxPosition);
+void cluster_grid_Double(int C, THLongTensor *output, THDoubleTensor *position, THDoubleTensor *size, THLongTensor *count);
-int64_t cluster_grid_Byte  (THLongTensor *output, THByteTensor   *position, THByteTensor   *size, THByteTensor   *maxPosition);
+void cluster_grid_Byte  (int C, THLongTensor *output, THByteTensor   *position, THByteTensor   *size, THLongTensor *count);
-int64_t cluster_grid_Char  (THLongTensor *output, THCharTensor   *position, THCharTensor   *size, THCharTensor   *maxPosition);
+void cluster_grid_Char  (int C, THLongTensor *output, THCharTensor   *position, THCharTensor   *size, THLongTensor *count);
-int64_t cluster_grid_Short (THLongTensor *output, THShortTensor  *position, THShortTensor  *size, THShortTensor  *maxPosition);
+void cluster_grid_Short (int C, THLongTensor *output, THShortTensor  *position, THShortTensor  *size, THLongTensor *count);
-int64_t cluster_grid_Int   (THLongTensor *output, THIntTensor    *position, THIntTensor    *size, THIntTensor    *maxPosition);
+void cluster_grid_Int   (int C, THLongTensor *output, THIntTensor    *position, THIntTensor    *size, THLongTensor *count);
-int64_t cluster_grid_Long  (THLongTensor *output, THLongTensor   *position, THLongTensor   *size, THLongTensor   *maxPosition);
+void cluster_grid_Long  (int C, THLongTensor *output, THLongTensor   *position, THLongTensor   *size, THLongTensor *count);
--- a/torch_cluster/src/cuda.h
+++ b/torch_cluster/src/cuda.h
-int64_t cluster_grid_cuda_Float (THCudaLongTensor *output, THCudaTensor       *position, THCudaTensor       *size, THCudaTensor       *maxPosition);
+void cluster_grid_cuda_Float (int C, THCudaLongTensor *output, THCudaTensor       *position, THCudaTensor       *size, THCudaLongTensor *count);
-int64_t cluster_grid_cuda_Double(THCudaLongTensor *output, THCudaDoubleTensor *position, THCudaDoubleTensor *size, THCudaDoubleTensor *maxPosition);
+void cluster_grid_cuda_Double(int C, THCudaLongTensor *output, THCudaDoubleTensor *position, THCudaDoubleTensor *size, THCudaLongTensor *count);
-int64_t cluster_grid_cuda_Byte  (THCudaLongTensor *output, THCudaByteTensor   *position, THCudaByteTensor   *size, THCudaByteTensor   *maxPosition);
+void cluster_grid_cuda_Byte  (int C, THCudaLongTensor *output, THCudaByteTensor   *position, THCudaByteTensor   *size, THCudaLongTensor *count);
-int64_t cluster_grid_cuda_Char  (THCudaLongTensor *output, THCudaCharTensor   *position, THCudaCharTensor   *size, THCudaCharTensor   *maxPosition);
+void cluster_grid_cuda_Char  (int C, THCudaLongTensor *output, THCudaCharTensor   *position, THCudaCharTensor   *size, THCudaLongTensor *count);
-int64_t cluster_grid_cuda_Short (THCudaLongTensor *output, THCudaShortTensor  *position, THCudaShortTensor  *size, THCudaShortTensor  *maxPosition);
+void cluster_grid_cuda_Short (int C, THCudaLongTensor *output, THCudaShortTensor  *position, THCudaShortTensor  *size, THCudaLongTensor *count);
-int64_t cluster_grid_cuda_Int   (THCudaLongTensor *output, THCudaIntTensor    *position, THCudaIntTensor    *size, THCudaIntTensor    *maxPosition);
+void cluster_grid_cuda_Int   (int C, THCudaLongTensor *output, THCudaIntTensor    *position, THCudaIntTensor    *size, THCudaLongTensor *count);
-int64_t cluster_grid_cuda_Long  (THCudaLongTensor *output, THCudaLongTensor   *position, THCudaLongTensor   *size, THCudaLongTensor   *maxPosition);
+void cluster_grid_cuda_Long  (int C, THCudaLongTensor *output, THCudaLongTensor   *position, THCudaLongTensor   *size, THCudaLongTensor *count);
--- a/torch_cluster/src/generic/cpu.c
+++ b/torch_cluster/src/generic/cpu.c
@@ -2,27 +2,20 @@
 #define TH_GENERIC_FILE "generic/cpu.c"
 #else
-int64_t cluster_(grid)(THLongTensor *output, THTensor *position, THTensor *size, THTensor *maxPosition) {
+void cluster_(grid)(int C, THLongTensor *output, THTensor *position, THTensor *size, THLongTensor *count) {
  real *size_data = size->storage->data + size->storageOffset;
-  real *maxPosition_data = maxPosition->storage->data + maxPosition->storageOffset;
+  int64_t *count_data = count->storage->data + count->storageOffset;
+  int64_t D, d, i, c, tmp;
-  int64_t Dims = THTensor_(nDimension)(position);
+  D = THTensor_(nDimension)(position);
-  int64_t D = THTensor_(size)(position, Dims - 1);
+  d = THTensor_(size)(position, D - 1);
+  TH_TENSOR_DIM_APPLY2(int64_t, output, real, position, D - 1,
-  TH_TENSOR_DIM_APPLY2(int64_t, output, real, position, Dims - 1,
+    tmp = C; c = 0;
-    int weight = 1; int64_t cluster = 0;
+    for (i = 0; i < d; i++) {
-    for (int d = D - 1; d >= 0; d--) {
+      tmp = tmp / *(count_data + i);
-      cluster += weight * (int64_t) (*(position_data + d * position_stride) / *(size_data + d));
+      c += tmp * (int64_t) (*(position_data + i * position_stride) / *(size_data + i));
-      weight *= (int64_t) (maxPosition_data[d] / size_data[d]) + 1;
    }
-    output_data[0] = cluster;
+    output_data[0] = c;
  )
-  int64_t C = 1;
-  for (int d = 1; d < D; d++) {
-    C *= (int64_t) (maxPosition_data[d] / size_data[d]) + 1;
-  }
-  return C;
 }
 #endif
--- a/torch_cluster/src/generic/cuda.c
+++ b/torch_cluster/src/generic/cuda.c
@@ -2,8 +2,8 @@
 #define THC_GENERIC_FILE "generic/cuda.c"
 #else
-int64_t cluster_(grid)(THCudaLongTensor *output, THCTensor *position, THCTensor *size, THCTensor *maxPosition) {
+void cluster_(grid)(int C, THCudaLongTensor *output, THCTensor *position, THCTensor *size, THCudaLongTensor *count) {
-  return cluster_kernel_(grid)(state, output, position, size, maxPosition);
+  cluster_kernel_(grid)(state, C, output, position, size, count);
 }
 #endif