cpu port to new extension api

6569c84e · rusty1s · aa80bb88 · 6569c84e · 6569c84e · 6569c84e
Commit 6569c84e authored Aug 05, 2018 by rusty1s
15 changed files
--- a/.coveragerc
+++ b/.coveragerc
 [report]
 exclude_lines =
    pragma: no cover
-    def backward
+    cuda
--- a/.gitignore
+++ b/.gitignore
 __pycache__/
-_ext/
 build/
 dist/
 .cache/
 .eggs/
 *.egg-info/
 .coverage
+*.so
 *.aux
 *.log
 *.pdf

--- a/.travis.yml
+++ b/.travis.yml
@@ -6,14 +6,15 @@ matrix:
    - python: 2.7
    - python: 3.5
    - python: 3.6
+    - python: 3.7
 install:
-  - if [[ $TRAVIS_PYTHON_VERSION == 2.7 ]]; then pip install http://download.pytorch.org/whl/cpu/torch-0.4.0-cp27-cp27mu-linux_x86_64.whl; fi
+  - if [[ $TRAVIS_PYTHON_VERSION == 2.7 ]]; then pip install http://download.pytorch.org/whl/cpu/torch-0.4.1-cp27-cp27mu-linux_x86_64.whl; fi
-  - if [[ $TRAVIS_PYTHON_VERSION == 3.5 ]]; then pip install http://download.pytorch.org/whl/cpu/torch-0.4.0-cp35-cp35m-linux_x86_64.whl; fi
+  - if [[ $TRAVIS_PYTHON_VERSION == 3.5 ]]; then pip install  http://download.pytorch.org/whl/cpu/torch-0.4.1-cp35-cp35m-linux_x86_64.whl; fi
-  - if [[ $TRAVIS_PYTHON_VERSION == 3.6 ]]; then pip install http://download.pytorch.org/whl/cpu/torch-0.4.0-cp36-cp36m-linux_x86_64.whl; fi
+  - if [[ $TRAVIS_PYTHON_VERSION == 3.6 ]]; then pip install http://download.pytorch.org/whl/cpu/torch-0.4.1-cp36-cp36m-linux_x86_64.whl; fi
+  - if [[ $TRAVIS_PYTHON_VERSION == 3.7 ]]; then pip install http://download.pytorch.org/whl/cpu/torch-0.4.1.post2-cp37-cp37m-linux_x86_64.whl; fi
  - pip install pycodestyle
  - pip install flake8
  - pip install codecov
-  - pip install cffi
 script:
  - pycodestyle .
  - flake8 .

--- a/README.md
+++ b/README.md
@@ -40,7 +40,7 @@ If not, add cuda (`/usr/local/cuda/bin`) to your `$PATH`.
 Then run:
 ```
-pip install cffi torch-scatter
+pip install torch-scatter
 ```
 ## Example

--- a/cpu/dim_apply.h
+++ b/cpu/dim_apply.h
+#pragma once
+#include <torch/torch.h>
+#define DIM_APPLY3(TYPE1, TENSOR1, TYPE2, TENSOR2, TYPE3, TENSOR3, DIM, CODE)  \
+  TYPE1 *TENSOR1##_data = TENSOR1.data<TYPE1>();                               \
+  auto TENSOR1##_size = TENSOR1.size(DIM);                                     \
+  auto TENSOR1##_stride = TENSOR1.stride(DIM);                                 \
+                                                                               \
+  TYPE2 *TENSOR2##_data = TENSOR2.data<TYPE2>();                               \
+  auto TENSOR2##_size = TENSOR2.size(DIM);                                     \
+  auto TENSOR2##_stride = TENSOR2.stride(DIM);                                 \
+                                                                               \
+  TYPE3 *TENSOR3##_data = TENSOR3.data<TYPE3>();                               \
+  auto TENSOR3##_size = TENSOR3.size(DIM);                                     \
+  auto TENSOR3##_stride = TENSOR3.stride(DIM);                                 \
+                                                                               \
+  auto dims = TENSOR1.dim();                                                   \
+  auto zeros = at::zeros(torch::CPU(at::kLong), {dims});                       \
+  auto counter = zeros.data<int64_t>();                                        \
+  bool has_finished = false;                                                   \
+                                                                               \
+  while (!has_finished) {                                                      \
+    CODE;                                                                      \
+    if (dims == 1)                                                             \
+      break;                                                                   \
+                                                                               \
+    for (int64_t cur_dim = 0; cur_dim < dims; cur_dim++) {                     \
+      if (cur_dim == DIM) {                                                    \
+        if (cur_dim == dims - 1) {                                             \
+          has_finished = true;                                                 \
+          break;                                                               \
+        }                                                                      \
+        continue;                                                              \
+      }                                                                        \
+                                                                               \
+      counter[cur_dim]++;                                                      \
+      TENSOR1##_data += TENSOR1.stride(cur_dim);                               \
+      TENSOR2##_data += TENSOR2.stride(cur_dim);                               \
+      TENSOR3##_data += TENSOR3.stride(cur_dim);                               \
+                                                                               \
+      if (counter[cur_dim] == TENSOR1.size(cur_dim)) {                         \
+        if (cur_dim == dims - 1) {                                             \
+          has_finished = true;                                                 \
+          break;                                                               \
+        } else {                                                               \
+          TENSOR1##_data -= counter[cur_dim] * TENSOR1.stride(cur_dim);        \
+          TENSOR2##_data -= counter[cur_dim] * TENSOR2.stride(cur_dim);        \
+          TENSOR3##_data -= counter[cur_dim] * TENSOR3.stride(cur_dim);        \
+          counter[cur_dim] = 0;                                                \
+        }                                                                      \
+      } else                                                                   \
+        break;                                                                 \
+    }                                                                          \
+  }
+#define DIM_APPLY4(TYPE1, TENSOR1, TYPE2, TENSOR2, TYPE3, TENSOR3, TYPE4,      \
+                   TENSOR4, DIM, CODE)                                         \
+  TYPE1 *TENSOR1##_data = TENSOR1.data<TYPE1>();                               \
+  auto TENSOR1##_size = TENSOR1.size(DIM);                                     \
+  auto TENSOR1##_stride = TENSOR1.stride(DIM);                                 \
+                                                                               \
+  TYPE2 *TENSOR2##_data = TENSOR2.data<TYPE2>();                               \
+  auto TENSOR2##_size = TENSOR2.size(DIM);                                     \
+  auto TENSOR2##_stride = TENSOR2.stride(DIM);                                 \
+                                                                               \
+  TYPE3 *TENSOR3##_data = TENSOR3.data<TYPE3>();                               \
+  auto TENSOR3##_size = TENSOR3.size(DIM);                                     \
+  auto TENSOR3##_stride = TENSOR3.stride(DIM);                                 \
+                                                                               \
+  TYPE4 *TENSOR4##_data = TENSOR4.data<TYPE4>();                               \
+  auto TENSOR4##_size = TENSOR4.size(DIM);                                     \
+  auto TENSOR4##_stride = TENSOR4.stride(DIM);                                 \
+                                                                               \
+  auto dims = TENSOR1.dim();                                                   \
+  auto zeros = at::zeros(torch::CPU(at::kLong), {dims});                       \
+  auto counter = zeros.data<int64_t>();                                        \
+  bool has_finished = false;                                                   \
+                                                                               \
+  while (!has_finished) {                                                      \
+    CODE;                                                                      \
+    if (dims == 1)                                                             \
+      break;                                                                   \
+                                                                               \
+    for (int64_t cur_dim = 0; cur_dim < dims; cur_dim++) {                     \
+      if (cur_dim == DIM) {                                                    \
+        if (cur_dim == dims - 1) {                                             \
+          has_finished = true;                                                 \
+          break;                                                               \
+        }                                                                      \
+        continue;                                                              \
+      }                                                                        \
+                                                                               \
+      counter[cur_dim]++;                                                      \
+      TENSOR1##_data += TENSOR1.stride(cur_dim);                               \
+      TENSOR2##_data += TENSOR2.stride(cur_dim);                               \
+      TENSOR3##_data += TENSOR3.stride(cur_dim);                               \
+      TENSOR4##_data += TENSOR4.stride(cur_dim);                               \
+                                                                               \
+      if (counter[cur_dim] == TENSOR1.size(cur_dim)) {                         \
+        if (cur_dim == dims - 1) {                                             \
+          has_finished = true;                                                 \
+          break;                                                               \
+        } else {                                                               \
+          TENSOR1##_data -= counter[cur_dim] * TENSOR1.stride(cur_dim);        \
+          TENSOR2##_data -= counter[cur_dim] * TENSOR2.stride(cur_dim);        \
+          TENSOR3##_data -= counter[cur_dim] * TENSOR3.stride(cur_dim);        \
+          TENSOR4##_data -= counter[cur_dim] * TENSOR4.stride(cur_dim);        \
+          counter[cur_dim] = 0;                                                \
+        }                                                                      \
+      } else                                                                   \
+        break;                                                                 \
+    }                                                                          \
+  }
--- a/cpu/scatter.cpp
+++ b/cpu/scatter.cpp
+#include <torch/torch.h>
+#include "dim_apply.h"
+void scatter_mul(at::Tensor src, at::Tensor index, at::Tensor out,
+                 int64_t dim) {
+  int64_t elems_per_row = index.size(dim), i, idx;
+  AT_DISPATCH_ALL_TYPES(src.type(), "scatter_mul", [&] {
+    DIM_APPLY3(scalar_t, src, int64_t, index, scalar_t, out, dim, {
+      for (i = 0; i < elems_per_row; i++) {
+        idx = index_data[i * index_stride];
+        out_data[idx * out_stride] *= src_data[i * src_stride];
+      }
+    })
+  });
+}
+void scatter_div(at::Tensor src, at::Tensor index, at::Tensor out,
+                 int64_t dim) {
+  int64_t elems_per_row = index.size(dim), i, idx;
+  AT_DISPATCH_ALL_TYPES(src.type(), "scatter_div", [&] {
+    DIM_APPLY3(scalar_t, src, int64_t, index, scalar_t, out, dim, {
+      for (i = 0; i < elems_per_row; i++) {
+        idx = index_data[i * index_stride];
+        out_data[idx * out_stride] /= src_data[i * src_stride];
+      }
+    })
+  });
+}
+void scatter_max(at::Tensor src, at::Tensor index, at::Tensor out,
+                 at::Tensor arg, int64_t dim) {
+  int64_t elems_per_row = index.size(dim), i, idx;
+  AT_DISPATCH_ALL_TYPES(src.type(), "scatter_max", [&] {
+    DIM_APPLY4(scalar_t, src, int64_t, index, scalar_t, out, int64_t, arg, dim,
+               {
+                 for (i = 0; i < elems_per_row; i++) {
+                   idx = index_data[i * index_stride];
+                   if (src_data[i * src_stride] >= out_data[idx * out_stride]) {
+                     out_data[idx * out_stride] = src_data[i * src_stride];
+                     arg_data[idx * arg_stride] = i;
+                   }
+                 }
+               })
+  });
+}
+void scatter_min(at::Tensor src, at::Tensor index, at::Tensor out,
+                 at::Tensor arg, int64_t dim) {
+  int64_t elems_per_row = index.size(dim), i, idx;
+  AT_DISPATCH_ALL_TYPES(src.type(), "scatter_min", [&] {
+    DIM_APPLY4(scalar_t, src, int64_t, index, scalar_t, out, int64_t, arg, dim,
+               {
+                 for (i = 0; i < elems_per_row; i++) {
+                   idx = index_data[i * index_stride];
+                   if (src_data[i * src_stride] <= out_data[idx * out_stride]) {
+                     out_data[idx * out_stride] = src_data[i * src_stride];
+                     arg_data[idx * arg_stride] = i;
+                   }
+                 }
+               })
+  });
+}
+void index_backward(at::Tensor grad, at::Tensor index, at::Tensor arg,
+                    at::Tensor out, int64_t dim) {
+  int64_t elems_per_row = index.size(dim), i, idx;
+  AT_DISPATCH_ALL_TYPES(grad.type(), "index_backward", [&] {
+    DIM_APPLY4(scalar_t, grad, int64_t, index, int64_t, arg, scalar_t, out, dim,
+               {
+                 for (i = 0; i < elems_per_row; i++) {
+                   idx = index_data[i * index_stride];
+                   if (arg_data[idx * arg_stride] == i) {
+                     out_data[i * out_stride] = grad_data[idx * grad_stride];
+                   }
+                 }
+               })
+  });
+}
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("scatter_mul", &scatter_mul, "Scatter Mul (CPU)");
+  m.def("scatter_div", &scatter_div, "Scatter Div (CPU)");
+  m.def("scatter_max", &scatter_max, "Scatter Max (CPU)");
+  m.def("scatter_min", &scatter_min, "Scatter Min (CPU)");
+  m.def("index_backward", &index_backward, "Index Backward (CPU)");
+}
--- a/setup.py
+++ b/setup.py
-from os import path as osp
+import glob
 from setuptools import setup, find_packages
-__version__ = '1.0.3'
+import torch.cuda
+from torch.utils.cpp_extension import CppExtension, CUDAExtension
+ext_modules = [CppExtension('scatter_cpu', ['cpu/scatter.cpp'])]
+cmdclass = {'build_ext': torch.utils.cpp_extension.BuildExtension}
+if torch.cuda.is_available():
+    ext_modules += [
+        CUDAExtension('scatter_cuda',
+                      ['cuda/scatter.cpp'] + glob.glob('cuda/*.cu'))
+    ]
+__version__ = '1.0.4'
 url = 'https://github.com/rusty1s/pytorch_scatter'
-install_requires = ['cffi']
+install_requires = []
 setup_requires = ['pytest-runner', 'cffi']
 tests_require = ['pytest', 'pytest-cov']
@@ -21,7 +32,7 @@ setup(
    install_requires=install_requires,
    setup_requires=setup_requires,
    tests_require=tests_require,
-    packages=find_packages(exclude=['build']),
+    ext_modules=ext_modules,
-    ext_package='',
+    cmdclass=cmdclass,
-    cffi_modules=[osp.join(osp.dirname(__file__), 'build.py:ffi')],
+    packages=find_packages(),
 )
--- a/test/test_backward.py
+++ b/test/test_backward.py
@@ -5,8 +5,9 @@ import torch
 from torch.autograd import gradcheck
 import torch_scatter
-from .utils import dtypes, devices, tensor
+from .utils import devices, tensor
+dtypes = [torch.float, torch.double]
 funcs = ['add', 'sub', 'mul', 'div', 'mean']
 indices = [2, 0, 1, 1, 0]

--- a/test/utils.py
+++ b/test/utils.py
@@ -3,9 +3,6 @@ from torch.testing import get_all_dtypes
 dtypes = get_all_dtypes()
 dtypes.remove(torch.half)
-dtypes.remove(torch.short)  # TODO: PyTorch `atomicAdd` bug with short type.
-dtypes.remove(torch.uint8)  # We cannot properly test unsigned values.
-dtypes.remove(torch.int8)  # Overflow on gradient computations :(
 devices = [torch.device('cpu')]
 if torch.cuda.is_available():  # pragma: no cover

--- a/torch_scatter/__init__.py
+++ b/torch_scatter/__init__.py
@@ -6,7 +6,7 @@ from .mean import scatter_mean
 from .max import scatter_max
 from .min import scatter_min
-__version__ = '1.0.3'
+__version__ = '1.0.4'
 __all__ = [
    'scatter_add', 'scatter_sub', 'scatter_mul', 'scatter_div', 'scatter_mean',

--- a/torch_scatter/div.py
+++ b/torch_scatter/div.py
 from torch.autograd import Function
-from .utils.ffi import get_func
+from .utils.ext import get_func
 from .utils.gen import gen
@@ -8,7 +8,7 @@ class ScatterDiv(Function):
    @staticmethod
    def forward(ctx, out, src, index, dim):
        func = get_func('scatter_div', src)
-        func(dim, out, index, src)
+        func(src, index, out, dim)
        ctx.mark_dirty(out)
        ctx.save_for_backward(out, src, index)

--- a/torch_scatter/max.py
+++ b/torch_scatter/max.py
 from torch.autograd import Function
-from .utils.ffi import get_func
+from .utils.ext import get_func
 from .utils.gen import gen
@@ -9,7 +9,7 @@ class ScatterMax(Function):
    def forward(ctx, out, src, index, dim):
        arg = index.new_full(out.size(), -1)
        func = get_func('scatter_max', src)
-        func(dim, out, index, src, arg)
+        func(src, index, out, arg, dim)
        ctx.mark_dirty(out)
        ctx.dim = dim
@@ -25,7 +25,7 @@ class ScatterMax(Function):
        if ctx.needs_input_grad[1]:
            grad_src = grad_out.new_zeros(index.size())
            func = get_func('index_backward', grad_out)
-            func(ctx.dim, grad_src, index, grad_out, arg)
+            func(grad_out, index, arg, grad_src, ctx.dim)
        return None, grad_src, None, None

--- a/torch_scatter/min.py
+++ b/torch_scatter/min.py
 from torch.autograd import Function
-from .utils.ffi import get_func
+from .utils.ext import get_func
 from .utils.gen import gen
@@ -9,7 +9,7 @@ class ScatterMin(Function):
    def forward(ctx, out, src, index, dim):
        arg = index.new_full(out.size(), -1)
        func = get_func('scatter_min', src)
-        func(dim, out, index, src, arg)
+        func(src, index, out, arg, dim)
        ctx.mark_dirty(out)
        ctx.dim = dim
@@ -25,7 +25,7 @@ class ScatterMin(Function):
        if ctx.needs_input_grad[1]:
            grad_src = grad_out.new_zeros(index.size())
            func = get_func('index_backward', grad_out)
-            func(ctx.dim, grad_src, index, grad_out, arg)
+            func(grad_out, index, arg, grad_src, ctx.dim)
        return None, grad_src, None, None

--- a/torch_scatter/mul.py
+++ b/torch_scatter/mul.py
 from torch.autograd import Function
-from .utils.ffi import get_func
+from .utils.ext import get_func
 from .utils.gen import gen
@@ -8,7 +8,7 @@ class ScatterMul(Function):
    @staticmethod
    def forward(ctx, out, src, index, dim):
        func = get_func('scatter_mul', src)
-        func(dim, out, index, src)
+        func(src, index, out, dim)
        ctx.mark_dirty(out)
        ctx.save_for_backward(out, src, index)

--- a/torch_scatter/utils/ext.py
+++ b/torch_scatter/utils/ext.py
+import torch
+import scatter_cpu
+if torch.cuda.is_available():
+    import scatter_cuda
+def get_func(name, tensor):
+    scatter = scatter_cuda if tensor.is_cuda else scatter_cpu
+    return getattr(scatter, name)