Merge pull request #57 from rusty1s/wheel

[WIP] Python wheels

Merge pull request #57 from rusty1s/wheel
[WIP] Python wheels
80b99adb · Matthias Fey · GitHub · 0194ebb6 · bc476876 · 80b99adb
Unverified Commit 80b99adb authored Mar 15, 2020 by Matthias Fey Committed by GitHub Mar 15, 2020
20 changed files
--- a/.coveragerc
+++ b/.coveragerc
@@ -3,5 +3,7 @@ source=torch_cluster
 [report]
 exclude_lines =
    pragma: no cover
-    cuda
+    torch.jit.script
    raise
+    except
+    is_cuda
--- a/.travis.yml
+++ b/.travis.yml
+language: shell
+os:
+  - linux
+  - osx
+  - windows
+env:
+  global:
+    - CUDA_HOME=/usr/local/cuda
+  jobs:
+    - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.8 IDX=cpu
+    - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.8 IDX=cu92
+    - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.8 IDX=cu100
+    - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.8 IDX=cu101
+    - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.7 IDX=cpu
+    - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.7 IDX=cu92
+    - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.7 IDX=cu100
+    - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.7 IDX=cu101
+    - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.6 IDX=cpu
+    - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.6 IDX=cu92
+    - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.6 IDX=cu100
+    - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.6 IDX=cu101
 jobs:
-  include:
+  exclude:  # Exclude *all* macOS CUDA jobs and Windows CUDA 9.2/10.0 jobs.
-    - os: linux
+    - os: osx
-      language: python
+      env: TORCH_VERSION=1.4.0 PYTHON_VERSION=3.8 IDX=cu92
-      python: 3.7
+    - os: osx
-      addons:
+      env: TORCH_VERSION=1.4.0 PYTHON_VERSION=3.8 IDX=cu100
-        apt:
+    - os: osx
-          sources:
+      env: TORCH_VERSION=1.4.0 PYTHON_VERSION=3.8 IDX=cu101
-            - ubuntu-toolchain-r-test
+    - os: osx
-          packages:
+      env: TORCH_VERSION=1.4.0 PYTHON_VERSION=3.7 IDX=cu92
-            - gcc-5
+    - os: osx
-            - g++-5
+      env: TORCH_VERSION=1.4.0 PYTHON_VERSION=3.7 IDX=cu100
-      env:
+    - os: osx
-        - CC=gcc-5
+      env: TORCH_VERSION=1.4.0 PYTHON_VERSION=3.7 IDX=cu101
-        - CXX=g++-5
+    - os: osx
-    - os: osx
+      env: TORCH_VERSION=1.4.0 PYTHON_VERSION=3.6 IDX=cu92
-      language: sh
+    - os: osx
-      before_cache:
+      env: TORCH_VERSION=1.4.0 PYTHON_VERSION=3.6 IDX=cu100
-        - brew cleanup
+    - os: osx
-      cache:
+      env: TORCH_VERSION=1.4.0 PYTHON_VERSION=3.6 IDX=cu101
-        directories:
+    - os: windows
-          - $HOME/Library/Caches/Homebrew
+      env: TORCH_VERSION=1.4.0 PYTHON_VERSION=3.8 IDX=cu92
-          - /usr/local/Homebrew
+    - os: windows
-      addons:
+      env: TORCH_VERSION=1.4.0 PYTHON_VERSION=3.8 IDX=cu100
-        homebrew:
+    - os: windows
-          packages: python3
+      env: TORCH_VERSION=1.4.0 PYTHON_VERSION=3.7 IDX=cu92
-      before_install:
+    - os: windows
-        - python3 -m pip install --upgrade virtualenv
+      env: TORCH_VERSION=1.4.0 PYTHON_VERSION=3.7 IDX=cu100
-        - virtualenv -p python3 --system-site-packages "$HOME/venv"
+    - os: windows
-        - source "$HOME/venv/bin/activate"
+      env: TORCH_VERSION=1.4.0 PYTHON_VERSION=3.6 IDX=cu92
-      env:
+    - os: windows
-        - CC=clang
+      env: TORCH_VERSION=1.4.0 PYTHON_VERSION=3.6 IDX=cu100
-        - CXX=clang++
+    - os: windows
+      env: TORCH_VERSION=1.4.0 PYTHON_VERSION=3.8 IDX=cu101
 install:
-  - pip install numpy
+  - source script/cuda.sh
-  - pip install --pre torch torchvision -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
+  - source script/conda.sh
-  - pip install pycodestyle
+  - conda create --yes -n test python="${PYTHON_VERSION}"
-  - pip install flake8
+  - source activate test
-  - pip install codecov
+  - conda install pytorch=${TORCH_VERSION} ${TOOLKIT} -c pytorch --yes
+  - source script/torch.sh
+  - pip install flake8 codecov
+  - python setup.py install
 script:
-  - python -c "import torch; print(torch.__version__)"
-  - pycodestyle .
  - flake8 .
-  - python setup.py install
  - python setup.py test
 after_success:
+  - python setup.py bdist_wheel --dist-dir=dist/torch-${TORCH_VERSION}
+  - python script/rename_wheel.py ${IDX}
  - codecov
+deploy:
+  provider: s3
+  region: eu-central-1
+  edge: true
+  access_key_id: ${S3_ACCESS_KEY}
+  secret_access_key: ${S3_SECRET_ACCESS_KEY}
+  bucket: pytorch-geometric.com
+  local_dir: dist/torch-${TORCH_VERSION}
+  upload_dir: whl/torch-${TORCH_VERSION}
+  acl: public_read
+  on:
+    repo: rusty1s/pytorch_cluster
+    tags: true
 notifications:
  email: false
--- a/LICENSE
+++ b/LICENSE
-Copyright (c) 2019 Matthias Fey <matthias.fey@tu-dortmund.de>
+Copyright (c) 2020 Matthias Fey <matthias.fey@tu-dortmund.de>
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

--- a/MANIFEST.in
+++ b/MANIFEST.in
+include README.md
 include LICENSE
-recursive-include cpu *
-recursive-include cuda *
+recursive-exclude test *
+recursive-include csrc *
--- a/README.md
+++ b/README.md
@@ -27,7 +27,30 @@ All included operations work on varying data types and are implemented both for
 ## Installation
-Ensure that at least PyTorch 1.1.0 is installed and verify that `cuda/bin` and `cuda/include` are in your `$PATH` and `$CPATH` respectively, *e.g.*:
+### Binaries
+We provide pip wheels for all major OS/PyTorch/CUDA combinations, see [here](https://s3.eu-central-1.amazonaws.com/pytorch-geometric.com/whl/index.html).
+To install from binaries, simply run
+```
+pip install torch-cluster==latest+${CUDA} -f https://s3.eu-central-1.amazonaws.com/pytorch-geometric.com/whl/torch-1.4.0.html
+```
+where `${CUDA}` should be replaced by either `cpu`, `cu92`, `cu100` or `cu101` depending on your PyTorch installation.
+|             | `cpu` | `cu92` | `cu100` | `cu101` |
+|-------------|-------|--------|---------|---------|
+| **Linux**   | ✅    | ✅     | ✅      | ✅      |
+| **Windows** | ✅    | ❌     | ❌      | ✅      |
+| **macOS**   | ✅    |        |         |         |
+### From source
+Ensure that at least PyTorch 1.4.0 is installed and verify that `cuda/bin` and `cuda/include` are in your `$PATH` and `$CPATH` respectively, *e.g.*:
+```
+$ python -c "import torch; print(torch.__version__)"
+>>> 1.4.0
 ```
 $ python -c "import torch; print(torch.__version__)"
@@ -46,10 +69,16 @@ Then run:
 pip install torch-cluster
 ```
-If you are running into any installation problems, please create an [issue](https://github.com/rusty1s/pytorch_cluster/issues).
+When running in a docker container without NVIDIA driver, PyTorch needs to evaluate the compute capabilities and may fail.
-Be sure to import `torch` first before using this package to resolve symbols the dynamic linker must see.
+In this case, ensure that the compute capabilities are set via `TORCH_CUDA_ARCH_LIST`, *e.g.*:
+```
+export TORCH_CUDA_ARCH_LIST = "6.0 6.1 7.2+PTX 7.5+PTX"
+```
+## Functions
-## Graclus
+### Graclus
 A greedy clustering algorithm of picking an unmarked vertex and matching it with one its unmarked neighbors (that maximizes its edge weight).
 The GPU algorithm is adapted from Fagginger Auer and Bisseling: [A GPU Algorithm for Greedy Graph Matching](http://www.staff.science.uu.nl/~bisse101/Articles/match12.pdf) (LNCS 2012)
@@ -70,7 +99,7 @@ print(cluster)
 tensor([0, 0, 1])
 ```
-## VoxelGrid
+### VoxelGrid
 A clustering algorithm, which overlays a regular grid of user-defined size over a point cloud and clusters all points within a voxel.
@@ -89,7 +118,7 @@ print(cluster)
 tensor([0, 5, 3, 0, 1])
 ```
-## FarthestPointSampling
+### FarthestPointSampling
 A sampling algorithm, which iteratively samples the most distant point with regard to the rest points.
@@ -107,7 +136,7 @@ print(sample)
 tensor([0, 3])
 ```
-## kNN-Graph
+### kNN-Graph
 Computes graph edges to the nearest *k* points.
@@ -126,7 +155,7 @@ tensor([[1, 2, 0, 3, 0, 3, 1, 2],
        [0, 0, 1, 1, 2, 2, 3, 3]])
 ```
-## Radius-Graph
+### Radius-Graph
 Computes graph edges to all points within a given distance.
@@ -145,7 +174,7 @@ tensor([[1, 2, 0, 3, 0, 3, 1, 2],
        [0, 0, 1, 1, 2, 2, 3, 3]])
 ```
-## Nearest
+### Nearest
 Clusters points in *x* together which are nearest to a given query point in *y*.
@@ -165,7 +194,7 @@ print(cluster)
 tensor([0, 0, 1, 1])
 ```
-## RandomWalk-Sampling
+### RandomWalk-Sampling
 Samples random walks of length `walk_length` from all node indices in `start` in the graph given by `(row, col)`.

--- a/cpu/compat.h
+++ b/cpu/compat.h
-#ifdef VERSION_GE_1_3
-#define DATA_PTR data_ptr
-#else
-#define DATA_PTR data
-#endif
--- a/cpu/fps.cpp
+++ b/cpu/fps.cpp
-#include <torch/extension.h>
-#include "compat.h"
-#include "utils.h"
-at::Tensor get_dist(at::Tensor x, ptrdiff_t index) {
-  return (x - x[index]).norm(2, 1);
-}
-at::Tensor fps(at::Tensor x, at::Tensor batch, float ratio, bool random) {
-  auto batch_size = batch[-1].DATA_PTR<int64_t>()[0] + 1;
-  auto deg = degree(batch, batch_size);
-  auto cum_deg = at::cat({at::zeros(1, deg.options()), deg.cumsum(0)}, 0);
-  auto k = (deg.toType(at::kFloat) * ratio).ceil().toType(at::kLong);
-  auto cum_k = at::cat({at::zeros(1, k.options()), k.cumsum(0)}, 0);
-  auto out = at::empty(cum_k[-1].DATA_PTR<int64_t>()[0], batch.options());
-  auto cum_deg_d = cum_deg.DATA_PTR<int64_t>();
-  auto k_d = k.DATA_PTR<int64_t>();
-  auto cum_k_d = cum_k.DATA_PTR<int64_t>();
-  auto out_d = out.DATA_PTR<int64_t>();
-  for (ptrdiff_t b = 0; b < batch_size; b++) {
-    auto index = at::range(cum_deg_d[b], cum_deg_d[b + 1] - 1, out.options());
-    auto y = x.index_select(0, index);
-    ptrdiff_t start = 0;
-    if (random) {
-      start = at::randperm(y.size(0), batch.options()).DATA_PTR<int64_t>()[0];
-    }
-    out_d[cum_k_d[b]] = cum_deg_d[b] + start;
-    auto dist = get_dist(y, start);
-    for (ptrdiff_t i = 1; i < k_d[b]; i++) {
-      ptrdiff_t argmax = dist.argmax().DATA_PTR<int64_t>()[0];
-      out_d[cum_k_d[b] + i] = cum_deg_d[b] + argmax;
-      dist = at::min(dist, get_dist(y, argmax));
-    }
-  }
-  return out;
-}
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("fps", &fps, "Farthest Point Sampling (CPU)");
-}
--- a/cpu/graclus.cpp
+++ b/cpu/graclus.cpp
-#include <torch/extension.h>
-#include "compat.h"
-#include "utils.h"
-at::Tensor graclus(at::Tensor row, at::Tensor col, int64_t num_nodes) {
-  std::tie(row, col) = remove_self_loops(row, col);
-  std::tie(row, col) = rand(row, col);
-  std::tie(row, col) = to_csr(row, col, num_nodes);
-  auto row_data = row.DATA_PTR<int64_t>(), col_data = col.DATA_PTR<int64_t>();
-  auto perm = at::randperm(num_nodes, row.options());
-  auto perm_data = perm.DATA_PTR<int64_t>();
-  auto cluster = at::full(num_nodes, -1, row.options());
-  auto cluster_data = cluster.DATA_PTR<int64_t>();
-  for (int64_t i = 0; i < num_nodes; i++) {
-    auto u = perm_data[i];
-    if (cluster_data[u] >= 0)
-      continue;
-    cluster_data[u] = u;
-    for (int64_t j = row_data[u]; j < row_data[u + 1]; j++) {
-      auto v = col_data[j];
-      if (cluster_data[v] >= 0)
-        continue;
-      cluster_data[u] = std::min(u, v);
-      cluster_data[v] = std::min(u, v);
-      break;
-    }
-  }
-  return cluster;
-}
-at::Tensor weighted_graclus(at::Tensor row, at::Tensor col, at::Tensor weight,
-                            int64_t num_nodes) {
-  std::tie(row, col, weight) = remove_self_loops(row, col, weight);
-  std::tie(row, col, weight) = to_csr(row, col, weight, num_nodes);
-  auto row_data = row.DATA_PTR<int64_t>(), col_data = col.DATA_PTR<int64_t>();
-  auto perm = at::randperm(num_nodes, row.options());
-  auto perm_data = perm.DATA_PTR<int64_t>();
-  auto cluster = at::full(num_nodes, -1, row.options());
-  auto cluster_data = cluster.DATA_PTR<int64_t>();
-  AT_DISPATCH_ALL_TYPES(weight.scalar_type(), "weighted_graclus", [&] {
-    auto weight_data = weight.DATA_PTR<scalar_t>();
-    for (int64_t i = 0; i < num_nodes; i++) {
-      auto u = perm_data[i];
-      if (cluster_data[u] >= 0)
-        continue;
-      int64_t v_max = u;
-      scalar_t w_max = 0;
-      for (int64_t j = row_data[u]; j < row_data[u + 1]; j++) {
-        auto v = col_data[j];
-        if (cluster_data[v] >= 0)
-          continue;
-        if (weight_data[j] >= w_max) {
-          v_max = v;
-          w_max = weight_data[j];
-        }
-      }
-      cluster_data[u] = std::min(u, v_max);
-      cluster_data[v_max] = std::min(u, v_max);
-    }
-  });
-  return cluster;
-}
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("graclus", &graclus, "Graclus (CPU)");
-  m.def("weighted_graclus", &weighted_graclus, "Weighted Graclus (CPU)");
-}
--- a/cpu/grid.cpp
+++ b/cpu/grid.cpp
-#include <torch/extension.h>
-at::Tensor grid(at::Tensor pos, at::Tensor size, at::Tensor start,
-                at::Tensor end) {
-  pos = pos - start.view({1, -1});
-  auto num_voxels = ((end - start) / size).toType(at::kLong) + 1;
-  num_voxels = num_voxels.cumprod(0);
-  num_voxels = at::cat({at::ones(1, num_voxels.options()), num_voxels}, 0);
-  auto index = at::empty(size.size(0), num_voxels.options());
-  at::arange_out(index, size.size(0));
-  num_voxels = num_voxels.index_select(0, index);
-  auto cluster = (pos / size.view({1, -1})).toType(at::kLong);
-  cluster *= num_voxels.view({1, -1});
-  cluster = cluster.sum(1);
-  return cluster;
-}
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("grid", &grid, "Grid (CPU)"); }
--- a/cpu/rw.cpp
+++ b/cpu/rw.cpp
-#include <torch/extension.h>
-#include "compat.h"
-#include "utils.h"
-at::Tensor rw(at::Tensor row, at::Tensor col, at::Tensor start,
-              size_t walk_length, float p, float q, size_t num_nodes) {
-  auto deg = degree(row, num_nodes);
-  auto cum_deg = at::cat({at::zeros(1, deg.options()), deg.cumsum(0)}, 0);
-  auto rand = at::rand({start.size(0), (int64_t)walk_length},
-                       start.options().dtype(at::kFloat));
-  auto out =
-      at::full({start.size(0), (int64_t)walk_length + 1}, -1, start.options());
-  auto deg_d = deg.DATA_PTR<int64_t>();
-  auto cum_deg_d = cum_deg.DATA_PTR<int64_t>();
-  auto col_d = col.DATA_PTR<int64_t>();
-  auto start_d = start.DATA_PTR<int64_t>();
-  auto rand_d = rand.DATA_PTR<float>();
-  auto out_d = out.DATA_PTR<int64_t>();
-  for (ptrdiff_t n = 0; n < start.size(0); n++) {
-    int64_t cur = start_d[n];
-    auto i = n * (walk_length + 1);
-    out_d[i] = cur;
-    for (ptrdiff_t l = 1; l <= (int64_t)walk_length; l++) {
-      cur = col_d[cum_deg_d[cur] +
-                  int64_t(rand_d[n * walk_length + (l - 1)] * deg_d[cur])];
-      out_d[i + l] = cur;
-    }
-  }
-  return out;
-}
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("rw", &rw, "Random Walk Sampling (CPU)");
-}
--- a/cpu/utils.h
+++ b/cpu/utils.h
-#pragma once
-#include <torch/extension.h>
-std::tuple<at::Tensor, at::Tensor> remove_self_loops(at::Tensor row,
-                                                     at::Tensor col) {
-  auto mask = row != col;
-  return std::make_tuple(row.masked_select(mask), col.masked_select(mask));
-}
-std::tuple<at::Tensor, at::Tensor, at::Tensor>
-remove_self_loops(at::Tensor row, at::Tensor col, at::Tensor weight) {
-  auto mask = row != col;
-  return std::make_tuple(row.masked_select(mask), col.masked_select(mask),
-                         weight.masked_select(mask));
-}
-std::tuple<at::Tensor, at::Tensor> rand(at::Tensor row, at::Tensor col) {
-  auto perm = at::randperm(row.size(0), row.options());
-  return std::make_tuple(row.index_select(0, perm), col.index_select(0, perm));
-}
-std::tuple<at::Tensor, at::Tensor> sort_by_row(at::Tensor row, at::Tensor col) {
-  at::Tensor perm;
-  std::tie(row, perm) = row.sort();
-  return std::make_tuple(row, col.index_select(0, perm));
-}
-std::tuple<at::Tensor, at::Tensor, at::Tensor>
-sort_by_row(at::Tensor row, at::Tensor col, at::Tensor weight) {
-  at::Tensor perm;
-  std::tie(row, perm) = row.sort();
-  return std::make_tuple(row, col.index_select(0, perm),
-                         weight.index_select(0, perm));
-}
-at::Tensor degree(at::Tensor row, int64_t num_nodes) {
-  auto zero = at::zeros(num_nodes, row.options());
-  auto one = at::ones(row.size(0), row.options());
-  return zero.scatter_add_(0, row, one);
-}
-std::tuple<at::Tensor, at::Tensor> to_csr(at::Tensor row, at::Tensor col,
-                                          int64_t num_nodes) {
-  std::tie(row, col) = sort_by_row(row, col);
-  row = degree(row, num_nodes).cumsum(0);
-  row = at::cat({at::zeros(1, row.options()), row}, 0); // Prepend zero.
-  return std::make_tuple(row, col);
-}
-std::tuple<at::Tensor, at::Tensor, at::Tensor>
-to_csr(at::Tensor row, at::Tensor col, at::Tensor weight, int64_t num_nodes) {
-  std::tie(row, col, weight) = sort_by_row(row, col, weight);
-  row = degree(row, num_nodes).cumsum(0);
-  row = at::cat({at::zeros(1, row.options()), row}, 0); // Prepend zero.
-  return std::make_tuple(row, col, weight);
-}
--- a/csrc/cpu/fps_cpu.cpp
+++ b/csrc/cpu/fps_cpu.cpp
+#include "fps_cpu.h"
+#include "utils.h"
+inline torch::Tensor get_dist(torch::Tensor x, int64_t idx) {
+  return (x - x[idx]).norm(2, 1);
+}
+torch::Tensor fps_cpu(torch::Tensor src, torch::Tensor ptr, double ratio,
+                      bool random_start) {
+  CHECK_CPU(src);
+  CHECK_CPU(ptr);
+  CHECK_INPUT(ptr.dim() == 1);
+  AT_ASSERTM(ratio > 0 && ratio < 1, "Invalid input");
+  src = src.view({src.size(0), -1}).contiguous();
+  ptr = ptr.contiguous();
+  auto batch_size = ptr.size(0) - 1;
+  auto deg = ptr.narrow(0, 1, batch_size) - ptr.narrow(0, 0, batch_size);
+  auto out_ptr = deg.toType(torch::kFloat) * (float)ratio;
+  out_ptr = out_ptr.ceil().toType(torch::kLong).cumsum(0);
+  auto out = torch::empty(out_ptr[-1].data_ptr<int64_t>()[0], ptr.options());
+  auto ptr_data = ptr.data_ptr<int64_t>();
+  auto out_ptr_data = out_ptr.data_ptr<int64_t>();
+  auto out_data = out.data_ptr<int64_t>();
+  int64_t src_start = 0, out_start = 0, src_end, out_end;
+  for (auto b = 0; b < batch_size; b++) {
+    src_end = ptr_data[b + 1], out_end = out_ptr_data[b];
+    auto y = src.narrow(0, src_start, src_end - src_start);
+    int64_t start_idx = 0;
+    if (random_start) {
+      start_idx = rand() % src.size(0);
+    }
+    out_data[out_start] = src_start + start_idx;
+    auto dist = get_dist(y, start_idx);
+    for (auto i = 1; i < out_end - out_start; i++) {
+      int64_t argmax = dist.argmax().data_ptr<int64_t>()[0];
+      out_data[out_start + i] = src_start + argmax;
+      dist = torch::min(dist, get_dist(y, argmax));
+    }
+    src_start = src_end, out_start = out_end;
+  }
+  return out;
+}
--- a/csrc/cpu/fps_cpu.h
+++ b/csrc/cpu/fps_cpu.h
+#pragma once
+#include <torch/extension.h>
+torch::Tensor fps_cpu(torch::Tensor src, torch::Tensor ptr, double ratio,
+                      bool random_start);
--- a/csrc/cpu/graclus_cpu.cpp
+++ b/csrc/cpu/graclus_cpu.cpp
+#include "graclus_cpu.h"
+#include "utils.h"
+torch::Tensor graclus_cpu(torch::Tensor rowptr, torch::Tensor col,
+                          torch::optional<torch::Tensor> optional_weight) {
+  CHECK_CPU(rowptr);
+  CHECK_CPU(col);
+  CHECK_INPUT(rowptr.dim() == 1 && col.dim() == 1);
+  if (optional_weight.has_value()) {
+    CHECK_CPU(optional_weight.value());
+    CHECK_INPUT(optional_weight.value().dim() == 1);
+    CHECK_INPUT(optional_weight.value().numel() == col.numel());
+  }
+  int64_t num_nodes = rowptr.numel() - 1;
+  auto out = torch::full(num_nodes, -1, rowptr.options());
+  auto node_perm = torch::randperm(num_nodes, rowptr.options());
+  auto rowptr_data = rowptr.data_ptr<int64_t>();
+  auto col_data = col.data_ptr<int64_t>();
+  auto node_perm_data = node_perm.data_ptr<int64_t>();
+  auto out_data = out.data_ptr<int64_t>();
+  if (!optional_weight.has_value()) {
+    for (int64_t n = 0; n < num_nodes; n++) {
+      auto u = node_perm_data[n];
+      if (out_data[u] >= 0)
+        continue;
+      out_data[u] = u;
+      int64_t row_start = rowptr_data[u], row_end = rowptr_data[u + 1];
+      for (auto e = 0; e < row_end - row_start; e++) {
+        auto v = col_data[row_start + e];
+        if (out_data[v] >= 0)
+          continue;
+        out_data[u] = std::min(u, v);
+        out_data[v] = std::min(u, v);
+        break;
+      }
+    }
+  } else {
+    auto weight = optional_weight.value();
+    AT_DISPATCH_ALL_TYPES(weight.scalar_type(), "weighted_graclus", [&] {
+      auto weight_data = weight.data_ptr<scalar_t>();
+      for (auto n = 0; n < num_nodes; n++) {
+        auto u = node_perm_data[n];
+        if (out_data[u] >= 0)
+          continue;
+        auto v_max = u;
+        scalar_t w_max = (scalar_t)0.;
+        for (auto e = rowptr_data[u]; e < rowptr_data[u + 1]; e++) {
+          auto v = col_data[e];
+          if (out_data[v] >= 0)
+            continue;
+          if (weight_data[e] >= w_max) {
+            v_max = v;
+            w_max = weight_data[e];
+          }
+        }
+        out_data[u] = std::min(u, v_max);
+        out_data[v_max] = std::min(u, v_max);
+      }
+    });
+  }
+  return out;
+}
--- a/csrc/cpu/graclus_cpu.h
+++ b/csrc/cpu/graclus_cpu.h
+#pragma once
+#include <torch/extension.h>
+torch::Tensor graclus_cpu(torch::Tensor rowptr, torch::Tensor col,
+                          torch::optional<torch::Tensor> optional_weight);
--- a/csrc/cpu/grid_cpu.cpp
+++ b/csrc/cpu/grid_cpu.cpp
+#include "grid_cpu.h"
+#include "utils.h"
+torch::Tensor grid_cpu(torch::Tensor pos, torch::Tensor size,
+                       torch::optional<torch::Tensor> optional_start,
+                       torch::optional<torch::Tensor> optional_end) {
+  CHECK_CPU(pos);
+  CHECK_CPU(size);
+  if (optional_start.has_value())
+    CHECK_CPU(optional_start.value());
+  if (optional_start.has_value())
+    CHECK_CPU(optional_start.value());
+  pos = pos.view({pos.size(0), -1});
+  CHECK_INPUT(size.numel() == pos.size(1));
+  if (!optional_start.has_value())
+    optional_start = std::get<0>(pos.min(0));
+  else
+    CHECK_INPUT(optional_start.value().numel() == pos.size(1));
+  if (!optional_end.has_value())
+    optional_end = std::get<0>(pos.max(0));
+  else
+    CHECK_INPUT(optional_start.value().numel() == pos.size(1));
+  auto start = optional_start.value();
+  auto end = optional_end.value();
+  pos = pos - start.unsqueeze(0);
+  auto num_voxels = ((end - start) / size).toType(torch::kLong) + 1;
+  num_voxels = num_voxels.cumprod(0);
+  num_voxels =
+      torch::cat({torch::ones(1, num_voxels.options()), num_voxels}, 0);
+  num_voxels = num_voxels.narrow(0, 0, size.size(0));
+  auto out = (pos / size.view({1, -1})).toType(torch::kLong);
+  out *= num_voxels.view({1, -1});
+  out = out.sum(1);
+  return out;
+}
--- a/csrc/cpu/grid_cpu.h
+++ b/csrc/cpu/grid_cpu.h
+#pragma once
+#include <torch/extension.h>
+torch::Tensor grid_cpu(torch::Tensor pos, torch::Tensor size,
+                       torch::optional<torch::Tensor> optional_start,
+                       torch::optional<torch::Tensor> optional_end);
--- a/csrc/cpu/rw_cpu.cpp
+++ b/csrc/cpu/rw_cpu.cpp
+#include "rw_cpu.h"
+#include "utils.h"
+torch::Tensor random_walk_cpu(torch::Tensor rowptr, torch::Tensor col,
+                              torch::Tensor start, int64_t walk_length,
+                              double p, double q) {
+  CHECK_CPU(rowptr);
+  CHECK_CPU(col);
+  CHECK_CPU(start);
+  CHECK_INPUT(rowptr.dim() == 1);
+  CHECK_INPUT(col.dim() == 1);
+  CHECK_INPUT(start.dim() == 1);
+  auto rand = torch::rand({start.size(0), walk_length},
+                          start.options().dtype(torch::kFloat));
+  auto out = torch::full({start.size(0), walk_length + 1}, -1, start.options());
+  auto rowptr_data = rowptr.data_ptr<int64_t>();
+  auto col_data = col.data_ptr<int64_t>();
+  auto start_data = start.data_ptr<int64_t>();
+  auto rand_data = rand.data_ptr<float>();
+  auto out_data = out.data_ptr<int64_t>();
+  for (auto n = 0; n < start.size(0); n++) {
+    auto cur = start_data[n];
+    auto offset = n * (walk_length + 1);
+    out_data[offset] = cur;
+    int64_t row_start, row_end;
+    for (auto l = 1; l <= walk_length; l++) {
+      row_start = rowptr_data[cur], row_end = rowptr_data[cur + 1];
+      cur = col_data[row_start + int64_t(rand_data[n * walk_length + (l - 1)] *
+                                         (row_end - row_start))];
+      out_data[offset + l] = cur;
+    }
+  }
+  return out;
+}
--- a/csrc/cpu/rw_cpu.h
+++ b/csrc/cpu/rw_cpu.h
+#pragma once
+#include <torch/extension.h>
+torch::Tensor random_walk_cpu(torch::Tensor rowptr, torch::Tensor col,
+                              torch::Tensor start, int64_t walk_length,
+                              double p, double q);
--- a/cpu/sampler.cpp
+++ b/cpu/sampler.cpp
-#include <torch/extension.h>
+#include "sampler_cpu.h"
-#include "compat.h"
+#include "utils.h"
-at::Tensor neighbor_sampler(at::Tensor start, at::Tensor cumdeg, size_t size,
+torch::Tensor neighbor_sampler_cpu(torch::Tensor start, torch::Tensor rowptr,
-                            float factor) {
+                                   int64_t count, double factor) {
-  auto start_ptr = start.DATA_PTR<int64_t>();
+  auto start_data = start.data_ptr<int64_t>();
-  auto cumdeg_ptr = cumdeg.DATA_PTR<int64_t>();
+  auto rowptr_data = rowptr.data_ptr<int64_t>();
  std::vector<int64_t> e_ids;
-  for (ptrdiff_t i = 0; i < start.size(0); i++) {
+  for (auto i = 0; i < start.size(0); i++) {
-    int64_t low = cumdeg_ptr[start_ptr[i]];
+    auto row_start = rowptr_data[start_data[i]];
-    int64_t high = cumdeg_ptr[start_ptr[i] + 1];
+    auto row_end = rowptr_data[start_data[i] + 1];
-    size_t num_neighbors = high - low;
+    auto num_neighbors = row_end - row_start;
-    size_t size_i = size_t(ceil(factor * float(num_neighbors)));
+    int64_t size = count;
-    size_i = (size_i < size) ? size_i : size;
+    if (count < 1) {
+      size = int64_t(ceil(factor * float(num_neighbors)));
+    }
    // If the number of neighbors is approximately equal to the number of
    // neighbors which are requested, we use `randperm` to sample without
-    // replacement, otherwise we sample random numbers into a set as long as
+    // replacement, otherwise we sample random numbers into a set as long
-    // necessary.
+    // as necessary.
    std::unordered_set<int64_t> set;
-    if (size_i < 0.7 * float(num_neighbors)) {
+    if (size < 0.7 * float(num_neighbors)) {
-      while (set.size() < size_i) {
+      while (int64_t(set.size()) < size) {
-        int64_t z = rand() % num_neighbors;
+        int64_t sample = (rand() % num_neighbors) + row_start;
-        set.insert(z + low);
+        set.insert(sample);
      }
      std::vector<int64_t> v(set.begin(), set.end());
      e_ids.insert(e_ids.end(), v.begin(), v.end());
    } else {
-      auto sample = at::randperm(num_neighbors, start.options());
+      auto sample = at::randperm(num_neighbors, start.options()) + row_start;
-      auto sample_ptr = sample.DATA_PTR<int64_t>();
+      auto sample_data = sample.data_ptr<int64_t>();
-      for (size_t j = 0; j < size_i; j++) {
+      for (auto j = 0; j < size; j++) {
-        e_ids.push_back(sample_ptr[j] + low);
+        e_ids.push_back(sample_data[j]);
      }
    }
  }
-  int64_t len = e_ids.size();
+  int64_t length = e_ids.size();
-  auto e_id = torch::from_blob(e_ids.data(), {len}, start.options()).clone();
+  return torch::from_blob(e_ids.data(), {length}, start.options()).clone();
-  return e_id;
-}
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("neighbor_sampler", &neighbor_sampler, "Neighbor Sampler (CPU)");
 }