Merge pull request #57 from rusty1s/wheel

[WIP] Python wheels

Merge pull request #57 from rusty1s/wheel
[WIP] Python wheels
80b99adb · Matthias Fey · GitHub · 0194ebb6 · bc476876 · 0194ebb6
Unverified Commit 80b99adb authored Mar 15, 2020 by Matthias Fey Committed by GitHub Mar 15, 2020
20 changed files
--- a/cuda/radius_kernel.cu
+++ b/cuda/radius_kernel.cu
-#include <ATen/ATen.h>
-
-#include "compat.cuh"
-#include "utils.cuh"
-
-#define THREADS 1024
-
-template <typename scalar_t>
-__global__ void
-radius_kernel(const scalar_t *__restrict__ x, const scalar_t *__restrict__ y,
-              const int64_t *__restrict__ batch_x,
-              const int64_t *__restrict__ batch_y, int64_t *__restrict__ row,
-              int64_t *__restrict__ col, scalar_t radius,
-              size_t max_num_neighbors, size_t dim) {
-
-  const ptrdiff_t batch_idx = blockIdx.x;
-  const ptrdiff_t idx = threadIdx.x;
-
-  const ptrdiff_t start_idx_x = batch_x[batch_idx];
-  const ptrdiff_t end_idx_x = batch_x[batch_idx + 1];
-
-  const ptrdiff_t start_idx_y = batch_y[batch_idx];
-  const ptrdiff_t end_idx_y = batch_y[batch_idx + 1];
-
-  for (ptrdiff_t n_y = start_idx_y + idx; n_y < end_idx_y; n_y += THREADS) {
-    size_t count = 0;
-    for (ptrdiff_t n_x = start_idx_x; n_x < end_idx_x; n_x++) {
-
-      scalar_t dist = 0;
-      for (ptrdiff_t d = 0; d < dim; d++) {
-        dist += (x[n_x * dim + d] - y[n_y * dim + d]) *
-                (x[n_x * dim + d] - y[n_y * dim + d]);
-      }
-      dist = sqrt(dist);
-
-      if (dist <= radius) {
-        row[n_y * max_num_neighbors + count] = n_y;
-        col[n_y * max_num_neighbors + count] = n_x;
-        count++;
-      }
-
-      if (count >= max_num_neighbors) {
-        break;
-      }
-    }
-  }
-}
-
-at::Tensor radius_cuda(at::Tensor x, at::Tensor y, float radius,
-                       at::Tensor batch_x, at::Tensor batch_y,
-                       size_t max_num_neighbors) {
-  cudaSetDevice(x.get_device());
-  auto batch_sizes = (int64_t *)malloc(sizeof(int64_t));
-  cudaMemcpy(batch_sizes, batch_x[-1].DATA_PTR<int64_t>(), sizeof(int64_t),
-             cudaMemcpyDeviceToHost);
-  auto batch_size = batch_sizes[0] + 1;
-
-  batch_x = degree(batch_x, batch_size);
-  batch_x = at::cat({at::zeros(1, batch_x.options()), batch_x.cumsum(0)}, 0);
-  batch_y = degree(batch_y, batch_size);
-  batch_y = at::cat({at::zeros(1, batch_y.options()), batch_y.cumsum(0)}, 0);
-
-  auto row = at::full(y.size(0) * max_num_neighbors, -1, batch_y.options());
-  auto col = at::full(y.size(0) * max_num_neighbors, -1, batch_y.options());
-
-  AT_DISPATCH_FLOATING_TYPES(x.scalar_type(), "radius_kernel", [&] {
-    radius_kernel<scalar_t><<<batch_size, THREADS>>>(
-        x.DATA_PTR<scalar_t>(), y.DATA_PTR<scalar_t>(),
-        batch_x.DATA_PTR<int64_t>(), batch_y.DATA_PTR<int64_t>(),
-        row.DATA_PTR<int64_t>(), col.DATA_PTR<int64_t>(), radius,
-        max_num_neighbors, x.size(1));
-  });
-
-  auto mask = row != -1;
-  return at::stack({row.masked_select(mask), col.masked_select(mask)}, 0);
-}
--- a/cuda/response.cuh
+++ b/cuda/response.cuh
-#pragma once
-
-#include <ATen/ATen.h>
-
-#include "compat.cuh"
-
-#define THREADS 1024
-#define BLOCKS(N) (N + THREADS - 1) / THREADS
-
-__global__ void respond_kernel(int64_t *__restrict__ cluster, int64_t *proposal,
-                               int64_t *__restrict row,
-                               int64_t *__restrict__ col, size_t numel) {
-  const size_t index = blockIdx.x * blockDim.x + threadIdx.x;
-  const size_t stride = blockDim.x * gridDim.x;
-  for (int64_t u = index; u < numel; u += stride) {
-    if (cluster[u] != -2)
-      continue; // Only vist red nodes.
-
-    bool has_unmatched_neighbor = false;
-
-    for (int64_t i = row[u]; i < row[u + 1]; i++) {
-      auto v = col[i];
-
-      if (cluster[v] < 0)
-        has_unmatched_neighbor = true; // Unmatched neighbor found.
-
-      if (cluster[v] == -1 && proposal[v] == u) {
-        // Match first blue neighbhor v which proposed to u.
-        cluster[u] = min(u, v);
-        cluster[v] = min(u, v);
-        break;
-      }
-    }
-
-    if (!has_unmatched_neighbor)
-      cluster[u] = u;
-  }
-}
-
-void respond(at::Tensor cluster, at::Tensor proposal, at::Tensor row,
-             at::Tensor col) {
-  respond_kernel<<<BLOCKS(cluster.numel()), THREADS>>>(
-      cluster.DATA_PTR<int64_t>(), proposal.DATA_PTR<int64_t>(),
-      row.DATA_PTR<int64_t>(), col.DATA_PTR<int64_t>(), cluster.numel());
-}
-
-template <typename scalar_t>
-__global__ void respond_kernel(int64_t *__restrict__ cluster, int64_t *proposal,
-                               int64_t *__restrict row,
-                               int64_t *__restrict__ col,
-                               scalar_t *__restrict__ weight, size_t numel) {
-  const size_t index = blockIdx.x * blockDim.x + threadIdx.x;
-  const size_t stride = blockDim.x * gridDim.x;
-  for (int64_t u = index; u < numel; u += stride) {
-    if (cluster[u] != -2)
-      continue; // Only vist red nodes.
-
-    bool has_unmatched_neighbor = false;
-    int64_t v_max = -1;
-    scalar_t w_max = 0;
-
-    for (int64_t i = row[u]; i < row[u + 1]; i++) {
-      auto v = col[i];
-
-      if (cluster[v] < 0)
-        has_unmatched_neighbor = true; // Unmatched neighbor found.
-
-      if (cluster[v] == -1 && proposal[v] == u && weight[i] >= w_max) {
-        // Find maximum weighted blue neighbhor v which proposed to u.
-        v_max = v;
-        w_max = weight[i];
-      }
-    }
-
-    if (v_max >= 0) {
-      cluster[u] = min(u, v_max); // Match neighbors.
-      cluster[v_max] = min(u, v_max);
-    }
-
-    if (!has_unmatched_neighbor)
-      cluster[u] = u;
-  }
-}
-
-void respond(at::Tensor cluster, at::Tensor proposal, at::Tensor row,
-             at::Tensor col, at::Tensor weight) {
-  AT_DISPATCH_ALL_TYPES(weight.scalar_type(), "respond_kernel", [&] {
-    respond_kernel<scalar_t><<<BLOCKS(cluster.numel()), THREADS>>>(
-        cluster.DATA_PTR<int64_t>(), proposal.DATA_PTR<int64_t>(),
-        row.DATA_PTR<int64_t>(), col.DATA_PTR<int64_t>(),
-        weight.DATA_PTR<scalar_t>(), cluster.numel());
-  });
-}
--- a/cuda/rw.cpp
+++ b/cuda/rw.cpp
-#include <torch/extension.h>
-
-#define CHECK_CUDA(x)                                                          \
-  AT_ASSERTM(x.device().is_cuda(), #x " must be CUDA tensor")
-#define IS_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " is not contiguous");
-
-at::Tensor rw_cuda(at::Tensor row, at::Tensor col, at::Tensor start,
-                   size_t walk_length, float p, float q, size_t num_nodes);
-
-at::Tensor rw(at::Tensor row, at::Tensor col, at::Tensor start,
-              size_t walk_length, float p, float q, size_t num_nodes) {
-  CHECK_CUDA(row);
-  CHECK_CUDA(col);
-  CHECK_CUDA(start);
-  return rw_cuda(row, col, start, walk_length, p, q, num_nodes);
-}
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("rw", &rw, "Random Walk Sampling (CUDA)");
-}
--- a/cuda/rw_kernel.cu
+++ b/cuda/rw_kernel.cu
-#include <ATen/ATen.h>
-
-#include "compat.cuh"
-#include "utils.cuh"
-
-#define THREADS 1024
-#define BLOCKS(N) (N + THREADS - 1) / THREADS
-
-__global__ void uniform_rw_kernel(
-    const int64_t *__restrict__ row, const int64_t *__restrict__ col,
-    const int64_t *__restrict__ deg, const int64_t *__restrict__ start,
-    const float *__restrict__ rand, int64_t *__restrict__ out,
-    const size_t walk_length, const size_t numel) {
-
-  const size_t index = blockIdx.x * blockDim.x + threadIdx.x;
-  const size_t stride = blockDim.x * gridDim.x;
-
-  for (ptrdiff_t n = index; n < numel; n += stride) {
-    out[n] = start[n];
-
-    for (ptrdiff_t l = 1; l <= walk_length; l++) {
-      auto i = (l - 1) * numel + n;
-      auto cur = out[i];
-      out[l * numel + n] = col[row[cur] + int64_t(rand[i] * deg[cur])];
-    }
-  }
-}
-
-at::Tensor rw_cuda(at::Tensor row, at::Tensor col, at::Tensor start,
-                   size_t walk_length, float p, float q, size_t num_nodes) {
-  cudaSetDevice(row.get_device());
-  auto deg = degree(row, num_nodes);
-  row = at::cat({at::zeros(1, deg.options()), deg.cumsum(0)}, 0);
-
-  auto rand = at::rand({(int64_t)walk_length, start.size(0)},
-                       start.options().dtype(at::kFloat));
-  auto out =
-      at::full({(int64_t)walk_length + 1, start.size(0)}, -1, start.options());
-
-  uniform_rw_kernel<<<BLOCKS(start.numel()), THREADS>>>(
-      row.DATA_PTR<int64_t>(), col.DATA_PTR<int64_t>(), deg.DATA_PTR<int64_t>(),
-      start.DATA_PTR<int64_t>(), rand.DATA_PTR<float>(),
-      out.DATA_PTR<int64_t>(), walk_length, start.numel());
-
-  return out.t().contiguous();
-}
--- a/cuda/utils.cuh
+++ b/cuda/utils.cuh
-#pragma once
-
-#include <ATen/ATen.h>
-
-std::tuple<at::Tensor, at::Tensor> remove_self_loops(at::Tensor row,
-                                                     at::Tensor col) {
-  auto mask = row != col;
-  return std::make_tuple(row.masked_select(mask), col.masked_select(mask));
-}
-
-std::tuple<at::Tensor, at::Tensor, at::Tensor>
-remove_self_loops(at::Tensor row, at::Tensor col, at::Tensor weight) {
-  auto mask = row != col;
-  return std::make_tuple(row.masked_select(mask), col.masked_select(mask),
-                         weight.masked_select(mask));
-}
-
-std::tuple<at::Tensor, at::Tensor> rand(at::Tensor row, at::Tensor col) {
-  auto perm = at::empty(row.size(0), row.options());
-  at::randperm_out(perm, row.size(0));
-  return std::make_tuple(row.index_select(0, perm), col.index_select(0, perm));
-}
-
-std::tuple<at::Tensor, at::Tensor> sort_by_row(at::Tensor row, at::Tensor col) {
-  at::Tensor perm;
-  std::tie(row, perm) = row.sort();
-  return std::make_tuple(row, col.index_select(0, perm));
-}
-
-std::tuple<at::Tensor, at::Tensor, at::Tensor>
-sort_by_row(at::Tensor row, at::Tensor col, at::Tensor weight) {
-  at::Tensor perm;
-  std::tie(row, perm) = row.sort();
-  return std::make_tuple(row, col.index_select(0, perm),
-                         weight.index_select(0, perm));
-}
-
-at::Tensor degree(at::Tensor row, int64_t num_nodes) {
-  auto zero = at::zeros(num_nodes, row.options());
-  auto one = at::ones(row.size(0), row.options());
-  return zero.scatter_add_(0, row, one);
-}
-
-std::tuple<at::Tensor, at::Tensor> to_csr(at::Tensor row, at::Tensor col,
-                                          int64_t num_nodes) {
-  std::tie(row, col) = sort_by_row(row, col);
-  row = degree(row, num_nodes).cumsum(0);
-  row = at::cat({at::zeros(1, row.options()), row}, 0); // Prepend zero.
-  return std::make_tuple(row, col);
-}
-
-std::tuple<at::Tensor, at::Tensor, at::Tensor>
-to_csr(at::Tensor row, at::Tensor col, at::Tensor weight, int64_t num_nodes) {
-  std::tie(row, col, weight) = sort_by_row(row, col, weight);
-  row = degree(row, num_nodes).cumsum(0);
-  row = at::cat({at::zeros(1, row.options()), row}, 0); // Prepend zero.
-  return std::make_tuple(row, col, weight);
-}
--- a/script/conda.sh
+++ b/script/conda.sh
+#!/bin/bash
+
+if [ "${TRAVIS_OS_NAME}" = "linux" ]; then
+  wget -nv https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh
+  chmod +x miniconda.sh
+  ./miniconda.sh -b
+  PATH=/home/travis/miniconda3/bin:${PATH}
+fi
+
+if [ "${TRAVIS_OS_NAME}" = "osx" ]; then
+  wget -nv https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O miniconda.sh
+  chmod +x miniconda.sh
+  ./miniconda.sh -b
+  PATH=/Users/travis/miniconda3/bin:${PATH}
+fi
+
+
+if [ "${TRAVIS_OS_NAME}" = "windows" ]; then
+  choco install openssl.light
+  choco install miniconda3
+  PATH=/c/tools/miniconda3/Scripts:$PATH
+fi
+
+conda update --yes conda
+
+conda create --yes -n test python="${PYTHON_VERSION}"
--- a/script/cuda.sh
+++ b/script/cuda.sh
+#!/bin/bash
+
+if [ "${TRAVIS_OS_NAME}" = "linux" ] && [ "$IDX" = "cpu" ]; then
+  export TOOLKIT=cpuonly
+fi
+
+if [ "${TRAVIS_OS_NAME}" = "linux" ] && [ "$IDX" = "cu92" ]; then
+  export CUDA_SHORT=9.2
+  export CUDA=9.2.148-1
+  export UBUNTU_VERSION=ubuntu1604
+  export CUBLAS=cuda-cublas-dev-9-2
+  export TOOLKIT="cudatoolkit=${CUDA_SHORT}"
+fi
+
+if [ "${TRAVIS_OS_NAME}" = "linux" ] && [ "$IDX" = "cu100" ]; then
+  export CUDA_SHORT=10.0
+  export CUDA=10.0.130-1
+  export UBUNTU_VERSION=ubuntu1804
+  export CUBLAS=cuda-cublas-dev-10-0
+  export TOOLKIT="cudatoolkit=${CUDA_SHORT}"
+fi
+
+if [ "${TRAVIS_OS_NAME}" = "linux" ] && [ "$IDX" = "cu101" ]; then
+  export IDX=cu101
+  export CUDA_SHORT=10.1
+  export CUDA=10.1.105-1
+  export UBUNTU_VERSION=ubuntu1804
+  export CUBLAS=libcublas-dev
+  export TOOLKIT="cudatoolkit=${CUDA_SHORT}"
+fi
+
+if [ "${TRAVIS_OS_NAME}" = "windows" ] && [ "$IDX" = "cpu" ]; then
+  export TOOLKIT=cpuonly
+fi
+
+if [ "${TRAVIS_OS_NAME}" = "windows" ] && [ "$IDX" = "cu92" ]; then
+  export CUDA_SHORT=9.2
+  export CUDA_URL=https://developer.nvidia.com/compute/cuda/${CUDA_SHORT}/Prod2/local_installers2
+  export CUDA_FILE=cuda_${CUDA_SHORT}.148_win10
+  export TOOLKIT="cudatoolkit=${CUDA_SHORT}"
+fi
+
+if [ "${TRAVIS_OS_NAME}" = "windows" ] && [ "$IDX" = "cu100" ]; then
+  export CUDA_SHORT=10.0
+  export CUDA_URL=https://developer.nvidia.com/compute/cuda/${CUDA_SHORT}/Prod/local_installers
+  export CUDA_FILE=cuda_${CUDA_SHORT}.130_411.31_win10
+  export TOOLKIT="cudatoolkit=${CUDA_SHORT}"
+fi
+
+if [ "${TRAVIS_OS_NAME}" = "windows" ] && [ "$IDX" = "cu101" ]; then
+  export CUDA_SHORT=10.1
+  export CUDA_URL=https://developer.nvidia.com/compute/cuda/${CUDA_SHORT}/Prod/local_installers
+  export CUDA_FILE=cuda_${CUDA_SHORT}.105_418.96_win10.exe
+  export TOOLKIT="cudatoolkit=${CUDA_SHORT}"
+fi
+
+if [ "${TRAVIS_OS_NAME}" = "osx" ] && [ "$IDX" = "cpu" ]; then
+  export TOOLKIT=""
+fi
+
+if [ "${IDX}" = "cpu" ]; then
+  export FORCE_CPU=1
+else
+  export FORCE_CUDA=1
+fi
+
+if [ "${TRAVIS_OS_NAME}" = "linux" ] && [ "${IDX}" != "cpu" ]; then
+  INSTALLER=cuda-repo-${UBUNTU_VERSION}_${CUDA}_amd64.deb
+  wget -nv "http://developer.download.nvidia.com/compute/cuda/repos/${UBUNTU_VERSION}/x86_64/${INSTALLER}"
+  sudo dpkg -i "${INSTALLER}"
+  wget -nv "https://developer.download.nvidia.com/compute/cuda/repos/${UBUNTU_VERSION}/x86_64/7fa2af80.pub"
+  sudo apt-key add 7fa2af80.pub
+  sudo apt update -qq
+  sudo apt install -y "cuda-core-${CUDA_SHORT/./-}" "cuda-cudart-dev-${CUDA_SHORT/./-}" "${CUBLAS}" "cuda-cusparse-dev-${CUDA_SHORT/./-}"
+  sudo apt clean
+  CUDA_HOME=/usr/local/cuda-${CUDA_SHORT}
+  LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
+  PATH=${CUDA_HOME}/bin:${PATH}
+  nvcc --version
+fi
+
+if [ "${TRAVIS_OS_NAME}" = "windows" ] && [ "${IDX}" != "cpu" ]; then
+  wget -nv "${CUDA_URL}/${CUDA_FILE}"
+  PowerShell -Command "Start-Process -FilePath \"${CUDA_FILE}\" -ArgumentList \"-s nvcc_${CUDA_SHORT} cublas_dev_${CUDA_SHORT} cusparse_dev_${CUDA_SHORT}\" -Wait -NoNewWindow"
+  CUDA_HOME=/c/Program\ Files/NVIDIA\ GPU\ Computing\ Toolkit/CUDA/v${CUDA_SHORT}
+  PATH=${CUDA_HOME}/bin:$PATH
+  PATH=/c/Program\ Files\ \(x86\)/Microsoft\ Visual\ Studio/2017/BuildTools/MSBuild/15.0/Bin:$PATH
+  nvcc --version
+fi
+
+# Fix Cuda9.2 on Windows: https://github.com/pytorch/pytorch/issues/6109
+if [ "${TRAVIS_OS_NAME}" = "windows" ] && [ "${IDX}" = "cu92" ]; then
+  sed -i.bak -e '129,141d' "${CUDA_HOME}/include/crt/host_config.h"
+fi
--- a/script/rename_wheel.py
+++ b/script/rename_wheel.py
+import sys
+import os
+import os.path as osp
+import glob
+import shutil
+
+idx = sys.argv[1]
+assert idx in ['cpu', 'cu92', 'cu100', 'cu101']
+
+dist_dir = osp.join(osp.dirname(osp.abspath(__file__)), '..', 'dist')
+wheels = glob.glob(osp.join('dist', '**', '*.whl'), recursive=True)
+
+for wheel in wheels:
+    if idx in wheel:
+        continue
+
+    paths = wheel.split(osp.sep)
+    names = paths[-1].split('-')
+
+    name = '-'.join(names[:-4] + ['latest+' + idx] + names[-3:])
+    shutil.copyfile(wheel, osp.join(*paths[:-1], name))
+
+    name = '-'.join(names[:-4] + [names[-4] + '+' + idx] + names[-3:])
+    os.rename(wheel, osp.join(*paths[:-1], name))
--- a/script/torch.sh
+++ b/script/torch.sh
+#!/bin/bash
+
+# Fix "member may not be initialized" error on Windows: https://github.com/pytorch/pytorch/issues/27958
+if [ "${TRAVIS_OS_NAME}" = "windows" ] && [ "${IDX}" != "cpu" ]; then
+  sed -i.bak -e 's/constexpr/const/g' /c/tools/miniconda3/envs/test/lib/site-packages/torch/include/torch/csrc/jit/script/module.h
+  sed -i.bak -e 's/constexpr/const/g' /c/tools/miniconda3/envs/test/lib/site-packages/torch/include/torch/csrc/jit/argument_spec.h
+  sed -i.bak -e 's/return \*(this->value)/return \*((type\*)this->value)/g' /c/tools/miniconda3/envs/test/lib/site-packages/torch/include/pybind11/cast.h
+fi
+
--- a/setup.py
+++ b/setup.py
+import os
+import os.path as osp
+import glob
 from setuptools import setup, find_packages
-from sys import argv
+
 import torch
+from torch.utils.cpp_extension import BuildExtension
 from torch.utils.cpp_extension import CppExtension, CUDAExtension, CUDA_HOME

-TORCH_MAJOR = int(torch.__version__.split('.')[0])
-TORCH_MINOR = int(torch.__version__.split('.')[1])
+WITH_CUDA = torch.cuda.is_available() and CUDA_HOME is not None
+if os.getenv('FORCE_CUDA', '0') == '1':
+    WITH_CUDA = True
+if os.getenv('FORCE_CPU', '0') == '1':
+    WITH_CUDA = False
+
+BUILD_DOCS = os.getenv('BUILD_DOCS', '0') == '1'
+
+
+def get_extensions():
+    Extension = CppExtension
+    define_macros = []
+    extra_compile_args = {'cxx': []}
+
+    if WITH_CUDA:
+        Extension = CUDAExtension
+        define_macros += [('WITH_CUDA', None)]
+        nvcc_flags = os.getenv('NVCC_FLAGS', '')
+        nvcc_flags = [] if nvcc_flags == '' else nvcc_flags.split(' ')
+        nvcc_flags += ['-arch=sm_35', '--expt-relaxed-constexpr']
+        extra_compile_args['nvcc'] = nvcc_flags
+
+    extensions_dir = osp.join(osp.dirname(osp.abspath(__file__)), 'csrc')
+    main_files = glob.glob(osp.join(extensions_dir, '*.cpp'))
+    extensions = []
+    for main in main_files:
+        name = main.split(os.sep)[-1][:-4]

-extra_compile_args = []
-if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 2):
-    extra_compile_args += ['-DVERSION_GE_1_3']
+        sources = [main]

-ext_modules = [
-    CppExtension('torch_cluster.graclus_cpu', ['cpu/graclus.cpp'],
-                 extra_compile_args=extra_compile_args),
-    CppExtension('torch_cluster.grid_cpu', ['cpu/grid.cpp']),
-    CppExtension('torch_cluster.fps_cpu', ['cpu/fps.cpp'],
-                 extra_compile_args=extra_compile_args),
-    CppExtension('torch_cluster.rw_cpu', ['cpu/rw.cpp'],
-                 extra_compile_args=extra_compile_args),
-    CppExtension('torch_cluster.sampler_cpu', ['cpu/sampler.cpp'],
-                 extra_compile_args=extra_compile_args),
-]
-cmdclass = {'build_ext': torch.utils.cpp_extension.BuildExtension}
+        path = osp.join(extensions_dir, 'cpu', f'{name}_cpu.cpp')
+        if osp.exists(path):
+            sources += [path]

-GPU = True
-for arg in argv:
-    if arg == '--cpu':
-        GPU = False
-        argv.remove(arg)
+        path = osp.join(extensions_dir, 'cuda', f'{name}_cuda.cu')
+        if WITH_CUDA and osp.exists(path):
+            sources += [path]

-if CUDA_HOME is not None and GPU:
-    ext_modules += [
-        CUDAExtension('torch_cluster.graclus_cuda',
-                      ['cuda/graclus.cpp', 'cuda/graclus_kernel.cu'],
-                      extra_compile_args=extra_compile_args),
-        CUDAExtension('torch_cluster.grid_cuda',
-                      ['cuda/grid.cpp', 'cuda/grid_kernel.cu'],
-                      extra_compile_args=extra_compile_args),
-        CUDAExtension('torch_cluster.fps_cuda',
-                      ['cuda/fps.cpp', 'cuda/fps_kernel.cu'],
-                      extra_compile_args=extra_compile_args),
-        CUDAExtension('torch_cluster.nearest_cuda',
-                      ['cuda/nearest.cpp', 'cuda/nearest_kernel.cu'],
-                      extra_compile_args=extra_compile_args),
-        CUDAExtension('torch_cluster.knn_cuda',
-                      ['cuda/knn.cpp', 'cuda/knn_kernel.cu'],
-                      extra_compile_args=extra_compile_args),
-        CUDAExtension('torch_cluster.radius_cuda',
-                      ['cuda/radius.cpp', 'cuda/radius_kernel.cu'],
-                      extra_compile_args=extra_compile_args),
-        CUDAExtension('torch_cluster.rw_cuda',
-                      ['cuda/rw.cpp', 'cuda/rw_kernel.cu'],
-                      extra_compile_args=extra_compile_args),
-    ]
+        extension = Extension(
+            'torch_cluster._' + name,
+            sources,
+            include_dirs=[extensions_dir],
+            define_macros=define_macros,
+            extra_compile_args=extra_compile_args,
+        )
+        extensions += [extension]
+
+    return extensions

-__version__ = '1.4.5'
-url = 'https://github.com/rusty1s/pytorch_cluster'

 install_requires = ['scipy']
 setup_requires = ['pytest-runner']
@@ -63,23 +63,26 @@ tests_require = ['pytest', 'pytest-cov']

 setup(
    name='torch_cluster',
-    version=__version__,
-    description=('PyTorch Extension Library of Optimized Graph Cluster '
-                 'Algorithms'),
+    version='1.5.0',
    author='Matthias Fey',
    author_email='matthias.fey@tu-dortmund.de',
-    url=url,
-    download_url='{}/archive/{}.tar.gz'.format(url, __version__),
+    url='https://github.com/rusty1s/pytorch_cluster',
+    description=('PyTorch Extension Library of Optimized Graph Cluster '
+                 'Algorithms'),
    keywords=[
        'pytorch',
        'geometric-deep-learning',
        'graph-neural-networks',
        'cluster-algorithms',
    ],
+    license='MIT',
+    python_requires='>=3.6',
    install_requires=install_requires,
    setup_requires=setup_requires,
    tests_require=tests_require,
-    ext_modules=ext_modules,
-    cmdclass=cmdclass,
+    ext_modules=get_extensions() if not BUILD_DOCS else [],
+    cmdclass={
+        'build_ext': BuildExtension.with_options(no_python_abi_suffix=True)
+    },
    packages=find_packages(),
 )
--- a/test/utils.py
+++ b/test/utils.py
 import torch
-from torch.testing import get_all_dtypes
-
-dtypes = get_all_dtypes()
-dtypes.remove(torch.half)
-dtypes.remove(torch.bool)
-if hasattr(torch, 'bfloat16'):
-    dtypes.remove(torch.bfloat16)

+dtypes = [torch.float, torch.double, torch.int, torch.long]
 grad_dtypes = [torch.float, torch.double]

 devices = [torch.device('cpu')]
 if torch.cuda.is_available():
-    devices += [torch.device('cuda:{}'.format(torch.cuda.current_device()))]
+    devices += [torch.device(f'cuda:{torch.cuda.current_device()}')]


 def tensor(x, dtype, device):

--- a/torch_cluster/__init__.py
+++ b/torch_cluster/__init__.py
-from .graclus import graclus_cluster
-from .grid import grid_cluster
-from .fps import fps
-from .nearest import nearest
-from .knn import knn, knn_graph
-from .radius import radius, radius_graph
-from .rw import random_walk
-from .sampler import neighbor_sampler
-
-__version__ = '1.4.5'
+import importlib
+import os.path as osp
+
+import torch
+
+__version__ = '1.5.0'
+expected_torch_version = (1, 4)
+
+try:
+    for library in [
+            '_version', '_grid', '_graclus', '_fps', '_rw', '_sampler',
+            '_nearest', '_knn', '_radius'
+    ]:
+        torch.ops.load_library(importlib.machinery.PathFinder().find_spec(
+            library, [osp.dirname(__file__)]).origin)
+except OSError as e:
+    major, minor = [int(x) for x in torch.__version__.split('.')[:2]]
+    t_major, t_minor = expected_torch_version
+    if major != t_major or (major == t_major and minor != t_minor):
+        raise RuntimeError(
+            f'Expected PyTorch version {t_major}.{t_minor} but found '
+            f'version {major}.{minor}.')
+    raise OSError(e)
+
+if torch.version.cuda is not None:  # pragma: no cover
+    cuda_version = torch.ops.torch_cluster.cuda_version()
+
+    if cuda_version == -1:
+        major = minor = 0
+    elif cuda_version < 10000:
+        major, minor = int(str(cuda_version)[0]), int(str(cuda_version)[2])
+    else:
+        major, minor = int(str(cuda_version)[0:2]), int(str(cuda_version)[3])
+    t_major, t_minor = [int(x) for x in torch.version.cuda.split('.')]
+
+    if t_major != major or t_minor != minor:
+        raise RuntimeError(
+            f'Detected that PyTorch and torch_cluster were compiled with '
+            f'different CUDA versions. PyTorch has CUDA version '
+            f'{t_major}.{t_minor} and torch_cluster has CUDA version '
+            f'{major}.{minor}. Please reinstall the torch_cluster that '
+            f'matches your PyTorch install.')
+
+from .graclus import graclus_cluster  # noqa
+from .grid import grid_cluster  # noqa
+from .fps import fps  # noqa
+from .nearest import nearest  # noqa
+from .knn import knn, knn_graph  # noqa
+from .radius import radius, radius_graph  # noqa
+from .rw import random_walk  # noqa
+from .sampler import neighbor_sampler  # noqa

 __all__ = [
    'graclus_cluster',

--- a/torch_cluster/fps.py
+++ b/torch_cluster/fps.py
-import torch
-import torch_cluster.fps_cpu
+from typing import Optional

-if torch.cuda.is_available():
-    import torch_cluster.fps_cuda
+import torch


-def fps(x, batch=None, ratio=0.5, random_start=True):
+@torch.jit.script
+def fps(src: torch.Tensor, batch: Optional[torch.Tensor] = None,
+        ratio: float = 0.5, random_start: bool = True) -> torch.Tensor:
    r""""A sampling algorithm from the `"PointNet++: Deep Hierarchical Feature
    Learning on Point Sets in a Metric Space"
    <https://arxiv.org/abs/1706.02413>`_ paper, which iteratively samples the
    most distant point with regard to the rest points.

    Args:
-        x (Tensor): Node feature matrix
+        src (Tensor): Point feature matrix
            :math:`\mathbf{X} \in \mathbb{R}^{N \times F}`.
        batch (LongTensor, optional): Batch vector
            :math:`\mathbf{b} \in {\{ 0, \ldots, B-1\}}^N`, which assigns each
@@ -23,28 +23,26 @@ def fps(x, batch=None, ratio=0.5, random_start=True):

    :rtype: :class:`LongTensor`

-    .. testsetup::
+    .. code-block:: python

        import torch
        from torch_cluster import fps

-    .. testcode::
-
-        >>> x = torch.Tensor([[-1, -1], [-1, 1], [1, -1], [1, 1]])
-        >>> batch = torch.tensor([0, 0, 0, 0])
-        >>> index = fps(x, batch, ratio=0.5)
+        src = torch.Tensor([[-1, -1], [-1, 1], [1, -1], [1, 1]])
+        batch = torch.tensor([0, 0, 0, 0])
+        index = fps(src, batch, ratio=0.5)
    """

-    if batch is None:
-        batch = x.new_zeros(x.size(0), dtype=torch.long)
+    if batch is not None:
+        assert src.size(0) == batch.numel()
+        batch_size = int(batch.max()) + 1

-    x = x.view(-1, 1) if x.dim() == 1 else x
+        deg = src.new_zeros(batch_size, dtype=torch.long)
+        deg.scatter_add_(0, batch, torch.ones_like(batch))

-    assert x.dim() == 2 and batch.dim() == 1
-    assert x.size(0) == batch.size(0)
-    assert ratio > 0 and ratio < 1
-
-    if x.is_cuda:
-        return torch_cluster.fps_cuda.fps(x, batch, ratio, random_start)
+        ptr = deg.new_zeros(batch_size + 1)
+        torch.cumsum(deg, 0, out=ptr[1:])
    else:
-        return torch_cluster.fps_cpu.fps(x, batch, ratio, random_start)
+        ptr = torch.tensor([0, src.size(0)], device=src.device)
+
+    return torch.ops.torch_cluster.fps(src, ptr, ratio, random_start)
--- a/torch_cluster/graclus.py
+++ b/torch_cluster/graclus.py
-import torch
-import torch_cluster.graclus_cpu
+from typing import Optional

-if torch.cuda.is_available():
-    import torch_cluster.graclus_cuda
+import torch


-def graclus_cluster(row, col, weight=None, num_nodes=None):
+@torch.jit.script
+def graclus_cluster(row: torch.Tensor, col: torch.Tensor,
+                    weight: Optional[torch.Tensor] = None,
+                    num_nodes: Optional[int] = None) -> torch.Tensor:
    """A greedy clustering algorithm of picking an unmarked vertex and matching
    it with one its unmarked neighbors (that maximizes its edge weight).

@@ -17,25 +18,42 @@ def graclus_cluster(row, col, weight=None, num_nodes=None):

    :rtype: :class:`LongTensor`

-    Examples::
+    .. code-block:: python

-        >>> row = torch.tensor([0, 1, 1, 2])
-        >>> col = torch.tensor([1, 0, 2, 1])
-        >>> weight = torch.Tensor([1, 1, 1, 1])
-        >>> cluster = graclus_cluster(row, col, weight)
+        import torch
+        from torch_cluster import graclus_cluster
+
+        row = torch.tensor([0, 1, 1, 2])
+        col = torch.tensor([1, 0, 2, 1])
+        weight = torch.Tensor([1, 1, 1, 1])
+        cluster = graclus_cluster(row, col, weight)
    """

    if num_nodes is None:
-        num_nodes = max(row.max().item(), col.max().item()) + 1
+        num_nodes = max(int(row.max()), int(col.max())) + 1
+
+    # Remove self-loops.
+    mask = row == col
+    row, col = row[mask], col[mask]

-    if row.is_cuda:
-        op = torch_cluster.graclus_cuda
-    else:
-        op = torch_cluster.graclus_cpu
+    if weight is not None:
+        weight = weight[mask]

+    # Randomly shuffle nodes.
    if weight is None:
-        cluster = op.graclus(row, col, num_nodes)
-    else:
-        cluster = op.weighted_graclus(row, col, weight, num_nodes)
+        perm = torch.randperm(row.size(0), dtype=torch.long, device=row.device)
+        row, col = row[perm], col[perm]
+
+    # To CSR.
+    perm = torch.argsort(row)
+    row, col = row[perm], col[perm]
+
+    if weight is not None:
+        weight = weight[perm]
+
+    deg = row.new_zeros(num_nodes)
+    deg.scatter_add_(0, row, torch.ones_like(row))
+    rowptr = row.new_zeros(num_nodes + 1)
+    torch.cumsum(deg, 0, out=rowptr[1:])

-    return cluster
+    return torch.ops.torch_cluster.graclus(rowptr, col, weight)
--- a/torch_cluster/grid.py
+++ b/torch_cluster/grid.py
-import torch
-import torch_cluster.grid_cpu
+from typing import Optional

-if torch.cuda.is_available():
-    import torch_cluster.grid_cuda
+import torch


-def grid_cluster(pos, size, start=None, end=None):
+@torch.jit.script
+def grid_cluster(pos: torch.Tensor, size: torch.Tensor,
+                 start: Optional[torch.Tensor] = None,
+                 end: Optional[torch.Tensor] = None) -> torch.Tensor:
    """A clustering algorithm, which overlays a regular grid of user-defined
    size over a point cloud and clusters all points within a voxel.

@@ -19,22 +20,13 @@ def grid_cluster(pos, size, start=None, end=None):

    :rtype: :class:`LongTensor`

-    Examples::
-
-        >>> pos = torch.Tensor([[0, 0], [11, 9], [2, 8], [2, 2], [8, 3]])
-        >>> size = torch.Tensor([5, 5])
-        >>> cluster = grid_cluster(pos, size)
-    """
+    .. code-block:: python

-    pos = pos.unsqueeze(-1) if pos.dim() == 1 else pos
-    start = pos.t().min(dim=1)[0] if start is None else start
-    end = pos.t().max(dim=1)[0] if end is None else end
+        import torch
+        from torch_cluster import grid_cluster

-    if pos.is_cuda:
-        op = torch_cluster.grid_cuda
-    else:
-        op = torch_cluster.grid_cpu
-
-    cluster = op.grid(pos, size, start, end)
-
-    return cluster
+        pos = torch.Tensor([[0, 0], [11, 9], [2, 8], [2, 2], [8, 3]])
+        size = torch.Tensor([5, 5])
+        cluster = grid_cluster(pos, size)
+    """
+    return torch.ops.torch_cluster.grid(pos, size, start, end)
--- a/torch_cluster/knn.py
+++ b/torch_cluster/knn.py
+from typing import Optional
+
 import torch
 import scipy.spatial

-if torch.cuda.is_available():
-    import torch_cluster.knn_cuda
-

-def knn(x, y, k, batch_x=None, batch_y=None, cosine=False):
+def knn(x: torch.Tensor, y: torch.Tensor, k: int,
+        batch_x: Optional[torch.Tensor] = None,
+        batch_y: Optional[torch.Tensor] = None,
+        cosine: bool = False) -> torch.Tensor:
    r"""Finds for each element in :obj:`y` the :obj:`k` nearest points in
    :obj:`x`.

@@ -27,66 +29,91 @@ def knn(x, y, k, batch_x=None, batch_y=None, cosine=False):

    :rtype: :class:`LongTensor`

-    .. testsetup::
+    .. code-block:: python

        import torch
        from torch_cluster import knn

-    .. testcode::
-
-        >>> x = torch.Tensor([[-1, -1], [-1, 1], [1, -1], [1, 1]])
-        >>> batch_x = torch.tensor([0, 0, 0, 0])
-        >>> y = torch.Tensor([[-1, 0], [1, 0]])
-        >>> batch_x = torch.tensor([0, 0])
-        >>> assign_index = knn(x, y, 2, batch_x, batch_y)
+        x = torch.Tensor([[-1, -1], [-1, 1], [1, -1], [1, 1]])
+        batch_x = torch.tensor([0, 0, 0, 0])
+        y = torch.Tensor([[-1, 0], [1, 0]])
+        batch_x = torch.tensor([0, 0])
+        assign_index = knn(x, y, 2, batch_x, batch_y)
    """

-    if batch_x is None:
-        batch_x = x.new_zeros(x.size(0), dtype=torch.long)
-
-    if batch_y is None:
-        batch_y = y.new_zeros(y.size(0), dtype=torch.long)
-
    x = x.view(-1, 1) if x.dim() == 1 else x
    y = y.view(-1, 1) if y.dim() == 1 else y

-    assert x.dim() == 2 and batch_x.dim() == 1
-    assert y.dim() == 2 and batch_y.dim() == 1
-    assert x.size(1) == y.size(1)
-    assert x.size(0) == batch_x.size(0)
-    assert y.size(0) == batch_y.size(0)
-
    if x.is_cuda:
-        return torch_cluster.knn_cuda.knn(x, y, k, batch_x, batch_y, cosine)
-
-    if cosine:
-        raise NotImplementedError('Cosine distance not implemented for CPU')
-
-    # Rescale x and y.
-    min_xy = min(x.min().item(), y.min().item())
-    x, y = x - min_xy, y - min_xy
-
-    max_xy = max(x.max().item(), y.max().item())
-    x, y, = x / max_xy, y / max_xy
-
-    # Concat batch/features to ensure no cross-links between examples exist.
-    x = torch.cat([x, 2 * x.size(1) * batch_x.view(-1, 1).to(x.dtype)], dim=-1)
-    y = torch.cat([y, 2 * y.size(1) * batch_y.view(-1, 1).to(y.dtype)], dim=-1)
-
-    tree = scipy.spatial.cKDTree(x.detach().numpy())
-    dist, col = tree.query(y.detach().cpu(), k=k,
-                           distance_upper_bound=x.size(1))
-    dist = torch.from_numpy(dist).to(x.dtype)
-    col = torch.from_numpy(col).to(torch.long)
-    row = torch.arange(col.size(0), dtype=torch.long).view(-1, 1).repeat(1, k)
-    mask = ~torch.isinf(dist).view(-1)
-    row, col = row.view(-1)[mask], col.view(-1)[mask]
-
-    return torch.stack([row, col], dim=0)
-
-
-def knn_graph(x, k, batch=None, loop=False, flow='source_to_target',
-              cosine=False):
+        if batch_x is not None:
+            assert x.size(0) == batch_x.numel()
+            batch_size = int(batch_x.max()) + 1
+
+            deg = x.new_zeros(batch_size, dtype=torch.long)
+            deg.scatter_add_(0, batch_x, torch.ones_like(batch_x))
+
+            ptr_x = deg.new_zeros(batch_size + 1)
+            torch.cumsum(deg, 0, out=ptr_x[1:])
+        else:
+            ptr_x = torch.tensor([0, x.size(0)], device=x.device)
+
+        if batch_y is not None:
+            assert y.size(0) == batch_y.numel()
+            batch_size = int(batch_y.max()) + 1
+
+            deg = y.new_zeros(batch_size, dtype=torch.long)
+            deg.scatter_add_(0, batch_y, torch.ones_like(batch_y))
+
+            ptr_y = deg.new_zeros(batch_size + 1)
+            torch.cumsum(deg, 0, out=ptr_y[1:])
+        else:
+            ptr_y = torch.tensor([0, y.size(0)], device=y.device)
+
+        return torch.ops.torch_cluster.knn(x, y, ptr_x, ptr_y, k, cosine)
+    else:
+        if batch_x is None:
+            batch_x = x.new_zeros(x.size(0), dtype=torch.long)
+
+        if batch_y is None:
+            batch_y = y.new_zeros(y.size(0), dtype=torch.long)
+
+        assert x.dim() == 2 and batch_x.dim() == 1
+        assert y.dim() == 2 and batch_y.dim() == 1
+        assert x.size(1) == y.size(1)
+        assert x.size(0) == batch_x.size(0)
+        assert y.size(0) == batch_y.size(0)
+
+        if cosine:
+            raise NotImplementedError('`cosine` argument not supported on CPU')
+
+        # Translate and rescale x and y to [0, 1].
+        min_xy = min(x.min().item(), y.min().item())
+        x, y = x - min_xy, y - min_xy
+
+        max_xy = max(x.max().item(), y.max().item())
+        x.div_(max_xy)
+        y.div_(max_xy)
+
+        # Concat batch/features to ensure no cross-links between examples.
+        x = torch.cat([x, 2 * x.size(1) * batch_x.view(-1, 1).to(x.dtype)], -1)
+        y = torch.cat([y, 2 * y.size(1) * batch_y.view(-1, 1).to(y.dtype)], -1)
+
+        tree = scipy.spatial.cKDTree(x.detach().numpy())
+        dist, col = tree.query(y.detach().cpu(), k=k,
+                               distance_upper_bound=x.size(1))
+        dist = torch.from_numpy(dist).to(x.dtype)
+        col = torch.from_numpy(col).to(torch.long)
+        row = torch.arange(col.size(0), dtype=torch.long)
+        row = row.view(-1, 1).repeat(1, k)
+        mask = ~torch.isinf(dist).view(-1)
+        row, col = row.view(-1)[mask], col.view(-1)[mask]
+
+        return torch.stack([row, col], dim=0)
+
+
+def knn_graph(x: torch.Tensor, k: int, batch: Optional[torch.Tensor] = None,
+              loop: bool = False, flow: str = 'source_to_target',
+              cosine: bool = False) -> torch.Tensor:
    r"""Computes graph edges to the nearest :obj:`k` points.

    Args:
@@ -107,16 +134,14 @@ def knn_graph(x, k, batch=None, loop=False, flow='source_to_target',

    :rtype: :class:`LongTensor`

-    .. testsetup::
+    .. code-block:: python

        import torch
        from torch_cluster import knn_graph

-    .. testcode::
-
-        >>> x = torch.Tensor([[-1, -1], [-1, 1], [1, -1], [1, 1]])
-        >>> batch = torch.tensor([0, 0, 0, 0])
-        >>> edge_index = knn_graph(x, k=2, batch=batch, loop=False)
+        x = torch.Tensor([[-1, -1], [-1, 1], [1, -1], [1, 1]])
+        batch = torch.tensor([0, 0, 0, 0])
+        edge_index = knn_graph(x, k=2, batch=batch, loop=False)
    """

    assert flow in ['source_to_target', 'target_to_source']

--- a/torch_cluster/nearest.py
+++ b/torch_cluster/nearest.py
+from typing import Optional
+
 import torch
 import scipy.cluster

-if torch.cuda.is_available():
-    import torch_cluster.nearest_cuda
-

-def nearest(x, y, batch_x=None, batch_y=None):
+def nearest(x: torch.Tensor, y: torch.Tensor,
+            batch_x: Optional[torch.Tensor] = None,
+            batch_y: Optional[torch.Tensor] = None) -> torch.Tensor:
    r"""Clusters points in :obj:`x` together which are nearest to a given query
    point in :obj:`y`.

@@ -21,49 +22,74 @@ def nearest(x, y, batch_x=None, batch_y=None):
            :math:`\mathbf{b} \in {\{ 0, \ldots, B-1\}}^M`, which assigns each
            node to a specific example. (default: :obj:`None`)

-    .. testsetup::
+    :rtype: :class:`LongTensor`
+
+    .. code-block:: python

        import torch
        from torch_cluster import nearest

-    .. testcode::
-
-        >>> x = torch.Tensor([[-1, -1], [-1, 1], [1, -1], [1, 1]])
-        >>> batch_x = torch.tensor([0, 0, 0, 0])
-        >>> y = torch.Tensor([[-1, 0], [1, 0]])
-        >>> batch_y = torch.tensor([0, 0])
-        >>> cluster = nearest(x, y, batch_x, batch_y)
+        x = torch.Tensor([[-1, -1], [-1, 1], [1, -1], [1, 1]])
+        batch_x = torch.tensor([0, 0, 0, 0])
+        y = torch.Tensor([[-1, 0], [1, 0]])
+        batch_y = torch.tensor([0, 0])
+        cluster = nearest(x, y, batch_x, batch_y)
    """

-    if batch_x is None:
-        batch_x = x.new_zeros(x.size(0), dtype=torch.long)
-
-    if batch_y is None:
-        batch_y = y.new_zeros(y.size(0), dtype=torch.long)
-
    x = x.view(-1, 1) if x.dim() == 1 else x
    y = y.view(-1, 1) if y.dim() == 1 else y

-    assert x.dim() == 2 and batch_x.dim() == 1
-    assert y.dim() == 2 and batch_y.dim() == 1
-    assert x.size(1) == y.size(1)
-    assert x.size(0) == batch_x.size(0)
-    assert y.size(0) == batch_y.size(0)
-
    if x.is_cuda:
-        return torch_cluster.nearest_cuda.nearest(x, y, batch_x, batch_y)
-
-    # Rescale x and y.
-    min_xy = min(x.min().item(), y.min().item())
-    x, y = x - min_xy, y - min_xy
-
-    max_xy = max(x.max().item(), y.max().item())
-    x, y, = x / max_xy, y / max_xy
-
-    # Concat batch/features to ensure no cross-links between examples exist.
-    x = torch.cat([x, 2 * x.size(1) * batch_x.view(-1, 1).to(x.dtype)], dim=-1)
-    y = torch.cat([y, 2 * y.size(1) * batch_y.view(-1, 1).to(y.dtype)], dim=-1)
-
-    return torch.from_numpy(
-        scipy.cluster.vq.vq(x.detach().cpu(),
-                            y.detach().cpu())[0]).to(torch.long)
+        if batch_x is not None:
+            assert x.size(0) == batch_x.numel()
+            batch_size = int(batch_x.max()) + 1
+
+            deg = x.new_zeros(batch_size, dtype=torch.long)
+            deg.scatter_add_(0, batch_x, torch.ones_like(batch_x))
+
+            ptr_x = deg.new_zeros(batch_size + 1)
+            torch.cumsum(deg, 0, out=ptr_x[1:])
+        else:
+            ptr_x = torch.tensor([0, x.size(0)], device=x.device)
+
+        if batch_y is not None:
+            assert y.size(0) == batch_y.numel()
+            batch_size = int(batch_y.max()) + 1
+
+            deg = y.new_zeros(batch_size, dtype=torch.long)
+            deg.scatter_add_(0, batch_y, torch.ones_like(batch_y))
+
+            ptr_y = deg.new_zeros(batch_size + 1)
+            torch.cumsum(deg, 0, out=ptr_y[1:])
+        else:
+            ptr_y = torch.tensor([0, y.size(0)], device=y.device)
+
+        return torch.ops.torch_cluster.nearest(x, y, ptr_x, ptr_y)
+    else:
+        if batch_x is None:
+            batch_x = x.new_zeros(x.size(0), dtype=torch.long)
+
+        if batch_y is None:
+            batch_y = y.new_zeros(y.size(0), dtype=torch.long)
+
+        assert x.dim() == 2 and batch_x.dim() == 1
+        assert y.dim() == 2 and batch_y.dim() == 1
+        assert x.size(1) == y.size(1)
+        assert x.size(0) == batch_x.size(0)
+        assert y.size(0) == batch_y.size(0)
+
+        # Translate and rescale x and y to [0, 1].
+        min_xy = min(x.min().item(), y.min().item())
+        x, y = x - min_xy, y - min_xy
+
+        max_xy = max(x.max().item(), y.max().item())
+        x.div_(max_xy)
+        y.div_(max_xy)
+
+        # Concat batch/features to ensure no cross-links between examples.
+        x = torch.cat([x, 2 * x.size(1) * batch_x.view(-1, 1).to(x.dtype)], -1)
+        y = torch.cat([y, 2 * y.size(1) * batch_y.view(-1, 1).to(y.dtype)], -1)
+
+        return torch.from_numpy(
+            scipy.cluster.vq.vq(x.detach().cpu(),
+                                y.detach().cpu())[0]).to(torch.long)
--- a/torch_cluster/radius.py
+++ b/torch_cluster/radius.py
+from typing import Optional
+
 import torch
 import scipy.spatial

-if torch.cuda.is_available():
-    import torch_cluster.radius_cuda
-

-def sample(col, count):
+@torch.jit.script
+def sample(col: torch.Tensor, count: int) -> torch.Tensor:
    if col.size(0) > count:
        col = col[torch.randperm(col.size(0))][:count]
    return col


-def radius(x, y, r, batch_x=None, batch_y=None, max_num_neighbors=32):
+def radius(x: torch.Tensor, y: torch.Tensor, r: float,
+           batch_x: Optional[torch.Tensor] = None,
+           batch_y: Optional[torch.Tensor] = None,
+           max_num_neighbors: int = 32) -> torch.Tensor:
    r"""Finds for each element in :obj:`y` all points in :obj:`x` within
    distance :obj:`r`.

@@ -30,56 +33,76 @@ def radius(x, y, r, batch_x=None, batch_y=None, max_num_neighbors=32):
        max_num_neighbors (int, optional): The maximum number of neighbors to
            return for each element in :obj:`y`. (default: :obj:`32`)

-    :rtype: :class:`LongTensor`
-
-    .. testsetup::
+    .. code-block:: python

        import torch
        from torch_cluster import radius

-    .. testcode::
-
-
-        >>> x = torch.Tensor([[-1, -1], [-1, 1], [1, -1], [1, 1]])
-        >>> batch_x = torch.tensor([0, 0, 0, 0])
-        >>> y = torch.Tensor([[-1, 0], [1, 0]])
-        >>> batch_y = torch.tensor([0, 0])
-        >>> assign_index = radius(x, y, 1.5, batch_x, batch_y)
+        x = torch.Tensor([[-1, -1], [-1, 1], [1, -1], [1, 1]])
+        batch_x = torch.tensor([0, 0, 0, 0])
+        y = torch.Tensor([[-1, 0], [1, 0]])
+        batch_y = torch.tensor([0, 0])
+        assign_index = radius(x, y, 1.5, batch_x, batch_y)
    """

-    if batch_x is None:
-        batch_x = x.new_zeros(x.size(0), dtype=torch.long)
-
-    if batch_y is None:
-        batch_y = y.new_zeros(y.size(0), dtype=torch.long)
-
    x = x.view(-1, 1) if x.dim() == 1 else x
    y = y.view(-1, 1) if y.dim() == 1 else y

-    assert x.dim() == 2 and batch_x.dim() == 1
-    assert y.dim() == 2 and batch_y.dim() == 1
-    assert x.size(1) == y.size(1)
-    assert x.size(0) == batch_x.size(0)
-    assert y.size(0) == batch_y.size(0)
-
    if x.is_cuda:
-        return torch_cluster.radius_cuda.radius(x, y, r, batch_x, batch_y,
-                                                max_num_neighbors)
-
-    x = torch.cat([x, 2 * r * batch_x.view(-1, 1).to(x.dtype)], dim=-1)
-    y = torch.cat([y, 2 * r * batch_y.view(-1, 1).to(y.dtype)], dim=-1)
-
-    tree = scipy.spatial.cKDTree(x.detach().numpy())
-    col = tree.query_ball_point(y.detach().numpy(), r)
-    col = [sample(torch.tensor(c), max_num_neighbors) for c in col]
-    row = [torch.full_like(c, i) for i, c in enumerate(col)]
-    row, col = torch.cat(row, dim=0), torch.cat(col, dim=0)
-    mask = col < int(tree.n)
-    return torch.stack([row[mask], col[mask]], dim=0)
-
-
-def radius_graph(x, r, batch=None, loop=False, max_num_neighbors=32,
-                 flow='source_to_target'):
+        if batch_x is not None:
+            assert x.size(0) == batch_x.numel()
+            batch_size = int(batch_x.max()) + 1
+
+            deg = x.new_zeros(batch_size, dtype=torch.long)
+            deg.scatter_add_(0, batch_x, torch.ones_like(batch_x))
+
+            ptr_x = deg.new_zeros(batch_size + 1)
+            torch.cumsum(deg, 0, out=ptr_x[1:])
+        else:
+            ptr_x = torch.tensor([0, x.size(0)], device=x.device)
+
+        if batch_y is not None:
+            assert y.size(0) == batch_y.numel()
+            batch_size = int(batch_y.max()) + 1
+
+            deg = y.new_zeros(batch_size, dtype=torch.long)
+            deg.scatter_add_(0, batch_y, torch.ones_like(batch_y))
+            ptr_y = deg.new_zeros(batch_size + 1)
+            torch.cumsum(deg, 0, out=ptr_y[1:])
+        else:
+            ptr_y = torch.tensor([0, y.size(0)], device=y.device)
+
+        return torch.ops.torch_cluster.radius(x, y, ptr_x, ptr_y, r,
+                                              max_num_neighbors)
+    else:
+        if batch_x is None:
+            batch_x = x.new_zeros(x.size(0), dtype=torch.long)
+
+        if batch_y is None:
+            batch_y = y.new_zeros(y.size(0), dtype=torch.long)
+
+        assert x.dim() == 2 and batch_x.dim() == 1
+        assert y.dim() == 2 and batch_y.dim() == 1
+        assert x.size(1) == y.size(1)
+        assert x.size(0) == batch_x.size(0)
+        assert y.size(0) == batch_y.size(0)
+
+        x = torch.cat([x, 2 * r * batch_x.view(-1, 1).to(x.dtype)], dim=-1)
+        y = torch.cat([y, 2 * r * batch_y.view(-1, 1).to(y.dtype)], dim=-1)
+
+        tree = scipy.spatial.cKDTree(x.detach().numpy())
+        col = tree.query_ball_point(y.detach().numpy(), r)
+        col = [sample(torch.tensor(c), max_num_neighbors) for c in col]
+        row = [torch.full_like(c, i) for i, c in enumerate(col)]
+        row, col = torch.cat(row, dim=0), torch.cat(col, dim=0)
+        mask = col < int(tree.n)
+        return torch.stack([row[mask], col[mask]], dim=0)
+
+
+def radius_graph(x: torch.Tensor, r: float,
+                 batch: Optional[torch.Tensor] = None, loop: bool = False,
+                 max_num_neighbors: int = 32,
+                 flow: str = 'source_to_target') -> torch.Tensor:
    r"""Computes graph edges to all points within a given distance.

    Args:
@@ -99,16 +122,14 @@ def radius_graph(x, r, batch=None, loop=False, max_num_neighbors=32,

    :rtype: :class:`LongTensor`

-    .. testsetup::
+    .. code-block:: python

        import torch
        from torch_cluster import radius_graph

-    .. testcode::
-
-        >>> x = torch.Tensor([[-1, -1], [-1, 1], [1, -1], [1, 1]])
-        >>> batch = torch.tensor([0, 0, 0, 0])
-        >>> edge_index = radius_graph(x, r=1.5, batch=batch, loop=False)
+        x = torch.Tensor([[-1, -1], [-1, 1], [1, -1], [1, 1]])
+        batch = torch.tensor([0, 0, 0, 0])
+        edge_index = radius_graph(x, r=1.5, batch=batch, loop=False)
    """

    assert flow in ['source_to_target', 'target_to_source']

--- a/torch_cluster/rw.py
+++ b/torch_cluster/rw.py
 import warnings
+from typing import Optional

 import torch
-import torch_cluster.rw_cpu

-if torch.cuda.is_available():
-    import torch_cluster.rw_cuda

-
-def random_walk(row, col, start, walk_length, p=1, q=1, coalesced=False,
-                num_nodes=None):
+@torch.jit.script
+def random_walk(row: torch.Tensor, col: torch.Tensor, start: torch.Tensor,
+                walk_length: int, p: float = 1, q: float = 1,
+                coalesced: bool = False, num_nodes: Optional[int] = None):
    """Samples random walks of length :obj:`walk_length` from all node indices
    in :obj:`start` in the graph given by :obj:`(row, col)` as described in the
    `"node2vec: Scalable Feature Learning for Networks"
@@ -33,22 +32,21 @@ def random_walk(row, col, start, walk_length, p=1, q=1, coalesced=False,
    :rtype: :class:`LongTensor`
    """
    if num_nodes is None:
-        num_nodes = max(row.max(), col.max()).item() + 1
+        num_nodes = max(int(row.max()), int(col.max())) + 1

    if coalesced:
-        _, perm = torch.sort(row * num_nodes + col)
+        perm = torch.argsort(row * num_nodes + col)
        row, col = row[perm], col[perm]

-    if p != 1 or q != 1:  # pragma: no cover
+    deg = row.new_zeros(num_nodes)
+    deg.scatter_add_(0, row, torch.ones_like(row))
+    rowptr = row.new_zeros(num_nodes + 1)
+    torch.cumsum(deg, 0, out=rowptr[1:])
+
+    if p != 1. or q != 1.:  # pragma: no cover
        warnings.warn('Parameters `p` and `q` are not supported yet and will'
                      'be restored to their default values `p=1` and `q=1`.')
-        p = q = 1
-
-    start = start.flatten()
+        p = q = 1.

-    if row.is_cuda:  # pragma: no cover
-        return torch_cluster.rw_cuda.rw(row, col, start, walk_length, p, q,
-                                        num_nodes)
-    else:
-        return torch_cluster.rw_cpu.rw(row, col, start, walk_length, p, q,
-                                       num_nodes)
+    return torch.ops.torch_cluster.random_walk(rowptr, col, start, walk_length,
+                                               p, q)
--- a/torch_cluster/sampler.py
+++ b/torch_cluster/sampler.py
-import torch_cluster.sampler_cpu
+import torch


-def neighbor_sampler(start, cumdeg, size):
+@torch.jit.script
+def neighbor_sampler(start: torch.Tensor, rowptr: torch.Tensor, size: float):
    assert not start.is_cuda

-    factor = 1
-    if isinstance(size, float):
+    factor: float = -1.
+    count: int = -1
+    if size <= 1:
        factor = size
-        size = 2147483647
+        assert factor > 0
+    else:
+        count = int(size)

-    op = torch_cluster.sampler_cpu.neighbor_sampler
-    return op(start, cumdeg, size, factor)
+    return torch.ops.torch_cluster.neighbor_sampler(start, rowptr, count,
+                                                    factor)