Merge pull request #57 from rusty1s/wheel

[WIP] Python wheels

Merge pull request #57 from rusty1s/wheel
[WIP] Python wheels
80b99adb · Matthias Fey · GitHub · 0194ebb6 · bc476876 · 80b99adb
Unverified Commit 80b99adb authored Mar 15, 2020 by Matthias Fey Committed by GitHub Mar 15, 2020
20 changed files
--- a/csrc/knn.cpp
+++ b/csrc/knn.cpp
+#include <Python.h>
+#include <torch/script.h>
+#ifdef WITH_CUDA
+#include "cuda/knn_cuda.h"
+#endif
+#ifdef _WIN32
+PyMODINIT_FUNC PyInit__knn(void) { return NULL; }
+#endif
+torch::Tensor knn(torch::Tensor x, torch::Tensor y, torch::Tensor ptr_x,
+                  torch::Tensor ptr_y, int64_t k, bool cosine) {
+  if (x.device().is_cuda()) {
+#ifdef WITH_CUDA
+    return knn_cuda(x, y, ptr_x, ptr_y, k, cosine);
+#else
+    AT_ERROR("Not compiled with CUDA support");
+#endif
+  } else {
+    AT_ERROR("No CPU version supported");
+  }
+}
+static auto registry =
+    torch::RegisterOperators().op("torch_cluster::knn", &knn);
--- a/csrc/nearest.cpp
+++ b/csrc/nearest.cpp
+#include <Python.h>
+#include <torch/script.h>
+#ifdef WITH_CUDA
+#include "cuda/nearest_cuda.h"
+#endif
+#ifdef _WIN32
+PyMODINIT_FUNC PyInit__nearest(void) { return NULL; }
+#endif
+torch::Tensor nearest(torch::Tensor x, torch::Tensor y, torch::Tensor ptr_x,
+                      torch::Tensor ptr_y) {
+  if (x.device().is_cuda()) {
+#ifdef WITH_CUDA
+    return nearest_cuda(x, y, ptr_x, ptr_y);
+#else
+    AT_ERROR("Not compiled with CUDA support");
+#endif
+  } else {
+    AT_ERROR("No CPU version supported");
+  }
+}
+static auto registry =
+    torch::RegisterOperators().op("torch_cluster::nearest", &nearest);
--- a/csrc/radius.cpp
+++ b/csrc/radius.cpp
+#include <Python.h>
+#include <torch/script.h>
+#ifdef WITH_CUDA
+#include "cuda/radius_cuda.h"
+#endif
+#ifdef _WIN32
+PyMODINIT_FUNC PyInit__radius(void) { return NULL; }
+#endif
+torch::Tensor radius(torch::Tensor x, torch::Tensor y, torch::Tensor ptr_x,
+                     torch::Tensor ptr_y, double r, int64_t max_num_neighbors) {
+  if (x.device().is_cuda()) {
+#ifdef WITH_CUDA
+    return radius_cuda(x, y, ptr_x, ptr_y, r, max_num_neighbors);
+#else
+    AT_ERROR("Not compiled with CUDA support");
+#endif
+  } else {
+    AT_ERROR("No CPU version supported");
+  }
+}
+static auto registry =
+    torch::RegisterOperators().op("torch_cluster::radius", &radius);
--- a/csrc/rw.cpp
+++ b/csrc/rw.cpp
+#include <Python.h>
+#include <torch/script.h>
+#include "cpu/rw_cpu.h"
+#ifdef WITH_CUDA
+#include "cuda/rw_cuda.h"
+#endif
+#ifdef _WIN32
+PyMODINIT_FUNC PyInit__rw(void) { return NULL; }
+#endif
+torch::Tensor random_walk(torch::Tensor rowptr, torch::Tensor col,
+                          torch::Tensor start, int64_t walk_length, double p,
+                          double q) {
+  if (rowptr.device().is_cuda()) {
+#ifdef WITH_CUDA
+    return random_walk_cuda(rowptr, col, start, walk_length, p, q);
+#else
+    AT_ERROR("Not compiled with CUDA support");
+#endif
+  } else {
+    return random_walk_cpu(rowptr, col, start, walk_length, p, q);
+  }
+}
+static auto registry =
+    torch::RegisterOperators().op("torch_cluster::random_walk", &random_walk);
--- a/csrc/sampler.cpp
+++ b/csrc/sampler.cpp
+#include <Python.h>
+#include <torch/script.h>
+#include "cpu/sampler_cpu.h"
+#ifdef _WIN32
+PyMODINIT_FUNC PyInit__sampler(void) { return NULL; }
+#endif
+torch::Tensor neighbor_sampler(torch::Tensor start, torch::Tensor rowptr,
+                               int64_t count, double factor) {
+  if (rowptr.device().is_cuda()) {
+#ifdef WITH_CUDA
+    AT_ERROR("No CUDA version supported");
+#else
+    AT_ERROR("Not compiled with CUDA support");
+#endif
+  } else {
+    return neighbor_sampler_cpu(start, rowptr, count, factor);
+  }
+}
+static auto registry = torch::RegisterOperators().op(
+    "torch_cluster::neighbor_sampler", &neighbor_sampler);
--- a/csrc/version.cpp
+++ b/csrc/version.cpp
+#include <Python.h>
+#include <torch/script.h>
+#ifdef WITH_CUDA
+#include <cuda.h>
+#endif
+#ifdef _WIN32
+PyMODINIT_FUNC PyInit__version(void) { return NULL; }
+#endif
+int64_t cuda_version() {
+#ifdef WITH_CUDA
+  return CUDA_VERSION;
+#else
+  return -1;
+#endif
+}
+static auto registry =
+    torch::RegisterOperators().op("torch_cluster::cuda_version", &cuda_version);
--- a/cuda/coloring.cuh
+++ b/cuda/coloring.cuh
-#pragma once
-#include <ATen/ATen.h>
-#include "compat.cuh"
-#define THREADS 1024
-#define BLOCKS(N) (N + THREADS - 1) / THREADS
-#define BLUE_PROB 0.53406
-__device__ int64_t done;
-__global__ void init_done_kernel() { done = 1; }
-__global__ void colorize_kernel(int64_t *cluster, float *__restrict__ bernoulli,
-                                size_t numel) {
-  const size_t index = blockIdx.x * blockDim.x + threadIdx.x;
-  const size_t stride = blockDim.x * gridDim.x;
-  for (int64_t u = index; u < numel; u += stride) {
-    if (cluster[u] < 0) {
-      cluster[u] = (int64_t)bernoulli[u] - 2;
-      done = 0;
-    }
-  }
-}
-int64_t colorize(at::Tensor cluster) {
-  init_done_kernel<<<1, 1>>>();
-  auto numel = cluster.size(0);
-  auto props = at::full(numel, BLUE_PROB, cluster.options().dtype(at::kFloat));
-  auto bernoulli = props.bernoulli();
-  colorize_kernel<<<BLOCKS(numel), THREADS>>>(
-      cluster.DATA_PTR<int64_t>(), bernoulli.DATA_PTR<float>(), numel);
-  int64_t out;
-  cudaMemcpyFromSymbol(&out, done, sizeof(out), 0, cudaMemcpyDeviceToHost);
-  return out;
-}
--- a/cuda/compat.cuh
+++ b/cuda/compat.cuh
-#ifdef VERSION_GE_1_3
-#define DATA_PTR data_ptr
-#else
-#define DATA_PTR data
-#endif
--- a/cuda/fps.cpp
+++ b/cuda/fps.cpp
-#include <torch/extension.h>
-#define CHECK_CUDA(x)                                                          \
-  AT_ASSERTM(x.device().is_cuda(), #x " must be CUDA tensor")
-#define IS_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " is not contiguous");
-at::Tensor fps_cuda(at::Tensor x, at::Tensor batch, float ratio, bool random);
-at::Tensor fps(at::Tensor x, at::Tensor batch, float ratio, bool random) {
-  CHECK_CUDA(x);
-  IS_CONTIGUOUS(x);
-  CHECK_CUDA(batch);
-  return fps_cuda(x, batch, ratio, random);
-}
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("fps", &fps, "Farthest Point Sampling (CUDA)");
-}
--- a/cuda/fps_kernel.cu
+++ b/cuda/fps_kernel.cu
-#include <ATen/ATen.h>
-#include "compat.cuh"
-#include "utils.cuh"
-#define THREADS 1024
-template <typename scalar_t, int64_t Dim> struct Dist;
-template <typename scalar_t> struct Dist<scalar_t, 1> {
-  static __device__ void
-  compute(ptrdiff_t idx, ptrdiff_t start_idx, ptrdiff_t end_idx, ptrdiff_t old,
-          scalar_t *__restrict__ best, ptrdiff_t *__restrict__ best_idx,
-          const scalar_t *__restrict__ x, scalar_t *__restrict__ dist,
-          scalar_t *__restrict__ tmp_dist, size_t dim) {
-    for (ptrdiff_t n = start_idx + idx; n < end_idx; n += THREADS) {
-      scalar_t d = x[old] - x[n];
-      dist[n] = min(dist[n], d * d);
-      if (dist[n] > *best) {
-        *best = dist[n];
-        *best_idx = n;
-      }
-    }
-  }
-};
-template <typename scalar_t> struct Dist<scalar_t, 2> {
-  static __device__ void
-  compute(ptrdiff_t idx, ptrdiff_t start_idx, ptrdiff_t end_idx, ptrdiff_t old,
-          scalar_t *__restrict__ best, ptrdiff_t *__restrict__ best_idx,
-          const scalar_t *__restrict__ x, scalar_t *__restrict__ dist,
-          scalar_t *__restrict__ tmp_dist, size_t dim) {
-    for (ptrdiff_t n = start_idx + idx; n < end_idx; n += THREADS) {
-      scalar_t a = x[2 * old + 0] - x[2 * n + 0];
-      scalar_t b = x[2 * old + 1] - x[2 * n + 1];
-      scalar_t d = a * a + b * b;
-      dist[n] = min(dist[n], d);
-      if (dist[n] > *best) {
-        *best = dist[n];
-        *best_idx = n;
-      }
-    }
-  }
-};
-template <typename scalar_t> struct Dist<scalar_t, 3> {
-  static __device__ void
-  compute(ptrdiff_t idx, ptrdiff_t start_idx, ptrdiff_t end_idx, ptrdiff_t old,
-          scalar_t *__restrict__ best, ptrdiff_t *__restrict__ best_idx,
-          const scalar_t *__restrict__ x, scalar_t *__restrict__ dist,
-          scalar_t *__restrict__ tmp_dist, size_t dim) {
-    for (ptrdiff_t n = start_idx + idx; n < end_idx; n += THREADS) {
-      scalar_t a = x[3 * old + 0] - x[3 * n + 0];
-      scalar_t b = x[3 * old + 1] - x[3 * n + 1];
-      scalar_t c = x[3 * old + 2] - x[3 * n + 2];
-      scalar_t d = a * a + b * b + c * c;
-      dist[n] = min(dist[n], d);
-      if (dist[n] > *best) {
-        *best = dist[n];
-        *best_idx = n;
-      }
-    }
-  }
-};
-template <typename scalar_t> struct Dist<scalar_t, -1> {
-  static __device__ void
-  compute(ptrdiff_t idx, ptrdiff_t start_idx, ptrdiff_t end_idx, ptrdiff_t old,
-          scalar_t *__restrict__ best, ptrdiff_t *__restrict__ best_idx,
-          const scalar_t *__restrict__ x, scalar_t *__restrict__ dist,
-          scalar_t *__restrict__ tmp_dist, size_t dim) {
-    for (ptrdiff_t n = start_idx + idx; n < end_idx; n += THREADS) {
-      tmp_dist[n] = 0;
-    }
-    __syncthreads();
-    for (ptrdiff_t i = start_idx * dim + idx; i < end_idx * dim; i += THREADS) {
-      scalar_t d = x[(old * dim) + (i % dim)] - x[i];
-      atomicAdd(&tmp_dist[i / dim], d * d);
-    }
-    __syncthreads();
-    for (ptrdiff_t n = start_idx + idx; n < end_idx; n += THREADS) {
-      dist[n] = min(dist[n], tmp_dist[n]);
-      if (dist[n] > *best) {
-        *best = dist[n];
-        *best_idx = n;
-      }
-    }
-  }
-};
-template <typename scalar_t, int64_t Dim>
-__global__ void
-fps_kernel(const scalar_t *__restrict__ x, const int64_t *__restrict__ cum_deg,
-           const int64_t *__restrict__ cum_k, const int64_t *__restrict__ start,
-           scalar_t *__restrict__ dist, scalar_t *__restrict__ tmp_dist,
-           int64_t *__restrict__ out, size_t dim) {
-  const ptrdiff_t batch_idx = blockIdx.x;
-  const ptrdiff_t idx = threadIdx.x;
-  const ptrdiff_t start_idx = cum_deg[batch_idx];
-  const ptrdiff_t end_idx = cum_deg[batch_idx + 1];
-  __shared__ scalar_t best_dist[THREADS];
-  __shared__ int64_t best_dist_idx[THREADS];
-  if (idx == 0) {
-    out[cum_k[batch_idx]] = start_idx + start[batch_idx];
-  }
-  for (ptrdiff_t m = cum_k[batch_idx] + 1; m < cum_k[batch_idx + 1]; m++) {
-    scalar_t best = -1;
-    ptrdiff_t best_idx = 0;
-    __syncthreads();
-    Dist<scalar_t, Dim>::compute(idx, start_idx, end_idx, out[m - 1], &best,
-                                 &best_idx, x, dist, tmp_dist, dim);
-    best_dist[idx] = best;
-    best_dist_idx[idx] = best_idx;
-    for (int64_t u = 0; (1 << u) < THREADS; u++) {
-      __syncthreads();
-      if (idx < (THREADS >> (u + 1))) {
-        int64_t idx_1 = (idx * 2) << u;
-        int64_t idx_2 = (idx * 2 + 1) << u;
-        if (best_dist[idx_1] < best_dist[idx_2]) {
-          best_dist[idx_1] = best_dist[idx_2];
-          best_dist_idx[idx_1] = best_dist_idx[idx_2];
-        }
-      }
-    }
-    __syncthreads();
-    if (idx == 0) {
-      out[m] = best_dist_idx[0];
-    }
-  }
-}
-#define FPS_KERNEL(DIM, ...)                                                   \
-  [&] {                                                                        \
-    switch (DIM) {                                                             \
-    case 1:                                                                    \
-      fps_kernel<scalar_t, 1><<<batch_size, THREADS>>>(__VA_ARGS__, DIM);      \
-      break;                                                                   \
-    case 2:                                                                    \
-      fps_kernel<scalar_t, 2><<<batch_size, THREADS>>>(__VA_ARGS__, DIM);      \
-      break;                                                                   \
-    case 3:                                                                    \
-      fps_kernel<scalar_t, 3><<<batch_size, THREADS>>>(__VA_ARGS__, DIM);      \
-      break;                                                                   \
-    default:                                                                   \
-      fps_kernel<scalar_t, -1><<<batch_size, THREADS>>>(__VA_ARGS__, DIM);     \
-    }                                                                          \
-  }()
-at::Tensor fps_cuda(at::Tensor x, at::Tensor batch, float ratio, bool random) {
-  cudaSetDevice(x.get_device());
-  auto batch_sizes = (int64_t *)malloc(sizeof(int64_t));
-  cudaMemcpy(batch_sizes, batch[-1].DATA_PTR<int64_t>(), sizeof(int64_t),
-             cudaMemcpyDeviceToHost);
-  auto batch_size = batch_sizes[0] + 1;
-  auto deg = degree(batch, batch_size);
-  auto cum_deg = at::cat({at::zeros(1, deg.options()), deg.cumsum(0)}, 0);
-  auto k = (deg.toType(at::kFloat) * ratio).ceil().toType(at::kLong);
-  auto cum_k = at::cat({at::zeros(1, k.options()), k.cumsum(0)}, 0);
-  at::Tensor start;
-  if (random) {
-    start = at::rand(batch_size, x.options());
-    start = (start * deg.toType(at::kFloat)).toType(at::kLong);
-  } else {
-    start = at::zeros(batch_size, k.options());
-  }
-  auto dist = at::full(x.size(0), 1e38, x.options());
-  auto tmp_dist = at::empty(x.size(0), x.options());
-  auto k_sum = (int64_t *)malloc(sizeof(int64_t));
-  cudaMemcpy(k_sum, cum_k[-1].DATA_PTR<int64_t>(), sizeof(int64_t),
-             cudaMemcpyDeviceToHost);
-  auto out = at::empty(k_sum[0], k.options());
-  AT_DISPATCH_FLOATING_TYPES(x.scalar_type(), "fps_kernel", [&] {
-    FPS_KERNEL(x.size(1), x.DATA_PTR<scalar_t>(), cum_deg.DATA_PTR<int64_t>(),
-               cum_k.DATA_PTR<int64_t>(), start.DATA_PTR<int64_t>(),
-               dist.DATA_PTR<scalar_t>(), tmp_dist.DATA_PTR<scalar_t>(),
-               out.DATA_PTR<int64_t>());
-  });
-  return out;
-}
--- a/cuda/graclus.cpp
+++ b/cuda/graclus.cpp
-#include <torch/extension.h>
-#define CHECK_CUDA(x)                                                          \
-  AT_ASSERTM(x.device().is_cuda(), #x " must be CUDA tensor")
-at::Tensor graclus_cuda(at::Tensor row, at::Tensor col, int64_t num_nodes);
-at::Tensor weighted_graclus_cuda(at::Tensor row, at::Tensor col,
-                                 at::Tensor weight, int64_t num_nodes);
-at::Tensor graclus(at::Tensor row, at::Tensor col, int64_t num_nodes) {
-  CHECK_CUDA(row);
-  CHECK_CUDA(col);
-  return graclus_cuda(row, col, num_nodes);
-}
-at::Tensor weighted_graclus(at::Tensor row, at::Tensor col, at::Tensor weight,
-                            int64_t num_nodes) {
-  CHECK_CUDA(row);
-  CHECK_CUDA(col);
-  CHECK_CUDA(weight);
-  return weighted_graclus_cuda(row, col, weight, num_nodes);
-}
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("graclus", &graclus, "Graclus (CUDA)");
-  m.def("weighted_graclus", &weighted_graclus, "Weighted Graclus (CUDA)");
-}
--- a/cuda/graclus_kernel.cu
+++ b/cuda/graclus_kernel.cu
-#include <ATen/ATen.h>
-#include "coloring.cuh"
-#include "proposal.cuh"
-#include "response.cuh"
-#include "utils.cuh"
-at::Tensor graclus_cuda(at::Tensor row, at::Tensor col, int64_t num_nodes) {
-  cudaSetDevice(row.get_device());
-  std::tie(row, col) = remove_self_loops(row, col);
-  std::tie(row, col) = rand(row, col);
-  std::tie(row, col) = to_csr(row, col, num_nodes);
-  auto cluster = at::full(num_nodes, -1, row.options());
-  auto proposal = at::full(num_nodes, -1, row.options());
-  while (!colorize(cluster)) {
-    propose(cluster, proposal, row, col);
-    respond(cluster, proposal, row, col);
-  }
-  return cluster;
-}
-at::Tensor weighted_graclus_cuda(at::Tensor row, at::Tensor col,
-                                 at::Tensor weight, int64_t num_nodes) {
-  cudaSetDevice(row.get_device());
-  std::tie(row, col, weight) = remove_self_loops(row, col, weight);
-  std::tie(row, col, weight) = to_csr(row, col, weight, num_nodes);
-  auto cluster = at::full(num_nodes, -1, row.options());
-  auto proposal = at::full(num_nodes, -1, row.options());
-  while (!colorize(cluster)) {
-    propose(cluster, proposal, row, col, weight);
-    respond(cluster, proposal, row, col, weight);
-  }
-  return cluster;
-}
--- a/cuda/grid.cpp
+++ b/cuda/grid.cpp
-#include <torch/extension.h>
-#define CHECK_CUDA(x)                                                          \
-  AT_ASSERTM(x.device().is_cuda(), #x " must be CUDA tensor")
-at::Tensor grid_cuda(at::Tensor pos, at::Tensor size, at::Tensor start,
-                     at::Tensor end);
-at::Tensor grid(at::Tensor pos, at::Tensor size, at::Tensor start,
-                at::Tensor end) {
-  CHECK_CUDA(pos);
-  CHECK_CUDA(size);
-  CHECK_CUDA(start);
-  CHECK_CUDA(end);
-  return grid_cuda(pos, size, start, end);
-}
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("grid", &grid, "Grid (CUDA)");
-}
--- a/cuda/grid_kernel.cu
+++ b/cuda/grid_kernel.cu
-#include <ATen/ATen.h>
-#include <ATen/cuda/detail/IndexUtils.cuh>
-#include <ATen/cuda/detail/TensorInfo.cuh>
-#include "compat.cuh"
-#define THREADS 1024
-#define BLOCKS(N) (N + THREADS - 1) / THREADS
-template <typename scalar_t>
-__global__ void grid_kernel(int64_t *cluster,
-                            at::cuda::detail::TensorInfo<scalar_t, int64_t> pos,
-                            scalar_t *__restrict__ size,
-                            scalar_t *__restrict__ start,
-                            scalar_t *__restrict__ end, size_t numel) {
-  const size_t index = blockIdx.x * blockDim.x + threadIdx.x;
-  const size_t stride = blockDim.x * gridDim.x;
-  for (ptrdiff_t i = index; i < numel; i += stride) {
-    int64_t c = 0, k = 1;
-    for (ptrdiff_t d = 0; d < pos.sizes[1]; d++) {
-      scalar_t p = pos.data[i * pos.strides[0] + d * pos.strides[1]] - start[d];
-      c += (int64_t)(p / size[d]) * k;
-      k *= (int64_t)((end[d] - start[d]) / size[d]) + 1;
-    }
-    cluster[i] = c;
-  }
-}
-at::Tensor grid_cuda(at::Tensor pos, at::Tensor size, at::Tensor start,
-                     at::Tensor end) {
-  cudaSetDevice(pos.get_device());
-  auto cluster = at::empty(pos.size(0), pos.options().dtype(at::kLong));
-  AT_DISPATCH_ALL_TYPES(pos.scalar_type(), "grid_kernel", [&] {
-    grid_kernel<scalar_t><<<BLOCKS(cluster.numel()), THREADS>>>(
-        cluster.DATA_PTR<int64_t>(),
-        at::cuda::detail::getTensorInfo<scalar_t, int64_t>(pos),
-        size.DATA_PTR<scalar_t>(), start.DATA_PTR<scalar_t>(),
-        end.DATA_PTR<scalar_t>(), cluster.numel());
-  });
-  return cluster;
-}
--- a/cuda/knn.cpp
+++ b/cuda/knn.cpp
-#include <torch/extension.h>
-#define CHECK_CUDA(x)                                                          \
-  AT_ASSERTM(x.device().is_cuda(), #x " must be CUDA tensor")
-#define IS_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " is not contiguous");
-at::Tensor knn_cuda(at::Tensor x, at::Tensor y, size_t k, at::Tensor batch_x,
-                    at::Tensor batch_y, bool cosine);
-at::Tensor knn(at::Tensor x, at::Tensor y, size_t k, at::Tensor batch_x,
-               at::Tensor batch_y, bool cosine) {
-  CHECK_CUDA(x);
-  IS_CONTIGUOUS(x);
-  CHECK_CUDA(y);
-  IS_CONTIGUOUS(y);
-  CHECK_CUDA(batch_x);
-  CHECK_CUDA(batch_y);
-  return knn_cuda(x, y, k, batch_x, batch_y, cosine);
-}
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("knn", &knn, "k-Nearest Neighbor (CUDA)");
-}
--- a/cuda/knn_kernel.cu
+++ b/cuda/knn_kernel.cu
-#include <ATen/ATen.h>
-#include "compat.cuh"
-#include "utils.cuh"
-#define THREADS 1024
-template <typename scalar_t> struct Cosine {
-  static inline __device__ scalar_t dot(const scalar_t *a, const scalar_t *b,
-                                        size_t size) {
-    scalar_t result = 0;
-    for (ptrdiff_t i = 0; i < size; i++) {
-      result += a[i] * b[i];
-    }
-    return result;
-  }
-  static inline __device__ scalar_t norm(const scalar_t *a, size_t size) {
-    scalar_t result = 0;
-    for (ptrdiff_t i = 0; i < size; i++) {
-      result += a[i] * a[i];
-    }
-    return sqrt(result);
-  }
-};
-template <typename scalar_t>
-__global__ void
-knn_kernel(const scalar_t *__restrict__ x, const scalar_t *__restrict__ y,
-           const int64_t *__restrict__ batch_x,
-           const int64_t *__restrict__ batch_y, scalar_t *__restrict__ dist,
-           int64_t *__restrict__ row, int64_t *__restrict__ col, size_t k,
-           size_t dim, bool cosine) {
-  const ptrdiff_t batch_idx = blockIdx.x;
-  const ptrdiff_t idx = threadIdx.x;
-  const ptrdiff_t start_idx_x = batch_x[batch_idx];
-  const ptrdiff_t end_idx_x = batch_x[batch_idx + 1];
-  const ptrdiff_t start_idx_y = batch_y[batch_idx];
-  const ptrdiff_t end_idx_y = batch_y[batch_idx + 1];
-  for (ptrdiff_t n_y = start_idx_y + idx; n_y < end_idx_y; n_y += THREADS) {
-    for (ptrdiff_t k_idx = 0; k_idx < k; k_idx++) {
-      row[n_y * k + k_idx] = n_y;
-    }
-    for (ptrdiff_t n_x = start_idx_x; n_x < end_idx_x; n_x++) {
-      scalar_t tmp_dist = 0;
-      if (cosine) {
-        tmp_dist =
-            Cosine<scalar_t>::norm(x, dim) * Cosine<scalar_t>::norm(y, dim) -
-            Cosine<scalar_t>::dot(x, y, dim);
-      } else {
-        for (ptrdiff_t d = 0; d < dim; d++) {
-          tmp_dist += (x[n_x * dim + d] - y[n_y * dim + d]) *
-                      (x[n_x * dim + d] - y[n_y * dim + d]);
-        }
-      }
-      for (ptrdiff_t k_idx_1 = 0; k_idx_1 < k; k_idx_1++) {
-        if (dist[n_y * k + k_idx_1] > tmp_dist) {
-          for (ptrdiff_t k_idx_2 = k - 1; k_idx_2 > k_idx_1; k_idx_2--) {
-            dist[n_y * k + k_idx_2] = dist[n_y * k + k_idx_2 - 1];
-            col[n_y * k + k_idx_2] = col[n_y * k + k_idx_2 - 1];
-          }
-          dist[n_y * k + k_idx_1] = tmp_dist;
-          col[n_y * k + k_idx_1] = n_x;
-          break;
-        }
-      }
-    }
-  }
-}
-at::Tensor knn_cuda(at::Tensor x, at::Tensor y, size_t k, at::Tensor batch_x,
-                    at::Tensor batch_y, bool cosine) {
-  cudaSetDevice(x.get_device());
-  auto batch_sizes = (int64_t *)malloc(sizeof(int64_t));
-  cudaMemcpy(batch_sizes, batch_x[-1].DATA_PTR<int64_t>(), sizeof(int64_t),
-             cudaMemcpyDeviceToHost);
-  auto batch_size = batch_sizes[0] + 1;
-  batch_x = degree(batch_x, batch_size);
-  batch_x = at::cat({at::zeros(1, batch_x.options()), batch_x.cumsum(0)}, 0);
-  batch_y = degree(batch_y, batch_size);
-  batch_y = at::cat({at::zeros(1, batch_y.options()), batch_y.cumsum(0)}, 0);
-  auto dist = at::full(y.size(0) * k, 1e38, y.options());
-  auto row = at::empty(y.size(0) * k, batch_y.options());
-  auto col = at::full(y.size(0) * k, -1, batch_y.options());
-  AT_DISPATCH_FLOATING_TYPES(x.scalar_type(), "knn_kernel", [&] {
-    knn_kernel<scalar_t><<<batch_size, THREADS>>>(
-        x.DATA_PTR<scalar_t>(), y.DATA_PTR<scalar_t>(),
-        batch_x.DATA_PTR<int64_t>(), batch_y.DATA_PTR<int64_t>(),
-        dist.DATA_PTR<scalar_t>(), row.DATA_PTR<int64_t>(),
-        col.DATA_PTR<int64_t>(), k, x.size(1), cosine);
-  });
-  auto mask = col != -1;
-  return at::stack({row.masked_select(mask), col.masked_select(mask)}, 0);
-}
--- a/cuda/nearest.cpp
+++ b/cuda/nearest.cpp
-#include <torch/extension.h>
-#define CHECK_CUDA(x)                                                          \
-  AT_ASSERTM(x.device().is_cuda(), #x " must be CUDA tensor")
-#define IS_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " is not contiguous");
-at::Tensor nearest_cuda(at::Tensor x, at::Tensor y, at::Tensor batch_x,
-                        at::Tensor batch_y);
-at::Tensor nearest(at::Tensor x, at::Tensor y, at::Tensor batch_x,
-                   at::Tensor batch_y) {
-  CHECK_CUDA(x);
-  IS_CONTIGUOUS(x);
-  CHECK_CUDA(y);
-  IS_CONTIGUOUS(y);
-  CHECK_CUDA(batch_x);
-  CHECK_CUDA(batch_y);
-  return nearest_cuda(x, y, batch_x, batch_y);
-}
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("nearest", &nearest, "Nearest Neighbor (CUDA)");
-}
--- a/cuda/nearest_kernel.cu
+++ b/cuda/nearest_kernel.cu
-#include <ATen/ATen.h>
-#include "compat.cuh"
-#include "utils.cuh"
-#define THREADS 1024
-template <typename scalar_t>
-__global__ void nearest_kernel(const scalar_t *__restrict__ x,
-                               const scalar_t *__restrict__ y,
-                               const int64_t *__restrict__ batch_x,
-                               const int64_t *__restrict__ batch_y,
-                               int64_t *__restrict__ out, const size_t dim) {
-  const ptrdiff_t n_x = blockIdx.x;
-  const ptrdiff_t batch_idx = batch_x[n_x];
-  const ptrdiff_t idx = threadIdx.x;
-  const ptrdiff_t start_idx = batch_y[batch_idx];
-  const ptrdiff_t end_idx = batch_y[batch_idx + 1];
-  __shared__ scalar_t best_dist[THREADS];
-  __shared__ int64_t best_dist_idx[THREADS];
-  scalar_t best = 1e38;
-  ptrdiff_t best_idx = 0;
-  for (ptrdiff_t n_y = start_idx + idx; n_y < end_idx; n_y += THREADS) {
-    scalar_t dist = 0;
-    for (ptrdiff_t d = 0; d < dim; d++) {
-      dist += (x[n_x * dim + d] - y[n_y * dim + d]) *
-              (x[n_x * dim + d] - y[n_y * dim + d]);
-    }
-    if (dist < best) {
-      best = dist;
-      best_idx = n_y;
-    }
-  }
-  best_dist[idx] = best;
-  best_dist_idx[idx] = best_idx;
-  for (int64_t u = 0; (1 << u) < THREADS; u++) {
-    __syncthreads();
-    if (idx < (THREADS >> (u + 1))) {
-      int64_t idx_1 = (idx * 2) << u;
-      int64_t idx_2 = (idx * 2 + 1) << u;
-      if (best_dist[idx_1] > best_dist[idx_2]) {
-        best_dist[idx_1] = best_dist[idx_2];
-        best_dist_idx[idx_1] = best_dist_idx[idx_2];
-      }
-    }
-  }
-  __syncthreads();
-  if (idx == 0) {
-    out[n_x] = best_dist_idx[0];
-  }
-}
-at::Tensor nearest_cuda(at::Tensor x, at::Tensor y, at::Tensor batch_x,
-                        at::Tensor batch_y) {
-  cudaSetDevice(x.get_device());
-  auto batch_sizes = (int64_t *)malloc(sizeof(int64_t));
-  cudaMemcpy(batch_sizes, batch_x[-1].DATA_PTR<int64_t>(), sizeof(int64_t),
-             cudaMemcpyDeviceToHost);
-  auto batch_size = batch_sizes[0] + 1;
-  batch_y = degree(batch_y, batch_size);
-  batch_y = at::cat({at::zeros(1, batch_y.options()), batch_y.cumsum(0)}, 0);
-  auto out = at::empty_like(batch_x);
-  AT_DISPATCH_FLOATING_TYPES(x.scalar_type(), "nearest_kernel", [&] {
-    nearest_kernel<scalar_t><<<x.size(0), THREADS>>>(
-        x.DATA_PTR<scalar_t>(), y.DATA_PTR<scalar_t>(),
-        batch_x.DATA_PTR<int64_t>(), batch_y.DATA_PTR<int64_t>(),
-        out.DATA_PTR<int64_t>(), x.size(1));
-  });
-  return out;
-}
--- a/cuda/proposal.cuh
+++ b/cuda/proposal.cuh
-#pragma once
-#include <ATen/ATen.h>
-#include "compat.cuh"
-#define THREADS 1024
-#define BLOCKS(N) (N + THREADS - 1) / THREADS
-__global__ void propose_kernel(int64_t *__restrict__ cluster, int64_t *proposal,
-                               int64_t *__restrict row,
-                               int64_t *__restrict__ col, size_t numel) {
-  const size_t index = blockIdx.x * blockDim.x + threadIdx.x;
-  const size_t stride = blockDim.x * gridDim.x;
-  for (int64_t u = index; u < numel; u += stride) {
-    if (cluster[u] != -1)
-      continue; // Only vist blue nodes.
-    bool has_unmatched_neighbor = false;
-    for (int64_t i = row[u]; i < row[u + 1]; i++) {
-      auto v = col[i];
-      if (cluster[v] < 0)
-        has_unmatched_neighbor = true; // Unmatched neighbor found.
-      if (cluster[v] == -2) {
-        proposal[u] = v; // Propose to first red neighbor.
-        break;
-      }
-    }
-    if (!has_unmatched_neighbor)
-      cluster[u] = u;
-  }
-}
-void propose(at::Tensor cluster, at::Tensor proposal, at::Tensor row,
-             at::Tensor col) {
-  propose_kernel<<<BLOCKS(cluster.numel()), THREADS>>>(
-      cluster.DATA_PTR<int64_t>(), proposal.DATA_PTR<int64_t>(),
-      row.DATA_PTR<int64_t>(), col.DATA_PTR<int64_t>(), cluster.numel());
-}
-template <typename scalar_t>
-__global__ void propose_kernel(int64_t *__restrict__ cluster, int64_t *proposal,
-                               int64_t *__restrict row,
-                               int64_t *__restrict__ col,
-                               scalar_t *__restrict__ weight, size_t numel) {
-  const size_t index = blockIdx.x * blockDim.x + threadIdx.x;
-  const size_t stride = blockDim.x * gridDim.x;
-  for (int64_t u = index; u < numel; u += stride) {
-    if (cluster[u] != -1)
-      continue; // Only vist blue nodes.
-    bool has_unmatched_neighbor = false;
-    int64_t v_max = -1;
-    scalar_t w_max = 0;
-    for (int64_t i = row[u]; i < row[u + 1]; i++) {
-      auto v = col[i];
-      if (cluster[v] < 0)
-        has_unmatched_neighbor = true; // Unmatched neighbor found.
-      // Find maximum weighted red neighbor.
-      if (cluster[v] == -2 && weight[i] >= w_max) {
-        v_max = v;
-        w_max = weight[i];
-      }
-    }
-    proposal[u] = v_max; // Propose.
-    if (!has_unmatched_neighbor)
-      cluster[u] = u;
-  }
-}
-void propose(at::Tensor cluster, at::Tensor proposal, at::Tensor row,
-             at::Tensor col, at::Tensor weight) {
-  AT_DISPATCH_ALL_TYPES(weight.scalar_type(), "propose_kernel", [&] {
-    propose_kernel<scalar_t><<<BLOCKS(cluster.numel()), THREADS>>>(
-        cluster.DATA_PTR<int64_t>(), proposal.DATA_PTR<int64_t>(),
-        row.DATA_PTR<int64_t>(), col.DATA_PTR<int64_t>(),
-        weight.DATA_PTR<scalar_t>(), cluster.numel());
-  });
-}
--- a/cuda/radius.cpp
+++ b/cuda/radius.cpp
-#include <torch/extension.h>
-#define CHECK_CUDA(x)                                                          \
-  AT_ASSERTM(x.device().is_cuda(), #x " must be CUDA tensor")
-#define IS_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " is not contiguous");
-at::Tensor radius_cuda(at::Tensor x, at::Tensor y, float radius,
-                       at::Tensor batch_x, at::Tensor batch_y,
-                       size_t max_num_neighbors);
-at::Tensor radius(at::Tensor x, at::Tensor y, float radius, at::Tensor batch_x,
-                  at::Tensor batch_y, size_t max_num_neighbors) {
-  CHECK_CUDA(x);
-  IS_CONTIGUOUS(x);
-  CHECK_CUDA(y);
-  IS_CONTIGUOUS(y);
-  CHECK_CUDA(batch_x);
-  CHECK_CUDA(batch_y);
-  return radius_cuda(x, y, radius, batch_x, batch_y, max_num_neighbors);
-}
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("radius", &radius, "Radius (CUDA)");
-}