add nvcc flags [ci skip]

1abbba60 · rusty1s · f127bd3a · c8e167a7 · 1abbba60 · 1abbba60
Commit 1abbba60 authored Jul 16, 2020 by rusty1s
20 changed files
--- a/.travis.yml
+++ b/.travis.yml
@@ -59,7 +59,8 @@ install:
  - conda install pytorch=${TORCH_VERSION} ${TOOLKIT} -c pytorch --yes
  - source script/torch.sh
  - pip install flake8 codecov
-  - python setup.py install
+  - pip install scipy==1.4.1
+  - pip install .[test]
 script:
  - flake8 .
  - python setup.py test

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
 cmake_minimum_required(VERSION 3.0)
 project(torchcluster)
 set(CMAKE_CXX_STANDARD 14)
-set(TORCHCLUSTER_VERSION 1.5.4)
+set(TORCHCLUSTER_VERSION 1.5.5)
 option(WITH_CUDA "Enable CUDA support" OFF)
@@ -74,3 +74,8 @@ if(WITH_CUDA)
    csrc/cuda/rw_cuda.h
    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME}/cuda)
 endif()
+if(WITH_CUDA)
+  set_property(TARGET torch_cuda PROPERTY INTERFACE_COMPILE_OPTIONS "")
+  set_property(TARGET torch_cpu PROPERTY INTERFACE_COMPILE_OPTIONS "")
+endif()
--- a/README.md
+++ b/README.md
@@ -107,7 +107,7 @@ from torch_cluster import graclus_cluster
 row = torch.tensor([0, 1, 1, 2])
 col = torch.tensor([1, 0, 2, 1])
-weight = torch.Tensor([1, 1, 1, 1])  # Optional edge weights.
+weight = torch.tensor([1., 1., 1., 1.])  # Optional edge weights.
 cluster = graclus_cluster(row, col, weight)
 ```
@@ -125,7 +125,7 @@ A clustering algorithm, which overlays a regular grid of user-defined size over
 import torch
 from torch_cluster import grid_cluster
-pos = torch.Tensor([[0, 0], [11, 9], [2, 8], [2, 2], [8, 3]])
+pos = torch.tensor([[0., 0.], [11., 9.], [2., 8.], [2., 2.], [8., 3.]])
 size = torch.Tensor([5, 5])
 cluster = grid_cluster(pos, size)
@@ -144,7 +144,7 @@ A sampling algorithm, which iteratively samples the most distant point with rega
 import torch
 from torch_cluster import fps
-x = torch.Tensor([[-1, -1], [-1, 1], [1, -1], [1, 1]])
+x = torch.tensor([[-1., -1.], [-1., 1.], [1., -1.], [1., 1.]])
 batch = torch.tensor([0, 0, 0, 0])
 index = fps(x, batch, ratio=0.5, random_start=False)
 ```
@@ -158,11 +158,21 @@ tensor([0, 3])
 Computes graph edges to the nearest *k* points.
+**Args:**
+* **x** *(Tensor)*: Node feature matrix of shape `[N, F]`.
+* **r** *(float)*: The radius.
+* **batch** *(LongTensor, optional)*: Batch vector of shape `[N]`, which assigns each node to a specific example. `batch` needs to be sorted. (default: `None`)
+* **loop** *(bool, optional)*: If `True`, the graph will contain self-loops. (default: `False`)
+* **flow** *(string, optional)*: The flow direction when using in combination with message passing (`"source_to_target"` or `"target_to_source"`). (default: `"source_to_target"`)
+* **cosine** *(boolean, optional)*: If `True`, will use the Cosine distance instead of Euclidean distance to find nearest neighbors. (default: `False`)
+* **num_workers** *(int)*: Number of workers to use for computation. Has no effect in case `batch` is not `None`, or the input lies on the GPU. (default: `1`)
 ```python
 import torch
 from torch_cluster import knn_graph
-x = torch.Tensor([[-1, -1], [-1, 1], [1, -1], [1, 1]])
+x = torch.tensor([[-1., -1.], [-1., 1.], [1., -1.], [1., 1.]])
 batch = torch.tensor([0, 0, 0, 0])
 edge_index = knn_graph(x, k=2, batch=batch, loop=False)
 ```
@@ -177,11 +187,21 @@ tensor([[1, 2, 0, 3, 0, 3, 1, 2],
 Computes graph edges to all points within a given distance.
+**Args:**
+* **x** *(Tensor)*: Node feature matrix of shape `[N, F]`.
+* **r** *(float)*: The radius.
+* **batch** *(LongTensor, optional)*: Batch vector of shape `[N]`, which assigns each node to a specific example. `batch` needs to be sorted. (default: `None`)
+* **loop** *(bool, optional)*: If `True`, the graph will contain self-loops. (default: `False`)
+* **max_num_neighbors** *(int, optional)*: The maximum number of neighbors to return for each element. (default: `32`)
+* **flow** *(string, optional)*: The flow direction when using in combination with message passing (`"source_to_target"` or `"target_to_source"`). (default: `"source_to_target"`)
+* **num_workers** *(int)*: Number of workers to use for computation. Has no effect in case `batch` is not `None`, or the input lies on the GPU. (default: `1`)
 ```python
 import torch
 from torch_cluster import radius_graph
-x = torch.Tensor([[-1, -1], [-1, 1], [1, -1], [1, 1]])
+x = torch.tensor([[-1., -1.], [-1., 1.], [1., -1.], [1., 1.]])
 batch = torch.tensor([0, 0, 0, 0])
 edge_index = radius_graph(x, r=1.5, batch=batch, loop=False)
 ```

--- a/csrc/cpu/knn_cpu.cpp
+++ b/csrc/cpu/knn_cpu.cpp
+#include "knn_cpu.h"
+#include "utils.h"
+#include "utils/neighbors.cpp"
+torch::Tensor knn_cpu(torch::Tensor x, torch::Tensor y,
+                      torch::optional<torch::Tensor> ptr_x,
+                      torch::optional<torch::Tensor> ptr_y, int64_t k,
+                      int64_t num_workers) {
+  CHECK_CPU(x);
+  CHECK_INPUT(x.dim() == 2);
+  CHECK_CPU(y);
+  CHECK_INPUT(y.dim() == 2);
+  if (ptr_x.has_value()) {
+    CHECK_CPU(ptr_x.value());
+    CHECK_INPUT(ptr_x.value().dim() == 1);
+  }
+  if (ptr_y.has_value()) {
+    CHECK_CPU(ptr_y.value());
+    CHECK_INPUT(ptr_y.value().dim() == 1);
+  }
+  std::vector<size_t> *out_vec = new std::vector<size_t>();
+  AT_DISPATCH_ALL_TYPES(x.scalar_type(), "radius_cpu", [&] {
+    auto x_data = x.data_ptr<scalar_t>();
+    auto y_data = y.data_ptr<scalar_t>();
+    auto x_vec = std::vector<scalar_t>(x_data, x_data + x.numel());
+    auto y_vec = std::vector<scalar_t>(y_data, y_data + y.numel());
+    if (!ptr_x.has_value()) {
+      nanoflann_neighbors<scalar_t>(y_vec, x_vec, out_vec, 0, x.size(-1), 0,
+                                    num_workers, k, 0);
+    } else {
+      auto sx = (ptr_x.value().narrow(0, 1, ptr_x.value().numel() - 1) -
+                 ptr_x.value().narrow(0, 0, ptr_x.value().numel() - 1));
+      auto sy = (ptr_y.value().narrow(0, 1, ptr_y.value().numel() - 1) -
+                 ptr_y.value().narrow(0, 0, ptr_y.value().numel() - 1));
+      auto sx_data = sx.data_ptr<int64_t>();
+      auto sy_data = sy.data_ptr<int64_t>();
+      auto sx_vec = std::vector<long>(sx_data, sx_data + sx.numel());
+      auto sy_vec = std::vector<long>(sy_data, sy_data + sy.numel());
+      batch_nanoflann_neighbors<scalar_t>(y_vec, x_vec, sy_vec, sx_vec, out_vec,
+                                          k, x.size(-1), 0, k, 0);
+    }
+  });
+  const int64_t size = out_vec->size() / 2;
+  auto out = torch::from_blob(out_vec->data(), {size, 2},
+                              x.options().dtype(torch::kLong));
+  return out.t().index_select(0, torch::tensor({1, 0}));
+}
--- a/csrc/cpu/knn_cpu.h
+++ b/csrc/cpu/knn_cpu.h
+#pragma once
+#include <torch/extension.h>
+torch::Tensor knn_cpu(torch::Tensor x, torch::Tensor y,
+                      torch::optional<torch::Tensor> ptr_x,
+                      torch::optional<torch::Tensor> ptr_y, int64_t k,
+                      int64_t num_workers);
--- a/csrc/cpu/radius_cpu.cpp
+++ b/csrc/cpu/radius_cpu.cpp
+#include "radius_cpu.h"
+#include "utils.h"
+#include "utils/neighbors.cpp"
+torch::Tensor radius_cpu(torch::Tensor x, torch::Tensor y,
+                         torch::optional<torch::Tensor> ptr_x,
+                         torch::optional<torch::Tensor> ptr_y, double r,
+                         int64_t max_num_neighbors, int64_t num_workers) {
+  CHECK_CPU(x);
+  CHECK_INPUT(x.dim() == 2);
+  CHECK_CPU(y);
+  CHECK_INPUT(y.dim() == 2);
+  if (ptr_x.has_value()) {
+    CHECK_CPU(ptr_x.value());
+    CHECK_INPUT(ptr_x.value().dim() == 1);
+  }
+  if (ptr_y.has_value()) {
+    CHECK_CPU(ptr_y.value());
+    CHECK_INPUT(ptr_y.value().dim() == 1);
+  }
+  std::vector<size_t> *out_vec = new std::vector<size_t>();
+  AT_DISPATCH_ALL_TYPES(x.scalar_type(), "radius_cpu", [&] {
+    auto x_data = x.data_ptr<scalar_t>();
+    auto y_data = y.data_ptr<scalar_t>();
+    auto x_vec = std::vector<scalar_t>(x_data, x_data + x.numel());
+    auto y_vec = std::vector<scalar_t>(y_data, y_data + y.numel());
+    if (!ptr_x.has_value()) {
+      nanoflann_neighbors<scalar_t>(y_vec, x_vec, out_vec, r, x.size(-1),
+                                    max_num_neighbors, num_workers, 0, 1);
+    } else {
+      auto sx = (ptr_x.value().narrow(0, 1, ptr_x.value().numel() - 1) -
+                 ptr_x.value().narrow(0, 0, ptr_x.value().numel() - 1));
+      auto sy = (ptr_y.value().narrow(0, 1, ptr_y.value().numel() - 1) -
+                 ptr_y.value().narrow(0, 0, ptr_y.value().numel() - 1));
+      auto sx_data = sx.data_ptr<int64_t>();
+      auto sy_data = sy.data_ptr<int64_t>();
+      auto sx_vec = std::vector<long>(sx_data, sx_data + sx.numel());
+      auto sy_vec = std::vector<long>(sy_data, sy_data + sy.numel());
+      batch_nanoflann_neighbors<scalar_t>(y_vec, x_vec, sy_vec, sx_vec, out_vec,
+                                          r, x.size(-1), max_num_neighbors, 0,
+                                          1);
+    }
+  });
+  const int64_t size = out_vec->size() / 2;
+  auto out = torch::from_blob(out_vec->data(), {size, 2},
+                              x.options().dtype(torch::kLong));
+  return out.t().index_select(0, torch::tensor({1, 0}));
+}
--- a/csrc/cpu/radius_cpu.h
+++ b/csrc/cpu/radius_cpu.h
+#pragma once
+#include <torch/extension.h>
+torch::Tensor radius_cpu(torch::Tensor x, torch::Tensor y,
+                         torch::optional<torch::Tensor> ptr_x,
+                         torch::optional<torch::Tensor> ptr_y, double r,
+                         int64_t max_num_neighbors, int64_t num_workers);
--- a/csrc/cpu/utils/cloud.h
+++ b/csrc/cpu/utils/cloud.h
+#pragma once
+#include <ATen/ATen.h>
+#include <algorithm>
+#include <cmath>
+#include <iomanip>
+#include <iostream>
+#include <map>
+#include <numeric>
+#include <unordered_map>
+#include <vector>
+#include <time.h>
+template <typename scalar_t> struct PointCloud {
+  std::vector<std::vector<scalar_t> *> pts;
+  void set(std::vector<scalar_t> new_pts, int dim) {
+    std::vector<std::vector<scalar_t> *> temp(new_pts.size() / dim);
+    for (size_t i = 0; i < new_pts.size(); i++) {
+      if (i % dim == 0) {
+        std::vector<scalar_t> *point = new std::vector<scalar_t>(dim);
+        for (size_t j = 0; j < (size_t)dim; j++) {
+          (*point)[j] = new_pts[i + j];
+        }
+        temp[i / dim] = point;
+      }
+    }
+    pts = temp;
+  }
+  void set_batch(std::vector<scalar_t> new_pts, size_t begin, long size,
+                 int dim) {
+    std::vector<std::vector<scalar_t> *> temp(size);
+    for (size_t i = 0; i < (size_t)size; i++) {
+      std::vector<scalar_t> *point = new std::vector<scalar_t>(dim);
+      for (size_t j = 0; j < (size_t)dim; j++) {
+        (*point)[j] = new_pts[dim * (begin + i) + j];
+      }
+      temp[i] = point;
+    }
+    pts = temp;
+  }
+  // Must return the number of data points.
+  inline size_t kdtree_get_point_count() const { return pts.size(); }
+  // Returns the dim'th component of the idx'th point in the class:
+  inline scalar_t kdtree_get_pt(const size_t idx, const size_t dim) const {
+    return (*pts[idx])[dim];
+  }
+  // Optional bounding-box computation: return false to default to a standard
+  // bbox computation loop.
+  // Return true if the BBOX was already computed by the class and returned in
+  // "bb" so it can be avoided to redo it again. Look at bb.size() to find out
+  // the expected dimensionality (e.g. 2 or 3 for point clouds)
+  template <class BBOX> bool kdtree_get_bbox(BBOX & /* bb */) const {
+    return false;
+  }
+};
--- a/csrc/cpu/utils/nanoflann.hpp
+++ b/csrc/cpu/utils/nanoflann.hpp
--- a/csrc/cpu/utils/neighbors.cpp
+++ b/csrc/cpu/utils/neighbors.cpp
+#include "cloud.h"
+#include "nanoflann.hpp"
+#include <cstdint>
+#include <iostream>
+#include <set>
+#include <thread>
+typedef struct thread_struct {
+  void *kd_tree;
+  void *matches;
+  void *queries;
+  size_t *max_count;
+  std::mutex *ct_m;
+  std::mutex *tree_m;
+  size_t start;
+  size_t end;
+  double search_radius;
+  bool small;
+  bool option;
+  size_t k;
+} thread_args;
+template <typename scalar_t> void thread_routine(thread_args *targs) {
+  typedef nanoflann::KDTreeSingleIndexAdaptor<
+      nanoflann::L2_Adaptor<scalar_t, PointCloud<scalar_t>>,
+      PointCloud<scalar_t>>
+      my_kd_tree_t;
+  typedef std::vector<std::vector<std::pair<size_t, scalar_t>>> kd_pair;
+  my_kd_tree_t *index = (my_kd_tree_t *)targs->kd_tree;
+  kd_pair *matches = (kd_pair *)targs->matches;
+  PointCloud<scalar_t> *pcd_query = (PointCloud<scalar_t> *)targs->queries;
+  size_t *max_count = targs->max_count;
+  std::mutex *ct_m = targs->ct_m;
+  std::mutex *tree_m = targs->tree_m;
+  double eps;
+  if (targs->small) {
+    eps = 0.000001;
+  } else {
+    eps = 0;
+  }
+  double search_radius = (double)targs->search_radius;
+  size_t start = targs->start;
+  size_t end = targs->end;
+  auto k = targs->k;
+  for (size_t i = start; i < end; i++) {
+    std::vector<scalar_t> p0 = *(((*pcd_query).pts)[i]);
+    scalar_t *query_pt = new scalar_t[p0.size()];
+    std::copy(p0.begin(), p0.end(), query_pt);
+    (*matches)[i].reserve(*max_count);
+    std::vector<std::pair<size_t, scalar_t>> ret_matches;
+    std::vector<size_t> *knn_ret_matches = new std::vector<size_t>(k);
+    std::vector<scalar_t> *knn_dist_matches = new std::vector<scalar_t>(k);
+    tree_m->lock();
+    size_t nMatches;
+    if (targs->option) {
+      nMatches = index->radiusSearch(query_pt, (scalar_t)(search_radius + eps),
+                                     ret_matches, nanoflann::SearchParams());
+    } else {
+      nMatches = index->knnSearch(query_pt, k, &(*knn_ret_matches)[0],
+                                  &(*knn_dist_matches)[0]);
+      auto temp = new std::vector<std::pair<size_t, scalar_t>>(
+          (*knn_dist_matches).size());
+      for (size_t j = 0; j < (*knn_ret_matches).size(); j++) {
+        (*temp)[j] =
+            std::make_pair((*knn_ret_matches)[j], (*knn_dist_matches)[j]);
+      }
+      ret_matches = *temp;
+    }
+    tree_m->unlock();
+    (*matches)[i] = ret_matches;
+    ct_m->lock();
+    if (*max_count < nMatches) {
+      *max_count = nMatches;
+    }
+    ct_m->unlock();
+  }
+}
+template <typename scalar_t>
+size_t nanoflann_neighbors(std::vector<scalar_t> &queries,
+                           std::vector<scalar_t> &supports,
+                           std::vector<size_t> *&neighbors_indices,
+                           double radius, int dim, int64_t max_num,
+                           int64_t n_threads, int64_t k, int option) {
+  const scalar_t search_radius = static_cast<scalar_t>(radius * radius);
+  // Counting vector
+  size_t *max_count = new size_t();
+  *max_count = 1;
+  size_t ssize = supports.size();
+  // CLoud variable
+  PointCloud<scalar_t> pcd;
+  pcd.set(supports, dim);
+  // Cloud query
+  PointCloud<scalar_t> *pcd_query = new PointCloud<scalar_t>();
+  (*pcd_query).set(queries, dim);
+  // Tree parameters
+  nanoflann::KDTreeSingleIndexAdaptorParams tree_params(15 /* max leaf */);
+  // KDTree type definition
+  typedef nanoflann::KDTreeSingleIndexAdaptor<
+      nanoflann::L2_Adaptor<scalar_t, PointCloud<scalar_t>>,
+      PointCloud<scalar_t>>
+      my_kd_tree_t;
+  typedef std::vector<std::vector<std::pair<size_t, scalar_t>>> kd_pair;
+  // Pointer to trees
+  my_kd_tree_t *index;
+  index = new my_kd_tree_t(dim, pcd, tree_params);
+  index->buildIndex();
+  // Search neigbors indices
+  // Search params
+  nanoflann::SearchParams search_params;
+  // search_params.sorted = true;
+  kd_pair *list_matches = new kd_pair((*pcd_query).pts.size());
+  // single threaded routine
+  if (n_threads == 1) {
+    size_t i0 = 0;
+    double eps;
+    if (ssize < 10) {
+      eps = 0.000001;
+    } else {
+      eps = 0;
+    }
+    for (auto &p : (*pcd_query).pts) {
+      auto p0 = *p;
+      // Find neighbors
+      scalar_t *query_pt = new scalar_t[dim];
+      std::copy(p0.begin(), p0.end(), query_pt);
+      (*list_matches)[i0].reserve(*max_count);
+      std::vector<std::pair<size_t, scalar_t>> ret_matches;
+      std::vector<size_t> *knn_ret_matches = new std::vector<size_t>(k);
+      std::vector<scalar_t> *knn_dist_matches = new std::vector<scalar_t>(k);
+      size_t nMatches;
+      if (!!(option)) {
+        nMatches =
+            index->radiusSearch(query_pt, (scalar_t)(search_radius + eps),
+                                ret_matches, search_params);
+      } else {
+        nMatches = index->knnSearch(query_pt, (size_t)k, &(*knn_ret_matches)[0],
+                                    &(*knn_dist_matches)[0]);
+        auto temp = new std::vector<std::pair<size_t, scalar_t>>(
+            (*knn_dist_matches).size());
+        for (size_t j = 0; j < (*knn_ret_matches).size(); j++) {
+          (*temp)[j] =
+              std::make_pair((*knn_ret_matches)[j], (*knn_dist_matches)[j]);
+        }
+        ret_matches = *temp;
+      }
+      (*list_matches)[i0] = ret_matches;
+      if (*max_count < nMatches)
+        *max_count = nMatches;
+      i0++;
+    }
+  } else { // Multi-threaded routine
+    std::mutex *mtx = new std::mutex();
+    std::mutex *mtx_tree = new std::mutex();
+    size_t n_queries = (*pcd_query).pts.size();
+    size_t actual_threads =
+        std::min((long long)n_threads, (long long)n_queries);
+    std::vector<std::thread *> tid(actual_threads);
+    size_t start, end;
+    size_t length;
+    if (n_queries) {
+      length = 1;
+    } else {
+      auto res = std::lldiv((long long)n_queries, (long long)n_threads);
+      length = (size_t)res.quot;
+    }
+    for (size_t t = 0; t < actual_threads; t++) {
+      start = t * length;
+      if (t == actual_threads - 1) {
+        end = n_queries;
+      } else {
+        end = (t + 1) * length;
+      }
+      thread_args *targs = new thread_args();
+      targs->kd_tree = index;
+      targs->matches = list_matches;
+      targs->max_count = max_count;
+      targs->ct_m = mtx;
+      targs->tree_m = mtx_tree;
+      targs->search_radius = search_radius;
+      targs->queries = pcd_query;
+      targs->start = start;
+      targs->end = end;
+      if (ssize < 10) {
+        targs->small = true;
+      } else {
+        targs->small = false;
+      }
+      targs->option = !!(option);
+      targs->k = k;
+      std::thread *temp = new std::thread(thread_routine<scalar_t>, targs);
+      tid[t] = temp;
+    }
+    for (size_t t = 0; t < actual_threads; t++) {
+      tid[t]->join();
+    }
+  }
+  // Reserve the memory
+  if (max_num > 0) {
+    *max_count = max_num;
+  }
+  size_t size = 0; // total number of edges
+  for (auto &inds : *list_matches) {
+    if (inds.size() <= *max_count)
+      size += inds.size();
+    else
+      size += *max_count;
+  }
+  neighbors_indices->resize(size * 2);
+  size_t i1 = 0; // index of the query points
+  size_t u = 0;  // curent index of the neighbors_indices
+  for (auto &inds : *list_matches) {
+    for (size_t j = 0; j < *max_count; j++) {
+      if (j < inds.size()) {
+        (*neighbors_indices)[u] = inds[j].first;
+        (*neighbors_indices)[u + 1] = i1;
+        u += 2;
+      }
+    }
+    i1++;
+  }
+  return *max_count;
+}
+template <typename scalar_t>
+size_t batch_nanoflann_neighbors(std::vector<scalar_t> &queries,
+                                 std::vector<scalar_t> &supports,
+                                 std::vector<long> &q_batches,
+                                 std::vector<long> &s_batches,
+                                 std::vector<size_t> *&neighbors_indices,
+                                 double radius, int dim, int64_t max_num,
+                                 int64_t k, int option) {
+  // Indices.
+  size_t i0 = 0;
+  // Square radius.
+  const scalar_t r2 = static_cast<scalar_t>(radius * radius);
+  // Counting vector.
+  size_t max_count = 0;
+  // Batch index.
+  size_t b = 0;
+  size_t sum_qb = 0;
+  size_t sum_sb = 0;
+  double eps;
+  if (supports.size() < 10) {
+    eps = 0.000001;
+  } else {
+    eps = 0;
+  }
+  // Nanoflann related variables.
+  // Cloud variable.
+  PointCloud<scalar_t> current_cloud;
+  PointCloud<scalar_t> query_pcd;
+  query_pcd.set(queries, dim);
+  std::vector<std::vector<std::pair<size_t, scalar_t>>> all_inds_dists(
+      query_pcd.pts.size());
+  // Tree parameters.
+  nanoflann::KDTreeSingleIndexAdaptorParams tree_params(10 /* max leaf */);
+  // KDTree type definition.
+  typedef nanoflann::KDTreeSingleIndexAdaptor<
+      nanoflann::L2_Adaptor<scalar_t, PointCloud<scalar_t>>,
+      PointCloud<scalar_t>>
+      my_kd_tree_t;
+  // Pointer to trees.
+  my_kd_tree_t *index;
+  // Build KDTree for the first batch element.
+  current_cloud.set_batch(supports, sum_sb, s_batches[b], dim);
+  index = new my_kd_tree_t(dim, current_cloud, tree_params);
+  index->buildIndex();
+  // Search neigbors indices.
+  // Search params.
+  nanoflann::SearchParams search_params;
+  search_params.sorted = true;
+  for (auto &p : query_pcd.pts) {
+    auto p0 = *p;
+    // Check if we changed batch.
+    scalar_t *query_pt = new scalar_t[dim];
+    std::copy(p0.begin(), p0.end(), query_pt);
+    if (i0 == sum_qb + q_batches[b]) {
+      sum_qb += q_batches[b];
+      sum_sb += s_batches[b];
+      b++;
+      // Change the points.
+      current_cloud.pts.clear();
+      current_cloud.set_batch(supports, sum_sb, s_batches[b], dim);
+      // Build KDTree of the current element of the batch.
+      delete index;
+      index = new my_kd_tree_t(dim, current_cloud, tree_params);
+      index->buildIndex();
+    }
+    // Initial guess of neighbors size.
+    all_inds_dists[i0].reserve(max_count);
+    // Find neighbors.
+    size_t nMatches;
+    if (!!option) {
+      nMatches = index->radiusSearch(query_pt, r2 + eps, all_inds_dists[i0],
+                                     search_params);
+      // Update max count.
+    } else {
+      std::vector<size_t> *knn_ret_matches = new std::vector<size_t>(k);
+      std::vector<scalar_t> *knn_dist_matches = new std::vector<scalar_t>(k);
+      nMatches = index->knnSearch(query_pt, (size_t)k, &(*knn_ret_matches)[0],
+                                  &(*knn_dist_matches)[0]);
+      auto temp = new std::vector<std::pair<size_t, scalar_t>>(
+          (*knn_dist_matches).size());
+      for (size_t j = 0; j < (*knn_ret_matches).size(); j++) {
+        (*temp)[j] =
+            std::make_pair((*knn_ret_matches)[j], (*knn_dist_matches)[j]);
+      }
+      all_inds_dists[i0] = *temp;
+    }
+    if (nMatches > max_count)
+      max_count = nMatches;
+    i0++;
+  }
+  // How many neighbors do we keep.
+  if (max_num > 0) {
+    max_count = max_num;
+  }
+  size_t size = 0; // Total number of edges.
+  for (auto &inds_dists : all_inds_dists) {
+    if (inds_dists.size() <= max_count)
+      size += inds_dists.size();
+    else
+      size += max_count;
+  }
+  neighbors_indices->resize(size * 2);
+  i0 = 0;
+  sum_sb = 0;
+  sum_qb = 0;
+  b = 0;
+  size_t u = 0;
+  for (auto &inds_dists : all_inds_dists) {
+    if (i0 == sum_qb + q_batches[b]) {
+      sum_qb += q_batches[b];
+      sum_sb += s_batches[b];
+      b++;
+    }
+    for (size_t j = 0; j < max_count; j++) {
+      if (j < inds_dists.size()) {
+        (*neighbors_indices)[u] = inds_dists[j].first + sum_sb;
+        (*neighbors_indices)[u + 1] = i0;
+        u += 2;
+      }
+    }
+    i0++;
+  }
+  return max_count;
+}
--- a/csrc/cuda/knn_cuda.cu
+++ b/csrc/cuda/knn_cuda.cu
@@ -75,26 +75,42 @@ __global__ void knn_kernel(const scalar_t *x, const scalar_t *y,
  }
 }
-torch::Tensor knn_cuda(torch::Tensor x, torch::Tensor y, torch::Tensor ptr_x,
+torch::Tensor knn_cuda(torch::Tensor x, torch::Tensor y,
-                       torch::Tensor ptr_y, int64_t k, bool cosine) {
+                       torch::optional<torch::Tensor> ptr_x,
+                       torch::optional<torch::Tensor> ptr_y, int64_t k,
+                       bool cosine) {
  CHECK_CUDA(x);
+  CHECK_INPUT(x.dim() == 2);
  CHECK_CUDA(y);
-  CHECK_CUDA(ptr_x);
+  CHECK_INPUT(y.dim() == 2);
-  CHECK_CUDA(ptr_y);
  cudaSetDevice(x.get_device());
-  x = x.view({x.size(0), -1}).contiguous();
+  if (ptr_x.has_value()) {
-  y = y.view({y.size(0), -1}).contiguous();
+    CHECK_CUDA(ptr_x.value());
+    CHECK_INPUT(ptr_x.value().dim() == 1);
+  } else {
+    ptr_x = torch::arange(0, x.size(0) + 1, x.size(0),
+                          x.options().dtype(torch::kLong));
+  }
+  if (ptr_y.has_value()) {
+    CHECK_CUDA(ptr_y.value());
+    CHECK_INPUT(ptr_y.value().dim() == 1);
+  } else {
+    ptr_y = torch::arange(0, y.size(0) + 1, y.size(0),
+                          y.options().dtype(torch::kLong));
+  }
+  CHECK_INPUT(ptr_x.value().numel() == ptr_y.value().numel());
  auto dist = torch::full(y.size(0) * k, 1e38, y.options());
-  auto row = torch::empty(y.size(0) * k, ptr_y.options());
+  auto row = torch::empty(y.size(0) * k, ptr_y.value().options());
-  auto col = torch::full(y.size(0) * k, -1, ptr_y.options());
+  auto col = torch::full(y.size(0) * k, -1, ptr_y.value().options());
  auto stream = at::cuda::getCurrentCUDAStream();
  AT_DISPATCH_FLOATING_TYPES(x.scalar_type(), "knn_kernel", [&] {
-    knn_kernel<scalar_t><<<ptr_x.size(0) - 1, THREADS, 0, stream>>>(
+    knn_kernel<scalar_t><<<ptr_x.value().size(0) - 1, THREADS, 0, stream>>>(
        x.data_ptr<scalar_t>(), y.data_ptr<scalar_t>(),
-        ptr_x.data_ptr<int64_t>(), ptr_y.data_ptr<int64_t>(),
+        ptr_x.value().data_ptr<int64_t>(), ptr_y.value().data_ptr<int64_t>(),
        dist.data_ptr<scalar_t>(), row.data_ptr<int64_t>(),
        col.data_ptr<int64_t>(), k, x.size(1), cosine);
  });

--- a/csrc/cuda/knn_cuda.h
+++ b/csrc/cuda/knn_cuda.h
@@ -2,5 +2,7 @@
 #include <torch/extension.h>
-torch::Tensor knn_cuda(torch::Tensor x, torch::Tensor y, torch::Tensor ptr_x,
+torch::Tensor knn_cuda(torch::Tensor x, torch::Tensor y,
-                       torch::Tensor ptr_y, int64_t k, bool cosine);
+                       torch::optional<torch::Tensor> ptr_x,
+                       torch::optional<torch::Tensor> ptr_y, int64_t k,
+                       bool cosine);
--- a/csrc/cuda/radius_cuda.cu
+++ b/csrc/cuda/radius_cuda.cu
@@ -44,26 +44,42 @@ __global__ void radius_kernel(const scalar_t *x, const scalar_t *y,
  }
 }
-torch::Tensor radius_cuda(torch::Tensor x, torch::Tensor y, torch::Tensor ptr_x,
+torch::Tensor radius_cuda(torch::Tensor x, torch::Tensor y,
-                          torch::Tensor ptr_y, double r,
+                          torch::optional<torch::Tensor> ptr_x,
+                          torch::optional<torch::Tensor> ptr_y, double r,
                          int64_t max_num_neighbors) {
  CHECK_CUDA(x);
+  CHECK_INPUT(x.dim() == 2);
  CHECK_CUDA(y);
-  CHECK_CUDA(ptr_x);
+  CHECK_INPUT(y.dim() == 2);
-  CHECK_CUDA(ptr_y);
  cudaSetDevice(x.get_device());
-  x = x.view({x.size(0), -1}).contiguous();
+  if (ptr_x.has_value()) {
-  y = y.view({y.size(0), -1}).contiguous();
+    CHECK_CUDA(ptr_x.value());
+    CHECK_INPUT(ptr_x.value().dim() == 1);
+  } else {
+    ptr_x = torch::arange(0, x.size(0) + 1, x.size(0),
+                          x.options().dtype(torch::kLong));
+  }
+  if (ptr_y.has_value()) {
+    CHECK_CUDA(ptr_y.value());
+    CHECK_INPUT(ptr_y.value().dim() == 1);
+  } else {
+    ptr_y = torch::arange(0, y.size(0) + 1, y.size(0),
+                          y.options().dtype(torch::kLong));
+  }
+  CHECK_INPUT(ptr_x.value().numel() == ptr_y.value().numel());
-  auto row = torch::full(y.size(0) * max_num_neighbors, -1, ptr_y.options());
+  auto row =
-  auto col = torch::full(y.size(0) * max_num_neighbors, -1, ptr_y.options());
+      torch::full(y.size(0) * max_num_neighbors, -1, ptr_y.value().options());
+  auto col =
+      torch::full(y.size(0) * max_num_neighbors, -1, ptr_y.value().options());
  auto stream = at::cuda::getCurrentCUDAStream();
  AT_DISPATCH_FLOATING_TYPES(x.scalar_type(), "radius_kernel", [&] {
-    radius_kernel<scalar_t><<<ptr_x.size(0) - 1, THREADS, 0, stream>>>(
+    radius_kernel<scalar_t><<<ptr_x.value().size(0) - 1, THREADS, 0, stream>>>(
        x.data_ptr<scalar_t>(), y.data_ptr<scalar_t>(),
-        ptr_x.data_ptr<int64_t>(), ptr_y.data_ptr<int64_t>(),
+        ptr_x.value().data_ptr<int64_t>(), ptr_y.value().data_ptr<int64_t>(),
        row.data_ptr<int64_t>(), col.data_ptr<int64_t>(), r, max_num_neighbors,
        x.size(1));
  });

--- a/csrc/cuda/radius_cuda.h
+++ b/csrc/cuda/radius_cuda.h
@@ -2,6 +2,7 @@
 #include <torch/extension.h>
-torch::Tensor radius_cuda(torch::Tensor x, torch::Tensor y, torch::Tensor ptr_x,
+torch::Tensor radius_cuda(torch::Tensor x, torch::Tensor y,
-                          torch::Tensor ptr_y, double r,
+                          torch::optional<torch::Tensor> ptr_x,
+                          torch::optional<torch::Tensor> ptr_y, double r,
                          int64_t max_num_neighbors);
--- a/csrc/knn.cpp
+++ b/csrc/knn.cpp
 #include <Python.h>
 #include <torch/script.h>
+#include "cpu/knn_cpu.h"
 #ifdef WITH_CUDA
 #include "cuda/knn_cuda.h"
 #endif
@@ -9,8 +11,10 @@
 PyMODINIT_FUNC PyInit__knn(void) { return NULL; }
 #endif
-torch::Tensor knn(torch::Tensor x, torch::Tensor y, torch::Tensor ptr_x,
+torch::Tensor knn(torch::Tensor x, torch::Tensor y,
-                  torch::Tensor ptr_y, int64_t k, bool cosine) {
+                  torch::optional<torch::Tensor> ptr_x,
+                  torch::optional<torch::Tensor> ptr_y, int64_t k, bool cosine,
+                  int64_t num_workers) {
  if (x.device().is_cuda()) {
 #ifdef WITH_CUDA
    return knn_cuda(x, y, ptr_x, ptr_y, k, cosine);
@@ -18,7 +22,9 @@ torch::Tensor knn(torch::Tensor x, torch::Tensor y, torch::Tensor ptr_x,
    AT_ERROR("Not compiled with CUDA support");
 #endif
  } else {
-    AT_ERROR("No CPU version supported");
+    if (cosine)
+      AT_ERROR("`cosine` argument not supported on CPU");
+    return knn_cpu(x, y, ptr_x, ptr_y, k, num_workers);
  }
 }

--- a/csrc/radius.cpp
+++ b/csrc/radius.cpp
 #include <Python.h>
 #include <torch/script.h>
+#include "cpu/radius_cpu.h"
 #ifdef WITH_CUDA
 #include "cuda/radius_cuda.h"
 #endif
@@ -9,8 +11,10 @@
 PyMODINIT_FUNC PyInit__radius(void) { return NULL; }
 #endif
-torch::Tensor radius(torch::Tensor x, torch::Tensor y, torch::Tensor ptr_x,
+torch::Tensor radius(torch::Tensor x, torch::Tensor y,
-                     torch::Tensor ptr_y, double r, int64_t max_num_neighbors) {
+                     torch::optional<torch::Tensor> ptr_x,
+                     torch::optional<torch::Tensor> ptr_y, double r,
+                     int64_t max_num_neighbors, int64_t num_workers) {
  if (x.device().is_cuda()) {
 #ifdef WITH_CUDA
    return radius_cuda(x, y, ptr_x, ptr_y, r, max_num_neighbors);
@@ -18,7 +22,7 @@ torch::Tensor radius(torch::Tensor x, torch::Tensor y, torch::Tensor ptr_x,
    AT_ERROR("Not compiled with CUDA support");
 #endif
  } else {
-    AT_ERROR("No CPU version supported");
+    return radius_cpu(x, y, ptr_x, ptr_y, r, max_num_neighbors, num_workers);
  }
 }

--- a/setup.py
+++ b/setup.py
@@ -57,13 +57,13 @@ def get_extensions():
    return extensions
-install_requires = ['scipy']
+install_requires = []
 setup_requires = ['pytest-runner']
-tests_require = ['pytest', 'pytest-cov']
+tests_require = ['pytest', 'pytest-cov', 'scipy']
 setup(
    name='torch_cluster',
-    version='1.5.4',
+    version='1.5.5',
    author='Matthias Fey',
    author_email='matthias.fey@tu-dortmund.de',
    url='https://github.com/rusty1s/pytorch_cluster',
@@ -80,6 +80,7 @@ setup(
    install_requires=install_requires,
    setup_requires=setup_requires,
    tests_require=tests_require,
+    extras_require={'test': tests_require},
    ext_modules=get_extensions() if not BUILD_DOCS else [],
    cmdclass={
        'build_ext':

--- a/test/test_knn.py
+++ b/test/test_knn.py
@@ -2,11 +2,16 @@ from itertools import product
 import pytest
 import torch
+import scipy.spatial
 from torch_cluster import knn, knn_graph
 from .utils import grad_dtypes, devices, tensor
+def to_set(edge_index):
+    return set([(i, j) for i, j in edge_index.t().tolist()])
 @pytest.mark.parametrize('dtype,device', product(grad_dtypes, devices))
 def test_knn(dtype, device):
    x = tensor([
@@ -27,16 +32,15 @@ def test_knn(dtype, device):
    batch_x = tensor([0, 0, 0, 0, 1, 1, 1, 1], torch.long, device)
    batch_y = tensor([0, 1], torch.long, device)
-    row, col = knn(x, y, 2, batch_x, batch_y)
+    edge_index = knn(x, y, 2)
-    col = col.view(-1, 2).sort(dim=-1)[0].view(-1)
+    assert to_set(edge_index) == set([(0, 2), (0, 3), (1, 0), (1, 1)])
-    assert row.tolist() == [0, 0, 1, 1]
+    edge_index = knn(x, y, 2, batch_x, batch_y)
-    assert col.tolist() == [2, 3, 4, 5]
+    assert to_set(edge_index) == set([(0, 2), (0, 3), (1, 4), (1, 5)])
    if x.is_cuda:
-        row, col = knn(x, y, 2, batch_x, batch_y, cosine=True)
+        edge_index = knn(x, y, 2, batch_x, batch_y, cosine=True)
-        assert row.tolist() == [0, 0, 1, 1]
+        assert to_set(edge_index) == set([(0, 0), (0, 1), (1, 4), (1, 5)])
-        assert col.tolist() == [0, 1, 4, 5]
 @pytest.mark.parametrize('dtype,device', product(grad_dtypes, devices))
@@ -48,12 +52,24 @@ def test_knn_graph(dtype, device):
        [+1, -1],
    ], dtype, device)
-    row, col = knn_graph(x, k=2, flow='target_to_source')
+    edge_index = knn_graph(x, k=2, flow='target_to_source')
-    col = col.view(-1, 2).sort(dim=-1)[0].view(-1)
+    assert to_set(edge_index) == set([(0, 1), (0, 3), (1, 0), (1, 2), (2, 1),
-    assert row.tolist() == [0, 0, 1, 1, 2, 2, 3, 3]
+                                      (2, 3), (3, 0), (3, 2)])
-    assert col.tolist() == [1, 3, 0, 2, 1, 3, 0, 2]
+    edge_index = knn_graph(x, k=2, flow='source_to_target')
+    assert to_set(edge_index) == set([(1, 0), (3, 0), (0, 1), (2, 1), (1, 2),
+                                      (3, 2), (0, 3), (2, 3)])
+@pytest.mark.parametrize('dtype,device', product(grad_dtypes, devices))
+def test_knn_graph_large(dtype, device):
+    x = torch.randn(1000, 3)
+    edge_index = knn_graph(x, k=5, flow='target_to_source', loop=True,
+                           num_workers=6)
+    tree = scipy.spatial.cKDTree(x.numpy())
+    _, col = tree.query(x.cpu(), k=5)
+    truth = set([(i, j) for i, ns in enumerate(col) for j in ns])
-    row, col = knn_graph(x, k=2, flow='source_to_target')
+    assert to_set(edge_index) == truth
-    row = row.view(-1, 2).sort(dim=-1)[0].view(-1)
-    assert row.tolist() == [1, 3, 0, 2, 1, 3, 0, 2]
-    assert col.tolist() == [0, 0, 1, 1, 2, 2, 3, 3]
--- a/test/test_radius.py
+++ b/test/test_radius.py
@@ -2,16 +2,14 @@ from itertools import product
 import pytest
 import torch
+import scipy.spatial
 from torch_cluster import radius, radius_graph
 from .utils import grad_dtypes, devices, tensor
-def coalesce(index):
+def to_set(edge_index):
-    N = index.max().item() + 1
+    return set([(i, j) for i, j in edge_index.t().tolist()])
-    tensor = torch.sparse_coo_tensor(index, index.new_ones(index.size(1)),
-                                     torch.Size([N, N]))
-    return tensor.coalesce().indices()
 @pytest.mark.parametrize('dtype,device', product(grad_dtypes, devices))
@@ -34,8 +32,13 @@ def test_radius(dtype, device):
    batch_x = tensor([0, 0, 0, 0, 1, 1, 1, 1], torch.long, device)
    batch_y = tensor([0, 1], torch.long, device)
-    out = radius(x, y, 2, batch_x, batch_y, max_num_neighbors=4)
+    edge_index = radius(x, y, 2, max_num_neighbors=4)
-    assert coalesce(out).tolist() == [[0, 0, 0, 0, 1, 1], [0, 1, 2, 3, 5, 6]]
+    assert to_set(edge_index) == set([(0, 0), (0, 1), (0, 2), (0, 3), (1, 1),
+                                      (1, 2), (1, 5), (1, 6)])
+    edge_index = radius(x, y, 2, batch_x, batch_y, max_num_neighbors=4)
+    assert to_set(edge_index) == set([(0, 0), (0, 1), (0, 2), (0, 3), (1, 5),
+                                      (1, 6)])
 @pytest.mark.parametrize('dtype,device', product(grad_dtypes, devices))
@@ -47,12 +50,24 @@ def test_radius_graph(dtype, device):
        [+1, -1],
    ], dtype, device)
-    row, col = radius_graph(x, r=2, flow='target_to_source')
+    edge_index = radius_graph(x, r=2, flow='target_to_source')
-    col = col.view(-1, 2).sort(dim=-1)[0].view(-1)
+    assert to_set(edge_index) == set([(0, 1), (0, 3), (1, 0), (1, 2), (2, 1),
-    assert row.tolist() == [0, 0, 1, 1, 2, 2, 3, 3]
+                                      (2, 3), (3, 0), (3, 2)])
-    assert col.tolist() == [1, 3, 0, 2, 1, 3, 0, 2]
+    edge_index = radius_graph(x, r=2, flow='source_to_target')
+    assert to_set(edge_index) == set([(1, 0), (3, 0), (0, 1), (2, 1), (1, 2),
+                                      (3, 2), (0, 3), (2, 3)])
+@pytest.mark.parametrize('dtype,device', product(grad_dtypes, devices))
+def test_radius_graph_large(dtype, device):
+    x = torch.randn(1000, 3)
+    edge_index = radius_graph(x, r=0.5, flow='target_to_source', loop=True,
+                              max_num_neighbors=1000, num_workers=6)
+    tree = scipy.spatial.cKDTree(x.numpy())
+    col = tree.query_ball_point(x.cpu(), r=0.5)
+    truth = set([(i, j) for i, ns in enumerate(col) for j in ns])
-    row, col = radius_graph(x, r=2, flow='source_to_target')
+    assert to_set(edge_index) == truth
-    row = row.view(-1, 2).sort(dim=-1)[0].view(-1)
-    assert row.tolist() == [1, 3, 0, 2, 1, 3, 0, 2]
-    assert col.tolist() == [0, 0, 1, 1, 2, 2, 3, 3]
--- a/test/test_rw.py
+++ b/test/test_rw.py
@@ -17,6 +17,6 @@ def test_rw(device):
    for n in range(start.size(0)):
        cur = start[n].item()
-        for l in range(1, walk_length):
+        for i in range(1, walk_length):
-            assert out[n, l].item() in col[row == cur].tolist()
+            assert out[n, i].item() in col[row == cur].tolist()
-            cur = out[n, l].item()
+            cur = out[n, i].item()