Merge branch 'master' of https://github.com/rusty1s/pytorch_cluster

01a51d47 · Duc · be87cc98 · 817b767e · 01a51d47 · 01a51d47
Commit 01a51d47 authored Dec 05, 2020 by Duc
17 changed files
--- a/.travis.yml
+++ b/.travis.yml
@@ -10,38 +10,51 @@ env:
    - CUDA_HOME=/usr/local/cuda
  jobs:
    # Python 3.6
+    - PYTHON_VERSION=3.6 TORCH_VERSION=1.7.0 IDX=cpu
+    - PYTHON_VERSION=3.6 TORCH_VERSION=1.7.0 IDX=cu92
+    - PYTHON_VERSION=3.6 TORCH_VERSION=1.7.0 IDX=cu101
+    - PYTHON_VERSION=3.6 TORCH_VERSION=1.7.0 IDX=cu102
+    - PYTHON_VERSION=3.6 TORCH_VERSION=1.7.0 IDX=cu110
    - PYTHON_VERSION=3.6 TORCH_VERSION=1.6.0 IDX=cpu
    - PYTHON_VERSION=3.6 TORCH_VERSION=1.6.0 IDX=cu92
    - PYTHON_VERSION=3.6 TORCH_VERSION=1.6.0 IDX=cu101
    - PYTHON_VERSION=3.6 TORCH_VERSION=1.6.0 IDX=cu102
-    - PYTHON_VERSION=3.6 TORCH_VERSION=1.5.0 IDX=cpu
-    - PYTHON_VERSION=3.6 TORCH_VERSION=1.5.0 IDX=cu92
-    - PYTHON_VERSION=3.6 TORCH_VERSION=1.5.0 IDX=cu101
-    - PYTHON_VERSION=3.6 TORCH_VERSION=1.5.0 IDX=cu102
    # Python 3.7
+    - PYTHON_VERSION=3.7 TORCH_VERSION=1.7.0 IDX=cpu
+    - PYTHON_VERSION=3.7 TORCH_VERSION=1.7.0 IDX=cu92
+    - PYTHON_VERSION=3.7 TORCH_VERSION=1.7.0 IDX=cu101
+    - PYTHON_VERSION=3.7 TORCH_VERSION=1.7.0 IDX=cu102
+    - PYTHON_VERSION=3.7 TORCH_VERSION=1.7.0 IDX=cu110
    - PYTHON_VERSION=3.7 TORCH_VERSION=1.6.0 IDX=cpu
    - PYTHON_VERSION=3.7 TORCH_VERSION=1.6.0 IDX=cu92
    - PYTHON_VERSION=3.7 TORCH_VERSION=1.6.0 IDX=cu101
    - PYTHON_VERSION=3.7 TORCH_VERSION=1.6.0 IDX=cu102
-    - PYTHON_VERSION=3.7 TORCH_VERSION=1.5.0 IDX=cpu
-    - PYTHON_VERSION=3.7 TORCH_VERSION=1.5.0 IDX=cu92
-    - PYTHON_VERSION=3.7 TORCH_VERSION=1.5.0 IDX=cu101
-    - PYTHON_VERSION=3.7 TORCH_VERSION=1.5.0 IDX=cu102
    # Python 3.8
+    - PYTHON_VERSION=3.8 TORCH_VERSION=1.7.0 IDX=cpu
+    - PYTHON_VERSION=3.8 TORCH_VERSION=1.7.0 IDX=cu92
+    - PYTHON_VERSION=3.8 TORCH_VERSION=1.7.0 IDX=cu101
+    - PYTHON_VERSION=3.8 TORCH_VERSION=1.7.0 IDX=cu102
+    - PYTHON_VERSION=3.8 TORCH_VERSION=1.7.0 IDX=cu110
    - PYTHON_VERSION=3.8 TORCH_VERSION=1.6.0 IDX=cpu
    - PYTHON_VERSION=3.8 TORCH_VERSION=1.6.0 IDX=cu92
    - PYTHON_VERSION=3.8 TORCH_VERSION=1.6.0 IDX=cu101
    - PYTHON_VERSION=3.8 TORCH_VERSION=1.6.0 IDX=cu102
-    - PYTHON_VERSION=3.8 TORCH_VERSION=1.5.0 IDX=cpu
-    - PYTHON_VERSION=3.8 TORCH_VERSION=1.5.0 IDX=cu92
-    - PYTHON_VERSION=3.8 TORCH_VERSION=1.5.0 IDX=cu101
-    - PYTHON_VERSION=3.8 TORCH_VERSION=1.5.0 IDX=cu102
 jobs:
-  # For daily builds, we only test on *Python 3.6* and *CUDA 10.2*.
+  # For daily builds, we only test on Python 3.6 with CUDA 10.2/11.0.
  # For deployment, we exclude *all* macOS CUDA and Windows CUDA 9.2 jobs.
  exclude:
    # Python 3.6
+    - if: not (tag is present or commit_message =~ ci-deploy) and os != osx
+      env: PYTHON_VERSION=3.6 TORCH_VERSION=1.7.0 IDX=cpu
+    - if: not (tag is present or commit_message =~ ci-deploy) or os != linux
+      env: PYTHON_VERSION=3.6 TORCH_VERSION=1.7.0 IDX=cu92
+    - if: not (tag is present or commit_message =~ ci-deploy) or os = osx
+      env: PYTHON_VERSION=3.6 TORCH_VERSION=1.7.0 IDX=cu101
+    - if: os = osx
+      env: PYTHON_VERSION=3.6 TORCH_VERSION=1.7.0 IDX=cu102
+    - if: os = osx
+      env: PYTHON_VERSION=3.6 TORCH_VERSION=1.7.0 IDX=cu110
    - if: not (tag is present or commit_message =~ ci-deploy) and os != osx
      env: PYTHON_VERSION=3.6 TORCH_VERSION=1.6.0 IDX=cpu
    - if: not (tag is present or commit_message =~ ci-deploy) or os != linux
@@ -50,15 +63,17 @@ jobs:
      env: PYTHON_VERSION=3.6 TORCH_VERSION=1.6.0 IDX=cu101
    - if: os = osx
      env: PYTHON_VERSION=3.6 TORCH_VERSION=1.6.0 IDX=cu102
-    - if: not (tag is present or commit_message =~ ci-deploy) and os != osx
+    # Python 3.7
-      env: PYTHON_VERSION=3.6 TORCH_VERSION=1.5.0 IDX=cpu
+    - if: not (tag is present or commit_message =~ ci-deploy)
+      env: PYTHON_VERSION=3.7 TORCH_VERSION=1.7.0 IDX=cpu
    - if: not (tag is present or commit_message =~ ci-deploy) or os != linux
-      env: PYTHON_VERSION=3.6 TORCH_VERSION=1.5.0 IDX=cu92
+      env: PYTHON_VERSION=3.7 TORCH_VERSION=1.7.0 IDX=cu92
    - if: not (tag is present or commit_message =~ ci-deploy) or os = osx
-      env: PYTHON_VERSION=3.6 TORCH_VERSION=1.5.0 IDX=cu101
+      env: PYTHON_VERSION=3.7 TORCH_VERSION=1.7.0 IDX=cu101
-    - if: os = osx
+    - if: not (tag is present or commit_message =~ ci-deploy) or os = osx
-      env: PYTHON_VERSION=3.6 TORCH_VERSION=1.5.0 IDX=cu102
+      env: PYTHON_VERSION=3.7 TORCH_VERSION=1.7.0 IDX=cu102
-    # Python 3.7
+    - if: not (tag is present or commit_message =~ ci-deploy) or os = osx
+      env: PYTHON_VERSION=3.7 TORCH_VERSION=1.7.0 IDX=cu110
    - if: not (tag is present or commit_message =~ ci-deploy)
      env: PYTHON_VERSION=3.7 TORCH_VERSION=1.6.0 IDX=cpu
    - if: not (tag is present or commit_message =~ ci-deploy) or os != linux
@@ -67,15 +82,17 @@ jobs:
      env: PYTHON_VERSION=3.7 TORCH_VERSION=1.6.0 IDX=cu101
    - if: not (tag is present or commit_message =~ ci-deploy) or os = osx
      env: PYTHON_VERSION=3.7 TORCH_VERSION=1.6.0 IDX=cu102
+    # Python 3.8
    - if: not (tag is present or commit_message =~ ci-deploy)
-      env: PYTHON_VERSION=3.7 TORCH_VERSION=1.5.0 IDX=cpu
+      env: PYTHON_VERSION=3.8 TORCH_VERSION=1.7.0 IDX=cpu
    - if: not (tag is present or commit_message =~ ci-deploy) or os != linux
-      env: PYTHON_VERSION=3.7 TORCH_VERSION=1.5.0 IDX=cu92
+      env: PYTHON_VERSION=3.8 TORCH_VERSION=1.7.0 IDX=cu92
    - if: not (tag is present or commit_message =~ ci-deploy) or os = osx
-      env: PYTHON_VERSION=3.7 TORCH_VERSION=1.5.0 IDX=cu101
+      env: PYTHON_VERSION=3.8 TORCH_VERSION=1.7.0 IDX=cu101
    - if: not (tag is present or commit_message =~ ci-deploy) or os = osx
-      env: PYTHON_VERSION=3.7 TORCH_VERSION=1.5.0 IDX=cu102
+      env: PYTHON_VERSION=3.8 TORCH_VERSION=1.7.0 IDX=cu102
-    # Python 3.8
+    - if: not (tag is present or commit_message =~ ci-deploy) or os = osx
+      env: PYTHON_VERSION=3.8 TORCH_VERSION=1.7.0 IDX=cu110
    - if: not (tag is present or commit_message =~ ci-deploy)
      env: PYTHON_VERSION=3.8 TORCH_VERSION=1.6.0 IDX=cpu
    - if: not (tag is present or commit_message =~ ci-deploy) or os != linux
@@ -84,14 +101,6 @@ jobs:
      env: PYTHON_VERSION=3.8 TORCH_VERSION=1.6.0 IDX=cu101
    - if: not (tag is present or commit_message =~ ci-deploy) or os = osx
      env: PYTHON_VERSION=3.8 TORCH_VERSION=1.6.0 IDX=cu102
-    - if: not (tag is present or commit_message =~ ci-deploy)
-      env: PYTHON_VERSION=3.8 TORCH_VERSION=1.5.0 IDX=cpu
-    - if: not (tag is present or commit_message =~ ci-deploy) or os != linux
-      env: PYTHON_VERSION=3.8 TORCH_VERSION=1.5.0 IDX=cu92
-    - if: not (tag is present or commit_message =~ ci-deploy) or os = osx
-      env: PYTHON_VERSION=3.8 TORCH_VERSION=1.5.0 IDX=cu101
-    - if: not (tag is present or commit_message =~ ci-deploy) or os = osx
-      env: PYTHON_VERSION=3.8 TORCH_VERSION=1.5.0 IDX=cu102
 install:
  - source script/gcc.sh

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
 cmake_minimum_required(VERSION 3.0)
 project(torchcluster)
 set(CMAKE_CXX_STANDARD 14)
-set(TORCHCLUSTER_VERSION 1.5.7)
+set(TORCHCLUSTER_VERSION 1.5.8)
 option(WITH_CUDA "Enable CUDA support" OFF)

--- a/README.md
+++ b/README.md
@@ -31,28 +31,29 @@ All included operations work on varying data types and are implemented both for
 We provide pip wheels for all major OS/PyTorch/CUDA combinations, see [here](https://s3.eu-central-1.amazonaws.com/pytorch-geometric.com/whl/index.html).
-#### PyTorch 1.6.0
+#### PyTorch 1.7.0
-To install the binaries for PyTorch 1.6.0, simply run
+To install the binaries for PyTorch 1.7.0, simply run
 ```
-pip install torch-cluster==latest+${CUDA} -f https://pytorch-geometric.com/whl/torch-1.6.0.html
+pip install torch-cluster -f https://pytorch-geometric.com/whl/torch-1.7.0+${CUDA}.html
 ```
-where `${CUDA}` should be replaced by either `cpu`, `cu92`, `cu101` or `cu102` depending on your PyTorch installation.
+where `${CUDA}` should be replaced by either `cpu`, `cu92`, `cu101`, `cu102`, or `cu110` depending on your PyTorch installation.
-|             | `cpu` | `cu92` | `cu101` | `cu102` |
+|             | `cpu` | `cu92` | `cu101` | `cu102` | `cu110` |
-|-------------|-------|--------|---------|---------|
+|-------------|-------|--------|---------|---------|---------|
-| **Linux**   | ✅    | ✅     | ✅      | ✅      |
+| **Linux**   | ✅    | ✅     | ✅      | ✅      | ✅      |
-| **Windows** | ✅    | ❌     | ✅      | ✅      |
+| **Windows** | ✅    | ❌     | ✅      | ✅      | ✅      |
-| **macOS**   | ✅    |        |         |         |
+| **macOS**   | ✅    |        |         |         |         |
-#### PyTorch 1.5.0
-To install the binaries for PyTorch 1.5.0, simply run
+#### PyTorch 1.6.0
+To install the binaries for PyTorch 1.6.0, simply run
 ```
-pip install torch-cluster==latest+${CUDA} -f https://pytorch-geometric.com/whl/torch-1.5.0.html
+pip install torch-cluster -f https://pytorch-geometric.com/whl/torch-1.6.0+${CUDA}.html
 ```
 where `${CUDA}` should be replaced by either `cpu`, `cu92`, `cu101` or `cu102` depending on your PyTorch installation.
@@ -63,21 +64,7 @@ where `${CUDA}` should be replaced by either `cpu`, `cu92`, `cu101` or `cu102` d
 | **Windows** | ✅    | ❌     | ✅      | ✅      |
 | **macOS**   | ✅    |        |         |         |
-#### PyTorch 1.4.0
+**Note:** Binaries of older versions are also provided for PyTorch 1.4.0 and PyTorch 1.5.0 (following the same procedure).
-To install the binaries for PyTorch 1.4.0, simply run
-```
-pip install torch-cluster==latest+${CUDA} -f https://pytorch-geometric.com/whl/torch-1.4.0.html
-```
-where `${CUDA}` should be replaced by either `cpu`, `cu92`, `cu100` or `cu101` depending on your PyTorch installation.
-|             | `cpu` | `cu92` | `cu100` | `cu101` |
-|-------------|-------|--------|---------|---------|
-| **Linux**   | ✅    | ✅     | ✅      | ✅      |
-| **Windows** | ✅    | ❌     | ❌      | ✅      |
-| **macOS**   | ✅    |        |         |         |
 ### From source

--- a/csrc/cluster.h
+++ b/csrc/cluster.h
@@ -23,9 +23,9 @@ torch::Tensor nearest(torch::Tensor x, torch::Tensor y, torch::Tensor ptr_x,
 torch::Tensor radius(torch::Tensor x, torch::Tensor y, torch::Tensor ptr_x,
                     torch::Tensor ptr_y, double r, int64_t max_num_neighbors);
-torch::Tensor random_walk(torch::Tensor rowptr, torch::Tensor col,
+std::tuple<torch::Tensor, torch::Tensor>
-                          torch::Tensor start, int64_t walk_length, double p,
+random_walk(torch::Tensor rowptr, torch::Tensor col, torch::Tensor start,
-                          double q);
+            int64_t walk_length, double p, double q);
 torch::Tensor neighbor_sampler(torch::Tensor start, torch::Tensor rowptr,
                               int64_t count, double factor);
--- a/csrc/cpu/fps_cpu.cpp
+++ b/csrc/cpu/fps_cpu.cpp
 #include "fps_cpu.h"
+#include <ATen/Parallel.h>
 #include "utils.h"
 inline torch::Tensor get_dist(torch::Tensor x, int64_t idx) {
-  return (x - x[idx]).norm(2, 1);
+  return (x - x[idx]).pow_(2).sum(1);
 }
 torch::Tensor fps_cpu(torch::Tensor src, torch::Tensor ptr, torch::Tensor ratio,
@@ -28,27 +30,29 @@ torch::Tensor fps_cpu(torch::Tensor src, torch::Tensor ptr, torch::Tensor ratio,
  auto out_ptr_data = out_ptr.data_ptr<int64_t>();
  auto out_data = out.data_ptr<int64_t>();
-  int64_t src_start = 0, out_start = 0, src_end, out_end;
+  int64_t grain_size = 1; // Always parallelize over batch dimension.
-  for (auto b = 0; b < batch_size; b++) {
+  at::parallel_for(0, batch_size, grain_size, [&](int64_t begin, int64_t end) {
-    src_end = ptr_data[b + 1], out_end = out_ptr_data[b];
+    int64_t src_start, src_end, out_start, out_end;
+    for (int64_t b = begin; b < end; b++) {
+      src_start = ptr_data[b], src_end = ptr_data[b + 1];
+      out_start = b == 0 ? 0 : out_ptr_data[b - 1], out_end = out_ptr_data[b];
      auto y = src.narrow(0, src_start, src_end - src_start);
      int64_t start_idx = 0;
-    if (random_start) {
+      if (random_start)
        start_idx = rand() % y.size(0);
-    }
      out_data[out_start] = src_start + start_idx;
      auto dist = get_dist(y, start_idx);
-    for (auto i = 1; i < out_end - out_start; i++) {
+      for (int64_t i = 1; i < out_end - out_start; i++) {
        int64_t argmax = dist.argmax().data_ptr<int64_t>()[0];
        out_data[out_start + i] = src_start + argmax;
        dist = torch::min(dist, get_dist(y, argmax));
      }
-    src_start = src_end, out_start = out_end;
    }
+  });
  return out;
 }
--- a/csrc/cpu/rw_cpu.cpp
+++ b/csrc/cpu/rw_cpu.cpp
 #include "rw_cpu.h"
+#include <ATen/Parallel.h>
 #include "utils.h"
-torch::Tensor random_walk_cpu(torch::Tensor rowptr, torch::Tensor col,
+void uniform_sampling(const int64_t *rowptr, const int64_t *col,
-                              torch::Tensor start, int64_t walk_length,
+                      const int64_t *start, int64_t *n_out, int64_t *e_out,
-                              double p, double q) {
+                      const int64_t numel, const int64_t walk_length) {
+  auto rand = torch::rand({numel, walk_length});
+  auto rand_data = rand.data_ptr<float>();
+  int64_t grain_size = at::internal::GRAIN_SIZE / walk_length;
+  at::parallel_for(0, numel, grain_size, [&](int64_t begin, int64_t end) {
+    for (auto n = begin; n < end; n++) {
+      int64_t n_cur = start[n], e_cur, row_start, row_end, idx;
+      n_out[n * (walk_length + 1)] = n_cur;
+      for (auto l = 0; l < walk_length; l++) {
+        row_start = rowptr[n_cur], row_end = rowptr[n_cur + 1];
+        if (row_end - row_start == 0) {
+          e_cur = -1;
+        } else {
+          idx = int64_t(rand_data[n * walk_length + l] * (row_end - row_start));
+          e_cur = row_start + idx;
+          n_cur = col[e_cur];
+        }
+        n_out[n * (walk_length + 1) + (l + 1)] = n_cur;
+        e_out[n * walk_length + l] = e_cur;
+      }
+    }
+  });
+}
+bool inline is_neighbor(const int64_t *rowptr, const int64_t *col, int64_t v,
+                        int64_t w) {
+  int64_t row_start = rowptr[v], row_end = rowptr[v + 1];
+  for (auto i = row_start; i < row_end; i++) {
+    if (col[i] == w)
+      return true;
+  }
+  return false;
+}
+// See: https://louisabraham.github.io/articles/node2vec-sampling.html
+void rejection_sampling(const int64_t *rowptr, const int64_t *col,
+                        int64_t *start, int64_t *n_out, int64_t *e_out,
+                        const int64_t numel, const int64_t walk_length,
+                        const double p, const double q) {
+  double max_prob = fmax(fmax(1. / p, 1.), 1. / q);
+  double prob_0 = 1. / p / max_prob;
+  double prob_1 = 1. / max_prob;
+  double prob_2 = 1. / q / max_prob;
+  int64_t grain_size = at::internal::GRAIN_SIZE / walk_length;
+  at::parallel_for(0, numel, grain_size, [&](int64_t begin, int64_t end) {
+    for (auto n = begin; n < end; n++) {
+      int64_t t = start[n], v, x, e_cur, row_start, row_end;
+      n_out[n * (walk_length + 1)] = t;
+      row_start = rowptr[t], row_end = rowptr[t + 1];
+      if (row_end - row_start == 0) {
+        e_cur = -1;
+        v = t;
+      } else {
+        e_cur = row_start + (rand() % (row_end - row_start));
+        v = col[e_cur];
+      }
+      n_out[n * (walk_length + 1) + 1] = v;
+      e_out[n * walk_length] = e_cur;
+      for (auto l = 1; l < walk_length; l++) {
+        row_start = rowptr[v], row_end = rowptr[v + 1];
+        if (row_end - row_start == 0) {
+          e_cur = -1;
+          x = v;
+        } else if (row_end - row_start == 1) {
+          e_cur = row_start;
+          x = col[e_cur];
+        } else {
+          while (true) {
+            e_cur = row_start + (rand() % (row_end - row_start));
+            x = col[e_cur];
+            auto r = ((double)rand() / (RAND_MAX)); // [0, 1)
+            if (x == t && r < prob_0)
+              break;
+            else if (is_neighbor(rowptr, col, x, t) && r < prob_1)
+              break;
+            else if (r < prob_2)
+              break;
+          }
+        }
+        n_out[n * (walk_length + 1) + (l + 1)] = x;
+        e_out[n * walk_length + l] = e_cur;
+        t = v;
+        v = x;
+      }
+    }
+  });
+}
+std::tuple<torch::Tensor, torch::Tensor>
+random_walk_cpu(torch::Tensor rowptr, torch::Tensor col, torch::Tensor start,
+                int64_t walk_length, double p, double q) {
  CHECK_CPU(rowptr);
  CHECK_CPU(col);
  CHECK_CPU(start);
@@ -13,35 +118,22 @@ torch::Tensor random_walk_cpu(torch::Tensor rowptr, torch::Tensor col,
  CHECK_INPUT(col.dim() == 1);
  CHECK_INPUT(start.dim() == 1);
-  auto rand = torch::rand({start.size(0), walk_length},
+  auto n_out = torch::empty({start.size(0), walk_length + 1}, start.options());
-                          start.options().dtype(torch::kFloat));
+  auto e_out = torch::empty({start.size(0), walk_length}, start.options());
-  auto out = torch::empty({start.size(0), walk_length + 1}, start.options());
  auto rowptr_data = rowptr.data_ptr<int64_t>();
  auto col_data = col.data_ptr<int64_t>();
  auto start_data = start.data_ptr<int64_t>();
-  auto rand_data = rand.data_ptr<float>();
+  auto n_out_data = n_out.data_ptr<int64_t>();
-  auto out_data = out.data_ptr<int64_t>();
+  auto e_out_data = e_out.data_ptr<int64_t>();
-  for (auto n = 0; n < start.size(0); n++) {
-    auto cur = start_data[n];
-    auto offset = n * (walk_length + 1);
-    out_data[offset] = cur;
-    int64_t row_start, row_end, rnd;
+  if (p == 1. && q == 1.) {
-    for (auto l = 1; l <= walk_length; l++) {
+    uniform_sampling(rowptr_data, col_data, start_data, n_out_data, e_out_data,
-      row_start = rowptr_data[cur], row_end = rowptr_data[cur + 1];
+                     start.numel(), walk_length);
-      if (row_end - row_start == 0) {
-        cur = n;
  } else {
-        rnd = int64_t(rand_data[n * walk_length + (l - 1)] *
+    rejection_sampling(rowptr_data, col_data, start_data, n_out_data,
-                      (row_end - row_start));
+                       e_out_data, start.numel(), walk_length, p, q);
-        cur = col_data[row_start + rnd];
-      }
-      out_data[offset + l] = cur;
-    }
  }
-  return out;
+  return std::make_tuple(n_out, e_out);
 }
--- a/csrc/cpu/rw_cpu.h
+++ b/csrc/cpu/rw_cpu.h
@@ -2,6 +2,6 @@
 #include <torch/extension.h>
-torch::Tensor random_walk_cpu(torch::Tensor rowptr, torch::Tensor col,
+std::tuple<torch::Tensor, torch::Tensor>
-                              torch::Tensor start, int64_t walk_length,
+random_walk_cpu(torch::Tensor rowptr, torch::Tensor col, torch::Tensor start,
-                              double p, double q);
+                int64_t walk_length, double p, double q);
--- a/csrc/cuda/rw_cuda.cu
+++ b/csrc/cuda/rw_cuda.cu
 #include "rw_cuda.h"
 #include <ATen/cuda/CUDAContext.h>
+#include <curand.h>
+#include <curand_kernel.h>
 #include "utils.cuh"
 #define THREADS 1024
 #define BLOCKS(N) (N + THREADS - 1) / THREADS
-__global__ void uniform_random_walk_kernel(const int64_t *rowptr,
+__global__ void uniform_sampling_kernel(const int64_t *rowptr,
                                        const int64_t *col,
-                                           const int64_t *start,
+                                        const int64_t *start, const float *rand,
-                                           const float *rand, int64_t *out,
+                                        int64_t *n_out, int64_t *e_out,
-                                           int64_t walk_length, int64_t numel) {
+                                        const int64_t walk_length,
+                                        const int64_t numel) {
+  const int64_t thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (thread_idx < numel) {
+    int64_t n_cur = start[thread_idx], e_cur, row_start, row_end, rnd;
+    n_out[thread_idx] = n_cur;
+    for (int64_t l = 0; l < walk_length; l++) {
+      row_start = rowptr[n_cur], row_end = rowptr[n_cur + 1];
+      if (row_end - row_start == 0) {
+        e_cur = -1;
+      } else {
+        rnd = int64_t(rand[l * numel + thread_idx] * (row_end - row_start));
+        e_cur = row_start + rnd;
+        n_cur = col[e_cur];
+      }
+      n_out[(l + 1) * numel + thread_idx] = n_cur;
+      e_out[l * numel + thread_idx] = e_cur;
+    }
+  }
+}
+__global__ void
+rejection_sampling_kernel(unsigned int seed, const int64_t *rowptr,
+                          const int64_t *col, const int64_t *start,
+                          int64_t *n_out, int64_t *e_out,
+                          const int64_t walk_length, const int64_t numel,
+                          const double p, const double q) {
+  curandState_t state;
+  curand_init(seed, 0, 0, &state);
+  double max_prob = fmax(fmax(1. / p, 1.), 1. / q);
+  double prob_0 = 1. / p / max_prob;
+  double prob_1 = 1. / max_prob;
+  double prob_2 = 1. / q / max_prob;
  const int64_t thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (thread_idx < numel) {
-    out[thread_idx] = start[thread_idx];
+    int64_t t = start[thread_idx], v, x, e_cur, row_start, row_end;
+    n_out[thread_idx] = t;
+    row_start = rowptr[t], row_end = rowptr[t + 1];
+    if (row_end - row_start == 0) {
+      e_cur = -1;
+      v = t;
+    } else {
+      e_cur = row_start + (curand(&state) % (row_end - row_start));
+      v = col[e_cur];
+    }
-    int64_t row_start, row_end, i, cur;
+    n_out[numel + thread_idx] = v;
-    for (int64_t l = 1; l <= walk_length; l++) {
+    e_out[thread_idx] = e_cur;
-      i = (l - 1) * numel + thread_idx;
-      cur = out[i];
+    for (int64_t l = 1; l < walk_length; l++) {
-      row_start = rowptr[cur], row_end = rowptr[cur + 1];
+      row_start = rowptr[v], row_end = rowptr[v + 1];
      if (row_end - row_start == 0) {
-        out[l * numel + thread_idx] = cur;
+        e_cur = -1;
+        x = v;
+      } else if (row_end - row_start == 1) {
+        e_cur = row_start;
+        x = col[e_cur];
      } else {
-        out[l * numel + thread_idx] =
+        while (true) {
-            col[row_start + int64_t(rand[i] * (row_end - row_start))];
+          e_cur = row_start + (curand(&state) % (row_end - row_start));
+          x = col[e_cur];
+          double r = curand_uniform(&state); // (0, 1]
+          if (x == t && r < prob_0)
+            break;
+          bool is_neighbor = false;
+          row_start = rowptr[x], row_end = rowptr[x + 1];
+          for (int64_t i = row_start; i < row_end; i++) {
+            if (col[i] == t) {
+              is_neighbor = true;
+              break;
+            }
+          }
+          if (is_neighbor && r < prob_1)
+            break;
+          else if (r < prob_2)
+            break;
+        }
      }
+      n_out[(l + 1) * numel + thread_idx] = x;
+      e_out[l * numel + thread_idx] = e_cur;
+      t = v;
+      v = x;
    }
  }
 }
-torch::Tensor random_walk_cuda(torch::Tensor rowptr, torch::Tensor col,
+std::tuple<torch::Tensor, torch::Tensor>
-                               torch::Tensor start, int64_t walk_length,
+random_walk_cuda(torch::Tensor rowptr, torch::Tensor col, torch::Tensor start,
-                               double p, double q) {
+                 int64_t walk_length, double p, double q) {
  CHECK_CUDA(rowptr);
  CHECK_CUDA(col);
  CHECK_CUDA(start);
@@ -45,15 +127,26 @@ torch::Tensor random_walk_cuda(torch::Tensor rowptr, torch::Tensor col,
  CHECK_INPUT(col.dim() == 1);
  CHECK_INPUT(start.dim() == 1);
+  auto n_out = torch::empty({walk_length + 1, start.size(0)}, start.options());
+  auto e_out = torch::empty({walk_length, start.size(0)}, start.options());
+  auto stream = at::cuda::getCurrentCUDAStream();
+  if (p == 1. && q == 1.) {
    auto rand = torch::rand({start.size(0), walk_length},
                            start.options().dtype(torch::kFloat));
-  auto out = torch::empty({walk_length + 1, start.size(0)}, start.options());
-  auto stream = at::cuda::getCurrentCUDAStream();
+    uniform_sampling_kernel<<<BLOCKS(start.numel()), THREADS, 0, stream>>>(
-  uniform_random_walk_kernel<<<BLOCKS(start.numel()), THREADS, 0, stream>>>(
        rowptr.data_ptr<int64_t>(), col.data_ptr<int64_t>(),
        start.data_ptr<int64_t>(), rand.data_ptr<float>(),
-      out.data_ptr<int64_t>(), walk_length, start.numel());
+        n_out.data_ptr<int64_t>(), e_out.data_ptr<int64_t>(), walk_length,
+        start.numel());
+  } else {
+    rejection_sampling_kernel<<<BLOCKS(start.numel()), THREADS, 0, stream>>>(
+        time(NULL), rowptr.data_ptr<int64_t>(), col.data_ptr<int64_t>(),
+        start.data_ptr<int64_t>(), n_out.data_ptr<int64_t>(),
+        e_out.data_ptr<int64_t>(), walk_length, start.numel(), p, q);
+  }
-  return out.t().contiguous();
+  return std::make_tuple(n_out.t().contiguous(), e_out.t().contiguous());
 }
--- a/csrc/cuda/rw_cuda.h
+++ b/csrc/cuda/rw_cuda.h
@@ -2,6 +2,6 @@
 #include <torch/extension.h>
-torch::Tensor random_walk_cuda(torch::Tensor rowptr, torch::Tensor col,
+std::tuple<torch::Tensor, torch::Tensor>
-                               torch::Tensor start, int64_t walk_length,
+random_walk_cuda(torch::Tensor rowptr, torch::Tensor col, torch::Tensor start,
-                               double p, double q);
+                 int64_t walk_length, double p, double q);
--- a/csrc/rw.cpp
+++ b/csrc/rw.cpp
@@ -11,9 +11,9 @@
 PyMODINIT_FUNC PyInit__rw(void) { return NULL; }
 #endif
-torch::Tensor random_walk(torch::Tensor rowptr, torch::Tensor col,
+std::tuple<torch::Tensor, torch::Tensor>
-                          torch::Tensor start, int64_t walk_length, double p,
+random_walk(torch::Tensor rowptr, torch::Tensor col, torch::Tensor start,
-                          double q) {
+            int64_t walk_length, double p, double q) {
  if (rowptr.device().is_cuda()) {
 #ifdef WITH_CUDA
    return random_walk_cuda(rowptr, col, start, walk_length, p, q);

--- a/script/cuda.sh
+++ b/script/cuda.sh
@@ -8,16 +8,14 @@ if [ "${TRAVIS_OS_NAME}" = "linux" ] && [ "$IDX" = "cu92" ]; then
  export CUDA_SHORT=9.2
  export CUDA=9.2.148-1
  export UBUNTU_VERSION=ubuntu1604
-  export CUBLAS=cuda-cublas-dev-9-2
  export TOOLKIT="cudatoolkit=${CUDA_SHORT}"
 fi
 if [ "${TRAVIS_OS_NAME}" = "linux" ] && [ "$IDX" = "cu101" ]; then
  export IDX=cu101
  export CUDA_SHORT=10.1
-  export CUDA=10.1.105-1
+  export CUDA=10.1.243-1
  export UBUNTU_VERSION=ubuntu1804
-  export CUBLAS=libcublas-dev
  export TOOLKIT="cudatoolkit=${CUDA_SHORT}"
 fi
@@ -26,7 +24,11 @@ if [ "${TRAVIS_OS_NAME}" = "linux" ] && [ "$IDX" = "cu102" ]; then
  export CUDA_SHORT=10.2
  export CUDA=10.2.89-1
  export UBUNTU_VERSION=ubuntu1804
-  export CUBLAS=libcublas-dev
+  export TOOLKIT="cudatoolkit=${CUDA_SHORT}"
+fi
+if [ "${TRAVIS_OS_NAME}" = "linux" ] && [ "$IDX" = "cu110" ]; then
+  export CUDA_SHORT=11.0
  export TOOLKIT="cudatoolkit=${CUDA_SHORT}"
 fi
@@ -55,6 +57,13 @@ if [ "${TRAVIS_OS_NAME}" = "windows" ] && [ "$IDX" = "cu102" ]; then
  export TOOLKIT="cudatoolkit=${CUDA_SHORT}"
 fi
+if [ "${TRAVIS_OS_NAME}" = "windows" ] && [ "$IDX" = "cu110" ]; then
+  export CUDA_SHORT=11.0
+  export CUDA_URL=https://developer.download.nvidia.com/compute/cuda/${CUDA_SHORT}.2/local_installers
+  export CUDA_FILE=cuda_${CUDA_SHORT}.2_451.48_win10.exe
+  export TOOLKIT="cudatoolkit=${CUDA_SHORT}"
+fi
 if [ "${TRAVIS_OS_NAME}" = "osx" ] && [ "$IDX" = "cpu" ]; then
  export TOOLKIT=""
 fi
@@ -65,14 +74,37 @@ else
  export FORCE_CUDA=1
 fi
-if [ "${TRAVIS_OS_NAME}" = "linux" ] && [ "${IDX}" != "cpu" ]; then
+if [ "${TRAVIS_OS_NAME}" = "linux" ] && [ "${IDX}" != "cpu" ] && [ "${IDX}" != "cu110" ]; then
-  INSTALLER=cuda-repo-${UBUNTU_VERSION}_${CUDA}_amd64.deb
+  INSTALLER="cuda-repo-${UBUNTU_VERSION}_${CUDA}_amd64.deb"
  wget -nv "http://developer.download.nvidia.com/compute/cuda/repos/${UBUNTU_VERSION}/x86_64/${INSTALLER}"
  sudo dpkg -i "${INSTALLER}"
  wget -nv "https://developer.download.nvidia.com/compute/cuda/repos/${UBUNTU_VERSION}/x86_64/7fa2af80.pub"
  sudo apt-key add 7fa2af80.pub
  sudo apt update -qq
-  sudo apt install -y "cuda-core-${CUDA_SHORT/./-}" "cuda-cudart-dev-${CUDA_SHORT/./-}" "${CUBLAS}" "cuda-cusparse-dev-${CUDA_SHORT/./-}"
+  sudo apt install "cuda-core-${CUDA_SHORT/./-}" "cuda-nvcc-${CUDA_SHORT/./-}" "cuda-libraries-dev-${CUDA_SHORT/./-}"
+  sudo apt clean
+  CUDA_HOME=/usr/local/cuda-${CUDA_SHORT}
+  LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
+  PATH=${CUDA_HOME}/bin:${PATH}
+  nvcc --version
+  # Fix cublas on CUDA 10.1:
+  if [ -d "/usr/local/cuda-10.2/targets/x86_64-linux/include" ]; then
+    sudo cp -r /usr/local/cuda-10.2/targets/x86_64-linux/include/* "${CUDA_HOME}/include/"
+  fi
+  if [ -d "/usr/local/cuda-10.2/targets/x86_64-linux/lib" ]; then
+    sudo cp -r /usr/local/cuda-10.2/targets/x86_64-linux/lib/* "${CUDA_HOME}/lib/"
+  fi
+fi
+if [ "${TRAVIS_OS_NAME}" = "linux" ] && [ "${IDX}" = "cu110" ]; then
+  wget -nv https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-ubuntu1804.pin
+  sudo mv cuda-ubuntu1804.pin /etc/apt/preferences.d/cuda-repository-pin-600
+  wget -nv https://developer.download.nvidia.com/compute/cuda/11.0.3/local_installers/cuda-repo-ubuntu1804-11-0-local_11.0.3-450.51.06-1_amd64.deb
+  sudo dpkg -i cuda-repo-ubuntu1804-11-0-local_11.0.3-450.51.06-1_amd64.deb
+  sudo apt-key add /var/cuda-repo-ubuntu1804-11-0-local/7fa2af80.pub
+  sudo apt update -qq
+  sudo apt install cuda-nvcc-11-0 cuda-libraries-dev-11-0
  sudo apt clean
  CUDA_HOME=/usr/local/cuda-${CUDA_SHORT}
  LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
@@ -86,16 +118,11 @@ if [ "${TRAVIS_OS_NAME}" = "windows" ] && [ "${IDX}" != "cpu" ]; then
  curl -k -L "https://drive.google.com/u/0/uc?id=1injUyo3lnarMgWyRcXqKg4UGnN0ysmuq&export=download" --output "/tmp/gpu_driver_dlls.zip"
  7z x "/tmp/gpu_driver_dlls.zip" -o"/c/Windows/System32"
-  # Install CUDA
+  # Install CUDA:
  wget -nv "${CUDA_URL}/${CUDA_FILE}"
-  PowerShell -Command "Start-Process -FilePath \"${CUDA_FILE}\" -ArgumentList \"-s nvcc_${CUDA_SHORT} cublas_dev_${CUDA_SHORT} cusparse_dev_${CUDA_SHORT}\" -Wait -NoNewWindow"
+  PowerShell -Command "Start-Process -FilePath \"${CUDA_FILE}\" -ArgumentList \"-s nvcc_${CUDA_SHORT} cuobjdump_${CUDA_SHORT} nvprune_${CUDA_SHORT} cupti_${CUDA_SHORT} cublas_dev_${CUDA_SHORT} cudart_${CUDA_SHORT} cufft_dev_${CUDA_SHORT} curand_dev_${CUDA_SHORT} cusolver_dev_${CUDA_SHORT} cusparse_dev_${CUDA_SHORT} npp_dev_${CUDA_SHORT} nvrtc_dev_${CUDA_SHORT} nvml_dev_${CUDA_SHORT}\" -Wait -NoNewWindow"
  CUDA_HOME=/c/Program\ Files/NVIDIA\ GPU\ Computing\ Toolkit/CUDA/v${CUDA_SHORT}
  PATH=${CUDA_HOME}/bin:$PATH
  PATH=/c/Program\ Files\ \(x86\)/Microsoft\ Visual\ Studio/2017/BuildTools/MSBuild/15.0/Bin:$PATH
  nvcc --version
 fi
-# Fix Cuda9.2 on Windows: https://github.com/pytorch/pytorch/issues/6109
-if [ "${TRAVIS_OS_NAME}" = "windows" ] && [ "${IDX}" = "cu92" ]; then
-  sed -i.bak -e '129,141d' "${CUDA_HOME}/include/crt/host_config.h"
-fi
--- a/script/rename_wheel.py
+++ b/script/rename_wheel.py
@@ -5,7 +5,7 @@ import glob
 import shutil
 idx = sys.argv[1]
-assert idx in ['cpu', 'cu92', 'cu101', 'cu102']
+assert idx in ['cpu', 'cu92', 'cu101', 'cu102', 'cu110']
 dist_dir = osp.join(osp.dirname(osp.abspath(__file__)), '..', 'dist')
 wheels = glob.glob(osp.join('dist', '**', '*.whl'), recursive=True)

--- a/script/torch.sh
+++ b/script/torch.sh
 #!/bin/bash
-# Fix "member may not be initialized" error on Windows: https://github.com/pytorch/pytorch/issues/27958
-if [ "${TRAVIS_OS_NAME}" = "windows" ] && [ "${TORCH_VERSION}" = "1.5.0" ]; then
-  echo "Fix nvcc for PyTorch 1.5.0"
-  sed -i.bak -e 's/constexpr/const/g' /c/tools/miniconda3/envs/test/lib/site-packages/torch/include/torch/csrc/jit/api/module.h
-  sed -i.bak -e 's/constexpr/const/g' /c/tools/miniconda3/envs/test/lib/site-packages/torch/include/torch/csrc/jit/runtime/argument_spec.h
-  sed -i.bak -e 's/return \*(this->value)/return \*((type\*)this->value)/g' /c/tools/miniconda3/envs/test/lib/site-packages/torch/include/pybind11/cast.h
-fi
 # https://github.com/pytorch/pytorch/commit/d2e16dd888a9b5fd55bd475d4fcffb70f388d4f0
-if [ "${TRAVIS_OS_NAME}" = "windows" ] && [ "${TORCH_VERSION}" = "1.6.0" ]; then
+if [ "${TRAVIS_OS_NAME}" = "windows" ]; then
-  echo "Fix nvcc for PyTorch 1.6.0"
+  echo "Fix nvcc for PyTorch"
  sed -i.bak -e 's/CONSTEXPR_EXCEPT_WIN_CUDA/const/g' /c/tools/miniconda3/envs/test/lib/site-packages/torch/include/torch/csrc/jit/api/module.h
  sed -i.bak -e 's/return \*(this->value)/return \*((type\*)this->value)/g' /c/tools/miniconda3/envs/test/lib/site-packages/torch/include/pybind11/cast.h
 fi
+if [ "${TRAVIS_OS_NAME}" = "windows" ] && [ "${TORCH_VERSION}" = "1.7.0" ]; then
+  echo "Fix nvcc for PyTorch 1.7.0"
+  sed -i.bak '/static constexpr Symbol Kind/d' /c/tools/miniconda3/envs/test/lib/site-packages/torch/include/torch/csrc/jit/ir/ir.h
+fi
--- a/setup.py
+++ b/setup.py
 import os
 import os.path as osp
+import sys
 import glob
 from setuptools import setup, find_packages
 import torch
+from torch.__config__ import parallel_info
 from torch.utils.cpp_extension import BuildExtension
 from torch.utils.cpp_extension import CppExtension, CUDAExtension, CUDA_HOME
@@ -20,6 +22,17 @@ def get_extensions():
    Extension = CppExtension
    define_macros = []
    extra_compile_args = {'cxx': []}
+    extra_link_args = []
+    info = parallel_info()
+    if 'parallel backend: OpenMP' in info and 'OpenMP not found' not in info:
+        extra_compile_args['cxx'] += ['-DAT_PARALLEL_OPENMP']
+        if sys.platform == 'win32':
+            extra_compile_args['cxx'] += ['/openmp']
+        else:
+            extra_compile_args['cxx'] += ['-fopenmp']
+    else:
+        print('Compiling without OpenMP...')
    if WITH_CUDA:
        Extension = CUDAExtension
@@ -51,6 +64,7 @@ def get_extensions():
            include_dirs=[extensions_dir],
            define_macros=define_macros,
            extra_compile_args=extra_compile_args,
+            extra_link_args=extra_link_args,
        )
        extensions += [extension]
@@ -63,7 +77,7 @@ tests_require = ['pytest', 'pytest-cov', 'scipy']
 setup(
    name='torch_cluster',
-    version='1.5.7',
+    version='1.5.8',
    author='Matthias Fey',
    author_email='matthias.fey@tu-dortmund.de',
    url='https://github.com/rusty1s/pytorch_cluster',

--- a/torch_cluster/__init__.py
+++ b/torch_cluster/__init__.py
@@ -3,7 +3,7 @@ import os.path as osp
 import torch
-__version__ = '1.5.7'
+__version__ = '1.5.8'
 for library in [
        '_version', '_grid', '_graclus', '_fps', '_rw', '_sampler', '_nearest',

--- a/torch_cluster/nearest.py
+++ b/torch_cluster/nearest.py
@@ -38,6 +38,7 @@ def nearest(x: torch.Tensor, y: torch.Tensor,
    x = x.view(-1, 1) if x.dim() == 1 else x
    y = y.view(-1, 1) if y.dim() == 1 else y
+    assert x.size(1) == y.size(1)
    if x.is_cuda:
        if batch_x is not None:
@@ -66,19 +67,13 @@ def nearest(x: torch.Tensor, y: torch.Tensor,
        return torch.ops.torch_cluster.nearest(x, y, ptr_x, ptr_y)
    else:
-        if batch_x is None:
+        # Translate and rescale x and y to [0, 1].
-            batch_x = x.new_zeros(x.size(0), dtype=torch.long)
+        if batch_x is not None and batch_y is not None:
-        if batch_y is None:
-            batch_y = y.new_zeros(y.size(0), dtype=torch.long)
            assert x.dim() == 2 and batch_x.dim() == 1
            assert y.dim() == 2 and batch_y.dim() == 1
-        assert x.size(1) == y.size(1)
            assert x.size(0) == batch_x.size(0)
            assert y.size(0) == batch_y.size(0)
-        # Translate and rescale x and y to [0, 1].
            min_xy = min(x.min().item(), y.min().item())
            x, y = x - min_xy, y - min_xy
@@ -87,8 +82,9 @@ def nearest(x: torch.Tensor, y: torch.Tensor,
            y.div_(max_xy)
            # Concat batch/features to ensure no cross-links between examples.
-        x = torch.cat([x, 2 * x.size(1) * batch_x.view(-1, 1).to(x.dtype)], -1)
+            D = x.size(-1)
-        y = torch.cat([y, 2 * y.size(1) * batch_y.view(-1, 1).to(y.dtype)], -1)
+            x = torch.cat([x, 2 * D * batch_x.view(-1, 1).to(x.dtype)], -1)
+            y = torch.cat([y, 2 * D * batch_y.view(-1, 1).to(y.dtype)], -1)
        return torch.from_numpy(
            scipy.cluster.vq.vq(x.detach().cpu(),

--- a/torch_cluster/rw.py
+++ b/torch_cluster/rw.py
-import warnings
 from typing import Optional
 import torch
+from torch import Tensor
 @torch.jit.script
-def random_walk(row: torch.Tensor, col: torch.Tensor, start: torch.Tensor,
+def random_walk(row: Tensor, col: Tensor, start: Tensor, walk_length: int,
-                walk_length: int, p: float = 1, q: float = 1,
+                p: float = 1, q: float = 1, coalesced: bool = True,
-                coalesced: bool = True, num_nodes: Optional[int] = None):
+                num_nodes: Optional[int] = None) -> Tensor:
    """Samples random walks of length :obj:`walk_length` from all node indices
    in :obj:`start` in the graph given by :obj:`(row, col)` as described in the
    `"node2vec: Scalable Feature Learning for Networks"
@@ -43,10 +43,5 @@ def random_walk(row: torch.Tensor, col: torch.Tensor, start: torch.Tensor,
    rowptr = row.new_zeros(num_nodes + 1)
    torch.cumsum(deg, 0, out=rowptr[1:])
-    if p != 1. or q != 1.:  # pragma: no cover
-        warnings.warn('Parameters `p` and `q` are not supported yet and will'
-                      'be restored to their default values `p=1` and `q=1`.')
-        p = q = 1.
    return torch.ops.torch_cluster.random_walk(rowptr, col, start, walk_length,
-                                               p, q)
+                                               p, q)[0]