Initial commit

c67425b0 · quyuanhao123 · c67425b0 · c67425b0 · c67425b0 · c67425b0
Commit c67425b0 authored Apr 18, 2023 by quyuanhao123
20 changed files
--- a/LICENSE
+++ b/LICENSE
+Copyright (c) 2020 Matthias Fey <matthias.fey@tu-dortmund.de>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
--- a/MANIFEST.in
+++ b/MANIFEST.in
+include README.md
+include LICENSE
+
+recursive-exclude test *
+recursive-include csrc *
--- a/PKG-INFO
+++ b/PKG-INFO
+Metadata-Version: 2.1
+Name: torch_cluster
+Version: 1.6.0
+Summary: PyTorch Extension Library of Optimized Graph Cluster Algorithms
+Home-page: https://github.com/rusty1s/pytorch_cluster
+Author: Matthias Fey
+Author-email: matthias.fey@tu-dortmund.de
+License: UNKNOWN
+Download-URL: https://github.com/rusty1s/pytorch_cluster/archive/1.6.0.tar.gz
+Description: [pypi-image]: https://badge.fury.io/py/torch-cluster.svg
+        [pypi-url]: https://pypi.python.org/pypi/torch-cluster
+        [testing-image]: https://github.com/rusty1s/pytorch_cluster/actions/workflows/testing.yml/badge.svg
+        [testing-url]: https://github.com/rusty1s/pytorch_cluster/actions/workflows/testing.yml
+        [linting-image]: https://github.com/rusty1s/pytorch_cluster/actions/workflows/linting.yml/badge.svg
+        [linting-url]: https://github.com/rusty1s/pytorch_cluster/actions/workflows/linting.yml
+        [coverage-image]: https://codecov.io/gh/rusty1s/pytorch_cluster/branch/master/graph/badge.svg
+        [coverage-url]: https://codecov.io/github/rusty1s/pytorch_cluster?branch=master
+        
+        # PyTorch Cluster
+        
+        [![PyPI Version][pypi-image]][pypi-url]
+        [![Testing Status][testing-image]][testing-url]
+        [![Linting Status][linting-image]][linting-url]
+        [![Code Coverage][coverage-image]][coverage-url]
+        
+        --------------------------------------------------------------------------------
+        
+        This package consists of a small extension library of highly optimized graph cluster algorithms for the use in [PyTorch](http://pytorch.org/).
+        The package consists of the following clustering algorithms:
+        
+        * **[Graclus](#graclus)** from Dhillon *et al.*: [Weighted Graph Cuts without Eigenvectors: A Multilevel Approach](http://www.cs.utexas.edu/users/inderjit/public_papers/multilevel_pami.pdf) (PAMI 2007)
+        * **[Voxel Grid Pooling](#voxelgrid)** from, *e.g.*, Simonovsky and Komodakis: [Dynamic Edge-Conditioned Filters in Convolutional Neural Networks on Graphs](https://arxiv.org/abs/1704.02901) (CVPR 2017)
+        * **[Iterative Farthest Point Sampling](#farthestpointsampling)** from, *e.g.* Qi *et al.*: [PointNet++: Deep Hierarchical Feature Learning on Point Sets in a Metric Space](https://arxiv.org/abs/1706.02413) (NIPS 2017)
+        * **[k-NN](#knn-graph)** and **[Radius](#radius-graph)** graph generation
+        * Clustering based on **[Nearest](#nearest)** points
+        * **[Random Walk Sampling](#randomwalk-sampling)** from, *e.g.*, Grover and Leskovec: [node2vec: Scalable Feature Learning for Networks](https://arxiv.org/abs/1607.00653) (KDD 2016)
+        
+        All included operations work on varying data types and are implemented both for CPU and GPU.
+        
+        ## Installation
+        
+        ### Anaconda
+        
+        **Update:** You can now install `pytorch-cluster` via [Anaconda](https://anaconda.org/pyg/pytorch-cluster) for all major OS/PyTorch/CUDA combinations 🤗
+        Given that you have [`pytorch >= 1.8.0` installed](https://pytorch.org/get-started/locally/), simply run
+        
+        ```
+        conda install pytorch-cluster -c pyg
+        ```
+        
+        ### Binaries
+        
+        We alternatively provide pip wheels for all major OS/PyTorch/CUDA combinations, see [here](https://data.pyg.org/whl).
+        
+        #### PyTorch 1.11
+        
+        To install the binaries for PyTorch 1.11.0, simply run
+        
+        ```
+        pip install torch-cluster -f https://data.pyg.org/whl/torch-1.11.0+${CUDA}.html
+        ```
+        
+        where `${CUDA}` should be replaced by either `cpu`, `cu102`, `cu113`, or `cu115` depending on your PyTorch installation.
+        
+        |             | `cpu` | `cu102` | `cu113` | `cu115` |
+        |-------------|-------|---------|---------|---------|
+        | **Linux**   | ✅    | ✅      | ✅      | ✅      |
+        | **Windows** | ✅    |         | ✅      | ✅      |
+        | **macOS**   | ✅    |         |         |         |
+        
+        #### PyTorch 1.10
+        
+        To install the binaries for PyTorch 1.10.0, PyTorch 1.10.1 and PyTorch 1.10.2, simply run
+        
+        ```
+        pip install torch-cluster -f https://data.pyg.org/whl/torch-1.10.0+${CUDA}.html
+        ```
+        
+        where `${CUDA}` should be replaced by either `cpu`, `cu102`, `cu111`, or `cu113` depending on your PyTorch installation.
+        
+        |             | `cpu` | `cu102` | `cu111` | `cu113` |
+        |-------------|-------|---------|---------|---------|
+        | **Linux**   | ✅    | ✅      | ✅      | ✅      |
+        | **Windows** | ✅    | ✅      | ✅      | ✅      |
+        | **macOS**   | ✅    |         |         |         |
+        
+        **Note:** Binaries of older versions are also provided for PyTorch 1.4.0, PyTorch 1.5.0, PyTorch 1.6.0, PyTorch 1.7.0/1.7.1, PyTorch 1.8.0/1.8.1 and PyTorch 1.9.0 (following the same procedure).
+        For older versions, you might need to explicitly specify the latest supported version number in order to prevent a manual installation from source.
+        You can look up the latest supported version number [here](https://data.pyg.org/whl).
+        
+        ### From source
+        
+        Ensure that at least PyTorch 1.4.0 is installed and verify that `cuda/bin` and `cuda/include` are in your `$PATH` and `$CPATH` respectively, *e.g.*:
+        
+        ```
+        $ python -c "import torch; print(torch.__version__)"
+        >>> 1.4.0
+        
+        $ python -c "import torch; print(torch.__version__)"
+        >>> 1.1.0
+        
+        $ echo $PATH
+        >>> /usr/local/cuda/bin:...
+        
+        $ echo $CPATH
+        >>> /usr/local/cuda/include:...
+        ```
+        
+        Then run:
+        
+        ```
+        pip install torch-cluster
+        ```
+        
+        When running in a docker container without NVIDIA driver, PyTorch needs to evaluate the compute capabilities and may fail.
+        In this case, ensure that the compute capabilities are set via `TORCH_CUDA_ARCH_LIST`, *e.g.*:
+        
+        ```
+        export TORCH_CUDA_ARCH_LIST = "6.0 6.1 7.2+PTX 7.5+PTX"
+        ```
+        
+        ## Functions
+        
+        ### Graclus
+        
+        A greedy clustering algorithm of picking an unmarked vertex and matching it with one its unmarked neighbors (that maximizes its edge weight).
+        The GPU algorithm is adapted from Fagginger Auer and Bisseling: [A GPU Algorithm for Greedy Graph Matching](http://www.staff.science.uu.nl/~bisse101/Articles/match12.pdf) (LNCS 2012)
+        
+        ```python
+        import torch
+        from torch_cluster import graclus_cluster
+        
+        row = torch.tensor([0, 1, 1, 2])
+        col = torch.tensor([1, 0, 2, 1])
+        weight = torch.tensor([1., 1., 1., 1.])  # Optional edge weights.
+        
+        cluster = graclus_cluster(row, col, weight)
+        ```
+        
+        ```
+        print(cluster)
+        tensor([0, 0, 1])
+        ```
+        
+        ### VoxelGrid
+        
+        A clustering algorithm, which overlays a regular grid of user-defined size over a point cloud and clusters all points within a voxel.
+        
+        ```python
+        import torch
+        from torch_cluster import grid_cluster
+        
+        pos = torch.tensor([[0., 0.], [11., 9.], [2., 8.], [2., 2.], [8., 3.]])
+        size = torch.Tensor([5, 5])
+        
+        cluster = grid_cluster(pos, size)
+        ```
+        
+        ```
+        print(cluster)
+        tensor([0, 5, 3, 0, 1])
+        ```
+        
+        ### FarthestPointSampling
+        
+        A sampling algorithm, which iteratively samples the most distant point with regard to the rest points.
+        
+        ```python
+        import torch
+        from torch_cluster import fps
+        
+        x = torch.tensor([[-1., -1.], [-1., 1.], [1., -1.], [1., 1.]])
+        batch = torch.tensor([0, 0, 0, 0])
+        index = fps(x, batch, ratio=0.5, random_start=False)
+        ```
+        
+        ```
+        print(index)
+        tensor([0, 3])
+        ```
+        
+        ### kNN-Graph
+        
+        Computes graph edges to the nearest *k* points.
+        
+        **Args:**
+        
+        * **x** *(Tensor)*: Node feature matrix of shape `[N, F]`.
+        * **k** *(int)*: The number of neighbors.
+        * **batch** *(LongTensor, optional)*: Batch vector of shape `[N]`, which assigns each node to a specific example. `batch` needs to be sorted. (default: `None`)
+        * **loop** *(bool, optional)*: If `True`, the graph will contain self-loops. (default: `False`)
+        * **flow** *(string, optional)*: The flow direction when using in combination with message passing (`"source_to_target"` or `"target_to_source"`). (default: `"source_to_target"`)
+        * **cosine** *(boolean, optional)*: If `True`, will use the Cosine distance instead of Euclidean distance to find nearest neighbors. (default: `False`)
+        * **num_workers** *(int)*: Number of workers to use for computation. Has no effect in case `batch` is not `None`, or the input lies on the GPU. (default: `1`)
+        
+        ```python
+        import torch
+        from torch_cluster import knn_graph
+        
+        x = torch.tensor([[-1., -1.], [-1., 1.], [1., -1.], [1., 1.]])
+        batch = torch.tensor([0, 0, 0, 0])
+        edge_index = knn_graph(x, k=2, batch=batch, loop=False)
+        ```
+        
+        ```
+        print(edge_index)
+        tensor([[1, 2, 0, 3, 0, 3, 1, 2],
+                [0, 0, 1, 1, 2, 2, 3, 3]])
+        ```
+        
+        ### Radius-Graph
+        
+        Computes graph edges to all points within a given distance.
+        
+        **Args:**
+        
+        * **x** *(Tensor)*: Node feature matrix of shape `[N, F]`.
+        * **r** *(float)*: The radius.
+        * **batch** *(LongTensor, optional)*: Batch vector of shape `[N]`, which assigns each node to a specific example. `batch` needs to be sorted. (default: `None`)
+        * **loop** *(bool, optional)*: If `True`, the graph will contain self-loops. (default: `False`)
+        * **max_num_neighbors** *(int, optional)*: The maximum number of neighbors to return for each element. If the number of actual neighbors is greater than `max_num_neighbors`, returned neighbors are picked randomly. (default: `32`)
+        * **flow** *(string, optional)*: The flow direction when using in combination with message passing (`"source_to_target"` or `"target_to_source"`). (default: `"source_to_target"`)
+        * **num_workers** *(int)*: Number of workers to use for computation. Has no effect in case `batch` is not `None`, or the input lies on the GPU. (default: `1`)
+        
+        ```python
+        import torch
+        from torch_cluster import radius_graph
+        
+        x = torch.tensor([[-1., -1.], [-1., 1.], [1., -1.], [1., 1.]])
+        batch = torch.tensor([0, 0, 0, 0])
+        edge_index = radius_graph(x, r=2.5, batch=batch, loop=False)
+        ```
+        
+        ```
+        print(edge_index)
+        tensor([[1, 2, 0, 3, 0, 3, 1, 2],
+                [0, 0, 1, 1, 2, 2, 3, 3]])
+        ```
+        
+        ### Nearest
+        
+        Clusters points in *x* together which are nearest to a given query point in *y*.
+        `batch_{x,y}` vectors need to be sorted.
+        
+        ```python
+        import torch
+        from torch_cluster import nearest
+        
+        x = torch.Tensor([[-1, -1], [-1, 1], [1, -1], [1, 1]])
+        batch_x = torch.tensor([0, 0, 0, 0])
+        y = torch.Tensor([[-1, 0], [1, 0]])
+        batch_y = torch.tensor([0, 0])
+        cluster = nearest(x, y, batch_x, batch_y)
+        ```
+        
+        ```
+        print(cluster)
+        tensor([0, 0, 1, 1])
+        ```
+        
+        ### RandomWalk-Sampling
+        
+        Samples random walks of length `walk_length` from all node indices in `start` in the graph given by `(row, col)`.
+        
+        ```python
+        import torch
+        from torch_cluster import random_walk
+        
+        row = torch.tensor([0, 1, 1, 1, 2, 2, 3, 3, 4, 4])
+        col = torch.tensor([1, 0, 2, 3, 1, 4, 1, 4, 2, 3])
+        start = torch.tensor([0, 1, 2, 3, 4])
+        
+        walk = random_walk(row, col, start, walk_length=3)
+        ```
+        
+        ```
+        print(walk)
+        tensor([[0, 1, 2, 4],
+                [1, 3, 4, 2],
+                [2, 4, 2, 1],
+                [3, 4, 2, 4],
+                [4, 3, 1, 0]])
+        ```
+        
+        ## Running tests
+        
+        ```
+        pytest
+        ```
+        
+        ## C++ API
+        
+        `torch-cluster` also offers a C++ API that contains C++ equivalent of python models.
+        
+        ```
+        mkdir build
+        cd build
+        # Add -DWITH_CUDA=on support for the CUDA if needed
+        cmake ..
+        make
+        make install
+        ```
+        
+Keywords: pytorch,geometric-deep-learning,graph-neural-networks,cluster-algorithms
+Platform: UNKNOWN
+Classifier: Development Status :: 5 - Production/Stable
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python
+Classifier: Programming Language :: Python :: 3.7
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3 :: Only
+Requires-Python: >=3.7
+Description-Content-Type: text/markdown
+Provides-Extra: test
--- a/README.md
+++ b/README.md
+[pypi-image]: https://badge.fury.io/py/torch-cluster.svg
+[pypi-url]: https://pypi.python.org/pypi/torch-cluster
+[testing-image]: https://github.com/rusty1s/pytorch_cluster/actions/workflows/testing.yml/badge.svg
+[testing-url]: https://github.com/rusty1s/pytorch_cluster/actions/workflows/testing.yml
+[linting-image]: https://github.com/rusty1s/pytorch_cluster/actions/workflows/linting.yml/badge.svg
+[linting-url]: https://github.com/rusty1s/pytorch_cluster/actions/workflows/linting.yml
+[coverage-image]: https://codecov.io/gh/rusty1s/pytorch_cluster/branch/master/graph/badge.svg
+[coverage-url]: https://codecov.io/github/rusty1s/pytorch_cluster?branch=master
+
+# PyTorch Cluster
+
+[![PyPI Version][pypi-image]][pypi-url]
+[![Testing Status][testing-image]][testing-url]
+[![Linting Status][linting-image]][linting-url]
+[![Code Coverage][coverage-image]][coverage-url]
+
+--------------------------------------------------------------------------------
+
+This package consists of a small extension library of highly optimized graph cluster algorithms for the use in [PyTorch](http://pytorch.org/).
+The package consists of the following clustering algorithms:
+
+* **[Graclus](#graclus)** from Dhillon *et al.*: [Weighted Graph Cuts without Eigenvectors: A Multilevel Approach](http://www.cs.utexas.edu/users/inderjit/public_papers/multilevel_pami.pdf) (PAMI 2007)
+* **[Voxel Grid Pooling](#voxelgrid)** from, *e.g.*, Simonovsky and Komodakis: [Dynamic Edge-Conditioned Filters in Convolutional Neural Networks on Graphs](https://arxiv.org/abs/1704.02901) (CVPR 2017)
+* **[Iterative Farthest Point Sampling](#farthestpointsampling)** from, *e.g.* Qi *et al.*: [PointNet++: Deep Hierarchical Feature Learning on Point Sets in a Metric Space](https://arxiv.org/abs/1706.02413) (NIPS 2017)
+* **[k-NN](#knn-graph)** and **[Radius](#radius-graph)** graph generation
+* Clustering based on **[Nearest](#nearest)** points
+* **[Random Walk Sampling](#randomwalk-sampling)** from, *e.g.*, Grover and Leskovec: [node2vec: Scalable Feature Learning for Networks](https://arxiv.org/abs/1607.00653) (KDD 2016)
+
+All included operations work on varying data types and are implemented both for CPU and GPU.
+
+## Installation
+
+### Anaconda
+
+**Update:** You can now install `pytorch-cluster` via [Anaconda](https://anaconda.org/pyg/pytorch-cluster) for all major OS/PyTorch/CUDA combinations 🤗
+Given that you have [`pytorch >= 1.8.0` installed](https://pytorch.org/get-started/locally/), simply run
+
+```
+conda install pytorch-cluster -c pyg
+```
+
+### Binaries
+
+We alternatively provide pip wheels for all major OS/PyTorch/CUDA combinations, see [here](https://data.pyg.org/whl).
+
+#### PyTorch 1.11
+
+To install the binaries for PyTorch 1.11.0, simply run
+
+```
+pip install torch-cluster -f https://data.pyg.org/whl/torch-1.11.0+${CUDA}.html
+```
+
+where `${CUDA}` should be replaced by either `cpu`, `cu102`, `cu113`, or `cu115` depending on your PyTorch installation.
+
+|             | `cpu` | `cu102` | `cu113` | `cu115` |
+|-------------|-------|---------|---------|---------|
+| **Linux**   | ✅    | ✅      | ✅      | ✅      |
+| **Windows** | ✅    |         | ✅      | ✅      |
+| **macOS**   | ✅    |         |         |         |
+
+#### PyTorch 1.10
+
+To install the binaries for PyTorch 1.10.0, PyTorch 1.10.1 and PyTorch 1.10.2, simply run
+
+```
+pip install torch-cluster -f https://data.pyg.org/whl/torch-1.10.0+${CUDA}.html
+```
+
+where `${CUDA}` should be replaced by either `cpu`, `cu102`, `cu111`, or `cu113` depending on your PyTorch installation.
+
+|             | `cpu` | `cu102` | `cu111` | `cu113` |
+|-------------|-------|---------|---------|---------|
+| **Linux**   | ✅    | ✅      | ✅      | ✅      |
+| **Windows** | ✅    | ✅      | ✅      | ✅      |
+| **macOS**   | ✅    |         |         |         |
+
+**Note:** Binaries of older versions are also provided for PyTorch 1.4.0, PyTorch 1.5.0, PyTorch 1.6.0, PyTorch 1.7.0/1.7.1, PyTorch 1.8.0/1.8.1 and PyTorch 1.9.0 (following the same procedure).
+For older versions, you might need to explicitly specify the latest supported version number in order to prevent a manual installation from source.
+You can look up the latest supported version number [here](https://data.pyg.org/whl).
+
+### From source
+
+Ensure that at least PyTorch 1.4.0 is installed and verify that `cuda/bin` and `cuda/include` are in your `$PATH` and `$CPATH` respectively, *e.g.*:
+
+```
+$ python -c "import torch; print(torch.__version__)"
+>>> 1.4.0
+
+$ python -c "import torch; print(torch.__version__)"
+>>> 1.1.0
+
+$ echo $PATH
+>>> /usr/local/cuda/bin:...
+
+$ echo $CPATH
+>>> /usr/local/cuda/include:...
+```
+
+Then run:
+
+```
+pip install torch-cluster
+```
+
+When running in a docker container without NVIDIA driver, PyTorch needs to evaluate the compute capabilities and may fail.
+In this case, ensure that the compute capabilities are set via `TORCH_CUDA_ARCH_LIST`, *e.g.*:
+
+```
+export TORCH_CUDA_ARCH_LIST = "6.0 6.1 7.2+PTX 7.5+PTX"
+```
+
+## Functions
+
+### Graclus
+
+A greedy clustering algorithm of picking an unmarked vertex and matching it with one its unmarked neighbors (that maximizes its edge weight).
+The GPU algorithm is adapted from Fagginger Auer and Bisseling: [A GPU Algorithm for Greedy Graph Matching](http://www.staff.science.uu.nl/~bisse101/Articles/match12.pdf) (LNCS 2012)
+
+```python
+import torch
+from torch_cluster import graclus_cluster
+
+row = torch.tensor([0, 1, 1, 2])
+col = torch.tensor([1, 0, 2, 1])
+weight = torch.tensor([1., 1., 1., 1.])  # Optional edge weights.
+
+cluster = graclus_cluster(row, col, weight)
+```
+
+```
+print(cluster)
+tensor([0, 0, 1])
+```
+
+### VoxelGrid
+
+A clustering algorithm, which overlays a regular grid of user-defined size over a point cloud and clusters all points within a voxel.
+
+```python
+import torch
+from torch_cluster import grid_cluster
+
+pos = torch.tensor([[0., 0.], [11., 9.], [2., 8.], [2., 2.], [8., 3.]])
+size = torch.Tensor([5, 5])
+
+cluster = grid_cluster(pos, size)
+```
+
+```
+print(cluster)
+tensor([0, 5, 3, 0, 1])
+```
+
+### FarthestPointSampling
+
+A sampling algorithm, which iteratively samples the most distant point with regard to the rest points.
+
+```python
+import torch
+from torch_cluster import fps
+
+x = torch.tensor([[-1., -1.], [-1., 1.], [1., -1.], [1., 1.]])
+batch = torch.tensor([0, 0, 0, 0])
+index = fps(x, batch, ratio=0.5, random_start=False)
+```
+
+```
+print(index)
+tensor([0, 3])
+```
+
+### kNN-Graph
+
+Computes graph edges to the nearest *k* points.
+
+**Args:**
+
+* **x** *(Tensor)*: Node feature matrix of shape `[N, F]`.
+* **k** *(int)*: The number of neighbors.
+* **batch** *(LongTensor, optional)*: Batch vector of shape `[N]`, which assigns each node to a specific example. `batch` needs to be sorted. (default: `None`)
+* **loop** *(bool, optional)*: If `True`, the graph will contain self-loops. (default: `False`)
+* **flow** *(string, optional)*: The flow direction when using in combination with message passing (`"source_to_target"` or `"target_to_source"`). (default: `"source_to_target"`)
+* **cosine** *(boolean, optional)*: If `True`, will use the Cosine distance instead of Euclidean distance to find nearest neighbors. (default: `False`)
+* **num_workers** *(int)*: Number of workers to use for computation. Has no effect in case `batch` is not `None`, or the input lies on the GPU. (default: `1`)
+
+```python
+import torch
+from torch_cluster import knn_graph
+
+x = torch.tensor([[-1., -1.], [-1., 1.], [1., -1.], [1., 1.]])
+batch = torch.tensor([0, 0, 0, 0])
+edge_index = knn_graph(x, k=2, batch=batch, loop=False)
+```
+
+```
+print(edge_index)
+tensor([[1, 2, 0, 3, 0, 3, 1, 2],
+        [0, 0, 1, 1, 2, 2, 3, 3]])
+```
+
+### Radius-Graph
+
+Computes graph edges to all points within a given distance.
+
+**Args:**
+
+* **x** *(Tensor)*: Node feature matrix of shape `[N, F]`.
+* **r** *(float)*: The radius.
+* **batch** *(LongTensor, optional)*: Batch vector of shape `[N]`, which assigns each node to a specific example. `batch` needs to be sorted. (default: `None`)
+* **loop** *(bool, optional)*: If `True`, the graph will contain self-loops. (default: `False`)
+* **max_num_neighbors** *(int, optional)*: The maximum number of neighbors to return for each element. If the number of actual neighbors is greater than `max_num_neighbors`, returned neighbors are picked randomly. (default: `32`)
+* **flow** *(string, optional)*: The flow direction when using in combination with message passing (`"source_to_target"` or `"target_to_source"`). (default: `"source_to_target"`)
+* **num_workers** *(int)*: Number of workers to use for computation. Has no effect in case `batch` is not `None`, or the input lies on the GPU. (default: `1`)
+
+```python
+import torch
+from torch_cluster import radius_graph
+
+x = torch.tensor([[-1., -1.], [-1., 1.], [1., -1.], [1., 1.]])
+batch = torch.tensor([0, 0, 0, 0])
+edge_index = radius_graph(x, r=2.5, batch=batch, loop=False)
+```
+
+```
+print(edge_index)
+tensor([[1, 2, 0, 3, 0, 3, 1, 2],
+        [0, 0, 1, 1, 2, 2, 3, 3]])
+```
+
+### Nearest
+
+Clusters points in *x* together which are nearest to a given query point in *y*.
+`batch_{x,y}` vectors need to be sorted.
+
+```python
+import torch
+from torch_cluster import nearest
+
+x = torch.Tensor([[-1, -1], [-1, 1], [1, -1], [1, 1]])
+batch_x = torch.tensor([0, 0, 0, 0])
+y = torch.Tensor([[-1, 0], [1, 0]])
+batch_y = torch.tensor([0, 0])
+cluster = nearest(x, y, batch_x, batch_y)
+```
+
+```
+print(cluster)
+tensor([0, 0, 1, 1])
+```
+
+### RandomWalk-Sampling
+
+Samples random walks of length `walk_length` from all node indices in `start` in the graph given by `(row, col)`.
+
+```python
+import torch
+from torch_cluster import random_walk
+
+row = torch.tensor([0, 1, 1, 1, 2, 2, 3, 3, 4, 4])
+col = torch.tensor([1, 0, 2, 3, 1, 4, 1, 4, 2, 3])
+start = torch.tensor([0, 1, 2, 3, 4])
+
+walk = random_walk(row, col, start, walk_length=3)
+```
+
+```
+print(walk)
+tensor([[0, 1, 2, 4],
+        [1, 3, 4, 2],
+        [2, 4, 2, 1],
+        [3, 4, 2, 4],
+        [4, 3, 1, 0]])
+```
+
+## Running tests
+
+```
+pytest
+```
+
+## C++ API
+
+`torch-cluster` also offers a C++ API that contains C++ equivalent of python models.
+
+```
+mkdir build
+cd build
+# Add -DWITH_CUDA=on support for the CUDA if needed
+cmake ..
+make
+make install
+```
--- a/csrc/cluster.h
+++ b/csrc/cluster.h
+#pragma once
+
+#include <torch/extension.h>
+
+int64_t cuda_version();
+
+torch::Tensor fps(torch::Tensor src, torch::Tensor ptr, double ratio,
+                  bool random_start);
+
+torch::Tensor graclus(torch::Tensor rowptr, torch::Tensor col,
+                      torch::optional<torch::Tensor> optional_weight);
+
+torch::Tensor grid(torch::Tensor pos, torch::Tensor size,
+                   torch::optional<torch::Tensor> optional_start,
+                   torch::optional<torch::Tensor> optional_end);
+
+torch::Tensor knn(torch::Tensor x, torch::Tensor y, torch::Tensor ptr_x,
+                  torch::Tensor ptr_y, int64_t k, bool cosine);
+
+torch::Tensor nearest(torch::Tensor x, torch::Tensor y, torch::Tensor ptr_x,
+                      torch::Tensor ptr_y);
+
+torch::Tensor radius(torch::Tensor x, torch::Tensor y, torch::Tensor ptr_x,
+                     torch::Tensor ptr_y, double r, int64_t max_num_neighbors);
+
+std::tuple<torch::Tensor, torch::Tensor>
+random_walk(torch::Tensor rowptr, torch::Tensor col, torch::Tensor start,
+            int64_t walk_length, double p, double q);
+
+torch::Tensor neighbor_sampler(torch::Tensor start, torch::Tensor rowptr,
+                               int64_t count, double factor);
--- a/csrc/cpu/fps_cpu.cpp
+++ b/csrc/cpu/fps_cpu.cpp
+#include "fps_cpu.h"
+
+#include <ATen/Parallel.h>
+
+#include "utils.h"
+
+inline torch::Tensor get_dist(torch::Tensor x, int64_t idx) {
+  return (x - x[idx]).pow_(2).sum(1);
+}
+
+torch::Tensor fps_cpu(torch::Tensor src, torch::Tensor ptr, torch::Tensor ratio,
+                      bool random_start) {
+
+  CHECK_CPU(src);
+  CHECK_CPU(ptr);
+  CHECK_CPU(ratio);
+  CHECK_INPUT(ptr.dim() == 1);
+
+  src = src.view({src.size(0), -1}).contiguous();
+  ptr = ptr.contiguous();
+  auto batch_size = ptr.numel() - 1;
+
+  auto deg = ptr.narrow(0, 1, batch_size) - ptr.narrow(0, 0, batch_size);
+  auto out_ptr = deg.toType(torch::kFloat) * ratio;
+  out_ptr = out_ptr.ceil().toType(torch::kLong).cumsum(0);
+
+  auto out = torch::empty(out_ptr[-1].data_ptr<int64_t>()[0], ptr.options());
+
+  auto ptr_data = ptr.data_ptr<int64_t>();
+  auto out_ptr_data = out_ptr.data_ptr<int64_t>();
+  auto out_data = out.data_ptr<int64_t>();
+
+  int64_t grain_size = 1; // Always parallelize over batch dimension.
+  at::parallel_for(0, batch_size, grain_size, [&](int64_t begin, int64_t end) {
+    int64_t src_start, src_end, out_start, out_end;
+    for (int64_t b = begin; b < end; b++) {
+      src_start = ptr_data[b], src_end = ptr_data[b + 1];
+      out_start = b == 0 ? 0 : out_ptr_data[b - 1], out_end = out_ptr_data[b];
+
+      auto y = src.narrow(0, src_start, src_end - src_start);
+
+      int64_t start_idx = 0;
+      if (random_start)
+        start_idx = rand() % y.size(0);
+
+      out_data[out_start] = src_start + start_idx;
+      auto dist = get_dist(y, start_idx);
+
+      for (int64_t i = 1; i < out_end - out_start; i++) {
+        int64_t argmax = dist.argmax().data_ptr<int64_t>()[0];
+        out_data[out_start + i] = src_start + argmax;
+        dist = torch::min(dist, get_dist(y, argmax));
+      }
+    }
+  });
+
+  return out;
+}
--- a/csrc/cpu/fps_cpu.h
+++ b/csrc/cpu/fps_cpu.h
+#pragma once
+
+#include <torch/extension.h>
+
+torch::Tensor fps_cpu(torch::Tensor src, torch::Tensor ptr, torch::Tensor ratio,
+                      bool random_start);
--- a/csrc/cpu/graclus_cpu.cpp
+++ b/csrc/cpu/graclus_cpu.cpp
+#include "graclus_cpu.h"
+
+#include "utils.h"
+
+torch::Tensor graclus_cpu(torch::Tensor rowptr, torch::Tensor col,
+                          torch::optional<torch::Tensor> optional_weight) {
+  CHECK_CPU(rowptr);
+  CHECK_CPU(col);
+  CHECK_INPUT(rowptr.dim() == 1 && col.dim() == 1);
+  if (optional_weight.has_value()) {
+    CHECK_CPU(optional_weight.value());
+    CHECK_INPUT(optional_weight.value().dim() == 1);
+    CHECK_INPUT(optional_weight.value().numel() == col.numel());
+  }
+
+  int64_t num_nodes = rowptr.numel() - 1;
+  auto out = torch::full(num_nodes, -1, rowptr.options());
+  auto node_perm = torch::randperm(num_nodes, rowptr.options());
+
+  auto rowptr_data = rowptr.data_ptr<int64_t>();
+  auto col_data = col.data_ptr<int64_t>();
+  auto node_perm_data = node_perm.data_ptr<int64_t>();
+  auto out_data = out.data_ptr<int64_t>();
+
+  if (!optional_weight.has_value()) {
+    for (int64_t n = 0; n < num_nodes; n++) {
+      auto u = node_perm_data[n];
+
+      if (out_data[u] >= 0)
+        continue;
+
+      out_data[u] = u;
+
+      int64_t row_start = rowptr_data[u], row_end = rowptr_data[u + 1];
+
+      for (auto e = 0; e < row_end - row_start; e++) {
+        auto v = col_data[row_start + e];
+
+        if (out_data[v] >= 0)
+          continue;
+
+        out_data[u] = std::min(u, v);
+        out_data[v] = std::min(u, v);
+        break;
+      }
+    }
+  } else {
+    auto weight = optional_weight.value();
+    auto scalar_type = weight.scalar_type();
+    AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, scalar_type, "_", [&] {
+      auto weight_data = weight.data_ptr<scalar_t>();
+
+      for (auto n = 0; n < num_nodes; n++) {
+        auto u = node_perm_data[n];
+
+        if (out_data[u] >= 0)
+          continue;
+
+        auto v_max = u;
+        scalar_t w_max = (scalar_t)0.;
+
+        for (auto e = rowptr_data[u]; e < rowptr_data[u + 1]; e++) {
+          auto v = col_data[e];
+
+          if (out_data[v] >= 0)
+            continue;
+
+          if (weight_data[e] >= w_max) {
+            v_max = v;
+            w_max = weight_data[e];
+          }
+        }
+
+        out_data[u] = std::min(u, v_max);
+        out_data[v_max] = std::min(u, v_max);
+      }
+    });
+  }
+
+  return out;
+}
--- a/csrc/cpu/graclus_cpu.h
+++ b/csrc/cpu/graclus_cpu.h
+#pragma once
+
+#include <torch/extension.h>
+
+torch::Tensor graclus_cpu(torch::Tensor rowptr, torch::Tensor col,
+                          torch::optional<torch::Tensor> optional_weight);
--- a/csrc/cpu/grid_cpu.cpp
+++ b/csrc/cpu/grid_cpu.cpp
+#include "grid_cpu.h"
+
+#include "utils.h"
+
+torch::Tensor grid_cpu(torch::Tensor pos, torch::Tensor size,
+                       torch::optional<torch::Tensor> optional_start,
+                       torch::optional<torch::Tensor> optional_end) {
+
+  CHECK_CPU(pos);
+  CHECK_CPU(size);
+
+  if (optional_start.has_value())
+    CHECK_CPU(optional_start.value());
+  if (optional_start.has_value())
+    CHECK_CPU(optional_start.value());
+
+  pos = pos.view({pos.size(0), -1});
+  CHECK_INPUT(size.numel() == pos.size(1));
+
+  if (!optional_start.has_value())
+    optional_start = std::get<0>(pos.min(0));
+  else
+    CHECK_INPUT(optional_start.value().numel() == pos.size(1));
+
+  if (!optional_end.has_value())
+    optional_end = std::get<0>(pos.max(0));
+  else
+    CHECK_INPUT(optional_start.value().numel() == pos.size(1));
+
+  auto start = optional_start.value();
+  auto end = optional_end.value();
+
+  pos = pos - start.unsqueeze(0);
+
+  auto num_voxels = (end - start).true_divide(size).toType(torch::kLong) + 1;
+  num_voxels = num_voxels.cumprod(0);
+  num_voxels =
+      torch::cat({torch::ones(1, num_voxels.options()), num_voxels}, 0);
+  num_voxels = num_voxels.narrow(0, 0, size.size(0));
+
+  auto out = pos.true_divide(size.view({1, -1})).toType(torch::kLong);
+  out *= num_voxels.view({1, -1});
+  out = out.sum(1);
+
+  return out;
+}
--- a/csrc/cpu/grid_cpu.h
+++ b/csrc/cpu/grid_cpu.h
+#pragma once
+
+#include <torch/extension.h>
+
+torch::Tensor grid_cpu(torch::Tensor pos, torch::Tensor size,
+                       torch::optional<torch::Tensor> optional_start,
+                       torch::optional<torch::Tensor> optional_end);
--- a/csrc/cpu/knn_cpu.cpp
+++ b/csrc/cpu/knn_cpu.cpp
+#include "knn_cpu.h"
+
+#include "utils.h"
+#include "utils/KDTreeVectorOfVectorsAdaptor.h"
+#include "utils/nanoflann.hpp"
+
+torch::Tensor knn_cpu(torch::Tensor x, torch::Tensor y,
+                      torch::optional<torch::Tensor> ptr_x,
+                      torch::optional<torch::Tensor> ptr_y, int64_t k,
+                      int64_t num_workers) {
+
+  CHECK_CPU(x);
+  CHECK_INPUT(x.dim() == 2);
+  CHECK_CPU(y);
+  CHECK_INPUT(y.dim() == 2);
+
+  if (ptr_x.has_value()) {
+    CHECK_CPU(ptr_x.value());
+    CHECK_INPUT(ptr_x.value().dim() == 1);
+  }
+  if (ptr_y.has_value()) {
+    CHECK_CPU(ptr_y.value());
+    CHECK_INPUT(ptr_y.value().dim() == 1);
+  }
+
+  std::vector<size_t> out_vec = std::vector<size_t>();
+
+  AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, x.scalar_type(), "_", [&] {
+    // See: nanoflann/examples/vector_of_vectors_example.cpp
+
+    auto x_data = x.data_ptr<scalar_t>();
+    auto y_data = y.data_ptr<scalar_t>();
+    typedef std::vector<std::vector<scalar_t>> vec_t;
+
+    if (!ptr_x.has_value()) { // Single example.
+
+      vec_t pts(x.size(0));
+      for (int64_t i = 0; i < x.size(0); i++) {
+        pts[i].resize(x.size(1));
+        for (int64_t j = 0; j < x.size(1); j++) {
+          pts[i][j] = x_data[i * x.size(1) + j];
+        }
+      }
+
+      typedef KDTreeVectorOfVectorsAdaptor<vec_t, scalar_t> my_kd_tree_t;
+
+      my_kd_tree_t mat_index(x.size(1), pts, 10);
+      mat_index.index->buildIndex();
+
+      std::vector<size_t> ret_index(k);
+      std::vector<scalar_t> out_dist_sqr(k);
+      for (int64_t i = 0; i < y.size(0); i++) {
+        size_t num_matches = mat_index.index->knnSearch(
+            y_data + i * y.size(1), k, &ret_index[0], &out_dist_sqr[0]);
+
+        for (size_t j = 0; j < num_matches; j++) {
+          out_vec.push_back(ret_index[j]);
+          out_vec.push_back(i);
+        }
+      }
+    } else { // Batch-wise.
+
+      auto ptr_x_data = ptr_x.value().data_ptr<int64_t>();
+      auto ptr_y_data = ptr_y.value().data_ptr<int64_t>();
+
+      for (int64_t b = 0; b < ptr_x.value().size(0) - 1; b++) {
+        auto x_start = ptr_x_data[b], x_end = ptr_x_data[b + 1];
+        auto y_start = ptr_y_data[b], y_end = ptr_y_data[b + 1];
+
+        if (x_start == x_end || y_start == y_end)
+          continue;
+
+        vec_t pts(x_end - x_start);
+        for (int64_t i = 0; i < x_end - x_start; i++) {
+          pts[i].resize(x.size(1));
+          for (int64_t j = 0; j < x.size(1); j++) {
+            pts[i][j] = x_data[(i + x_start) * x.size(1) + j];
+          }
+        }
+
+        typedef KDTreeVectorOfVectorsAdaptor<vec_t, scalar_t> my_kd_tree_t;
+
+        my_kd_tree_t mat_index(x.size(1), pts, 10);
+        mat_index.index->buildIndex();
+
+        std::vector<size_t> ret_index(k);
+        std::vector<scalar_t> out_dist_sqr(k);
+        for (int64_t i = y_start; i < y_end; i++) {
+          size_t num_matches = mat_index.index->knnSearch(
+              y_data + i * y.size(1), k, &ret_index[0], &out_dist_sqr[0]);
+
+          for (size_t j = 0; j < num_matches; j++) {
+            out_vec.push_back(x_start + ret_index[j]);
+            out_vec.push_back(i);
+          }
+        }
+      }
+    }
+  });
+
+  const int64_t size = out_vec.size() / 2;
+  auto out = torch::from_blob(out_vec.data(), {size, 2},
+                              x.options().dtype(torch::kLong));
+  return out.t().index_select(0, torch::tensor({1, 0}));
+}
--- a/csrc/cpu/knn_cpu.h
+++ b/csrc/cpu/knn_cpu.h
+#pragma once
+
+#include <torch/extension.h>
+
+torch::Tensor knn_cpu(torch::Tensor x, torch::Tensor y,
+                      torch::optional<torch::Tensor> ptr_x,
+                      torch::optional<torch::Tensor> ptr_y, int64_t k,
+                      int64_t num_workers);
--- a/csrc/cpu/radius_cpu.cpp
+++ b/csrc/cpu/radius_cpu.cpp
+#include "radius_cpu.h"
+
+#include "utils.h"
+#include "utils/KDTreeVectorOfVectorsAdaptor.h"
+#include "utils/nanoflann.hpp"
+
+torch::Tensor radius_cpu(torch::Tensor x, torch::Tensor y,
+                         torch::optional<torch::Tensor> ptr_x,
+                         torch::optional<torch::Tensor> ptr_y, double r,
+                         int64_t max_num_neighbors, int64_t num_workers) {
+
+  CHECK_CPU(x);
+  CHECK_INPUT(x.dim() == 2);
+  CHECK_CPU(y);
+  CHECK_INPUT(y.dim() == 2);
+
+  if (ptr_x.has_value()) {
+    CHECK_CPU(ptr_x.value());
+    CHECK_INPUT(ptr_x.value().dim() == 1);
+  }
+  if (ptr_y.has_value()) {
+    CHECK_CPU(ptr_y.value());
+    CHECK_INPUT(ptr_y.value().dim() == 1);
+  }
+
+  std::vector<size_t> out_vec = std::vector<size_t>();
+
+  AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, x.scalar_type(), "_", [&] {
+    // See: nanoflann/examples/vector_of_vectors_example.cpp
+
+    auto x_data = x.data_ptr<scalar_t>();
+    auto y_data = y.data_ptr<scalar_t>();
+    typedef std::vector<std::vector<scalar_t>> vec_t;
+    nanoflann::SearchParams params;
+    params.sorted = false;
+
+    if (!ptr_x.has_value()) { // Single example.
+
+      vec_t pts(x.size(0));
+      for (int64_t i = 0; i < x.size(0); i++) {
+        pts[i].resize(x.size(1));
+        for (int64_t j = 0; j < x.size(1); j++) {
+          pts[i][j] = x_data[i * x.size(1) + j];
+        }
+      }
+
+      typedef KDTreeVectorOfVectorsAdaptor<vec_t, scalar_t> my_kd_tree_t;
+
+      my_kd_tree_t mat_index(x.size(1), pts, 10);
+      mat_index.index->buildIndex();
+
+      for (int64_t i = 0; i < y.size(0); i++) {
+        std::vector<std::pair<size_t, scalar_t>> ret_matches;
+        size_t num_matches = mat_index.index->radiusSearch(
+            y_data + i * y.size(1), r * r, ret_matches, params);
+
+        for (size_t j = 0; j < std::min(num_matches, (size_t)max_num_neighbors);
+             j++) {
+          out_vec.push_back(ret_matches[j].first);
+          out_vec.push_back(i);
+        }
+      }
+
+    } else { // Batch-wise.
+
+      auto ptr_x_data = ptr_x.value().data_ptr<int64_t>();
+      auto ptr_y_data = ptr_y.value().data_ptr<int64_t>();
+
+      for (int64_t b = 0; b < ptr_x.value().size(0) - 1; b++) {
+        auto x_start = ptr_x_data[b], x_end = ptr_x_data[b + 1];
+        auto y_start = ptr_y_data[b], y_end = ptr_y_data[b + 1];
+
+        if (x_start == x_end || y_start == y_end)
+          continue;
+
+        vec_t pts(x_end - x_start);
+        for (int64_t i = 0; i < x_end - x_start; i++) {
+          pts[i].resize(x.size(1));
+          for (int64_t j = 0; j < x.size(1); j++) {
+            pts[i][j] = x_data[(i + x_start) * x.size(1) + j];
+          }
+        }
+
+        typedef KDTreeVectorOfVectorsAdaptor<vec_t, scalar_t> my_kd_tree_t;
+
+        my_kd_tree_t mat_index(x.size(1), pts, 10);
+        mat_index.index->buildIndex();
+
+        for (int64_t i = y_start; i < y_end; i++) {
+          std::vector<std::pair<size_t, scalar_t>> ret_matches;
+          size_t num_matches = mat_index.index->radiusSearch(
+              y_data + i * y.size(1), r * r, ret_matches, params);
+
+          for (size_t j = 0;
+               j < std::min(num_matches, (size_t)max_num_neighbors); j++) {
+            out_vec.push_back(x_start + ret_matches[j].first);
+            out_vec.push_back(i);
+          }
+        }
+      }
+    }
+  });
+
+  const int64_t size = out_vec.size() / 2;
+  auto out = torch::from_blob(out_vec.data(), {size, 2},
+                              x.options().dtype(torch::kLong));
+  return out.t().index_select(0, torch::tensor({1, 0}));
+}
--- a/csrc/cpu/radius_cpu.h
+++ b/csrc/cpu/radius_cpu.h
+#pragma once
+
+#include <torch/extension.h>
+
+torch::Tensor radius_cpu(torch::Tensor x, torch::Tensor y,
+                         torch::optional<torch::Tensor> ptr_x,
+                         torch::optional<torch::Tensor> ptr_y, double r,
+                         int64_t max_num_neighbors, int64_t num_workers);
--- a/csrc/cpu/rw_cpu.cpp
+++ b/csrc/cpu/rw_cpu.cpp
+#include "rw_cpu.h"
+
+#include <ATen/Parallel.h>
+
+#include "utils.h"
+
+void uniform_sampling(const int64_t *rowptr, const int64_t *col,
+                      const int64_t *start, int64_t *n_out, int64_t *e_out,
+                      const int64_t numel, const int64_t walk_length) {
+
+  auto rand = torch::rand({numel, walk_length});
+  auto rand_data = rand.data_ptr<float>();
+
+  int64_t grain_size = at::internal::GRAIN_SIZE / walk_length;
+  at::parallel_for(0, numel, grain_size, [&](int64_t begin, int64_t end) {
+    for (auto n = begin; n < end; n++) {
+      int64_t n_cur = start[n], e_cur, row_start, row_end, idx;
+
+      n_out[n * (walk_length + 1)] = n_cur;
+
+      for (auto l = 0; l < walk_length; l++) {
+        row_start = rowptr[n_cur], row_end = rowptr[n_cur + 1];
+        if (row_end - row_start == 0) {
+          e_cur = -1;
+        } else {
+          idx = int64_t(rand_data[n * walk_length + l] * (row_end - row_start));
+          e_cur = row_start + idx;
+          n_cur = col[e_cur];
+        }
+        n_out[n * (walk_length + 1) + (l + 1)] = n_cur;
+        e_out[n * walk_length + l] = e_cur;
+      }
+    }
+  });
+}
+
+bool inline is_neighbor(const int64_t *rowptr, const int64_t *col, int64_t v,
+                        int64_t w) {
+  int64_t row_start = rowptr[v], row_end = rowptr[v + 1];
+  for (auto i = row_start; i < row_end; i++) {
+    if (col[i] == w)
+      return true;
+  }
+  return false;
+}
+
+// See: https://louisabraham.github.io/articles/node2vec-sampling.html
+void rejection_sampling(const int64_t *rowptr, const int64_t *col,
+                        int64_t *start, int64_t *n_out, int64_t *e_out,
+                        const int64_t numel, const int64_t walk_length,
+                        const double p, const double q) {
+
+  double max_prob = fmax(fmax(1. / p, 1.), 1. / q);
+  double prob_0 = 1. / p / max_prob;
+  double prob_1 = 1. / max_prob;
+  double prob_2 = 1. / q / max_prob;
+
+  int64_t grain_size = at::internal::GRAIN_SIZE / walk_length;
+  at::parallel_for(0, numel, grain_size, [&](int64_t begin, int64_t end) {
+    for (auto n = begin; n < end; n++) {
+      int64_t t = start[n], v, x, e_cur, row_start, row_end;
+
+      n_out[n * (walk_length + 1)] = t;
+
+      row_start = rowptr[t], row_end = rowptr[t + 1];
+      if (row_end - row_start == 0) {
+        e_cur = -1;
+        v = t;
+      } else {
+        e_cur = row_start + (rand() % (row_end - row_start));
+        v = col[e_cur];
+      }
+      n_out[n * (walk_length + 1) + 1] = v;
+      e_out[n * walk_length] = e_cur;
+
+      for (auto l = 1; l < walk_length; l++) {
+        row_start = rowptr[v], row_end = rowptr[v + 1];
+
+        if (row_end - row_start == 0) {
+          e_cur = -1;
+          x = v;
+        } else if (row_end - row_start == 1) {
+          e_cur = row_start;
+          x = col[e_cur];
+        } else {
+          while (true) {
+            e_cur = row_start + (rand() % (row_end - row_start));
+            x = col[e_cur];
+
+            auto r = ((double)rand() / (RAND_MAX)); // [0, 1)
+
+            if (x == t && r < prob_0)
+              break;
+            else if (is_neighbor(rowptr, col, x, t) && r < prob_1)
+              break;
+            else if (r < prob_2)
+              break;
+          }
+        }
+
+        n_out[n * (walk_length + 1) + (l + 1)] = x;
+        e_out[n * walk_length + l] = e_cur;
+        t = v;
+        v = x;
+      }
+    }
+  });
+}
+
+std::tuple<torch::Tensor, torch::Tensor>
+random_walk_cpu(torch::Tensor rowptr, torch::Tensor col, torch::Tensor start,
+                int64_t walk_length, double p, double q) {
+  CHECK_CPU(rowptr);
+  CHECK_CPU(col);
+  CHECK_CPU(start);
+
+  CHECK_INPUT(rowptr.dim() == 1);
+  CHECK_INPUT(col.dim() == 1);
+  CHECK_INPUT(start.dim() == 1);
+
+  auto n_out = torch::empty({start.size(0), walk_length + 1}, start.options());
+  auto e_out = torch::empty({start.size(0), walk_length}, start.options());
+
+  auto rowptr_data = rowptr.data_ptr<int64_t>();
+  auto col_data = col.data_ptr<int64_t>();
+  auto start_data = start.data_ptr<int64_t>();
+  auto n_out_data = n_out.data_ptr<int64_t>();
+  auto e_out_data = e_out.data_ptr<int64_t>();
+
+  if (p == 1. && q == 1.) {
+    uniform_sampling(rowptr_data, col_data, start_data, n_out_data, e_out_data,
+                     start.numel(), walk_length);
+  } else {
+    rejection_sampling(rowptr_data, col_data, start_data, n_out_data,
+                       e_out_data, start.numel(), walk_length, p, q);
+  }
+
+  return std::make_tuple(n_out, e_out);
+}
--- a/csrc/cpu/rw_cpu.h
+++ b/csrc/cpu/rw_cpu.h
+#pragma once
+
+#include <torch/extension.h>
+
+std::tuple<torch::Tensor, torch::Tensor>
+random_walk_cpu(torch::Tensor rowptr, torch::Tensor col, torch::Tensor start,
+                int64_t walk_length, double p, double q);
--- a/csrc/cpu/sampler_cpu.cpp
+++ b/csrc/cpu/sampler_cpu.cpp
+#include "sampler_cpu.h"
+
+#include "utils.h"
+
+torch::Tensor neighbor_sampler_cpu(torch::Tensor start, torch::Tensor rowptr,
+                                   int64_t count, double factor) {
+
+  auto start_data = start.data_ptr<int64_t>();
+  auto rowptr_data = rowptr.data_ptr<int64_t>();
+
+  std::vector<int64_t> e_ids;
+  for (auto i = 0; i < start.size(0); i++) {
+    auto row_start = rowptr_data[start_data[i]];
+    auto row_end = rowptr_data[start_data[i] + 1];
+    auto num_neighbors = row_end - row_start;
+
+    int64_t size = count;
+    if (count < 1)
+      size = int64_t(ceil(factor * float(num_neighbors)));
+    if (size > num_neighbors)
+      size = num_neighbors;
+
+    // If the number of neighbors is approximately equal to the number of
+    // neighbors which are requested, we use `randperm` to sample without
+    // replacement, otherwise we sample random numbers into a set as long
+    // as necessary.
+    std::unordered_set<int64_t> set;
+    if (size < 0.7 * float(num_neighbors)) {
+      while (int64_t(set.size()) < size) {
+        int64_t sample = rand() % num_neighbors;
+        set.insert(sample + row_start);
+      }
+      std::vector<int64_t> v(set.begin(), set.end());
+      e_ids.insert(e_ids.end(), v.begin(), v.end());
+    } else {
+      auto sample = torch::randperm(num_neighbors, start.options());
+      auto sample_data = sample.data_ptr<int64_t>();
+      for (auto j = 0; j < size; j++) {
+        e_ids.push_back(sample_data[j] + row_start);
+      }
+    }
+  }
+
+  int64_t length = e_ids.size();
+  return torch::from_blob(e_ids.data(), {length}, start.options()).clone();
+}
--- a/csrc/cpu/sampler_cpu.h
+++ b/csrc/cpu/sampler_cpu.h
+#pragma once
+
+#include <torch/extension.h>
+
+torch::Tensor neighbor_sampler_cpu(torch::Tensor start, torch::Tensor rowptr,
+                                   int64_t count, double factor);
--- a/csrc/cpu/utils.h
+++ b/csrc/cpu/utils.h
+#pragma once
+
+#include <torch/extension.h>
+
+#define CHECK_CPU(x) AT_ASSERTM(x.device().is_cpu(), #x " must be CPU tensor")
+#define CHECK_INPUT(x) AT_ASSERTM(x, "Input mismatch")
+#define CHECK_CONTIGUOUS(x)                                                    \
+  AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")