Initial commit

97f2f4e9 · quyuanhao123 · 97f2f4e9 · 97f2f4e9 · 97f2f4e9 · 97f2f4e9
Commit 97f2f4e9 authored Apr 18, 2023 by quyuanhao123
20 changed files
--- a/LICENSE
+++ b/LICENSE
+Copyright (c) 2020 Matthias Fey <matthias.fey@tu-dortmund.de>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
--- a/MANIFEST.in
+++ b/MANIFEST.in
+include README.md
+include LICENSE
+
+recursive-exclude test *
+recursive-include csrc *
--- a/PKG-INFO
+++ b/PKG-INFO
+Metadata-Version: 2.1
+Name: torch_sparse
+Version: 0.6.13
+Summary: PyTorch Extension Library of Optimized Autograd Sparse Matrix Operations
+Home-page: https://github.com/rusty1s/pytorch_sparse
+Author: Matthias Fey
+Author-email: matthias.fey@tu-dortmund.de
+License: UNKNOWN
+Download-URL: https://github.com/rusty1s/pytorch_sparse/archive/0.6.13.tar.gz
+Description: [pypi-image]: https://badge.fury.io/py/torch-sparse.svg
+        [pypi-url]: https://pypi.python.org/pypi/torch-sparse
+        [testing-image]: https://github.com/rusty1s/pytorch_sparse/actions/workflows/testing.yml/badge.svg
+        [testing-url]: https://github.com/rusty1s/pytorch_sparse/actions/workflows/testing.yml
+        [linting-image]: https://github.com/rusty1s/pytorch_sparse/actions/workflows/linting.yml/badge.svg
+        [linting-url]: https://github.com/rusty1s/pytorch_sparse/actions/workflows/linting.yml
+        [coverage-image]: https://codecov.io/gh/rusty1s/pytorch_sparse/branch/master/graph/badge.svg
+        [coverage-url]: https://codecov.io/github/rusty1s/pytorch_sparse?branch=master
+        
+        # PyTorch Sparse
+        
+        [![PyPI Version][pypi-image]][pypi-url]
+        [![Testing Status][testing-image]][testing-url]
+        [![Linting Status][linting-image]][linting-url]
+        [![Code Coverage][coverage-image]][coverage-url]
+        
+        --------------------------------------------------------------------------------
+        
+        This package consists of a small extension library of optimized sparse matrix operations with autograd support.
+        This package currently consists of the following methods:
+        
+        * **[Coalesce](#coalesce)**
+        * **[Transpose](#transpose)**
+        * **[Sparse Dense Matrix Multiplication](#sparse-dense-matrix-multiplication)**
+        * **[Sparse Sparse Matrix Multiplication](#sparse-sparse-matrix-multiplication)**
+        
+        All included operations work on varying data types and are implemented both for CPU and GPU.
+        To avoid the hazzle of creating [`torch.sparse_coo_tensor`](https://pytorch.org/docs/stable/torch.html?highlight=sparse_coo_tensor#torch.sparse_coo_tensor), this package defines operations on sparse tensors by simply passing `index` and `value` tensors as arguments ([with same shapes as defined in PyTorch](https://pytorch.org/docs/stable/sparse.html)).
+        Note that only `value` comes with autograd support, as `index` is discrete and therefore not differentiable.
+        
+        ## Installation
+        
+        ### Anaconda
+        
+        **Update:** You can now install `pytorch-sparse` via [Anaconda](https://anaconda.org/pyg/pytorch-sparse) for all major OS/PyTorch/CUDA combinations 🤗
+        Given that you have [`pytorch >= 1.8.0` installed](https://pytorch.org/get-started/locally/), simply run
+        
+        ```
+        conda install pytorch-sparse -c pyg
+        ```
+        
+        ### Binaries
+        
+        We alternatively provide pip wheels for all major OS/PyTorch/CUDA combinations, see [here](https://data.pyg.org/whl).
+        
+        #### PyTorch 1.11
+        
+        To install the binaries for PyTorch 1.11.0, simply run
+        
+        ```
+        pip install torch-scatter torch-sparse -f https://data.pyg.org/whl/torch-1.11.0+${CUDA}.html
+        ```
+        
+        where `${CUDA}` should be replaced by either `cpu`, `cu102`, `cu113`, or `cu115` depending on your PyTorch installation.
+        
+        |             | `cpu` | `cu102` | `cu113` | `cu115` |
+        |-------------|-------|---------|---------|---------|
+        | **Linux**   | ✅    | ✅      | ✅      | ✅      |
+        | **Windows** | ✅    |         | ✅      | ✅      |
+        | **macOS**   | ✅    |         |         |         |
+        
+        #### PyTorch 1.10
+        
+        To install the binaries for PyTorch 1.10.0, PyTorch 1.10.1 and PyTorch 1.10.2, simply run
+        
+        ```
+        pip install torch-scatter torch-sparse -f https://data.pyg.org/whl/torch-1.10.0+${CUDA}.html
+        ```
+        
+        where `${CUDA}` should be replaced by either `cpu`, `cu102`, `cu111`, or `cu113` depending on your PyTorch installation.
+        
+        |             | `cpu` | `cu102` | `cu111` | `cu113` |
+        |-------------|-------|---------|---------|---------|
+        | **Linux**   | ✅    | ✅      | ✅      | ✅      |
+        | **Windows** | ✅    | ✅      | ✅      | ✅      |
+        | **macOS**   | ✅    |         |         |         |
+        
+        **Note:** Binaries of older versions are also provided for PyTorch 1.4.0, PyTorch 1.5.0, PyTorch 1.6.0, PyTorch 1.7.0/1.7.1, PyTorch 1.8.0/1.8.1 and PyTorch 1.9.0 (following the same procedure).
+        For older versions, you might need to explicitly specify the latest supported version number in order to prevent a manual installation from source.
+        You can look up the latest supported version number [here](https://data.pyg.org/whl).
+        
+        ### From source
+        
+        Ensure that at least PyTorch 1.7.0 is installed and verify that `cuda/bin` and `cuda/include` are in your `$PATH` and `$CPATH` respectively, *e.g.*:
+        
+        ```
+        $ python -c "import torch; print(torch.__version__)"
+        >>> 1.7.0
+        
+        $ echo $PATH
+        >>> /usr/local/cuda/bin:...
+        
+        $ echo $CPATH
+        >>> /usr/local/cuda/include:...
+        ```
+        
+        If you want to additionally build `torch-sparse` with METIS support, *e.g.* for partioning, please download and install the [METIS library](http://glaros.dtc.umn.edu/gkhome/metis/metis/download) by following the instructions in the `Install.txt` file.
+        Note that METIS needs to be installed with 64 bit `IDXTYPEWIDTH` by changing `include/metis.h`.
+        Afterwards, set the environment variable `WITH_METIS=1`.
+        
+        Then run:
+        
+        ```
+        pip install torch-scatter torch-sparse
+        ```
+        
+        When running in a docker container without NVIDIA driver, PyTorch needs to evaluate the compute capabilities and may fail.
+        In this case, ensure that the compute capabilities are set via `TORCH_CUDA_ARCH_LIST`, *e.g.*:
+        
+        ```
+        export TORCH_CUDA_ARCH_LIST="6.0 6.1 7.2+PTX 7.5+PTX"
+        ```
+        
+        ## Functions
+        
+        ### Coalesce
+        
+        ```
+        torch_sparse.coalesce(index, value, m, n, op="add") -> (torch.LongTensor, torch.Tensor)
+        ```
+        
+        Row-wise sorts `index` and removes duplicate entries.
+        Duplicate entries are removed by scattering them together.
+        For scattering, any operation of [`torch_scatter`](https://github.com/rusty1s/pytorch_scatter) can be used.
+        
+        #### Parameters
+        
+        * **index** *(LongTensor)* - The index tensor of sparse matrix.
+        * **value** *(Tensor)* - The value tensor of sparse matrix.
+        * **m** *(int)* - The first dimension of sparse matrix.
+        * **n** *(int)* - The second dimension of sparse matrix.
+        * **op** *(string, optional)* - The scatter operation to use. (default: `"add"`)
+        
+        #### Returns
+        
+        * **index** *(LongTensor)* - The coalesced index tensor of sparse matrix.
+        * **value** *(Tensor)* - The coalesced value tensor of sparse matrix.
+        
+        #### Example
+        
+        ```python
+        import torch
+        from torch_sparse import coalesce
+        
+        index = torch.tensor([[1, 0, 1, 0, 2, 1],
+                              [0, 1, 1, 1, 0, 0]])
+        value = torch.Tensor([[1, 2], [2, 3], [3, 4], [4, 5], [5, 6], [6, 7]])
+        
+        index, value = coalesce(index, value, m=3, n=2)
+        ```
+        
+        ```
+        print(index)
+        tensor([[0, 1, 1, 2],
+                [1, 0, 1, 0]])
+        print(value)
+        tensor([[6.0, 8.0],
+                [7.0, 9.0],
+                [3.0, 4.0],
+                [5.0, 6.0]])
+        ```
+        
+        ### Transpose
+        
+        ```
+        torch_sparse.transpose(index, value, m, n) -> (torch.LongTensor, torch.Tensor)
+        ```
+        
+        Transposes dimensions 0 and 1 of a sparse matrix.
+        
+        #### Parameters
+        
+        * **index** *(LongTensor)* - The index tensor of sparse matrix.
+        * **value** *(Tensor)* - The value tensor of sparse matrix.
+        * **m** *(int)* - The first dimension of sparse matrix.
+        * **n** *(int)* - The second dimension of sparse matrix.
+        * **coalesced** *(bool, optional)* - If set to `False`, will not coalesce the output. (default: `True`)
+        
+        #### Returns
+        
+        * **index** *(LongTensor)* - The transposed index tensor of sparse matrix.
+        * **value** *(Tensor)* - The transposed value tensor of sparse matrix.
+        
+        #### Example
+        
+        ```python
+        import torch
+        from torch_sparse import transpose
+        
+        index = torch.tensor([[1, 0, 1, 0, 2, 1],
+                              [0, 1, 1, 1, 0, 0]])
+        value = torch.Tensor([[1, 2], [2, 3], [3, 4], [4, 5], [5, 6], [6, 7]])
+        
+        index, value = transpose(index, value, 3, 2)
+        ```
+        
+        ```
+        print(index)
+        tensor([[0, 0, 1, 1],
+                [1, 2, 0, 1]])
+        print(value)
+        tensor([[7.0, 9.0],
+                [5.0, 6.0],
+                [6.0, 8.0],
+                [3.0, 4.0]])
+        ```
+        
+        ### Sparse Dense Matrix Multiplication
+        
+        ```
+        torch_sparse.spmm(index, value, m, n, matrix) -> torch.Tensor
+        ```
+        
+        Matrix product of a sparse matrix with a dense matrix.
+        
+        #### Parameters
+        
+        * **index** *(LongTensor)* - The index tensor of sparse matrix.
+        * **value** *(Tensor)* - The value tensor of sparse matrix.
+        * **m** *(int)* - The first dimension of sparse matrix.
+        * **n** *(int)* - The second dimension of sparse matrix.
+        * **matrix** *(Tensor)* - The dense matrix.
+        
+        #### Returns
+        
+        * **out** *(Tensor)* - The dense output matrix.
+        
+        #### Example
+        
+        ```python
+        import torch
+        from torch_sparse import spmm
+        
+        index = torch.tensor([[0, 0, 1, 2, 2],
+                              [0, 2, 1, 0, 1]])
+        value = torch.Tensor([1, 2, 4, 1, 3])
+        matrix = torch.Tensor([[1, 4], [2, 5], [3, 6]])
+        
+        out = spmm(index, value, 3, 3, matrix)
+        ```
+        
+        ```
+        print(out)
+        tensor([[7.0, 16.0],
+                [8.0, 20.0],
+                [7.0, 19.0]])
+        ```
+        
+        ### Sparse Sparse Matrix Multiplication
+        
+        ```
+        torch_sparse.spspmm(indexA, valueA, indexB, valueB, m, k, n) -> (torch.LongTensor, torch.Tensor)
+        ```
+        
+        Matrix product of two sparse tensors.
+        Both input sparse matrices need to be **coalesced** (use the `coalesced` attribute to force).
+        
+        #### Parameters
+        
+        * **indexA** *(LongTensor)* - The index tensor of first sparse matrix.
+        * **valueA** *(Tensor)* - The value tensor of first sparse matrix.
+        * **indexB** *(LongTensor)* - The index tensor of second sparse matrix.
+        * **valueB** *(Tensor)* - The value tensor of second sparse matrix.
+        * **m** *(int)* - The first dimension of first sparse matrix.
+        * **k** *(int)* - The second dimension of first sparse matrix and first dimension of second sparse matrix.
+        * **n** *(int)* - The second dimension of second sparse matrix.
+        * **coalesced** *(bool, optional)*: If set to `True`, will coalesce both input sparse matrices. (default: `False`)
+        
+        #### Returns
+        
+        * **index** *(LongTensor)* - The output index tensor of sparse matrix.
+        * **value** *(Tensor)* - The output value tensor of sparse matrix.
+        
+        #### Example
+        
+        ```python
+        import torch
+        from torch_sparse import spspmm
+        
+        indexA = torch.tensor([[0, 0, 1, 2, 2], [1, 2, 0, 0, 1]])
+        valueA = torch.Tensor([1, 2, 3, 4, 5])
+        
+        indexB = torch.tensor([[0, 2], [1, 0]])
+        valueB = torch.Tensor([2, 4])
+        
+        indexC, valueC = spspmm(indexA, valueA, indexB, valueB, 3, 3, 2)
+        ```
+        
+        ```
+        print(indexC)
+        tensor([[0, 1, 2],
+                [0, 1, 1]])
+        print(valueC)
+        tensor([8.0, 6.0, 8.0])
+        ```
+        
+        ## C++ API
+        
+        `torch-sparse` also offers a C++ API that contains C++ equivalent of python models.
+        
+        ```
+        mkdir build
+        cd build
+        # Add -DWITH_CUDA=on support for the CUDA if needed
+        cmake ..
+        make
+        make install
+        ```
+        
+        ## Running tests
+        
+        ```
+        pytest
+        ```
+        
+Keywords: pytorch,sparse,sparse-matrices,autograd
+Platform: UNKNOWN
+Classifier: Development Status :: 5 - Production/Stable
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python
+Classifier: Programming Language :: Python :: 3.7
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3 :: Only
+Requires-Python: >=3.7
+Description-Content-Type: text/markdown
+Provides-Extra: test
--- a/README.md
+++ b/README.md
+[pypi-image]: https://badge.fury.io/py/torch-sparse.svg
+[pypi-url]: https://pypi.python.org/pypi/torch-sparse
+[testing-image]: https://github.com/rusty1s/pytorch_sparse/actions/workflows/testing.yml/badge.svg
+[testing-url]: https://github.com/rusty1s/pytorch_sparse/actions/workflows/testing.yml
+[linting-image]: https://github.com/rusty1s/pytorch_sparse/actions/workflows/linting.yml/badge.svg
+[linting-url]: https://github.com/rusty1s/pytorch_sparse/actions/workflows/linting.yml
+[coverage-image]: https://codecov.io/gh/rusty1s/pytorch_sparse/branch/master/graph/badge.svg
+[coverage-url]: https://codecov.io/github/rusty1s/pytorch_sparse?branch=master
+
+# PyTorch Sparse
+
+[![PyPI Version][pypi-image]][pypi-url]
+[![Testing Status][testing-image]][testing-url]
+[![Linting Status][linting-image]][linting-url]
+[![Code Coverage][coverage-image]][coverage-url]
+
+--------------------------------------------------------------------------------
+
+This package consists of a small extension library of optimized sparse matrix operations with autograd support.
+This package currently consists of the following methods:
+
+* **[Coalesce](#coalesce)**
+* **[Transpose](#transpose)**
+* **[Sparse Dense Matrix Multiplication](#sparse-dense-matrix-multiplication)**
+* **[Sparse Sparse Matrix Multiplication](#sparse-sparse-matrix-multiplication)**
+
+All included operations work on varying data types and are implemented both for CPU and GPU.
+To avoid the hazzle of creating [`torch.sparse_coo_tensor`](https://pytorch.org/docs/stable/torch.html?highlight=sparse_coo_tensor#torch.sparse_coo_tensor), this package defines operations on sparse tensors by simply passing `index` and `value` tensors as arguments ([with same shapes as defined in PyTorch](https://pytorch.org/docs/stable/sparse.html)).
+Note that only `value` comes with autograd support, as `index` is discrete and therefore not differentiable.
+
+## Installation
+
+### Anaconda
+
+**Update:** You can now install `pytorch-sparse` via [Anaconda](https://anaconda.org/pyg/pytorch-sparse) for all major OS/PyTorch/CUDA combinations 🤗
+Given that you have [`pytorch >= 1.8.0` installed](https://pytorch.org/get-started/locally/), simply run
+
+```
+conda install pytorch-sparse -c pyg
+```
+
+### Binaries
+
+We alternatively provide pip wheels for all major OS/PyTorch/CUDA combinations, see [here](https://data.pyg.org/whl).
+
+#### PyTorch 1.11
+
+To install the binaries for PyTorch 1.11.0, simply run
+
+```
+pip install torch-scatter torch-sparse -f https://data.pyg.org/whl/torch-1.11.0+${CUDA}.html
+```
+
+where `${CUDA}` should be replaced by either `cpu`, `cu102`, `cu113`, or `cu115` depending on your PyTorch installation.
+
+|             | `cpu` | `cu102` | `cu113` | `cu115` |
+|-------------|-------|---------|---------|---------|
+| **Linux**   | ✅    | ✅      | ✅      | ✅      |
+| **Windows** | ✅    |         | ✅      | ✅      |
+| **macOS**   | ✅    |         |         |         |
+
+#### PyTorch 1.10
+
+To install the binaries for PyTorch 1.10.0, PyTorch 1.10.1 and PyTorch 1.10.2, simply run
+
+```
+pip install torch-scatter torch-sparse -f https://data.pyg.org/whl/torch-1.10.0+${CUDA}.html
+```
+
+where `${CUDA}` should be replaced by either `cpu`, `cu102`, `cu111`, or `cu113` depending on your PyTorch installation.
+
+|             | `cpu` | `cu102` | `cu111` | `cu113` |
+|-------------|-------|---------|---------|---------|
+| **Linux**   | ✅    | ✅      | ✅      | ✅      |
+| **Windows** | ✅    | ✅      | ✅      | ✅      |
+| **macOS**   | ✅    |         |         |         |
+
+**Note:** Binaries of older versions are also provided for PyTorch 1.4.0, PyTorch 1.5.0, PyTorch 1.6.0, PyTorch 1.7.0/1.7.1, PyTorch 1.8.0/1.8.1 and PyTorch 1.9.0 (following the same procedure).
+For older versions, you might need to explicitly specify the latest supported version number in order to prevent a manual installation from source.
+You can look up the latest supported version number [here](https://data.pyg.org/whl).
+
+### From source
+
+Ensure that at least PyTorch 1.7.0 is installed and verify that `cuda/bin` and `cuda/include` are in your `$PATH` and `$CPATH` respectively, *e.g.*:
+
+```
+$ python -c "import torch; print(torch.__version__)"
+>>> 1.7.0
+
+$ echo $PATH
+>>> /usr/local/cuda/bin:...
+
+$ echo $CPATH
+>>> /usr/local/cuda/include:...
+```
+
+If you want to additionally build `torch-sparse` with METIS support, *e.g.* for partioning, please download and install the [METIS library](http://glaros.dtc.umn.edu/gkhome/metis/metis/download) by following the instructions in the `Install.txt` file.
+Note that METIS needs to be installed with 64 bit `IDXTYPEWIDTH` by changing `include/metis.h`.
+Afterwards, set the environment variable `WITH_METIS=1`.
+
+Then run:
+
+```
+pip install torch-scatter torch-sparse
+```
+
+When running in a docker container without NVIDIA driver, PyTorch needs to evaluate the compute capabilities and may fail.
+In this case, ensure that the compute capabilities are set via `TORCH_CUDA_ARCH_LIST`, *e.g.*:
+
+```
+export TORCH_CUDA_ARCH_LIST="6.0 6.1 7.2+PTX 7.5+PTX"
+```
+
+## Functions
+
+### Coalesce
+
+```
+torch_sparse.coalesce(index, value, m, n, op="add") -> (torch.LongTensor, torch.Tensor)
+```
+
+Row-wise sorts `index` and removes duplicate entries.
+Duplicate entries are removed by scattering them together.
+For scattering, any operation of [`torch_scatter`](https://github.com/rusty1s/pytorch_scatter) can be used.
+
+#### Parameters
+
+* **index** *(LongTensor)* - The index tensor of sparse matrix.
+* **value** *(Tensor)* - The value tensor of sparse matrix.
+* **m** *(int)* - The first dimension of sparse matrix.
+* **n** *(int)* - The second dimension of sparse matrix.
+* **op** *(string, optional)* - The scatter operation to use. (default: `"add"`)
+
+#### Returns
+
+* **index** *(LongTensor)* - The coalesced index tensor of sparse matrix.
+* **value** *(Tensor)* - The coalesced value tensor of sparse matrix.
+
+#### Example
+
+```python
+import torch
+from torch_sparse import coalesce
+
+index = torch.tensor([[1, 0, 1, 0, 2, 1],
+                      [0, 1, 1, 1, 0, 0]])
+value = torch.Tensor([[1, 2], [2, 3], [3, 4], [4, 5], [5, 6], [6, 7]])
+
+index, value = coalesce(index, value, m=3, n=2)
+```
+
+```
+print(index)
+tensor([[0, 1, 1, 2],
+        [1, 0, 1, 0]])
+print(value)
+tensor([[6.0, 8.0],
+        [7.0, 9.0],
+        [3.0, 4.0],
+        [5.0, 6.0]])
+```
+
+### Transpose
+
+```
+torch_sparse.transpose(index, value, m, n) -> (torch.LongTensor, torch.Tensor)
+```
+
+Transposes dimensions 0 and 1 of a sparse matrix.
+
+#### Parameters
+
+* **index** *(LongTensor)* - The index tensor of sparse matrix.
+* **value** *(Tensor)* - The value tensor of sparse matrix.
+* **m** *(int)* - The first dimension of sparse matrix.
+* **n** *(int)* - The second dimension of sparse matrix.
+* **coalesced** *(bool, optional)* - If set to `False`, will not coalesce the output. (default: `True`)
+
+#### Returns
+
+* **index** *(LongTensor)* - The transposed index tensor of sparse matrix.
+* **value** *(Tensor)* - The transposed value tensor of sparse matrix.
+
+#### Example
+
+```python
+import torch
+from torch_sparse import transpose
+
+index = torch.tensor([[1, 0, 1, 0, 2, 1],
+                      [0, 1, 1, 1, 0, 0]])
+value = torch.Tensor([[1, 2], [2, 3], [3, 4], [4, 5], [5, 6], [6, 7]])
+
+index, value = transpose(index, value, 3, 2)
+```
+
+```
+print(index)
+tensor([[0, 0, 1, 1],
+        [1, 2, 0, 1]])
+print(value)
+tensor([[7.0, 9.0],
+        [5.0, 6.0],
+        [6.0, 8.0],
+        [3.0, 4.0]])
+```
+
+### Sparse Dense Matrix Multiplication
+
+```
+torch_sparse.spmm(index, value, m, n, matrix) -> torch.Tensor
+```
+
+Matrix product of a sparse matrix with a dense matrix.
+
+#### Parameters
+
+* **index** *(LongTensor)* - The index tensor of sparse matrix.
+* **value** *(Tensor)* - The value tensor of sparse matrix.
+* **m** *(int)* - The first dimension of sparse matrix.
+* **n** *(int)* - The second dimension of sparse matrix.
+* **matrix** *(Tensor)* - The dense matrix.
+
+#### Returns
+
+* **out** *(Tensor)* - The dense output matrix.
+
+#### Example
+
+```python
+import torch
+from torch_sparse import spmm
+
+index = torch.tensor([[0, 0, 1, 2, 2],
+                      [0, 2, 1, 0, 1]])
+value = torch.Tensor([1, 2, 4, 1, 3])
+matrix = torch.Tensor([[1, 4], [2, 5], [3, 6]])
+
+out = spmm(index, value, 3, 3, matrix)
+```
+
+```
+print(out)
+tensor([[7.0, 16.0],
+        [8.0, 20.0],
+        [7.0, 19.0]])
+```
+
+### Sparse Sparse Matrix Multiplication
+
+```
+torch_sparse.spspmm(indexA, valueA, indexB, valueB, m, k, n) -> (torch.LongTensor, torch.Tensor)
+```
+
+Matrix product of two sparse tensors.
+Both input sparse matrices need to be **coalesced** (use the `coalesced` attribute to force).
+
+#### Parameters
+
+* **indexA** *(LongTensor)* - The index tensor of first sparse matrix.
+* **valueA** *(Tensor)* - The value tensor of first sparse matrix.
+* **indexB** *(LongTensor)* - The index tensor of second sparse matrix.
+* **valueB** *(Tensor)* - The value tensor of second sparse matrix.
+* **m** *(int)* - The first dimension of first sparse matrix.
+* **k** *(int)* - The second dimension of first sparse matrix and first dimension of second sparse matrix.
+* **n** *(int)* - The second dimension of second sparse matrix.
+* **coalesced** *(bool, optional)*: If set to `True`, will coalesce both input sparse matrices. (default: `False`)
+
+#### Returns
+
+* **index** *(LongTensor)* - The output index tensor of sparse matrix.
+* **value** *(Tensor)* - The output value tensor of sparse matrix.
+
+#### Example
+
+```python
+import torch
+from torch_sparse import spspmm
+
+indexA = torch.tensor([[0, 0, 1, 2, 2], [1, 2, 0, 0, 1]])
+valueA = torch.Tensor([1, 2, 3, 4, 5])
+
+indexB = torch.tensor([[0, 2], [1, 0]])
+valueB = torch.Tensor([2, 4])
+
+indexC, valueC = spspmm(indexA, valueA, indexB, valueB, 3, 3, 2)
+```
+
+```
+print(indexC)
+tensor([[0, 1, 2],
+        [0, 1, 1]])
+print(valueC)
+tensor([8.0, 6.0, 8.0])
+```
+
+## C++ API
+
+`torch-sparse` also offers a C++ API that contains C++ equivalent of python models.
+
+```
+mkdir build
+cd build
+# Add -DWITH_CUDA=on support for the CUDA if needed
+cmake ..
+make
+make install
+```
+
+## Running tests
+
+```
+pytest
+```
--- a/csrc/convert.cpp
+++ b/csrc/convert.cpp
+#ifdef WITH_PYTHON
+#include <Python.h>
+#endif
+#include <torch/script.h>
+
+#include "cpu/convert_cpu.h"
+
+#ifdef WITH_HIP
+#include "hip/convert_hip.h"
+#endif
+
+#ifdef _WIN32
+#ifdef WITH_PYTHON
+#ifdef WITH_HIP
+PyMODINIT_FUNC PyInit__convert_cuda(void) { return NULL; }
+#else
+PyMODINIT_FUNC PyInit__convert_cpu(void) { return NULL; }
+#endif
+#endif
+#endif
+
+SPARSE_API torch::Tensor ind2ptr(torch::Tensor ind, int64_t M) {
+  if (ind.device().is_cuda()) {
+#ifdef WITH_HIP
+    return ind2ptr_cuda(ind, M);
+#else
+    AT_ERROR("Not compiled with CUDA support");
+#endif
+  } else {
+    return ind2ptr_cpu(ind, M);
+  }
+}
+
+SPARSE_API torch::Tensor ptr2ind(torch::Tensor ptr, int64_t E) {
+  if (ptr.device().is_cuda()) {
+#ifdef WITH_HIP
+    return ptr2ind_cuda(ptr, E);
+#else
+    AT_ERROR("Not compiled with CUDA support");
+#endif
+  } else {
+    return ptr2ind_cpu(ptr, E);
+  }
+}
+
+static auto registry = torch::RegisterOperators()
+                           .op("torch_sparse::ind2ptr", &ind2ptr)
+                           .op("torch_sparse::ptr2ind", &ptr2ind);
--- a/csrc/cpu/convert_cpu.cpp
+++ b/csrc/cpu/convert_cpu.cpp
+#include "convert_cpu.h"
+
+#include <ATen/Parallel.h>
+
+#include "utils.h"
+
+torch::Tensor ind2ptr_cpu(torch::Tensor ind, int64_t M) {
+  CHECK_CPU(ind);
+  auto out = torch::empty(M + 1, ind.options());
+  auto ind_data = ind.data_ptr<int64_t>();
+  auto out_data = out.data_ptr<int64_t>();
+
+  int64_t numel = ind.numel();
+
+  if (numel == 0)
+    return out.zero_();
+
+  for (int64_t i = 0; i <= ind_data[0]; i++)
+    out_data[i] = 0;
+
+  int64_t grain_size = at::internal::GRAIN_SIZE;
+  at::parallel_for(0, numel, grain_size, [&](int64_t begin, int64_t end) {
+    int64_t idx = ind_data[begin], next_idx;
+    for (int64_t i = begin; i < std::min(end, numel - 1); i++) {
+      next_idx = ind_data[i + 1];
+      for (; idx < next_idx; idx++)
+        out_data[idx + 1] = i + 1;
+    }
+  });
+
+  for (int64_t i = ind_data[numel - 1] + 1; i < M + 1; i++)
+    out_data[i] = numel;
+
+  return out;
+}
+
+torch::Tensor ptr2ind_cpu(torch::Tensor ptr, int64_t E) {
+  CHECK_CPU(ptr);
+  auto out = torch::empty(E, ptr.options());
+  auto ptr_data = ptr.data_ptr<int64_t>();
+  auto out_data = out.data_ptr<int64_t>();
+
+  int64_t numel = ptr.numel();
+
+  int64_t grain_size = at::internal::GRAIN_SIZE;
+  at::parallel_for(0, numel - 1, grain_size, [&](int64_t begin, int64_t end) {
+    int64_t idx = ptr_data[begin], next_idx;
+    for (int64_t i = begin; i < end; i++) {
+      next_idx = ptr_data[i + 1];
+      for (int64_t e = idx; e < next_idx; e++)
+        out_data[e] = i;
+      idx = next_idx;
+    }
+  });
+
+  return out;
+}
--- a/csrc/cpu/convert_cpu.h
+++ b/csrc/cpu/convert_cpu.h
+#pragma once
+
+#include "../extensions.h"
+
+torch::Tensor ind2ptr_cpu(torch::Tensor ind, int64_t M);
+torch::Tensor ptr2ind_cpu(torch::Tensor ptr, int64_t E);
--- a/csrc/cpu/diag_cpu.cpp
+++ b/csrc/cpu/diag_cpu.cpp
+#include "diag_cpu.h"
+
+#include "utils.h"
+
+torch::Tensor non_diag_mask_cpu(torch::Tensor row, torch::Tensor col, int64_t M,
+                                int64_t N, int64_t k) {
+  CHECK_CPU(row);
+  CHECK_CPU(col);
+
+  auto E = row.size(0);
+  auto num_diag = k < 0 ? std::min(M + k, N) : std::min(M, N - k);
+
+  auto row_data = row.data_ptr<int64_t>();
+  auto col_data = col.data_ptr<int64_t>();
+
+  auto mask = torch::zeros(E + num_diag, row.options().dtype(torch::kBool));
+  auto mask_data = mask.data_ptr<bool>();
+
+  int64_t r, c;
+  if (k < 0) {
+    for (int64_t i = 0; i < E; i++) {
+      r = row_data[i], c = col_data[i];
+      if (r + k < 0) {
+        mask_data[i] = true;
+      } else if (r + k >= N) {
+        mask_data[i + num_diag] = true;
+      } else if (r + k > c) {
+        mask_data[i + r + k] = true;
+      } else if (r + k < c) {
+        mask_data[i + r + k + 1] = true;
+      }
+    }
+  } else {
+    for (int64_t i = 0; i < E; i++) {
+      r = row_data[i], c = col_data[i];
+      if (r + k >= N) {
+        mask_data[i + num_diag] = true;
+      } else if (r + k > c) {
+        mask_data[i + r] = true;
+      } else if (r + k < c) {
+        mask_data[i + r + 1] = true;
+      }
+    }
+  }
+
+  return mask;
+}
--- a/csrc/cpu/diag_cpu.h
+++ b/csrc/cpu/diag_cpu.h
+#pragma once
+
+#include "../extensions.h"
+
+torch::Tensor non_diag_mask_cpu(torch::Tensor row, torch::Tensor col, int64_t M,
+                                int64_t N, int64_t k);
--- a/csrc/cpu/ego_sample_cpu.cpp
+++ b/csrc/cpu/ego_sample_cpu.cpp
+#include "ego_sample_cpu.h"
+
+#include <ATen/Parallel.h>
+
+#include "utils.h"
+
+#ifdef _WIN32
+#include <process.h>
+#endif
+
+inline torch::Tensor vec2tensor(std::vector<int64_t> vec) {
+  return torch::from_blob(vec.data(), {(int64_t)vec.size()}, at::kLong).clone();
+}
+
+// Returns `rowptr`, `col`, `n_id`, `e_id`, `ptr`, `root_n_id`
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor,
+           torch::Tensor, torch::Tensor>
+ego_k_hop_sample_adj_cpu(torch::Tensor rowptr, torch::Tensor col,
+                         torch::Tensor idx, int64_t depth,
+                         int64_t num_neighbors, bool replace) {
+
+  srand(time(NULL) + 1000 * getpid()); // Initialize random seed.
+
+  std::vector<torch::Tensor> out_rowptrs(idx.numel() + 1);
+  std::vector<torch::Tensor> out_cols(idx.numel());
+  std::vector<torch::Tensor> out_n_ids(idx.numel());
+  std::vector<torch::Tensor> out_e_ids(idx.numel());
+  auto out_root_n_id = torch::empty({idx.numel()}, at::kLong);
+  out_rowptrs[0] = torch::zeros({1}, at::kLong);
+
+  auto rowptr_data = rowptr.data_ptr<int64_t>();
+  auto col_data = col.data_ptr<int64_t>();
+  auto idx_data = idx.data_ptr<int64_t>();
+  auto out_root_n_id_data = out_root_n_id.data_ptr<int64_t>();
+
+  at::parallel_for(0, idx.numel(), 1, [&](int64_t begin, int64_t end) {
+    int64_t row_start, row_end, row_count, vec_start, vec_end, v, w;
+    for (int64_t g = begin; g < end; g++) {
+      std::set<int64_t> n_id_set;
+      n_id_set.insert(idx_data[g]);
+      std::vector<int64_t> n_ids;
+      n_ids.push_back(idx_data[g]);
+
+      vec_start = 0, vec_end = n_ids.size();
+      for (int64_t d = 0; d < depth; d++) {
+        for (int64_t i = vec_start; i < vec_end; i++) {
+          v = n_ids[i];
+          row_start = rowptr_data[v], row_end = rowptr_data[v + 1];
+          row_count = row_end - row_start;
+
+          if (row_count <= num_neighbors) {
+            for (int64_t e = row_start; e < row_end; e++) {
+              w = col_data[e];
+              n_id_set.insert(w);
+              n_ids.push_back(w);
+            }
+          } else if (replace) {
+            for (int64_t j = 0; j < num_neighbors; j++) {
+              w = col_data[row_start + (rand() % row_count)];
+              n_id_set.insert(w);
+              n_ids.push_back(w);
+            }
+          } else {
+            std::unordered_set<int64_t> perm;
+            for (int64_t j = row_count - num_neighbors; j < row_count; j++) {
+              if (!perm.insert(rand() % j).second) {
+                perm.insert(j);
+              }
+            }
+            for (int64_t j : perm) {
+              w = col_data[row_start + j];
+              n_id_set.insert(w);
+              n_ids.push_back(w);
+            }
+          }
+        }
+        vec_start = vec_end;
+        vec_end = n_ids.size();
+      }
+
+      n_ids.clear();
+      std::map<int64_t, int64_t> n_id_map;
+      std::map<int64_t, int64_t>::iterator iter;
+
+      int64_t i = 0;
+      for (int64_t v : n_id_set) {
+        n_ids.push_back(v);
+        n_id_map[v] = i;
+        i++;
+      }
+
+      out_root_n_id_data[g] = n_id_map[idx_data[g]];
+
+      std::vector<int64_t> rowptrs, cols, e_ids;
+      for (int64_t v : n_ids) {
+        row_start = rowptr_data[v], row_end = rowptr_data[v + 1];
+        for (int64_t e = row_start; e < row_end; e++) {
+          w = col_data[e];
+          iter = n_id_map.find(w);
+          if (iter != n_id_map.end()) {
+            cols.push_back(iter->second);
+            e_ids.push_back(e);
+          }
+        }
+        rowptrs.push_back(cols.size());
+      }
+
+      out_rowptrs[g + 1] = vec2tensor(rowptrs);
+      out_cols[g] = vec2tensor(cols);
+      out_n_ids[g] = vec2tensor(n_ids);
+      out_e_ids[g] = vec2tensor(e_ids);
+    }
+  });
+
+  auto out_ptr = torch::empty({idx.numel() + 1}, at::kLong);
+  auto out_ptr_data = out_ptr.data_ptr<int64_t>();
+  out_ptr_data[0] = 0;
+
+  int64_t node_cumsum = 0, edge_cumsum = 0;
+  for (int64_t g = 1; g < idx.numel(); g++) {
+    node_cumsum += out_n_ids[g - 1].numel();
+    edge_cumsum += out_cols[g - 1].numel();
+    out_rowptrs[g + 1].add_(edge_cumsum);
+    out_cols[g].add_(node_cumsum);
+    out_ptr_data[g] = node_cumsum;
+    out_root_n_id_data[g] += node_cumsum;
+  }
+  node_cumsum += out_n_ids[idx.numel() - 1].numel();
+  out_ptr_data[idx.numel()] = node_cumsum;
+
+  return std::make_tuple(torch::cat(out_rowptrs, 0), torch::cat(out_cols, 0),
+                         torch::cat(out_n_ids, 0), torch::cat(out_e_ids, 0),
+                         out_ptr, out_root_n_id);
+}
--- a/csrc/cpu/ego_sample_cpu.h
+++ b/csrc/cpu/ego_sample_cpu.h
+#pragma once
+
+#include "../extensions.h"
+
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor,
+           torch::Tensor, torch::Tensor>
+ego_k_hop_sample_adj_cpu(torch::Tensor rowptr, torch::Tensor col,
+                         torch::Tensor idx, int64_t depth,
+                         int64_t num_neighbors, bool replace);
--- a/csrc/cpu/hgt_sample_cpu.cpp
+++ b/csrc/cpu/hgt_sample_cpu.cpp
+#include "hgt_sample_cpu.h"
+
+#include "utils.h"
+
+#ifdef _WIN32
+#include <process.h>
+#endif
+
+#define MAX_NEIGHBORS 50
+
+using namespace std;
+
+edge_t split(const rel_t &rel_type) {
+  vector<string> result(3);
+  int start = 0, end;
+  for (int i = 0; i < 3; i++) {
+    end = rel_type.find("__", start);
+    result[i] = rel_type.substr(start, end - start);
+    start = end + 2;
+  }
+  return make_tuple(result[0], result[1], result[2]);
+}
+
+void update_budget_(
+    unordered_map<node_t, unordered_map<int64_t, float>> *budget_dict,
+    const node_t &node_type, const vector<int64_t> &samples,
+    const unordered_map<node_t, unordered_map<int64_t, int64_t>>
+        &to_local_node_dict,
+    const unordered_map<rel_t, edge_t> &to_edge_type,
+    const c10::Dict<rel_t, torch::Tensor> &colptr_dict,
+    const c10::Dict<rel_t, torch::Tensor> &row_dict) {
+
+  if (samples.empty())
+    return;
+
+  for (const auto &kv : colptr_dict) {
+    const auto &rel_type = kv.key();
+    const auto &edge_type = to_edge_type.at(rel_type);
+    const auto &src_node_type = get<0>(edge_type);
+    const auto &dst_node_type = get<2>(edge_type);
+
+    if (node_type != dst_node_type)
+      continue;
+
+    const auto &to_local_src_node = to_local_node_dict.at(src_node_type);
+    const auto *colptr_data = kv.value().data_ptr<int64_t>();
+    const auto *row_data = row_dict.at(rel_type).data_ptr<int64_t>();
+    auto &src_budget = budget_dict->at(src_node_type);
+
+    for (const auto &w : samples) {
+      const auto &col_start = colptr_data[w], &col_end = colptr_data[w + 1];
+      if (col_end - col_start > MAX_NEIGHBORS) {
+        // There might be same neighbors with large neighborhood sizes.
+        // In order to prevent that we fill our budget with many values of low
+        // probability, we instead sample a fixed amount without replacement:
+        auto indices = choice(col_end - col_start, MAX_NEIGHBORS, false);
+        auto *indices_data = indices.data_ptr<int64_t>();
+        for (int64_t i = 0; i < indices.numel(); i++) {
+          const auto &v = row_data[col_start + indices_data[i]];
+          // Only add the neighbor in case we have not yet seen it before:
+          if (to_local_src_node.find(v) == to_local_src_node.end())
+            src_budget[v] += 1.f / float(MAX_NEIGHBORS);
+        }
+
+      } else if (col_end != col_start) {
+        const auto inv_deg = 1.f / float(col_end - col_start);
+        for (int64_t i = col_start; i < col_end; i++) {
+          const auto &v = row_data[i];
+          // Only add the neighbor in case we have not yet seen it before:
+          if (to_local_src_node.find(v) == to_local_src_node.end())
+            src_budget[v] += inv_deg;
+        }
+      }
+    }
+  }
+}
+
+vector<int64_t> sample_from(const unordered_map<int64_t, float> &budget,
+                            const int64_t num_samples) {
+  vector<int64_t> indices;
+  vector<float> weights;
+  indices.reserve(budget.size());
+  weights.reserve(budget.size());
+  for (const auto &kv : budget) {
+    indices.push_back(kv.first);
+    weights.push_back(kv.second * kv.second);
+  }
+
+  const auto weight = from_vector(weights, true);
+  const auto sample = choice(budget.size(), num_samples, false, weight);
+  const auto *sample_data = sample.data_ptr<int64_t>();
+
+  vector<int64_t> out(sample.numel());
+  for (int64_t i = 0; i < sample.numel(); i++) {
+    out[i] = indices[sample_data[i]];
+  }
+  return out;
+}
+
+tuple<c10::Dict<node_t, torch::Tensor>, c10::Dict<rel_t, torch::Tensor>,
+      c10::Dict<rel_t, torch::Tensor>, c10::Dict<rel_t, torch::Tensor>>
+hgt_sample_cpu(const c10::Dict<rel_t, torch::Tensor> &colptr_dict,
+               const c10::Dict<rel_t, torch::Tensor> &row_dict,
+               const c10::Dict<node_t, torch::Tensor> &input_node_dict,
+               const c10::Dict<node_t, vector<int64_t>> &num_samples_dict,
+               const int64_t num_hops) {
+
+  srand(time(NULL) + 1000 * getpid()); // Initialize random seed.
+
+  // Create a mapping to convert single string relations to edge type triplets:
+  unordered_map<rel_t, edge_t> to_edge_type;
+  for (const auto &kv : colptr_dict) {
+    const auto &rel_type = kv.key();
+    to_edge_type[rel_type] = split(rel_type);
+  }
+
+  // Initialize some necessary data structures for the sampling process:
+  unordered_map<node_t, vector<int64_t>> nodes_dict;
+  unordered_map<node_t, unordered_map<int64_t, int64_t>> to_local_node_dict;
+  unordered_map<node_t, unordered_map<int64_t, float>> budget_dict;
+  for (const auto &kv : num_samples_dict) {
+    const auto &node_type = kv.key();
+    nodes_dict[node_type];
+    to_local_node_dict[node_type];
+    budget_dict[node_type];
+  }
+
+  // Add the input nodes to the sampled output nodes (line 1):
+  for (const auto &kv : input_node_dict) {
+    const auto &node_type = kv.key();
+    const auto &input_node = kv.value();
+    const auto *input_node_data = input_node.data_ptr<int64_t>();
+
+    auto &nodes = nodes_dict.at(node_type);
+    auto &to_local_node = to_local_node_dict.at(node_type);
+    for (int64_t i = 0; i < input_node.numel(); i++) {
+      const auto &v = input_node_data[i];
+      nodes.push_back(v);
+      to_local_node[v] = i;
+    }
+  }
+
+  // Update the budget based on the initial input set (line 3-5):
+  for (const auto &kv : nodes_dict) {
+    const auto &node_type = kv.first;
+    const auto &last_samples = kv.second;
+    update_budget_(&budget_dict, node_type, last_samples, to_local_node_dict,
+                   to_edge_type, colptr_dict, row_dict);
+  }
+
+  for (int64_t ell = 0; ell < num_hops; ell++) {
+    unordered_map<node_t, vector<int64_t>> samples_dict;
+    for (auto &kv : budget_dict) {
+      const auto &node_type = kv.first;
+      auto &budget = kv.second;
+      const auto num_samples = num_samples_dict.at(node_type)[ell];
+
+      // Sample `num_samples` nodes, according to the budget (line 9-11):
+      const auto samples = sample_from(budget, num_samples);
+      samples_dict[node_type] = samples;
+
+      // Add samples to the sampled output nodes, and erase them from the budget
+      // (line 13/15):
+      auto &nodes = nodes_dict.at(node_type);
+      auto &to_local_node = to_local_node_dict.at(node_type);
+      for (const auto &v : samples) {
+        to_local_node[v] = nodes.size();
+        nodes.push_back(v);
+        budget.erase(v);
+      }
+    }
+
+    if (ell < num_hops - 1) {
+      // Add neighbors of newly sampled nodes to the budget (line 14):
+      // Note that we do not need to update the budget in the last iteration.
+      for (const auto &kv : samples_dict) {
+        const auto &node_type = kv.first;
+        const auto &last_samples = kv.second;
+        update_budget_(&budget_dict, node_type, last_samples,
+                       to_local_node_dict, to_edge_type, colptr_dict, row_dict);
+      }
+    }
+  }
+
+  c10::Dict<node_t, torch::Tensor> out_node_dict;
+  c10::Dict<rel_t, torch::Tensor> out_row_dict;
+  c10::Dict<rel_t, torch::Tensor> out_col_dict;
+  c10::Dict<rel_t, torch::Tensor> out_edge_dict;
+
+  // Reconstruct the sampled adjacency matrix among the sampled nodes (line 19):
+  for (const auto &kv : colptr_dict) {
+    const auto &rel_type = kv.key();
+    const auto &edge_type = to_edge_type.at(rel_type);
+    const auto &src_node_type = get<0>(edge_type);
+    const auto &dst_node_type = get<2>(edge_type);
+
+    const auto *colptr_data = kv.value().data_ptr<int64_t>();
+    const auto *row_data = row_dict.at(rel_type).data_ptr<int64_t>();
+
+    const auto &dst_nodes = nodes_dict.at(dst_node_type);
+    const auto &to_local_src_node = to_local_node_dict.at(src_node_type);
+
+    vector<int64_t> rows, cols, edges;
+    for (int64_t i = 0; i < (int64_t)dst_nodes.size(); i++) {
+      const auto &w = dst_nodes[i];
+      const auto &col_start = colptr_data[w], &col_end = colptr_data[w + 1];
+      if (col_end - col_start > MAX_NEIGHBORS) {
+        auto indices = choice(col_end - col_start, MAX_NEIGHBORS, false);
+        auto *indices_data = indices.data_ptr<int64_t>();
+        for (int64_t j = 0; j < indices.numel(); j++) {
+          const auto &v = row_data[col_start + indices_data[j]];
+          if (to_local_src_node.find(v) != to_local_src_node.end()) {
+            rows.push_back(to_local_src_node.at(v));
+            cols.push_back(i);
+            edges.push_back(col_start + j);
+          }
+        }
+      } else {
+        for (int64_t j = col_start; j < col_end; j++) {
+          const auto &v = row_data[j];
+          if (to_local_src_node.find(v) != to_local_src_node.end()) {
+            rows.push_back(to_local_src_node.at(v));
+            cols.push_back(i);
+            edges.push_back(j);
+          }
+        }
+      }
+    }
+    if (rows.size() > 0) {
+      out_row_dict.insert(rel_type, from_vector<int64_t>(rows));
+      out_col_dict.insert(rel_type, from_vector<int64_t>(cols));
+      out_edge_dict.insert(rel_type, from_vector<int64_t>(edges));
+    }
+  }
+
+  // Generate tensor-valued output node dictionary (line 20):
+  for (const auto &kv : nodes_dict) {
+    const auto &node_type = kv.first;
+    const auto &nodes = kv.second;
+    if (!nodes.empty())
+      out_node_dict.insert(node_type, from_vector<int64_t>(nodes));
+  }
+
+  return make_tuple(out_node_dict, out_row_dict, out_col_dict, out_edge_dict);
+}
--- a/csrc/cpu/hgt_sample_cpu.h
+++ b/csrc/cpu/hgt_sample_cpu.h
+#pragma once
+
+#include "../extensions.h"
+
+typedef std::string node_t;
+typedef std::string rel_t;
+typedef std::tuple<std::string, std::string, std::string> edge_t;
+
+std::tuple<c10::Dict<node_t, torch::Tensor>, c10::Dict<rel_t, torch::Tensor>,
+           c10::Dict<rel_t, torch::Tensor>, c10::Dict<rel_t, torch::Tensor>>
+hgt_sample_cpu(const c10::Dict<rel_t, torch::Tensor> &colptr_dict,
+               const c10::Dict<rel_t, torch::Tensor> &row_dict,
+               const c10::Dict<node_t, torch::Tensor> &input_node_dict,
+               const c10::Dict<node_t, std::vector<int64_t>> &num_samples_dict,
+               const int64_t num_hops);
--- a/csrc/cpu/metis_cpu.cpp
+++ b/csrc/cpu/metis_cpu.cpp
+#include "metis_cpu.h"
+
+#ifdef WITH_METIS
+#include <metis.h>
+#endif
+
+#ifdef WITH_MTMETIS
+#include <mtmetis.h>
+#endif
+
+#include "utils.h"
+
+torch::Tensor partition_cpu(torch::Tensor rowptr, torch::Tensor col,
+                            torch::optional<torch::Tensor> optional_value,
+                            torch::optional<torch::Tensor> optional_node_weight,
+                            int64_t num_parts, bool recursive) {
+#ifdef WITH_METIS
+  CHECK_CPU(rowptr);
+  CHECK_CPU(col);
+
+  if (optional_value.has_value()) {
+    CHECK_CPU(optional_value.value());
+    CHECK_INPUT(optional_value.value().dim() == 1);
+    CHECK_INPUT(optional_value.value().numel() == col.numel());
+  }
+
+  if (optional_node_weight.has_value()) {
+    CHECK_CPU(optional_node_weight.value());
+    CHECK_INPUT(optional_node_weight.value().dim() == 1);
+    CHECK_INPUT(optional_node_weight.value().numel() == rowptr.numel() - 1);
+  }
+
+  int64_t nvtxs = rowptr.numel() - 1;
+  int64_t ncon = 1;
+  auto *xadj = rowptr.data_ptr<int64_t>();
+  auto *adjncy = col.data_ptr<int64_t>();
+
+  int64_t *adjwgt = NULL;
+  if (optional_value.has_value())
+    adjwgt = optional_value.value().data_ptr<int64_t>();
+
+  int64_t *vwgt = NULL;
+  if (optional_node_weight.has_value())
+    vwgt = optional_node_weight.value().data_ptr<int64_t>();
+
+  int64_t objval = -1;
+  auto part = torch::empty(nvtxs, rowptr.options());
+  auto part_data = part.data_ptr<int64_t>();
+
+  if (recursive) {
+    METIS_PartGraphRecursive(&nvtxs, &ncon, xadj, adjncy, vwgt, NULL, adjwgt,
+                             &num_parts, NULL, NULL, NULL, &objval, part_data);
+  } else {
+    METIS_PartGraphKway(&nvtxs, &ncon, xadj, adjncy, vwgt, NULL, adjwgt,
+                        &num_parts, NULL, NULL, NULL, &objval, part_data);
+  }
+
+  return part;
+#else
+  AT_ERROR("Not compiled with METIS support");
+#endif
+}
+
+// needs mt-metis installed via:
+// ./configure --shared --edges64bit --vertices64bit --weights64bit
+//             --partitions64bit
+torch::Tensor
+mt_partition_cpu(torch::Tensor rowptr, torch::Tensor col,
+                 torch::optional<torch::Tensor> optional_value,
+                 torch::optional<torch::Tensor> optional_node_weight,
+                 int64_t num_parts, bool recursive, int64_t num_workers) {
+#ifdef WITH_MTMETIS
+  CHECK_CPU(rowptr);
+  CHECK_CPU(col);
+  if (optional_value.has_value()) {
+    CHECK_CPU(optional_value.value());
+    CHECK_INPUT(optional_value.value().dim() == 1);
+    CHECK_INPUT(optional_value.value().numel() == col.numel());
+  }
+
+  if (optional_node_weight.has_value()) {
+    CHECK_CPU(optional_node_weight.value());
+    CHECK_INPUT(optional_node_weight.value().dim() == 1);
+    CHECK_INPUT(optional_node_weight.value().numel() == rowptr.numel() - 1);
+  }
+
+  mtmetis_vtx_type nvtxs = rowptr.numel() - 1;
+  mtmetis_vtx_type ncon = 1;
+  mtmetis_adj_type *xadj = (mtmetis_adj_type *)rowptr.data_ptr<int64_t>();
+  mtmetis_vtx_type *adjncy = (mtmetis_vtx_type *)col.data_ptr<int64_t>();
+  mtmetis_wgt_type *adjwgt = NULL;
+
+  if (optional_value.has_value())
+    adjwgt = optional_value.value().data_ptr<int64_t>();
+
+  mtmetis_wgt_type *vwgt = NULL;
+  if (optional_node_weight.has_value())
+    vwgt = optional_node_weight.value().data_ptr<int64_t>();
+
+  mtmetis_pid_type nparts = num_parts;
+  mtmetis_wgt_type objval = -1;
+  auto part = torch::empty(nvtxs, rowptr.options());
+  mtmetis_pid_type *part_data = (mtmetis_pid_type *)part.data_ptr<int64_t>();
+
+  double *opts = mtmetis_init_options();
+  opts[MTMETIS_OPTION_NTHREADS] = num_workers;
+
+  if (recursive) {
+    MTMETIS_PartGraphRecursive(&nvtxs, &ncon, xadj, adjncy, vwgt, NULL, adjwgt,
+                               &nparts, NULL, NULL, opts, &objval, part_data);
+  } else {
+    MTMETIS_PartGraphKway(&nvtxs, &ncon, xadj, adjncy, vwgt, NULL, adjwgt,
+                          &nparts, NULL, NULL, opts, &objval, part_data);
+  }
+
+  return part;
+#else
+  AT_ERROR("Not compiled with MTMETIS support");
+#endif
+}
--- a/csrc/cpu/metis_cpu.h
+++ b/csrc/cpu/metis_cpu.h
+#pragma once
+
+#include "../extensions.h"
+
+torch::Tensor partition_cpu(torch::Tensor rowptr, torch::Tensor col,
+                            torch::optional<torch::Tensor> optional_value,
+                            torch::optional<torch::Tensor> optional_node_weight,
+                            int64_t num_parts, bool recursive);
+
+torch::Tensor
+mt_partition_cpu(torch::Tensor rowptr, torch::Tensor col,
+                 torch::optional<torch::Tensor> optional_value,
+                 torch::optional<torch::Tensor> optional_node_weight,
+                 int64_t num_parts, bool recursive, int64_t num_workers);
--- a/csrc/cpu/neighbor_sample_cpu.cpp
+++ b/csrc/cpu/neighbor_sample_cpu.cpp
+#include "neighbor_sample_cpu.h"
+
+#include "utils.h"
+
+#ifdef _WIN32
+#include <process.h>
+#endif
+
+using namespace std;
+
+namespace {
+
+template <bool replace, bool directed>
+tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
+sample(const torch::Tensor &colptr, const torch::Tensor &row,
+       const torch::Tensor &input_node, const vector<int64_t> num_neighbors) {
+
+  srand(time(NULL) + 1000 * getpid()); // Initialize random seed.
+
+  // Initialize some data structures for the sampling process:
+  vector<int64_t> samples;
+  unordered_map<int64_t, int64_t> to_local_node;
+
+  auto *colptr_data = colptr.data_ptr<int64_t>();
+  auto *row_data = row.data_ptr<int64_t>();
+  auto *input_node_data = input_node.data_ptr<int64_t>();
+
+  for (int64_t i = 0; i < input_node.numel(); i++) {
+    const auto &v = input_node_data[i];
+    samples.push_back(v);
+    to_local_node.insert({v, i});
+  }
+
+  vector<int64_t> rows, cols, edges;
+
+  int64_t begin = 0, end = samples.size();
+  for (int64_t ell = 0; ell < (int64_t)num_neighbors.size(); ell++) {
+    const auto &num_samples = num_neighbors[ell];
+    for (int64_t i = begin; i < end; i++) {
+      const auto &w = samples[i];
+      const auto &col_start = colptr_data[w];
+      const auto &col_end = colptr_data[w + 1];
+      const auto col_count = col_end - col_start;
+
+      if (col_count == 0)
+        continue;
+
+      if ((num_samples < 0) || (!replace && (num_samples >= col_count))) {
+        for (int64_t offset = col_start; offset < col_end; offset++) {
+          const int64_t &v = row_data[offset];
+          const auto res = to_local_node.insert({v, samples.size()});
+          if (res.second)
+            samples.push_back(v);
+          if (directed) {
+            cols.push_back(i);
+            rows.push_back(res.first->second);
+            edges.push_back(offset);
+          }
+        }
+      } else if (replace) {
+        for (int64_t j = 0; j < num_samples; j++) {
+          const int64_t offset = col_start + rand() % col_count;
+          const int64_t &v = row_data[offset];
+          const auto res = to_local_node.insert({v, samples.size()});
+          if (res.second)
+            samples.push_back(v);
+          if (directed) {
+            cols.push_back(i);
+            rows.push_back(res.first->second);
+            edges.push_back(offset);
+          }
+        }
+      } else {
+        unordered_set<int64_t> rnd_indices;
+        for (int64_t j = col_count - num_samples; j < col_count; j++) {
+          int64_t rnd = rand() % j;
+          if (!rnd_indices.insert(rnd).second) {
+            rnd = j;
+            rnd_indices.insert(j);
+          }
+          const int64_t offset = col_start + rnd;
+          const int64_t &v = row_data[offset];
+          const auto res = to_local_node.insert({v, samples.size()});
+          if (res.second)
+            samples.push_back(v);
+          if (directed) {
+            cols.push_back(i);
+            rows.push_back(res.first->second);
+            edges.push_back(offset);
+          }
+        }
+      }
+    }
+    begin = end, end = samples.size();
+  }
+
+  if (!directed) {
+    unordered_map<int64_t, int64_t>::iterator iter;
+    for (int64_t i = 0; i < (int64_t)samples.size(); i++) {
+      const auto &w = samples[i];
+      const auto &col_start = colptr_data[w];
+      const auto &col_end = colptr_data[w + 1];
+      for (int64_t offset = col_start; offset < col_end; offset++) {
+        const auto &v = row_data[offset];
+        iter = to_local_node.find(v);
+        if (iter != to_local_node.end()) {
+          rows.push_back(iter->second);
+          cols.push_back(i);
+          edges.push_back(offset);
+        }
+      }
+    }
+  }
+
+  return make_tuple(from_vector<int64_t>(samples), from_vector<int64_t>(rows),
+                    from_vector<int64_t>(cols), from_vector<int64_t>(edges));
+}
+
+template <bool replace, bool directed>
+tuple<c10::Dict<node_t, torch::Tensor>, c10::Dict<rel_t, torch::Tensor>,
+      c10::Dict<rel_t, torch::Tensor>, c10::Dict<rel_t, torch::Tensor>>
+hetero_sample(const vector<node_t> &node_types,
+              const vector<edge_t> &edge_types,
+              const c10::Dict<rel_t, torch::Tensor> &colptr_dict,
+              const c10::Dict<rel_t, torch::Tensor> &row_dict,
+              const c10::Dict<node_t, torch::Tensor> &input_node_dict,
+              const c10::Dict<rel_t, vector<int64_t>> &num_neighbors_dict,
+              const int64_t num_hops) {
+
+  srand(time(NULL) + 1000 * getpid()); // Initialize random seed.
+
+  // Create a mapping to convert single string relations to edge type triplets:
+  unordered_map<rel_t, edge_t> to_edge_type;
+  for (const auto &k : edge_types)
+    to_edge_type[get<0>(k) + "__" + get<1>(k) + "__" + get<2>(k)] = k;
+
+  // Initialize some data structures for the sampling process:
+  unordered_map<node_t, vector<int64_t>> samples_dict;
+  unordered_map<node_t, unordered_map<int64_t, int64_t>> to_local_node_dict;
+  for (const auto &node_type : node_types) {
+    samples_dict[node_type];
+    to_local_node_dict[node_type];
+  }
+
+  unordered_map<rel_t, vector<int64_t>> rows_dict, cols_dict, edges_dict;
+  for (const auto &kv : colptr_dict) {
+    const auto &rel_type = kv.key();
+    rows_dict[rel_type];
+    cols_dict[rel_type];
+    edges_dict[rel_type];
+  }
+
+  // Add the input nodes to the output nodes:
+  for (const auto &kv : input_node_dict) {
+    const auto &node_type = kv.key();
+    const torch::Tensor &input_node = kv.value();
+    const auto *input_node_data = input_node.data_ptr<int64_t>();
+
+    auto &samples = samples_dict.at(node_type);
+    auto &to_local_node = to_local_node_dict.at(node_type);
+    for (int64_t i = 0; i < input_node.numel(); i++) {
+      const auto &v = input_node_data[i];
+      samples.push_back(v);
+      to_local_node.insert({v, i});
+    }
+  }
+
+  unordered_map<node_t, pair<int64_t, int64_t>> slice_dict;
+  for (const auto &kv : samples_dict)
+    slice_dict[kv.first] = {0, kv.second.size()};
+
+  for (int64_t ell = 0; ell < num_hops; ell++) {
+    for (const auto &kv : num_neighbors_dict) {
+      const auto &rel_type = kv.key();
+      const auto &edge_type = to_edge_type[rel_type];
+      const auto &src_node_type = get<0>(edge_type);
+      const auto &dst_node_type = get<2>(edge_type);
+      const auto num_samples = kv.value()[ell];
+      const auto &dst_samples = samples_dict.at(dst_node_type);
+      auto &src_samples = samples_dict.at(src_node_type);
+      auto &to_local_src_node = to_local_node_dict.at(src_node_type);
+
+      const auto *colptr_data = ((torch::Tensor)colptr_dict.at(rel_type)).data_ptr<int64_t>();
+      const auto *row_data = ((torch::Tensor)row_dict.at(rel_type)).data_ptr<int64_t>();
+
+      auto &rows = rows_dict.at(rel_type);
+      auto &cols = cols_dict.at(rel_type);
+      auto &edges = edges_dict.at(rel_type);
+
+      const auto &begin = slice_dict.at(dst_node_type).first;
+      const auto &end = slice_dict.at(dst_node_type).second;
+      for (int64_t i = begin; i < end; i++) {
+        const auto &w = dst_samples[i];
+        const auto &col_start = colptr_data[w];
+        const auto &col_end = colptr_data[w + 1];
+        const auto col_count = col_end - col_start;
+
+        if (col_count == 0)
+          continue;
+
+        if ((num_samples < 0) || (!replace && (num_samples >= col_count))) {
+          for (int64_t offset = col_start; offset < col_end; offset++) {
+            const int64_t &v = row_data[offset];
+            const auto res = to_local_src_node.insert({v, src_samples.size()});
+            if (res.second)
+              src_samples.push_back(v);
+            if (directed) {
+              cols.push_back(i);
+              rows.push_back(res.first->second);
+              edges.push_back(offset);
+            }
+          }
+        } else if (replace) {
+          for (int64_t j = 0; j < num_samples; j++) {
+            const int64_t offset = col_start + rand() % col_count;
+            const int64_t &v = row_data[offset];
+            const auto res = to_local_src_node.insert({v, src_samples.size()});
+            if (res.second)
+              src_samples.push_back(v);
+            if (directed) {
+              cols.push_back(i);
+              rows.push_back(res.first->second);
+              edges.push_back(offset);
+            }
+          }
+        } else {
+          unordered_set<int64_t> rnd_indices;
+          for (int64_t j = col_count - num_samples; j < col_count; j++) {
+            int64_t rnd = rand() % j;
+            if (!rnd_indices.insert(rnd).second) {
+              rnd = j;
+              rnd_indices.insert(j);
+            }
+            const int64_t offset = col_start + rnd;
+            const int64_t &v = row_data[offset];
+            const auto res = to_local_src_node.insert({v, src_samples.size()});
+            if (res.second)
+              src_samples.push_back(v);
+            if (directed) {
+              cols.push_back(i);
+              rows.push_back(res.first->second);
+              edges.push_back(offset);
+            }
+          }
+        }
+      }
+    }
+
+    for (const auto &kv : samples_dict) {
+      slice_dict[kv.first] = {slice_dict.at(kv.first).second, kv.second.size()};
+    }
+  }
+
+  if (!directed) { // Construct the subgraph among the sampled nodes:
+    unordered_map<int64_t, int64_t>::iterator iter;
+    for (const auto &kv : colptr_dict) {
+      const auto &rel_type = kv.key();
+      const auto &edge_type = to_edge_type[rel_type];
+      const auto &src_node_type = get<0>(edge_type);
+      const auto &dst_node_type = get<2>(edge_type);
+      const auto &dst_samples = samples_dict.at(dst_node_type);
+      auto &to_local_src_node = to_local_node_dict.at(src_node_type);
+
+      const auto *colptr_data = ((torch::Tensor)kv.value()).data_ptr<int64_t>();
+      const auto *row_data = ((torch::Tensor)row_dict.at(rel_type)).data_ptr<int64_t>();
+
+      auto &rows = rows_dict.at(rel_type);
+      auto &cols = cols_dict.at(rel_type);
+      auto &edges = edges_dict.at(rel_type);
+
+      for (int64_t i = 0; i < (int64_t)dst_samples.size(); i++) {
+        const auto &w = dst_samples[i];
+        const auto &col_start = colptr_data[w];
+        const auto &col_end = colptr_data[w + 1];
+        for (int64_t offset = col_start; offset < col_end; offset++) {
+          const auto &v = row_data[offset];
+          iter = to_local_src_node.find(v);
+          if (iter != to_local_src_node.end()) {
+            rows.push_back(iter->second);
+            cols.push_back(i);
+            edges.push_back(offset);
+          }
+        }
+      }
+    }
+  }
+
+  return make_tuple(from_vector<node_t, int64_t>(samples_dict),
+                    from_vector<rel_t, int64_t>(rows_dict),
+                    from_vector<rel_t, int64_t>(cols_dict),
+                    from_vector<rel_t, int64_t>(edges_dict));
+}
+
+} // namespace
+
+tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
+neighbor_sample_cpu(const torch::Tensor &colptr, const torch::Tensor &row,
+                    const torch::Tensor &input_node,
+                    const vector<int64_t> num_neighbors, const bool replace,
+                    const bool directed) {
+
+  if (replace && directed) {
+    return sample<true, true>(colptr, row, input_node, num_neighbors);
+  } else if (replace && !directed) {
+    return sample<true, false>(colptr, row, input_node, num_neighbors);
+  } else if (!replace && directed) {
+    return sample<false, true>(colptr, row, input_node, num_neighbors);
+  } else {
+    return sample<false, false>(colptr, row, input_node, num_neighbors);
+  }
+}
+
+tuple<c10::Dict<node_t, torch::Tensor>, c10::Dict<rel_t, torch::Tensor>,
+      c10::Dict<rel_t, torch::Tensor>, c10::Dict<rel_t, torch::Tensor>>
+hetero_neighbor_sample_cpu(
+    const vector<node_t> &node_types, const vector<edge_t> &edge_types,
+    const c10::Dict<rel_t, torch::Tensor> &colptr_dict,
+    const c10::Dict<rel_t, torch::Tensor> &row_dict,
+    const c10::Dict<node_t, torch::Tensor> &input_node_dict,
+    const c10::Dict<rel_t, vector<int64_t>> &num_neighbors_dict,
+    const int64_t num_hops, const bool replace, const bool directed) {
+
+  if (replace && directed) {
+    return hetero_sample<true, true>(node_types, edge_types, colptr_dict,
+                                     row_dict, input_node_dict,
+                                     num_neighbors_dict, num_hops);
+  } else if (replace && !directed) {
+    return hetero_sample<true, false>(node_types, edge_types, colptr_dict,
+                                      row_dict, input_node_dict,
+                                      num_neighbors_dict, num_hops);
+  } else if (!replace && directed) {
+    return hetero_sample<false, true>(node_types, edge_types, colptr_dict,
+                                      row_dict, input_node_dict,
+                                      num_neighbors_dict, num_hops);
+  } else {
+    return hetero_sample<false, false>(node_types, edge_types, colptr_dict,
+                                       row_dict, input_node_dict,
+                                       num_neighbors_dict, num_hops);
+  }
+}
--- a/csrc/cpu/neighbor_sample_cpu.h
+++ b/csrc/cpu/neighbor_sample_cpu.h
+#pragma once
+
+#include "../extensions.h"
+
+typedef std::string node_t;
+typedef std::tuple<std::string, std::string, std::string> edge_t;
+typedef std::string rel_t;
+
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
+neighbor_sample_cpu(const torch::Tensor &colptr, const torch::Tensor &row,
+                    const torch::Tensor &input_node,
+                    const std::vector<int64_t> num_neighbors,
+                    const bool replace, const bool directed);
+
+std::tuple<c10::Dict<node_t, torch::Tensor>, c10::Dict<rel_t, torch::Tensor>,
+           c10::Dict<rel_t, torch::Tensor>, c10::Dict<rel_t, torch::Tensor>>
+hetero_neighbor_sample_cpu(
+    const std::vector<node_t> &node_types,
+    const std::vector<edge_t> &edge_types,
+    const c10::Dict<rel_t, torch::Tensor> &colptr_dict,
+    const c10::Dict<rel_t, torch::Tensor> &row_dict,
+    const c10::Dict<node_t, torch::Tensor> &input_node_dict,
+    const c10::Dict<rel_t, std::vector<int64_t>> &num_neighbors_dict,
+    const int64_t num_hops, const bool replace, const bool directed);
--- a/csrc/cpu/reducer.h
+++ b/csrc/cpu/reducer.h
+#pragma once
+
+#include <limits>
+#include <map>
+
+enum ReductionType { SUM, MEAN, MUL, DIV, MIN, MAX };
+
+const std::map<std::string, ReductionType> reduce2REDUCE = {
+    {"sum", SUM}, {"mean", MEAN}, {"mul", MUL},
+    {"div", DIV}, {"min", MIN},   {"max", MAX},
+};
+
+#define AT_DISPATCH_REDUCTION_TYPES(reduce, ...)                               \
+  [&] {                                                                        \
+    switch (reduce2REDUCE.at(reduce)) {                                        \
+    case SUM: {                                                                \
+      static constexpr ReductionType REDUCE = SUM;                             \
+      return __VA_ARGS__();                                                    \
+    }                                                                          \
+    case MEAN: {                                                               \
+      static constexpr ReductionType REDUCE = MEAN;                            \
+      return __VA_ARGS__();                                                    \
+    }                                                                          \
+    case MUL: {                                                                \
+      static constexpr ReductionType REDUCE = MUL;                             \
+      return __VA_ARGS__();                                                    \
+    }                                                                          \
+    case DIV: {                                                                \
+      static constexpr ReductionType REDUCE = DIV;                             \
+      return __VA_ARGS__();                                                    \
+    }                                                                          \
+    case MIN: {                                                                \
+      static constexpr ReductionType REDUCE = MIN;                             \
+      return __VA_ARGS__();                                                    \
+    }                                                                          \
+    case MAX: {                                                                \
+      static constexpr ReductionType REDUCE = MAX;                             \
+      return __VA_ARGS__();                                                    \
+    }                                                                          \
+    }                                                                          \
+  }()
+
+template <typename scalar_t, ReductionType REDUCE> struct Reducer {
+  static inline scalar_t init() {
+    if (REDUCE == MUL || REDUCE == DIV)
+      return (scalar_t)1;
+    else if (REDUCE == MIN)
+      return std::numeric_limits<scalar_t>::max();
+    else if (REDUCE == MAX)
+      return std::numeric_limits<scalar_t>::lowest();
+    else
+      return (scalar_t)0;
+  }
+
+  static inline void update(scalar_t *val, scalar_t new_val, int64_t *arg,
+                            int64_t new_arg) {
+    if (REDUCE == SUM || REDUCE == MEAN)
+      *val = *val + new_val;
+    else if (REDUCE == MUL)
+      *val = *val * new_val;
+    else if (REDUCE == DIV)
+      *val = *val / new_val;
+    else if ((REDUCE == MIN && new_val < *val) ||
+             (REDUCE == MAX && new_val > *val)) {
+      *val = new_val;
+      *arg = new_arg;
+    }
+  }
+
+  static inline void write(scalar_t *address, scalar_t val,
+                           int64_t *arg_address, int64_t arg, int count) {
+    if (REDUCE == SUM || REDUCE == MUL || REDUCE == DIV)
+      *address = val;
+    else if (REDUCE == MEAN)
+      *address = val / (scalar_t)(count > 0 ? count : 1);
+    else if (REDUCE == MIN || REDUCE == MAX) {
+      if (count > 0) {
+        *address = val;
+        *arg_address = arg;
+      } else
+        *address = (scalar_t)0;
+    }
+  }
+};
--- a/csrc/cpu/relabel_cpu.cpp
+++ b/csrc/cpu/relabel_cpu.cpp
+#include "relabel_cpu.h"
+
+#include "utils.h"
+
+std::tuple<torch::Tensor, torch::Tensor> relabel_cpu(torch::Tensor col,
+                                                     torch::Tensor idx) {
+
+  CHECK_CPU(col);
+  CHECK_CPU(idx);
+  CHECK_INPUT(idx.dim() == 1);
+
+  auto col_data = col.data_ptr<int64_t>();
+  auto idx_data = idx.data_ptr<int64_t>();
+
+  std::vector<int64_t> cols;
+  std::vector<int64_t> n_ids;
+  std::unordered_map<int64_t, int64_t> n_id_map;
+
+  int64_t i;
+  for (int64_t n = 0; n < idx.size(0); n++) {
+    i = idx_data[n];
+    n_id_map[i] = n;
+    n_ids.push_back(i);
+  }
+
+  int64_t c;
+  for (int64_t e = 0; e < col.size(0); e++) {
+    c = col_data[e];
+
+    if (n_id_map.count(c) == 0) {
+      n_id_map[c] = n_ids.size();
+      n_ids.push_back(c);
+    }
+
+    cols.push_back(n_id_map[c]);
+  }
+
+  int64_t n_len = n_ids.size(), e_len = cols.size();
+  auto out_col = torch::from_blob(cols.data(), {e_len}, col.options()).clone();
+  auto out_idx = torch::from_blob(n_ids.data(), {n_len}, col.options()).clone();
+
+  return std::make_tuple(out_col, out_idx);
+}
+
+std::tuple<torch::Tensor, torch::Tensor, torch::optional<torch::Tensor>,
+           torch::Tensor>
+relabel_one_hop_cpu(torch::Tensor rowptr, torch::Tensor col,
+                    torch::optional<torch::Tensor> optional_value,
+                    torch::Tensor idx, bool bipartite) {
+
+  CHECK_CPU(rowptr);
+  CHECK_CPU(col);
+  if (optional_value.has_value()) {
+    CHECK_CPU(optional_value.value());
+    CHECK_INPUT(optional_value.value().dim() == 1);
+  }
+  CHECK_CPU(idx);
+
+  auto rowptr_data = rowptr.data_ptr<int64_t>();
+  auto col_data = col.data_ptr<int64_t>();
+  auto idx_data = idx.data_ptr<int64_t>();
+
+  std::vector<int64_t> n_ids;
+  std::unordered_map<int64_t, int64_t> n_id_map;
+  std::unordered_map<int64_t, int64_t>::iterator it;
+
+  auto out_rowptr = torch::empty(idx.numel() + 1, rowptr.options());
+  auto out_rowptr_data = out_rowptr.data_ptr<int64_t>();
+
+  out_rowptr_data[0] = 0;
+  int64_t v, w, c, row_start, row_end, offset = 0;
+  for (int64_t i = 0; i < idx.numel(); i++) {
+    v = idx_data[i];
+    n_id_map[v] = i;
+    offset += rowptr_data[v + 1] - rowptr_data[v];
+    out_rowptr_data[i + 1] = offset;
+  }
+
+  auto out_col = torch::empty(offset, col.options());
+  auto out_col_data = out_col.data_ptr<int64_t>();
+
+  torch::optional<torch::Tensor> out_value = torch::nullopt;
+  if (optional_value.has_value()) {
+    out_value = torch::empty(offset, optional_value.value().options());
+
+    AT_DISPATCH_ALL_TYPES(optional_value.value().scalar_type(), "relabel", [&] {
+      auto value_data = optional_value.value().data_ptr<scalar_t>();
+      auto out_value_data = out_value.value().data_ptr<scalar_t>();
+
+      offset = 0;
+      for (int64_t i = 0; i < idx.numel(); i++) {
+        v = idx_data[i];
+        row_start = rowptr_data[v], row_end = rowptr_data[v + 1];
+
+        for (int64_t j = row_start; j < row_end; j++) {
+          w = col_data[j];
+          it = n_id_map.find(w);
+          if (it == n_id_map.end()) {
+            c = idx.numel() + n_ids.size();
+            n_id_map[w] = c;
+            n_ids.push_back(w);
+            out_col_data[offset] = c;
+          } else {
+            out_col_data[offset] = it->second;
+          }
+          out_value_data[offset] = value_data[j];
+          offset++;
+        }
+      }
+    });
+
+  } else {
+    offset = 0;
+    for (int64_t i = 0; i < idx.numel(); i++) {
+      v = idx_data[i];
+      row_start = rowptr_data[v], row_end = rowptr_data[v + 1];
+
+      for (int64_t j = row_start; j < row_end; j++) {
+        w = col_data[j];
+        it = n_id_map.find(w);
+        if (it == n_id_map.end()) {
+          c = idx.numel() + n_ids.size();
+          n_id_map[w] = c;
+          n_ids.push_back(w);
+          out_col_data[offset] = c;
+        } else {
+          out_col_data[offset] = it->second;
+        }
+        offset++;
+      }
+    }
+  }
+
+  if (!bipartite)
+    out_rowptr = torch::cat(
+        {out_rowptr, torch::full({(int64_t)n_ids.size()}, out_col.numel(),
+                                 rowptr.options())});
+
+  idx = torch::cat({idx, torch::from_blob(n_ids.data(), {(int64_t)n_ids.size()},
+                                          idx.options())});
+
+  return std::make_tuple(out_rowptr, out_col, out_value, idx);
+}
--- a/csrc/cpu/relabel_cpu.h
+++ b/csrc/cpu/relabel_cpu.h
+#pragma once
+
+#include "../extensions.h"
+
+std::tuple<torch::Tensor, torch::Tensor> relabel_cpu(torch::Tensor col,
+                                                     torch::Tensor idx);
+
+std::tuple<torch::Tensor, torch::Tensor, torch::optional<torch::Tensor>,
+           torch::Tensor>
+relabel_one_hop_cpu(torch::Tensor rowptr, torch::Tensor col,
+                    torch::optional<torch::Tensor> optional_value,
+                    torch::Tensor idx, bool bipartite);