support v0.6.16

19fd8251 · limm · 9ccee9c0 · 19fd8251 · 19fd8251 · 9ccee9c0
Commit 19fd8251 authored Jul 25, 2024 by limm
20 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
-cmake_minimum_required(VERSION 3.0)
+cmake_minimum_required(VERSION 3.10)
 project(torchsparse)
 set(CMAKE_CXX_STANDARD 14)
-set(TORCHSPARSE_VERSION 0.6.15)
+set(TORCHSPARSE_VERSION 0.6.16)
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/cmake)

 option(WITH_CUDA "Enable CUDA support" OFF)
 option(WITH_PYTHON "Link to Python when building" ON)
+option(WITH_METIS "Enable METIS support" OFF)

 if(WITH_CUDA)
  enable_language(CUDA)
@@ -19,6 +21,11 @@ if (WITH_PYTHON)
 endif()
 find_package(Torch REQUIRED)

+if (WITH_METIS)
+  add_definitions(-DWITH_METIS)
+  find_package(METIS)
+endif()
+
 file(GLOB HEADERS csrc/*.h)
 file(GLOB OPERATOR_SOURCES csrc/*.* csrc/cpu/*.*)
 if(WITH_CUDA)
@@ -30,6 +37,17 @@ target_link_libraries(${PROJECT_NAME} PRIVATE ${TORCH_LIBRARIES})
 if (WITH_PYTHON)
  target_link_libraries(${PROJECT_NAME} PRIVATE Python3::Python)
 endif()
+if (WITH_METIS)
+  target_include_directories(${PROJECT_NAME} PRIVATE ${METIS_INCLUDE_DIRS})
+  target_link_libraries(${PROJECT_NAME} PRIVATE ${METIS_LIBRARIES})
+endif()
+find_package(OpenMP)
+if (OPENMP_FOUND)
+    set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+    set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+#    set (CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=${OpenMP_CXX_FLAGS}")
+    set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
+endif()
 set_target_properties(${PROJECT_NAME} PROPERTIES EXPORT_NAME TorchSparse)

 target_include_directories(${PROJECT_NAME} INTERFACE
@@ -74,7 +92,6 @@ install(FILES
  csrc/cpu/saint_cpu.h
  csrc/cpu/sample_cpu.h
  csrc/cpu/spmm_cpu.h
-  csrc/cpu/spspmm_cpu.h
  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME}/cpu)
 if(WITH_CUDA)
  install(FILES
@@ -82,7 +99,6 @@ if(WITH_CUDA)
    csrc/cuda/diag_cuda.h
    csrc/cuda/rw_cuda.h
    csrc/cuda/spmm_cuda.h
-    csrc/cuda/spspmm_cuda.h
    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME}/cuda)
 endif()


--- a/MANIFEST.in
+++ b/MANIFEST.in
 include README.md
 include LICENSE

-recursive-exclude test *
 recursive-include csrc *
+
+recursive-include third_party *
+recursive-exclude third_party/parallel-hashmap/css *
+recursive-exclude third_party/parallel-hashmap/html *
+recursive-exclude third_party/parallel-hashmap/tests *
+recursive-exclude third_party/parallel-hashmap/examples *
+recursive-exclude third_party/parallel-hashmap/benchmark *
+
+recursive-exclude test *
+recursive-exclude benchmark *
--- a/PKG-INFO
+++ b/PKG-INFO
-Metadata-Version: 2.1
-Name: torch_sparse
-Version: 0.6.13
-Summary: PyTorch Extension Library of Optimized Autograd Sparse Matrix Operations
-Home-page: https://github.com/rusty1s/pytorch_sparse
-Author: Matthias Fey
-Author-email: matthias.fey@tu-dortmund.de
-License: UNKNOWN
-Download-URL: https://github.com/rusty1s/pytorch_sparse/archive/0.6.13.tar.gz
-Description: [pypi-image]: https://badge.fury.io/py/torch-sparse.svg
-        [pypi-url]: https://pypi.python.org/pypi/torch-sparse
-        [testing-image]: https://github.com/rusty1s/pytorch_sparse/actions/workflows/testing.yml/badge.svg
-        [testing-url]: https://github.com/rusty1s/pytorch_sparse/actions/workflows/testing.yml
-        [linting-image]: https://github.com/rusty1s/pytorch_sparse/actions/workflows/linting.yml/badge.svg
-        [linting-url]: https://github.com/rusty1s/pytorch_sparse/actions/workflows/linting.yml
-        [coverage-image]: https://codecov.io/gh/rusty1s/pytorch_sparse/branch/master/graph/badge.svg
-        [coverage-url]: https://codecov.io/github/rusty1s/pytorch_sparse?branch=master
-        
-        # PyTorch Sparse
-        
-        [![PyPI Version][pypi-image]][pypi-url]
-        [![Testing Status][testing-image]][testing-url]
-        [![Linting Status][linting-image]][linting-url]
-        [![Code Coverage][coverage-image]][coverage-url]
-        
-        --------------------------------------------------------------------------------
-        
-        This package consists of a small extension library of optimized sparse matrix operations with autograd support.
-        This package currently consists of the following methods:
-        
-        * **[Coalesce](#coalesce)**
-        * **[Transpose](#transpose)**
-        * **[Sparse Dense Matrix Multiplication](#sparse-dense-matrix-multiplication)**
-        * **[Sparse Sparse Matrix Multiplication](#sparse-sparse-matrix-multiplication)**
-        
-        All included operations work on varying data types and are implemented both for CPU and GPU.
-        To avoid the hazzle of creating [`torch.sparse_coo_tensor`](https://pytorch.org/docs/stable/torch.html?highlight=sparse_coo_tensor#torch.sparse_coo_tensor), this package defines operations on sparse tensors by simply passing `index` and `value` tensors as arguments ([with same shapes as defined in PyTorch](https://pytorch.org/docs/stable/sparse.html)).
-        Note that only `value` comes with autograd support, as `index` is discrete and therefore not differentiable.
-        
-        ## Installation
-        
-        ### Anaconda
-        
-        **Update:** You can now install `pytorch-sparse` via [Anaconda](https://anaconda.org/pyg/pytorch-sparse) for all major OS/PyTorch/CUDA combinations 🤗
-        Given that you have [`pytorch >= 1.8.0` installed](https://pytorch.org/get-started/locally/), simply run
-        
-        ```
-        conda install pytorch-sparse -c pyg
-        ```
-        
-        ### Binaries
-        
-        We alternatively provide pip wheels for all major OS/PyTorch/CUDA combinations, see [here](https://data.pyg.org/whl).
-        
-        #### PyTorch 1.11
-        
-        To install the binaries for PyTorch 1.11.0, simply run
-        
-        ```
-        pip install torch-scatter torch-sparse -f https://data.pyg.org/whl/torch-1.11.0+${CUDA}.html
-        ```
-        
-        where `${CUDA}` should be replaced by either `cpu`, `cu102`, `cu113`, or `cu115` depending on your PyTorch installation.
-        
-        |             | `cpu` | `cu102` | `cu113` | `cu115` |
-        |-------------|-------|---------|---------|---------|
-        | **Linux**   | ✅    | ✅      | ✅      | ✅      |
-        | **Windows** | ✅    |         | ✅      | ✅      |
-        | **macOS**   | ✅    |         |         |         |
-        
-        #### PyTorch 1.10
-        
-        To install the binaries for PyTorch 1.10.0, PyTorch 1.10.1 and PyTorch 1.10.2, simply run
-        
-        ```
-        pip install torch-scatter torch-sparse -f https://data.pyg.org/whl/torch-1.10.0+${CUDA}.html
-        ```
-        
-        where `${CUDA}` should be replaced by either `cpu`, `cu102`, `cu111`, or `cu113` depending on your PyTorch installation.
-        
-        |             | `cpu` | `cu102` | `cu111` | `cu113` |
-        |-------------|-------|---------|---------|---------|
-        | **Linux**   | ✅    | ✅      | ✅      | ✅      |
-        | **Windows** | ✅    | ✅      | ✅      | ✅      |
-        | **macOS**   | ✅    |         |         |         |
-        
-        **Note:** Binaries of older versions are also provided for PyTorch 1.4.0, PyTorch 1.5.0, PyTorch 1.6.0, PyTorch 1.7.0/1.7.1, PyTorch 1.8.0/1.8.1 and PyTorch 1.9.0 (following the same procedure).
-        For older versions, you might need to explicitly specify the latest supported version number in order to prevent a manual installation from source.
-        You can look up the latest supported version number [here](https://data.pyg.org/whl).
-        
-        ### From source
-        
-        Ensure that at least PyTorch 1.7.0 is installed and verify that `cuda/bin` and `cuda/include` are in your `$PATH` and `$CPATH` respectively, *e.g.*:
-        
-        ```
-        $ python -c "import torch; print(torch.__version__)"
-        >>> 1.7.0
-        
-        $ echo $PATH
-        >>> /usr/local/cuda/bin:...
-        
-        $ echo $CPATH
-        >>> /usr/local/cuda/include:...
-        ```
-        
-        If you want to additionally build `torch-sparse` with METIS support, *e.g.* for partioning, please download and install the [METIS library](http://glaros.dtc.umn.edu/gkhome/metis/metis/download) by following the instructions in the `Install.txt` file.
-        Note that METIS needs to be installed with 64 bit `IDXTYPEWIDTH` by changing `include/metis.h`.
-        Afterwards, set the environment variable `WITH_METIS=1`.
-        
-        Then run:
-        
-        ```
-        pip install torch-scatter torch-sparse
-        ```
-        
-        When running in a docker container without NVIDIA driver, PyTorch needs to evaluate the compute capabilities and may fail.
-        In this case, ensure that the compute capabilities are set via `TORCH_CUDA_ARCH_LIST`, *e.g.*:
-        
-        ```
-        export TORCH_CUDA_ARCH_LIST="6.0 6.1 7.2+PTX 7.5+PTX"
-        ```
-        
-        ## Functions
-        
-        ### Coalesce
-        
-        ```
-        torch_sparse.coalesce(index, value, m, n, op="add") -> (torch.LongTensor, torch.Tensor)
-        ```
-        
-        Row-wise sorts `index` and removes duplicate entries.
-        Duplicate entries are removed by scattering them together.
-        For scattering, any operation of [`torch_scatter`](https://github.com/rusty1s/pytorch_scatter) can be used.
-        
-        #### Parameters
-        
-        * **index** *(LongTensor)* - The index tensor of sparse matrix.
-        * **value** *(Tensor)* - The value tensor of sparse matrix.
-        * **m** *(int)* - The first dimension of sparse matrix.
-        * **n** *(int)* - The second dimension of sparse matrix.
-        * **op** *(string, optional)* - The scatter operation to use. (default: `"add"`)
-        
-        #### Returns
-        
-        * **index** *(LongTensor)* - The coalesced index tensor of sparse matrix.
-        * **value** *(Tensor)* - The coalesced value tensor of sparse matrix.
-        
-        #### Example
-        
-        ```python
-        import torch
-        from torch_sparse import coalesce
-        
-        index = torch.tensor([[1, 0, 1, 0, 2, 1],
-                              [0, 1, 1, 1, 0, 0]])
-        value = torch.Tensor([[1, 2], [2, 3], [3, 4], [4, 5], [5, 6], [6, 7]])
-        
-        index, value = coalesce(index, value, m=3, n=2)
-        ```
-        
-        ```
-        print(index)
-        tensor([[0, 1, 1, 2],
-                [1, 0, 1, 0]])
-        print(value)
-        tensor([[6.0, 8.0],
-                [7.0, 9.0],
-                [3.0, 4.0],
-                [5.0, 6.0]])
-        ```
-        
-        ### Transpose
-        
-        ```
-        torch_sparse.transpose(index, value, m, n) -> (torch.LongTensor, torch.Tensor)
-        ```
-        
-        Transposes dimensions 0 and 1 of a sparse matrix.
-        
-        #### Parameters
-        
-        * **index** *(LongTensor)* - The index tensor of sparse matrix.
-        * **value** *(Tensor)* - The value tensor of sparse matrix.
-        * **m** *(int)* - The first dimension of sparse matrix.
-        * **n** *(int)* - The second dimension of sparse matrix.
-        * **coalesced** *(bool, optional)* - If set to `False`, will not coalesce the output. (default: `True`)
-        
-        #### Returns
-        
-        * **index** *(LongTensor)* - The transposed index tensor of sparse matrix.
-        * **value** *(Tensor)* - The transposed value tensor of sparse matrix.
-        
-        #### Example
-        
-        ```python
-        import torch
-        from torch_sparse import transpose
-        
-        index = torch.tensor([[1, 0, 1, 0, 2, 1],
-                              [0, 1, 1, 1, 0, 0]])
-        value = torch.Tensor([[1, 2], [2, 3], [3, 4], [4, 5], [5, 6], [6, 7]])
-        
-        index, value = transpose(index, value, 3, 2)
-        ```
-        
-        ```
-        print(index)
-        tensor([[0, 0, 1, 1],
-                [1, 2, 0, 1]])
-        print(value)
-        tensor([[7.0, 9.0],
-                [5.0, 6.0],
-                [6.0, 8.0],
-                [3.0, 4.0]])
-        ```
-        
-        ### Sparse Dense Matrix Multiplication
-        
-        ```
-        torch_sparse.spmm(index, value, m, n, matrix) -> torch.Tensor
-        ```
-        
-        Matrix product of a sparse matrix with a dense matrix.
-        
-        #### Parameters
-        
-        * **index** *(LongTensor)* - The index tensor of sparse matrix.
-        * **value** *(Tensor)* - The value tensor of sparse matrix.
-        * **m** *(int)* - The first dimension of sparse matrix.
-        * **n** *(int)* - The second dimension of sparse matrix.
-        * **matrix** *(Tensor)* - The dense matrix.
-        
-        #### Returns
-        
-        * **out** *(Tensor)* - The dense output matrix.
-        
-        #### Example
-        
-        ```python
-        import torch
-        from torch_sparse import spmm
-        
-        index = torch.tensor([[0, 0, 1, 2, 2],
-                              [0, 2, 1, 0, 1]])
-        value = torch.Tensor([1, 2, 4, 1, 3])
-        matrix = torch.Tensor([[1, 4], [2, 5], [3, 6]])
-        
-        out = spmm(index, value, 3, 3, matrix)
-        ```
-        
-        ```
-        print(out)
-        tensor([[7.0, 16.0],
-                [8.0, 20.0],
-                [7.0, 19.0]])
-        ```
-        
-        ### Sparse Sparse Matrix Multiplication
-        
-        ```
-        torch_sparse.spspmm(indexA, valueA, indexB, valueB, m, k, n) -> (torch.LongTensor, torch.Tensor)
-        ```
-        
-        Matrix product of two sparse tensors.
-        Both input sparse matrices need to be **coalesced** (use the `coalesced` attribute to force).
-        
-        #### Parameters
-        
-        * **indexA** *(LongTensor)* - The index tensor of first sparse matrix.
-        * **valueA** *(Tensor)* - The value tensor of first sparse matrix.
-        * **indexB** *(LongTensor)* - The index tensor of second sparse matrix.
-        * **valueB** *(Tensor)* - The value tensor of second sparse matrix.
-        * **m** *(int)* - The first dimension of first sparse matrix.
-        * **k** *(int)* - The second dimension of first sparse matrix and first dimension of second sparse matrix.
-        * **n** *(int)* - The second dimension of second sparse matrix.
-        * **coalesced** *(bool, optional)*: If set to `True`, will coalesce both input sparse matrices. (default: `False`)
-        
-        #### Returns
-        
-        * **index** *(LongTensor)* - The output index tensor of sparse matrix.
-        * **value** *(Tensor)* - The output value tensor of sparse matrix.
-        
-        #### Example
-        
-        ```python
-        import torch
-        from torch_sparse import spspmm
-        
-        indexA = torch.tensor([[0, 0, 1, 2, 2], [1, 2, 0, 0, 1]])
-        valueA = torch.Tensor([1, 2, 3, 4, 5])
-        
-        indexB = torch.tensor([[0, 2], [1, 0]])
-        valueB = torch.Tensor([2, 4])
-        
-        indexC, valueC = spspmm(indexA, valueA, indexB, valueB, 3, 3, 2)
-        ```
-        
-        ```
-        print(indexC)
-        tensor([[0, 1, 2],
-                [0, 1, 1]])
-        print(valueC)
-        tensor([8.0, 6.0, 8.0])
-        ```
-        
-        ## C++ API
-        
-        `torch-sparse` also offers a C++ API that contains C++ equivalent of python models.
-        
-        ```
-        mkdir build
-        cd build
-        # Add -DWITH_CUDA=on support for the CUDA if needed
-        cmake ..
-        make
-        make install
-        ```
-        
-        ## Running tests
-        
-        ```
-        pytest
-        ```
-        
-Keywords: pytorch,sparse,sparse-matrices,autograd
-Platform: UNKNOWN
-Classifier: Development Status :: 5 - Production/Stable
-Classifier: License :: OSI Approved :: MIT License
-Classifier: Programming Language :: Python
-Classifier: Programming Language :: Python :: 3.7
-Classifier: Programming Language :: Python :: 3.8
-Classifier: Programming Language :: Python :: 3.9
-Classifier: Programming Language :: Python :: 3.10
-Classifier: Programming Language :: Python :: 3 :: Only
-Requires-Python: >=3.7
-Description-Content-Type: text/markdown
-Provides-Extra: test
--- a/cmake/FindMetis.cmake
+++ b/cmake/FindMetis.cmake
+###
+#
+# @copyright (c) 2009-2014 The University of Tennessee and The University
+#                          of Tennessee Research Foundation.
+#                          All rights reserved.
+# @copyright (c) 2012-2014 Inria. All rights reserved.
+# @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+#
+###
+#
+# - Find METIS include dirs and libraries
+# Use this module by invoking find_package with the form:
+#  find_package(METIS
+#               [REQUIRED]             # Fail with error if metis is not found
+#              )
+#
+# This module finds headers and metis library.
+# Results are reported in variables:
+#  METIS_FOUND           - True if headers and requested libraries were found
+#  METIS_INCLUDE_DIRS    - metis include directories
+#  METIS_LIBRARY_DIRS    - Link directories for metis libraries
+#  METIS_LIBRARIES       - metis component libraries to be linked
+#
+# The user can give specific paths where to find the libraries adding cmake
+# options at configure (ex: cmake path/to/project -DMETIS_DIR=path/to/metis):
+#  METIS_DIR             - Where to find the base directory of metis
+#  METIS_INCDIR          - Where to find the header files
+#  METIS_LIBDIR          - Where to find the library files
+# The module can also look for the following environment variables if paths
+# are not given as cmake variable: METIS_DIR, METIS_INCDIR, METIS_LIBDIR
+
+#=============================================================================
+# Copyright 2012-2013 Inria
+# Copyright 2012-2013 Emmanuel Agullo
+# Copyright 2012-2013 Mathieu Faverge
+# Copyright 2012      Cedric Castagnede
+# Copyright 2013      Florent Pruvost
+#
+# Distributed under the OSI-approved BSD License (the "License");
+# see accompanying file MORSE-Copyright.txt for details.
+#
+# This software is distributed WITHOUT ANY WARRANTY; without even the
+# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# See the License for more information.
+#=============================================================================
+# (To distribute this file outside of Morse, substitute the full
+#  License text for the above reference.)
+
+if (NOT METIS_FOUND)
+  set(METIS_DIR "" CACHE PATH "Installation directory of METIS library")
+  if (NOT METIS_FIND_QUIETLY)
+    message(STATUS "A cache variable, namely METIS_DIR, has been set to specify the install directory of METIS")
+  endif()
+endif()
+
+# Looking for include
+# -------------------
+
+# Add system include paths to search include
+# ------------------------------------------
+unset(_inc_env)
+set(ENV_METIS_DIR "$ENV{METIS_DIR}")
+set(ENV_METIS_INCDIR "$ENV{METIS_INCDIR}")
+if(ENV_METIS_INCDIR)
+  list(APPEND _inc_env "${ENV_METIS_INCDIR}")
+elseif(ENV_METIS_DIR)
+  list(APPEND _inc_env "${ENV_METIS_DIR}")
+  list(APPEND _inc_env "${ENV_METIS_DIR}/include")
+  list(APPEND _inc_env "${ENV_METIS_DIR}/include/metis")
+else()
+  if(WIN32)
+    string(REPLACE ":" ";" _inc_env "$ENV{INCLUDE}")
+  else()
+    string(REPLACE ":" ";" _path_env "$ENV{INCLUDE}")
+    list(APPEND _inc_env "${_path_env}")
+    string(REPLACE ":" ";" _path_env "$ENV{C_INCLUDE_PATH}")
+    list(APPEND _inc_env "${_path_env}")
+    string(REPLACE ":" ";" _path_env "$ENV{CPATH}")
+    list(APPEND _inc_env "${_path_env}")
+    string(REPLACE ":" ";" _path_env "$ENV{INCLUDE_PATH}")
+    list(APPEND _inc_env "${_path_env}")
+  endif()
+endif()
+list(APPEND _inc_env "${CMAKE_PLATFORM_IMPLICIT_INCLUDE_DIRECTORIES}")
+list(APPEND _inc_env "${CMAKE_C_IMPLICIT_INCLUDE_DIRECTORIES}")
+list(REMOVE_DUPLICATES _inc_env)
+
+
+# Try to find the metis header in the given paths
+# -------------------------------------------------
+# call cmake macro to find the header path
+if(METIS_INCDIR)
+  set(METIS_metis.h_DIRS "METIS_metis.h_DIRS-NOTFOUND")
+  find_path(METIS_metis.h_DIRS
+    NAMES metis.h
+    HINTS ${METIS_INCDIR})
+else()
+  if(METIS_DIR)
+    set(METIS_metis.h_DIRS "METIS_metis.h_DIRS-NOTFOUND")
+    find_path(METIS_metis.h_DIRS
+      NAMES metis.h
+      HINTS ${METIS_DIR}
+      PATH_SUFFIXES "include" "include/metis")
+  else()
+    set(METIS_metis.h_DIRS "METIS_metis.h_DIRS-NOTFOUND")
+    find_path(METIS_metis.h_DIRS
+      NAMES metis.h
+      HINTS ${_inc_env})
+  endif()
+endif()
+mark_as_advanced(METIS_metis.h_DIRS)
+
+
+# If found, add path to cmake variable
+# ------------------------------------
+if (METIS_metis.h_DIRS)
+  set(METIS_INCLUDE_DIRS "${METIS_metis.h_DIRS}")
+else ()
+  set(METIS_INCLUDE_DIRS "METIS_INCLUDE_DIRS-NOTFOUND")
+  if(NOT METIS_FIND_QUIETLY)
+    message(STATUS "Looking for metis -- metis.h not found")
+  endif()
+endif()
+
+
+# Looking for lib
+# ---------------
+
+# Add system library paths to search lib
+# --------------------------------------
+unset(_lib_env)
+set(ENV_METIS_LIBDIR "$ENV{METIS_LIBDIR}")
+if(ENV_METIS_LIBDIR)
+  list(APPEND _lib_env "${ENV_METIS_LIBDIR}")
+elseif(ENV_METIS_DIR)
+  list(APPEND _lib_env "${ENV_METIS_DIR}")
+  list(APPEND _lib_env "${ENV_METIS_DIR}/lib")
+else()
+  if(WIN32)
+    string(REPLACE ":" ";" _lib_env "$ENV{LIB}")
+  else()
+    if(APPLE)
+      string(REPLACE ":" ";" _lib_env "$ENV{DYLD_LIBRARY_PATH}")
+    else()
+      string(REPLACE ":" ";" _lib_env "$ENV{LD_LIBRARY_PATH}")
+    endif()
+    list(APPEND _lib_env "${CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES}")
+    list(APPEND _lib_env "${CMAKE_C_IMPLICIT_LINK_DIRECTORIES}")
+  endif()
+endif()
+list(REMOVE_DUPLICATES _lib_env)
+
+# Try to find the metis lib in the given paths
+# ----------------------------------------------
+# call cmake macro to find the lib path
+if(METIS_LIBDIR)
+  set(METIS_metis_LIBRARY "METIS_metis_LIBRARY-NOTFOUND")
+  find_library(METIS_metis_LIBRARY
+    NAMES metis
+    HINTS ${METIS_LIBDIR})
+else()
+  if(METIS_DIR)
+    set(METIS_metis_LIBRARY "METIS_metis_LIBRARY-NOTFOUND")
+    find_library(METIS_metis_LIBRARY
+      NAMES metis
+      HINTS ${METIS_DIR}
+      PATH_SUFFIXES lib lib32 lib64)
+  else()
+    set(METIS_metis_LIBRARY "METIS_metis_LIBRARY-NOTFOUND")
+    find_library(METIS_metis_LIBRARY
+      NAMES metis
+      HINTS ${_lib_env})
+  endif()
+endif()
+mark_as_advanced(METIS_metis_LIBRARY)
+
+
+# If found, add path to cmake variable
+# ------------------------------------
+if (METIS_metis_LIBRARY)
+  get_filename_component(metis_lib_path "${METIS_metis_LIBRARY}" PATH)
+  # set cmake variables
+  set(METIS_LIBRARIES    "${METIS_metis_LIBRARY}")
+  set(METIS_LIBRARY_DIRS "${metis_lib_path}")
+else ()
+  set(METIS_LIBRARIES    "METIS_LIBRARIES-NOTFOUND")
+  set(METIS_LIBRARY_DIRS "METIS_LIBRARY_DIRS-NOTFOUND")
+  if(NOT METIS_FIND_QUIETLY)
+    message(STATUS "Looking for metis -- lib metis not found")
+  endif()
+endif ()
+
+# check a function to validate the find
+if(METIS_LIBRARIES)
+
+  set(REQUIRED_INCDIRS)
+  set(REQUIRED_LIBDIRS)
+  set(REQUIRED_LIBS)
+
+  # METIS
+  if (METIS_INCLUDE_DIRS)
+    set(REQUIRED_INCDIRS  "${METIS_INCLUDE_DIRS}")
+  endif()
+  if (METIS_LIBRARY_DIRS)
+    set(REQUIRED_LIBDIRS "${METIS_LIBRARY_DIRS}")
+  endif()
+  set(REQUIRED_LIBS "${METIS_LIBRARIES}")
+  # m
+  find_library(M_LIBRARY NAMES m)
+  mark_as_advanced(M_LIBRARY)
+  if(M_LIBRARY)
+    list(APPEND REQUIRED_LIBS "-lm")
+  endif()
+
+  # set required libraries for link
+  set(CMAKE_REQUIRED_INCLUDES "${REQUIRED_INCDIRS}")
+  set(CMAKE_REQUIRED_LIBRARIES)
+  foreach(lib_dir ${REQUIRED_LIBDIRS})
+    list(APPEND CMAKE_REQUIRED_LIBRARIES "-L${lib_dir}")
+  endforeach()
+  list(APPEND CMAKE_REQUIRED_LIBRARIES "${REQUIRED_LIBS}")
+  string(REGEX REPLACE "^ -" "-" CMAKE_REQUIRED_LIBRARIES "${CMAKE_REQUIRED_LIBRARIES}")
+
+  # test link
+  unset(METIS_WORKS CACHE)
+  include(CheckFunctionExists)
+  check_function_exists(METIS_NodeND METIS_WORKS)
+  mark_as_advanced(METIS_WORKS)
+
+  if(NOT METIS_WORKS)
+    if(NOT METIS_FIND_QUIETLY)
+      message(STATUS "Looking for METIS : test of METIS_NodeND with METIS library fails")
+      message(STATUS "CMAKE_REQUIRED_LIBRARIES: ${CMAKE_REQUIRED_LIBRARIES}")
+      message(STATUS "CMAKE_REQUIRED_INCLUDES: ${CMAKE_REQUIRED_INCLUDES}")
+      message(STATUS "Check in CMakeFiles/CMakeError.log to figure out why it fails")
+    endif()
+  endif()
+  set(CMAKE_REQUIRED_INCLUDES)
+  set(CMAKE_REQUIRED_FLAGS)
+  set(CMAKE_REQUIRED_LIBRARIES)
+endif()
+
+if (METIS_LIBRARIES)
+  list(GET METIS_LIBRARIES 0 first_lib)
+  get_filename_component(first_lib_path "${first_lib}" PATH)
+  if (${first_lib_path} MATCHES "/lib(32|64)?$")
+    string(REGEX REPLACE "/lib(32|64)?$" "" not_cached_dir "${first_lib_path}")
+    set(METIS_DIR_FOUND "${not_cached_dir}" CACHE PATH "Installation directory of METIS library" FORCE)
+  else()
+    set(METIS_DIR_FOUND "${first_lib_path}" CACHE PATH "Installation directory of METIS library" FORCE)
+  endif()
+endif()
+mark_as_advanced(METIS_DIR)
+mark_as_advanced(METIS_DIR_FOUND)
+
+# check that METIS has been found
+# ---------------------------------
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(METIS DEFAULT_MSG
+  METIS_LIBRARIES
+  METIS_WORKS
+  METIS_INCLUDE_DIRS)
+#
+# TODO: Add possibility to check for specific functions in the library
+#
--- a/conda/pytorch-sparse/README.md
+++ b/conda/pytorch-sparse/README.md
 ```
-./build_conda.sh 3.9 1.12.0 cu113  # python, pytorch and cuda version
+./build_conda.sh 3.9 1.13.0 cu116  # python, pytorch and cuda version
 ```
--- a/conda/pytorch-sparse/build_conda.sh
+++ b/conda/pytorch-sparse/build_conda.sh
@@ -10,8 +10,15 @@ if [ "${CUDA_VERSION}" = "cpu" ]; then
  export CONDA_CUDATOOLKIT_CONSTRAINT="cpuonly  # [not osx]"
 else
  case $CUDA_VERSION in
+    cu117)
+      export CONDA_CUDATOOLKIT_CONSTRAINT="pytorch-cuda==11.7.*"
+      ;;
    cu116)
-      export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit==11.6.*"
+      if [ "${TORCH_VERSION}" = "1.12.0" ]; then
+        export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit==11.6.*"
+      else
+        export CONDA_CUDATOOLKIT_CONSTRAINT="pytorch-cuda==11.6.*"
+      fi
      ;;
    cu115)
      export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit==11.5.*"
@@ -39,8 +46,8 @@ echo "PyTorch $TORCH_VERSION+$CUDA_VERSION"
 echo "- $CONDA_PYTORCH_CONSTRAINT"
 echo "- $CONDA_CUDATOOLKIT_CONSTRAINT"

-if [ "${CUDA_VERSION}" = "cu116" ]; then
-  conda build . -c pytorch -c default -c nvidia -c conda-forge --output-folder "$HOME/conda-bld"
+if [ "${TORCH_VERSION}" = "1.12.0" ] && [ "${CUDA_VERSION}" = "cu116" ]; then
+  conda build . -c pytorch -c pyg -c default -c nvidia -c conda-forge --output-folder "$HOME/conda-bld"
 else
-  conda build . -c pytorch -c default -c nvidia --output-folder "$HOME/conda-bld"
+  conda build . -c pytorch -c pyg -c default -c nvidia --output-folder "$HOME/conda-bld"
 fi
--- a/conda/pytorch-sparse/meta.yaml
+++ b/conda/pytorch-sparse/meta.yaml
 package:
  name: pytorch-sparse
-  version: 0.6.15
+  version: 0.6.16

 source:
  path: ../..

--- a/csrc/cpu/neighbor_sample_cpu.cpp
+++ b/csrc/cpu/neighbor_sample_cpu.cpp
@@ -365,11 +365,11 @@ hetero_sample(const vector<node_t> &node_types,

    if (temporal) {
      for (const auto &kv : temp_samples_dict) {
-        slice_dict[kv.first] = {0, kv.second.size()};
+        slice_dict[kv.first] = {slice_dict.at(kv.first).second, kv.second.size()};
      }
    } else {
      for (const auto &kv : samples_dict)
-        slice_dict[kv.first] = {0, kv.second.size()};
+        slice_dict[kv.first] = {slice_dict.at(kv.first).second, kv.second.size()};
    }
  }

@@ -504,4 +504,4 @@ hetero_temporal_neighbor_sample_cpu(
        node_types, edge_types, colptr_dict, row_dict, input_node_dict,
        num_neighbors_dict, node_time_dict, num_hops);
  }
-}
\ No newline at end of file
+}
--- a/csrc/cpu/spmm_cpu.cpp
+++ b/csrc/cpu/spmm_cpu.cpp
@@ -44,7 +44,7 @@ spmm_cpu(torch::Tensor rowptr, torch::Tensor col,
  auto K = mat.size(-1);
  auto B = mat.numel() / (N * K);

-  AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, mat.scalar_type(), "_", [&] {
+  AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, mat.scalar_type(), "spmm_cpu", [&] {
    scalar_t *value_data = nullptr;
    auto mat_data = mat.data_ptr<scalar_t>();
    auto out_data = out.data_ptr<scalar_t>();
@@ -123,7 +123,7 @@ torch::Tensor spmm_value_bw_cpu(torch::Tensor row, torch::Tensor rowptr,
  auto row_data = row.data_ptr<int64_t>();
  auto rowptr_data = rowptr.data_ptr<int64_t>();
  auto col_data = col.data_ptr<int64_t>();
-  AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, mat.scalar_type(), "_", [&] {
+  AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, mat.scalar_type(), "spmm_value_bw_cpu", [&] {
    auto mat_data = mat.data_ptr<scalar_t>();
    auto grad_data = grad.data_ptr<scalar_t>();
    auto out_data = out.data_ptr<scalar_t>();

--- a/csrc/cpu/spspmm_cpu.cpp
+++ b/csrc/cpu/spspmm_cpu.cpp
-#include "spspmm_cpu.h"
-
-#include "utils.h"
-
-std::tuple<torch::Tensor, torch::Tensor, torch::optional<torch::Tensor>>
-spspmm_cpu(torch::Tensor rowptrA, torch::Tensor colA,
-           torch::optional<torch::Tensor> optional_valueA,
-           torch::Tensor rowptrB, torch::Tensor colB,
-           torch::optional<torch::Tensor> optional_valueB, int64_t K,
-           std::string reduce) {
-
-  CHECK_CPU(rowptrA);
-  CHECK_CPU(colA);
-  if (optional_valueA.has_value())
-    CHECK_CPU(optional_valueA.value());
-  CHECK_CPU(rowptrB);
-  CHECK_CPU(colB);
-  if (optional_valueB.has_value())
-    CHECK_CPU(optional_valueB.value());
-
-  CHECK_INPUT(rowptrA.dim() == 1);
-  CHECK_INPUT(colA.dim() == 1);
-  if (optional_valueA.has_value()) {
-    CHECK_INPUT(optional_valueA.value().dim() == 1);
-    CHECK_INPUT(optional_valueA.value().size(0) == colA.size(0));
-  }
-  CHECK_INPUT(rowptrB.dim() == 1);
-  CHECK_INPUT(colB.dim() == 1);
-  if (optional_valueB.has_value()) {
-    CHECK_INPUT(optional_valueB.value().dim() == 1);
-    CHECK_INPUT(optional_valueB.value().size(0) == colB.size(0));
-  }
-
-  if (!optional_valueA.has_value() && optional_valueB.has_value())
-    optional_valueA =
-        torch::ones({colA.numel()}, optional_valueB.value().options());
-
-  if (!optional_valueB.has_value() && optional_valueA.has_value())
-    optional_valueB =
-        torch::ones({colB.numel()}, optional_valueA.value().options());
-
-  auto scalar_type = torch::ScalarType::Float;
-  if (optional_valueA.has_value())
-    scalar_type = optional_valueA.value().scalar_type();
-
-  auto rowptrA_data = rowptrA.data_ptr<int64_t>();
-  auto colA_data = colA.data_ptr<int64_t>();
-  auto rowptrB_data = rowptrB.data_ptr<int64_t>();
-  auto colB_data = colB.data_ptr<int64_t>();
-
-  auto rowptrC = torch::empty_like(rowptrA);
-  auto rowptrC_data = rowptrC.data_ptr<int64_t>();
-  rowptrC_data[0] = 0;
-
-  torch::Tensor colC;
-  torch::optional<torch::Tensor> optional_valueC = torch::nullopt;
-
-  AT_DISPATCH_ALL_TYPES(scalar_type, "spspmm", [&] {
-    AT_DISPATCH_HAS_VALUE(optional_valueA, [&] {
-      scalar_t *valA_data = nullptr, *valB_data = nullptr;
-      if (HAS_VALUE) {
-        valA_data = optional_valueA.value().data_ptr<scalar_t>();
-        valB_data = optional_valueB.value().data_ptr<scalar_t>();
-      }
-
-      int64_t nnz = 0, cA, cB;
-      std::vector<scalar_t> tmp_vals(K, 0);
-      std::vector<int64_t> cols;
-      std::vector<scalar_t> vals;
-
-      for (auto rA = 0; rA < rowptrA.numel() - 1; rA++) {
-        for (auto eA = rowptrA_data[rA]; eA < rowptrA_data[rA + 1]; eA++) {
-          cA = colA_data[eA];
-          for (auto eB = rowptrB_data[cA]; eB < rowptrB_data[cA + 1]; eB++) {
-            cB = colB_data[eB];
-
-            if (HAS_VALUE)
-              tmp_vals[cB] += valA_data[eA] * valB_data[eB];
-            else
-              tmp_vals[cB]++;
-          }
-        }
-
-        for (auto k = 0; k < K; k++) {
-          if (tmp_vals[k] != 0) {
-            cols.push_back(k);
-            if (HAS_VALUE)
-              vals.push_back(tmp_vals[k]);
-            nnz++;
-          }
-          tmp_vals[k] = (scalar_t)0;
-        }
-        rowptrC_data[rA + 1] = nnz;
-      }
-
-      colC = torch::from_blob(cols.data(), {nnz}, colA.options()).clone();
-      if (HAS_VALUE) {
-        optional_valueC = torch::from_blob(vals.data(), {nnz},
-                                           optional_valueA.value().options());
-        optional_valueC = optional_valueC.value().clone();
-      }
-    });
-  });
-
-  return std::make_tuple(rowptrC, colC, optional_valueC);
-}
--- a/csrc/cpu/spspmm_cpu.h
+++ b/csrc/cpu/spspmm_cpu.h
-#pragma once
-
-#include "../extensions.h"
-
-std::tuple<torch::Tensor, torch::Tensor, torch::optional<torch::Tensor>>
-spspmm_cpu(torch::Tensor rowptrA, torch::Tensor colA,
-           torch::optional<torch::Tensor> optional_valueA,
-           torch::Tensor rowptrB, torch::Tensor colB,
-           torch::optional<torch::Tensor> optional_valueB, int64_t K,
-           std::string reduce);
--- a/csrc/cpu/utils.h
+++ b/csrc/cpu/utils.h
@@ -5,6 +5,7 @@

 #define CHECK_CPU(x) AT_ASSERTM(x.device().is_cpu(), #x " must be CPU tensor")
 #define CHECK_INPUT(x) AT_ASSERTM(x, "Input mismatch")
+#define CHECK_LT(low, high) AT_ASSERTM(low < high, "low must be smaller than high")

 #define AT_DISPATCH_HAS_VALUE(optional_value, ...)                             \
  [&] {                                                                        \

--- a/csrc/cuda/atomics.cuh
+++ b/csrc/cuda/atomics.cuh
@@ -5,7 +5,7 @@ static inline __device__ void atomAdd(float *address, float val) {
 }

 static inline __device__ void atomAdd(double *address, double val) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600 || CUDA_VERSION < 8000)
+#if defined(USE_ROCM) || (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600 || CUDA_VERSION < 8000))
  unsigned long long int *address_as_ull = (unsigned long long int *)address;
  unsigned long long int old = *address_as_ull;
  unsigned long long int assumed;

--- a/csrc/cuda/spmm_cuda.cu
+++ b/csrc/cuda/spmm_cuda.cu
@@ -9,7 +9,7 @@
 #define FULL_MASK 0xffffffff

 // Paper: Design Principles for Sparse Matrix Multiplication on the GPU
-// Code:  https://github.com/owensgroup/merge-spmm
+
 template <typename scalar_t, ReductionType REDUCE, bool HAS_VALUE>
 __global__ void spmm_kernel(const int64_t *rowptr_data, const int64_t *col_data,
                            const scalar_t *value_data,
@@ -20,22 +20,22 @@ __global__ void spmm_kernel(const int64_t *rowptr_data, const int64_t *col_data,
  // across `blockIdx.y` are treated equally.
  int thread_idx = blockDim.x * blockIdx.x + threadIdx.x;

-  int row = thread_idx >> 5;            // thread_idx / 32
-  int lane_idx = thread_idx & (32 - 1); // thread_idx % 32
+  int row = thread_idx >> 6;            // thread_idx / 32
+  int lane_idx = thread_idx & (64 - 1); // thread_idx % 32
  int batch_idx = row / M;

  // Compute the column index of `mat` in which the thread is operating.
-  int mat_col_idx = lane_idx + (blockIdx.y << 5);
+  int mat_col_idx = lane_idx + (blockIdx.y << 6);

  // Compute the output index (row-major order).
  int out_idx = row * K + mat_col_idx;

  // Helper arrays for warp communication.
-  int mat_row, mat_rows[32];
-  scalar_t val, vals[HAS_VALUE ? 32 : 1];
+  int mat_row, mat_rows[64];
+  scalar_t val, vals[HAS_VALUE ? 64 : 1];

  // Do not aggregate/write across the Y-axis (lane_idx < leftover).
-  int leftover = K - (blockIdx.y << 5);
+  int leftover = K - (blockIdx.y << 6);

  if (batch_idx < B) {
    int row_start = __ldg(rowptr_data + (row % M));
@@ -46,7 +46,7 @@ __global__ void spmm_kernel(const int64_t *rowptr_data, const int64_t *col_data,
    int64_t arg;

    // Iterate over all `col` indices in parallel within a warp.
-    for (int c = row_start; c < row_end; c += 32) {
+    for (int c = row_start; c < row_end; c += 64) {

      if (col_idx < row_end) {
        // Coalesced memory access into `col` and `val`.
@@ -58,18 +58,18 @@ __global__ void spmm_kernel(const int64_t *rowptr_data, const int64_t *col_data,
        if (HAS_VALUE)
          val = (scalar_t)0;
      }
-      col_idx += 32;
+      col_idx += 64;

 #pragma unroll
-      for (int i = 0; i < 32; i++) {
+      for (int i = 0; i < 64; i++) {
        // Communication between all threads in a warp.
-        mat_rows[i] = __shfl_sync(FULL_MASK, mat_row, i);
+        mat_rows[i] = SHFL_SYNC(FULL_MASK, mat_row, i);
        if (HAS_VALUE)
-          vals[i] = __shfl_sync(FULL_MASK, val, i);
+          vals[i] = SHFL_SYNC(FULL_MASK, val, i);
      }

 #pragma unroll
-      for (int i = 0; i < 32; i++) {
+      for (int i = 0; i < 64; i++) {
        if (lane_idx < leftover && mat_rows[i] != -1) {
          // Coalesced memory access into `mat`.
          val = __ldg(mat_data + batch_idx * N * K + mat_rows[i] + mat_col_idx);
@@ -129,7 +129,7 @@ spmm_cuda(torch::Tensor rowptr, torch::Tensor col,
  auto N = mat.size(-2);
  auto K = mat.size(-1);
  auto B = mat.numel() / (N * K);
-  auto BLOCKS = dim3((32 * B * M + THREADS - 1) / THREADS, (K + 31) / 32);
+  auto BLOCKS = dim3((64 * B * M + THREADS - 1) / THREADS, (K + 63) / 64);

  auto stream = at::cuda::getCurrentCUDAStream();

@@ -154,6 +154,7 @@ spmm_cuda(torch::Tensor rowptr, torch::Tensor col,
  return std::make_tuple(out, arg_out);
 }

+
 template <typename scalar_t, ReductionType REDUCE>
 __global__ void
 spmm_value_bw_kernel(const int64_t *row_data, const int64_t *rowptr_data,
@@ -162,8 +163,8 @@ spmm_value_bw_kernel(const int64_t *row_data, const int64_t *rowptr_data,
                     int M, int N, int E, int K) {
  int thread_idx = blockDim.x * blockIdx.x + threadIdx.x;

-  int index_idx = (thread_idx >> 5);    // thread_idx / 32
-  int lane_idx = thread_idx & (32 - 1); // thread_idx % 32
+  int index_idx = (thread_idx >> 6);    // thread_idx / 32
+  int lane_idx = thread_idx & (64 - 1); // thread_idx % 32

  if (index_idx < E) {
    int row = __ldg(row_data + index_idx);
@@ -171,15 +172,15 @@ spmm_value_bw_kernel(const int64_t *row_data, const int64_t *rowptr_data,

    scalar_t val = (scalar_t)0;
    for (int b = 0; b < B; b++) {
-      for (int k = lane_idx; k < K; k += 32) {
+      for (int k = lane_idx; k < K; k += 64) {
        val += mat_data[b * N * K + col * K + k] *
               grad_data[b * M * K + row * K + k];
      }
    }

 #pragma unroll
-    for (int i = 32 / 2; i > 0; i /= 2) { // Parallel reduction inside a warp.
-      val += __shfl_down_sync(FULL_MASK, val, i);
+    for (int i = 64 / 2; i > 0; i /= 2) { // Parallel reduction inside a warp.
+      val += SHFL_DOWN_SYNC(FULL_MASK, val, i);
    }

    if (lane_idx == 0) {
@@ -211,7 +212,7 @@ torch::Tensor spmm_value_bw_cuda(torch::Tensor row, torch::Tensor rowptr,
  auto E = row.numel();
  auto K = mat.size(-1);
  auto B = mat.numel() / (N * K);
-  auto BLOCKS = dim3((E * 32 + THREADS - 1) / THREADS);
+  auto BLOCKS = dim3((E * 64 + THREADS - 1) / THREADS);

  auto out = torch::zeros({row.numel()}, grad.options());


--- a/csrc/cuda/spspmm_cuda.cu
+++ b/csrc/cuda/spspmm_cuda.cu
-#include "spspmm_cuda.h"
-
-#include <ATen/cuda/CUDAContext.h>
-#include <cusparse.h>
-
-#include "utils.cuh"
-
-#define AT_DISPATCH_CUSPARSE_TYPES(TYPE, ...)                                  \
-  [&] {                                                                        \
-    switch (TYPE) {                                                            \
-    case torch::ScalarType::Float: {                                           \
-      using scalar_t = float;                                                  \
-      const auto &cusparsecsrgemm2_bufferSizeExt =                             \
-          cusparseScsrgemm2_bufferSizeExt;                                     \
-      const auto &cusparsecsrgemm2 = cusparseScsrgemm2;                        \
-      return __VA_ARGS__();                                                    \
-    }                                                                          \
-    case torch::ScalarType::Double: {                                          \
-      using scalar_t = double;                                                 \
-      const auto &cusparsecsrgemm2_bufferSizeExt =                             \
-          cusparseDcsrgemm2_bufferSizeExt;                                     \
-      const auto &cusparsecsrgemm2 = cusparseDcsrgemm2;                        \
-      return __VA_ARGS__();                                                    \
-    }                                                                          \
-    default:                                                                   \
-      AT_ERROR("Not implemented for '", toString(TYPE), "'");                  \
-    }                                                                          \
-  }()
-
-std::tuple<torch::Tensor, torch::Tensor, torch::optional<torch::Tensor>>
-spspmm_cuda(torch::Tensor rowptrA, torch::Tensor colA,
-            torch::optional<torch::Tensor> optional_valueA,
-            torch::Tensor rowptrB, torch::Tensor colB,
-            torch::optional<torch::Tensor> optional_valueB, int64_t K,
-            std::string reduce) {
-
-  CHECK_CUDA(rowptrA);
-  CHECK_CUDA(colA);
-  if (optional_valueA.has_value())
-    CHECK_CUDA(optional_valueA.value());
-  CHECK_CUDA(rowptrB);
-  CHECK_CUDA(colB);
-  if (optional_valueB.has_value())
-    CHECK_CUDA(optional_valueB.value());
-  cudaSetDevice(rowptrA.get_device());
-
-  CHECK_INPUT(rowptrA.dim() == 1);
-  CHECK_INPUT(colA.dim() == 1);
-  if (optional_valueA.has_value()) {
-    CHECK_INPUT(optional_valueA.value().dim() == 1);
-    CHECK_INPUT(optional_valueA.value().size(0) == colA.size(0));
-  }
-  CHECK_INPUT(rowptrB.dim() == 1);
-  CHECK_INPUT(colB.dim() == 1);
-  if (optional_valueB.has_value()) {
-    CHECK_INPUT(optional_valueB.value().dim() == 1);
-    CHECK_INPUT(optional_valueB.value().size(0) == colB.size(0));
-  }
-
-  if (!optional_valueA.has_value() && optional_valueB.has_value())
-    optional_valueA =
-        torch::ones({colA.numel()}, optional_valueB.value().options());
-
-  if (!optional_valueB.has_value() && optional_valueA.has_value())
-    optional_valueB =
-        torch::ones({colB.numel()}, optional_valueA.value().options());
-
-  auto scalar_type = torch::ScalarType::Float;
-  if (optional_valueA.has_value())
-    scalar_type = optional_valueA.value().scalar_type();
-
-  auto handle = at::cuda::getCurrentCUDASparseHandle();
-
-  cusparseMatDescr_t descr;
-  cusparseCreateMatDescr(&descr);
-
-  rowptrA = rowptrA.toType(torch::kInt);
-  colA = colA.toType(torch::kInt);
-  rowptrB = rowptrB.toType(torch::kInt);
-  colB = colB.toType(torch::kInt);
-
-  int64_t M = rowptrA.numel() - 1, N = rowptrB.numel() - 1;
-  auto rowptrA_data = rowptrA.data_ptr<int>();
-  auto colA_data = colA.data_ptr<int>();
-  auto rowptrB_data = rowptrB.data_ptr<int>();
-  auto colB_data = colB.data_ptr<int>();
-
-  torch::Tensor rowptrC, colC;
-  torch::optional<torch::Tensor> optional_valueC = torch::nullopt;
-
-  int nnzC;
-  int *nnzTotalDevHostPtr = &nnzC;
-
-  // Step 1: Create an opaque structure.
-  csrgemm2Info_t info = NULL;
-  cusparseCreateCsrgemm2Info(&info);
-
-  // Step 2: Allocate buffer for `csrgemm2Nnz` and `csrgemm2`.
-  size_t bufferSize;
-  AT_DISPATCH_CUSPARSE_TYPES(scalar_type, [&] {
-    scalar_t alpha = (scalar_t)1.0;
-    cusparsecsrgemm2_bufferSizeExt(handle, M, N, K, &alpha, descr, colA.numel(),
-                                   rowptrA_data, colA_data, descr, colB.numel(),
-                                   rowptrB_data, colB_data, NULL, descr, 0,
-                                   NULL, NULL, info, &bufferSize);
-
-    void *buffer = NULL;
-    cudaMalloc(&buffer, bufferSize);
-
-    // Step 3: Compute CSR row pointer.
-    rowptrC = torch::empty({M + 1}, rowptrA.options());
-    auto rowptrC_data = rowptrC.data_ptr<int>();
-    cusparseXcsrgemm2Nnz(handle, M, N, K, descr, colA.numel(), rowptrA_data,
-                         colA_data, descr, colB.numel(), rowptrB_data,
-                         colB_data, descr, 0, NULL, NULL, descr, rowptrC_data,
-                         nnzTotalDevHostPtr, info, buffer);
-
-    // Step 4: Compute CSR entries.
-    colC = torch::empty({nnzC}, rowptrC.options());
-    auto colC_data = colC.data_ptr<int>();
-
-    if (optional_valueA.has_value())
-      optional_valueC = torch::empty({nnzC}, optional_valueA.value().options());
-
-    scalar_t *valA_data = NULL, *valB_data = NULL, *valC_data = NULL;
-    if (optional_valueA.has_value()) {
-      valA_data = optional_valueA.value().data_ptr<scalar_t>();
-      valB_data = optional_valueB.value().data_ptr<scalar_t>();
-      valC_data = optional_valueC.value().data_ptr<scalar_t>();
-    }
-
-    cusparsecsrgemm2(handle, M, N, K, &alpha, descr, colA.numel(), valA_data,
-                     rowptrA_data, colA_data, descr, colB.numel(), valB_data,
-                     rowptrB_data, colB_data, NULL, descr, 0, NULL, NULL, NULL,
-                     descr, valC_data, rowptrC_data, colC_data, info, buffer);
-
-    cudaFree(buffer);
-  });
-
-  // Step 5: Destroy the opaque structure.
-  cusparseDestroyCsrgemm2Info(info);
-
-  rowptrC = rowptrC.toType(torch::kLong);
-  colC = colC.toType(torch::kLong);
-
-  return std::make_tuple(rowptrC, colC, optional_valueC);
-}
--- a/csrc/cuda/spspmm_cuda.h
+++ b/csrc/cuda/spspmm_cuda.h
-#pragma once
-
-#include "../extensions.h"
-
-std::tuple<torch::Tensor, torch::Tensor, torch::optional<torch::Tensor>>
-spspmm_cuda(torch::Tensor rowptrA, torch::Tensor colA,
-            torch::optional<torch::Tensor> optional_valueA,
-            torch::Tensor rowptrB, torch::Tensor colB,
-            torch::optional<torch::Tensor> optional_valueB, int64_t K,
-            std::string reduce);
--- a/csrc/cuda/utils.cuh
+++ b/csrc/cuda/utils.cuh
@@ -17,6 +17,20 @@ __device__ __inline__ at::Half __shfl_down_sync(const unsigned mask,
  return __shfl_down_sync(mask, var.operator __half(), delta);
 }

+#ifdef USE_ROCM
+ __device__ __inline__ at::Half __shfl_up(const at::Half var,  const unsigned int delta) {
+           return __shfl_up(var.operator __half(), delta);
+ }
+
+ __device__ __inline__ at::Half __shfl_down(const at::Half var, const unsigned int delta) {
+           return __shfl_down(var.operator __half(), delta);
+ }
+
+ __device__ __inline__ at::Half __shfl(const at::Half var, int delta) {
+           return __shfl(var.operator __half(), delta);
+ }
+#endif
+
 #ifdef USE_ROCM
 __device__ __inline__ at::Half __ldg(const at::Half* ptr) {
  return __ldg(reinterpret_cast<const __half*>(ptr));

--- a/csrc/hip/atomics.cuh
+++ b/csrc/hip/atomics.cuh
-#pragma once
-
-static inline __device__ void atomAdd(float *address, float val) {
-  atomicAdd(address, val);
-}
-
-static inline __device__ void atomAdd(double *address, double val) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600 || TORCH_HIP_VERSION < 8000)
-  unsigned long long int *address_as_ull = (unsigned long long int *)address;
-  unsigned long long int old = *address_as_ull;
-  unsigned long long int assumed;
-
-  do {
-    assumed = old;
-    old = atomicCAS(address_as_ull, assumed,
-                    __double_as_longlong(val + __longlong_as_double(assumed)));
-  } while (assumed != old);
-#else
-  atomicAdd(address, val);
-#endif
-}
--- a/csrc/hip/convert_hip.h
+++ b/csrc/hip/convert_hip.h
-#pragma once
-
-#include "../extensions.h"
-
-torch::Tensor ind2ptr_cuda(torch::Tensor ind, int64_t M);
-torch::Tensor ptr2ind_cuda(torch::Tensor ptr, int64_t E);
--- a/csrc/hip/convert_hip.hip
+++ b/csrc/hip/convert_hip.hip
-#include "hip/hip_runtime.h"
-#include "convert_hip.h"
-
-#include <ATen/hip/HIPContext.h>
-
-#include "utils.cuh"
-
-#define THREADS 256
-
-__global__ void ind2ptr_kernel(const int64_t *ind_data, int64_t *out_data,
-                               int64_t M, int64_t numel) {
-
-  int64_t thread_idx = blockDim.x * blockIdx.x + threadIdx.x;
-
-  if (thread_idx == 0) {
-    for (int64_t i = 0; i <= ind_data[0]; i++)
-      out_data[i] = 0;
-  } else if (thread_idx < numel) {
-    for (int64_t i = ind_data[thread_idx - 1]; i < ind_data[thread_idx]; i++)
-      out_data[i + 1] = thread_idx;
-  } else if (thread_idx == numel) {
-    for (int64_t i = ind_data[numel - 1] + 1; i < M + 1; i++)
-      out_data[i] = numel;
-  }
-}
-
-torch::Tensor ind2ptr_cuda(torch::Tensor ind, int64_t M) {
-  CHECK_CUDA(ind);
-  hipSetDevice(ind.get_device());
-
-  auto out = torch::empty(M + 1, ind.options());
-
-  if (ind.numel() == 0)
-    return out.zero_();
-
-  auto ind_data = ind.data_ptr<int64_t>();
-  auto out_data = out.data_ptr<int64_t>();
-  auto stream = at::cuda::getCurrentCUDAStream();
-  ind2ptr_kernel<<<(ind.numel() + 2 + THREADS - 1) / THREADS, THREADS, 0,
-                   stream>>>(ind_data, out_data, M, ind.numel());
-  return out;
-}
-
-__global__ void ptr2ind_kernel(const int64_t *ptr_data, int64_t *out_data,
-                               int64_t E, int64_t numel) {
-
-  int64_t thread_idx = blockDim.x * blockIdx.x + threadIdx.x;
-
-  if (thread_idx < numel) {
-    int64_t idx = ptr_data[thread_idx], next_idx = ptr_data[thread_idx + 1];
-    for (int64_t i = idx; i < next_idx; i++) {
-      out_data[i] = thread_idx;
-    }
-  }
-}
-
-torch::Tensor ptr2ind_cuda(torch::Tensor ptr, int64_t E) {
-  CHECK_CUDA(ptr);
-  hipSetDevice(ptr.get_device());
-
-  auto out = torch::empty(E, ptr.options());
-  auto ptr_data = ptr.data_ptr<int64_t>();
-  auto out_data = out.data_ptr<int64_t>();
-  auto stream = at::cuda::getCurrentCUDAStream();
-  ptr2ind_kernel<<<(ptr.numel() - 1 + THREADS - 1) / THREADS, THREADS, 0,
-                   stream>>>(ptr_data, out_data, E, ptr.numel() - 1);
-  return out;
-}