push v0.6.18 version

2951b12d · aiss · e8309f27 · 2951b12d · 2951b12d · 2951b12d
Commit 2951b12d authored Jan 10, 2024 by aiss
20 changed files
--- a/csrc/hgt_sample.cpp
+++ b/csrc/hgt_sample.cpp
@@ -7,7 +7,7 @@

 #ifdef _WIN32
 #ifdef WITH_PYTHON
-#ifdef WITH_HIP
+#ifdef WITH_CUDA
 PyMODINIT_FUNC PyInit__hgt_sample_cuda(void) { return NULL; }
 #else
 PyMODINIT_FUNC PyInit__hgt_sample_cpu(void) { return NULL; }

--- a/csrc/macros.h
+++ b/csrc/macros.h
+#pragma once
+
+#ifdef _WIN32
+#if defined(torchsparse_EXPORTS)
+#define SPARSE_API __declspec(dllexport)
+#else
+#define SPARSE_API __declspec(dllimport)
+#endif
+#else
+#define SPARSE_API
+#endif
+
+#if (defined __cpp_inline_variables) || __cplusplus >= 201703L
+#define SPARSE_INLINE_VARIABLE inline
+#else
+#ifdef _MSC_VER
+#define SPARSE_INLINE_VARIABLE __declspec(selectany)
+#else
+#define SPARSE_INLINE_VARIABLE __attribute__((weak))
+#endif
+#endif
--- a/csrc/metis.cpp
+++ b/csrc/metis.cpp
@@ -7,7 +7,7 @@

 #ifdef _WIN32
 #ifdef WITH_PYTHON
-#ifdef WITH_HIP
+#ifdef WITH_CUDA
 PyMODINIT_FUNC PyInit__metis_cuda(void) { return NULL; }
 #else
 PyMODINIT_FUNC PyInit__metis_cpu(void) { return NULL; }
@@ -19,7 +19,7 @@ SPARSE_API torch::Tensor partition(torch::Tensor rowptr, torch::Tensor col,
                        torch::optional<torch::Tensor> optional_value,
                        int64_t num_parts, bool recursive) {
  if (rowptr.device().is_cuda()) {
-#ifdef WITH_HIP
+#ifdef WITH_CUDA
    AT_ERROR("No CUDA version supported");
 #else
    AT_ERROR("Not compiled with CUDA support");
@@ -35,7 +35,7 @@ SPARSE_API torch::Tensor partition2(torch::Tensor rowptr, torch::Tensor col,
                         torch::optional<torch::Tensor> optional_node_weight,
                         int64_t num_parts, bool recursive) {
  if (rowptr.device().is_cuda()) {
-#ifdef WITH_HIP
+#ifdef WITH_CUDA
    AT_ERROR("No CUDA version supported");
 #else
    AT_ERROR("Not compiled with CUDA support");
@@ -52,7 +52,7 @@ SPARSE_API torch::Tensor mt_partition(torch::Tensor rowptr, torch::Tensor col,
                           int64_t num_parts, bool recursive,
                           int64_t num_workers) {
  if (rowptr.device().is_cuda()) {
-#ifdef WITH_HIP
+#ifdef WITH_CUDA
    AT_ERROR("No CUDA version supported");
 #else
    AT_ERROR("Not compiled with CUDA support");

--- a/csrc/neighbor_sample.cpp
+++ b/csrc/neighbor_sample.cpp
@@ -7,7 +7,7 @@

 #ifdef _WIN32
 #ifdef WITH_PYTHON
-#ifdef WITH_HIP
+#ifdef WITH_CUDA
 PyMODINIT_FUNC PyInit__neighbor_sample_cuda(void) { return NULL; }
 #else
 PyMODINIT_FUNC PyInit__neighbor_sample_cpu(void) { return NULL; }
@@ -16,7 +16,8 @@ PyMODINIT_FUNC PyInit__neighbor_sample_cpu(void) { return NULL; }
 #endif

 // Returns 'output_node', 'row', 'col', 'output_edge'
-SPARSE_API std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
+SPARSE_API
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
 neighbor_sample(const torch::Tensor &colptr, const torch::Tensor &row,
                const torch::Tensor &input_node,
                const std::vector<int64_t> num_neighbors, const bool replace,
@@ -25,7 +26,8 @@ neighbor_sample(const torch::Tensor &colptr, const torch::Tensor &row,
                             directed);
 }

-SPARSE_API std::tuple<c10::Dict<node_t, torch::Tensor>, c10::Dict<rel_t, torch::Tensor>,
+SPARSE_API
+std::tuple<c10::Dict<node_t, torch::Tensor>, c10::Dict<rel_t, torch::Tensor>,
           c10::Dict<rel_t, torch::Tensor>, c10::Dict<rel_t, torch::Tensor>>
 hetero_neighbor_sample(
    const std::vector<node_t> &node_types,
@@ -40,7 +42,25 @@ hetero_neighbor_sample(
      num_neighbors_dict, num_hops, replace, directed);
 }

+std::tuple<c10::Dict<node_t, torch::Tensor>, c10::Dict<rel_t, torch::Tensor>,
+           c10::Dict<rel_t, torch::Tensor>, c10::Dict<rel_t, torch::Tensor>>
+hetero_temporal_neighbor_sample(
+    const std::vector<node_t> &node_types,
+    const std::vector<edge_t> &edge_types,
+    const c10::Dict<rel_t, torch::Tensor> &colptr_dict,
+    const c10::Dict<rel_t, torch::Tensor> &row_dict,
+    const c10::Dict<node_t, torch::Tensor> &input_node_dict,
+    const c10::Dict<rel_t, std::vector<int64_t>> &num_neighbors_dict,
+    const c10::Dict<node_t, torch::Tensor> &node_time_dict,
+    const int64_t num_hops, const bool replace, const bool directed) {
+  return hetero_temporal_neighbor_sample_cpu(
+      node_types, edge_types, colptr_dict, row_dict, input_node_dict,
+      num_neighbors_dict, node_time_dict, num_hops, replace, directed);
+}
+
 static auto registry =
    torch::RegisterOperators()
        .op("torch_sparse::neighbor_sample", &neighbor_sample)
-        .op("torch_sparse::hetero_neighbor_sample", &hetero_neighbor_sample);
+        .op("torch_sparse::hetero_neighbor_sample", &hetero_neighbor_sample)
+        .op("torch_sparse::hetero_temporal_neighbor_sample",
+            &hetero_temporal_neighbor_sample);
--- a/csrc/relabel.cpp
+++ b/csrc/relabel.cpp
@@ -7,7 +7,7 @@

 #ifdef _WIN32
 #ifdef WITH_PYTHON
-#ifdef WITH_HIP
+#ifdef WITH_CUDA
 PyMODINIT_FUNC PyInit__relabel_cuda(void) { return NULL; }
 #else
 PyMODINIT_FUNC PyInit__relabel_cpu(void) { return NULL; }
@@ -18,7 +18,7 @@ PyMODINIT_FUNC PyInit__relabel_cpu(void) { return NULL; }
 SPARSE_API std::tuple<torch::Tensor, torch::Tensor> relabel(torch::Tensor col,
                                                 torch::Tensor idx) {
  if (col.device().is_cuda()) {
-#ifdef WITH_HIP
+#ifdef WITH_CUDA
    AT_ERROR("No CUDA version supported");
 #else
    AT_ERROR("Not compiled with CUDA support");
@@ -34,7 +34,7 @@ relabel_one_hop(torch::Tensor rowptr, torch::Tensor col,
                torch::optional<torch::Tensor> optional_value,
                torch::Tensor idx, bool bipartite) {
  if (rowptr.device().is_cuda()) {
-#ifdef WITH_HIP
+#ifdef WITH_CUDA
    AT_ERROR("No CUDA version supported");
 #else
    AT_ERROR("Not compiled with CUDA support");

--- a/csrc/rw.cpp
+++ b/csrc/rw.cpp
@@ -5,13 +5,13 @@

 #include "cpu/rw_cpu.h"

-#ifdef WITH_HIP
-#include "hip/rw_hip.h"
+#ifdef WITH_CUDA
+#include "cuda/rw_cuda.h"
 #endif

 #ifdef _WIN32
 #ifdef WITH_PYTHON
-#ifdef WITH_HIP
+#ifdef WITH_CUDA
 PyMODINIT_FUNC PyInit__rw_cuda(void) { return NULL; }
 #else
 PyMODINIT_FUNC PyInit__rw_cpu(void) { return NULL; }
@@ -22,7 +22,7 @@ PyMODINIT_FUNC PyInit__rw_cpu(void) { return NULL; }
 SPARSE_API torch::Tensor random_walk(torch::Tensor rowptr, torch::Tensor col,
                          torch::Tensor start, int64_t walk_length) {
  if (rowptr.device().is_cuda()) {
-#ifdef WITH_HIP
+#ifdef WITH_CUDA
    return random_walk_cuda(rowptr, col, start, walk_length);
 #else
    AT_ERROR("Not compiled with CUDA support");

--- a/csrc/saint.cpp
+++ b/csrc/saint.cpp
@@ -7,7 +7,7 @@

 #ifdef _WIN32
 #ifdef WITH_PYTHON
-#ifdef WITH_HIP
+#ifdef WITH_CUDA
 PyMODINIT_FUNC PyInit__saint_cuda(void) { return NULL; }
 #else
 PyMODINIT_FUNC PyInit__saint_cpu(void) { return NULL; }
@@ -19,7 +19,7 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>
 subgraph(torch::Tensor idx, torch::Tensor rowptr, torch::Tensor row,
         torch::Tensor col) {
  if (idx.device().is_cuda()) {
-#ifdef WITH_HIP
+#ifdef WITH_CUDA
    AT_ERROR("No CUDA version supported");
 #else
    AT_ERROR("Not compiled with CUDA support");

--- a/csrc/sample.cpp
+++ b/csrc/sample.cpp
@@ -7,7 +7,7 @@

 #ifdef _WIN32
 #ifdef WITH_PYTHON
-#ifdef WITH_HIP
+#ifdef WITH_CUDA
 PyMODINIT_FUNC PyInit__sample_cuda(void) { return NULL; }
 #else
 PyMODINIT_FUNC PyInit__sample_cpu(void) { return NULL; }
@@ -19,7 +19,7 @@ SPARSE_API std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor
 sample_adj(torch::Tensor rowptr, torch::Tensor col, torch::Tensor idx,
           int64_t num_neighbors, bool replace) {
  if (rowptr.device().is_cuda()) {
-#ifdef WITH_HIP
+#ifdef WITH_CUDA
    AT_ERROR("No CUDA version supported");
 #else
    AT_ERROR("Not compiled with CUDA support");

--- a/csrc/sparse.h
+++ b/csrc/sparse.h
 #pragma once

-#include <torch/library.h>
+#include "extensions.h"
+#include "macros.h"

-#ifdef _WIN32
-#if defined(torchsparse_EXPORTS)
-#define SPARSE_API __declspec(dllexport)
-#else
-#define SPARSE_API __declspec(dllimport)
-#endif
-#else
-#define SPARSE_API
-#endif
+namespace sparse {
+SPARSE_API int64_t cuda_version() noexcept;

-
-SPARSE_API int64_t cuda_version();
+namespace detail {
+SPARSE_INLINE_VARIABLE int64_t _cuda_version = cuda_version();
+} // namespace detail
+} // namespace sparse

 SPARSE_API torch::Tensor ind2ptr(torch::Tensor ind, int64_t M);
 SPARSE_API torch::Tensor ptr2ind(torch::Tensor ptr, int64_t E);

-SPARSE_API torch::Tensor partition(torch::Tensor rowptr, torch::Tensor col,
-                        torch::optional<torch::Tensor> optional_value,
-                        int64_t num_parts, bool recursive);
-
-SPARSE_API torch::Tensor partition2(torch::Tensor rowptr, torch::Tensor col,
-                         torch::optional<torch::Tensor> optional_value,
-                         torch::optional<torch::Tensor> optional_node_weight,
-                         int64_t num_parts, bool recursive);
-
-SPARSE_API torch::Tensor mt_partition(torch::Tensor rowptr, torch::Tensor col,
-                           torch::optional<torch::Tensor> optional_value,
-                           torch::optional<torch::Tensor> optional_node_weight,
-                           int64_t num_parts, bool recursive,
-                           int64_t num_workers);
- 
+SPARSE_API torch::Tensor
+partition(torch::Tensor rowptr, torch::Tensor col,
+          torch::optional<torch::Tensor> optional_value, int64_t num_parts,
+          bool recursive);
+
+SPARSE_API torch::Tensor
+partition2(torch::Tensor rowptr, torch::Tensor col,
+           torch::optional<torch::Tensor> optional_value,
+           torch::optional<torch::Tensor> optional_node_weight,
+           int64_t num_parts, bool recursive);
+
+SPARSE_API torch::Tensor
+mt_partition(torch::Tensor rowptr, torch::Tensor col,
+             torch::optional<torch::Tensor> optional_value,
+             torch::optional<torch::Tensor> optional_node_weight,
+             int64_t num_parts, bool recursive, int64_t num_workers);
+
 SPARSE_API std::tuple<torch::Tensor, torch::Tensor> relabel(torch::Tensor col,
-                                                 torch::Tensor idx);
+                                                            torch::Tensor idx);

-SPARSE_API std::tuple<torch::Tensor, torch::Tensor, torch::optional<torch::Tensor>,
-           torch::Tensor>
+SPARSE_API std::tuple<torch::Tensor, torch::Tensor,
+                      torch::optional<torch::Tensor>, torch::Tensor>
 relabel_one_hop(torch::Tensor rowptr, torch::Tensor col,
                torch::optional<torch::Tensor> optional_value,
                torch::Tensor idx, bool bipartite);

 SPARSE_API torch::Tensor random_walk(torch::Tensor rowptr, torch::Tensor col,
-                          torch::Tensor start, int64_t walk_length);
+                                     torch::Tensor start, int64_t walk_length);

 SPARSE_API std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>
 subgraph(torch::Tensor idx, torch::Tensor rowptr, torch::Tensor row,
         torch::Tensor col);

-SPARSE_API std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
+SPARSE_API
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
 sample_adj(torch::Tensor rowptr, torch::Tensor col, torch::Tensor idx,
           int64_t num_neighbors, bool replace);

 SPARSE_API torch::Tensor spmm_sum(torch::optional<torch::Tensor> opt_row,
-                       torch::Tensor rowptr, torch::Tensor col,
-                       torch::optional<torch::Tensor> opt_value,
-                       torch::optional<torch::Tensor> opt_colptr,
-                       torch::optional<torch::Tensor> opt_csr2csc,
-                       torch::Tensor mat);
+                                  torch::Tensor rowptr, torch::Tensor col,
+                                  torch::optional<torch::Tensor> opt_value,
+                                  torch::optional<torch::Tensor> opt_colptr,
+                                  torch::optional<torch::Tensor> opt_csr2csc,
+                                  torch::Tensor mat);

 SPARSE_API torch::Tensor spmm_mean(torch::optional<torch::Tensor> opt_row,
-                        torch::Tensor rowptr, torch::Tensor col,
-                        torch::optional<torch::Tensor> opt_value,
-                        torch::optional<torch::Tensor> opt_rowcount,
-                        torch::optional<torch::Tensor> opt_colptr,
-                        torch::optional<torch::Tensor> opt_csr2csc,
-                        torch::Tensor mat);
+                                   torch::Tensor rowptr, torch::Tensor col,
+                                   torch::optional<torch::Tensor> opt_value,
+                                   torch::optional<torch::Tensor> opt_rowcount,
+                                   torch::optional<torch::Tensor> opt_colptr,
+                                   torch::optional<torch::Tensor> opt_csr2csc,
+                                   torch::Tensor mat);

 SPARSE_API std::tuple<torch::Tensor, torch::Tensor>
 spmm_min(torch::Tensor rowptr, torch::Tensor col,
@@ -75,9 +74,3 @@ spmm_min(torch::Tensor rowptr, torch::Tensor col,
 SPARSE_API std::tuple<torch::Tensor, torch::Tensor>
 spmm_max(torch::Tensor rowptr, torch::Tensor col,
         torch::optional<torch::Tensor> opt_value, torch::Tensor mat);
-
-SPARSE_API std::tuple<torch::Tensor, torch::Tensor, torch::optional<torch::Tensor>>
-spspmm_sum(torch::Tensor rowptrA, torch::Tensor colA,
-           torch::optional<torch::Tensor> optional_valueA,
-           torch::Tensor rowptrB, torch::Tensor colB,
-           torch::optional<torch::Tensor> optional_valueB, int64_t K); 
--- a/csrc/spmm.cpp
+++ b/csrc/spmm.cpp
@@ -5,13 +5,13 @@

 #include "cpu/spmm_cpu.h"

-#ifdef WITH_HIP
-#include "hip/spmm_hip.h"
+#ifdef WITH_CUDA
+#include "cuda/spmm_cuda.h"
 #endif

 #ifdef _WIN32
 #ifdef WITH_PYTHON
-#ifdef WITH_HIP
+#ifdef WITH_CUDA
 PyMODINIT_FUNC PyInit__spmm_cuda(void) { return NULL; }
 #else
 PyMODINIT_FUNC PyInit__spmm_cpu(void) { return NULL; }
@@ -24,7 +24,7 @@ spmm_fw(torch::Tensor rowptr, torch::Tensor col,
        torch::optional<torch::Tensor> optional_value, torch::Tensor mat,
        std::string reduce) {
  if (rowptr.device().is_cuda()) {
-#ifdef WITH_HIP
+#ifdef WITH_CUDA
    return spmm_cuda(rowptr, col, optional_value, mat, reduce);
 #else
    AT_ERROR("Not compiled with CUDA support");
@@ -38,7 +38,7 @@ torch::Tensor spmm_value_bw(torch::Tensor row, torch::Tensor rowptr,
                            torch::Tensor col, torch::Tensor mat,
                            torch::Tensor grad, std::string reduce) {
  if (row.device().is_cuda()) {
-#ifdef WITH_HIP
+#ifdef WITH_CUDA
    return spmm_value_bw_cuda(row, rowptr, col, mat, grad, reduce);
 #else
    AT_ERROR("Not compiled with CUDA support");

--- a/csrc/version.cpp
+++ b/csrc/version.cpp
@@ -2,15 +2,20 @@
 #include <Python.h>
 #endif
 #include <torch/script.h>
-#include "sparse.h"

-#ifdef WITH_HIP
-#include <hip/hip_runtime.h>
+#ifdef WITH_CUDA
+#ifdef USE_ROCM
+#include <hip/hip_version.h>
+#else
+#include <cuda.h>
+#endif
 #endif

+#include "macros.h"
+
 #ifdef _WIN32
 #ifdef WITH_PYTHON
-#ifdef WITH_HIP
+#ifdef WITH_CUDA
 PyMODINIT_FUNC PyInit__version_cuda(void) { return NULL; }
 #else
 PyMODINIT_FUNC PyInit__version_cpu(void) { return NULL; }
@@ -18,13 +23,19 @@ PyMODINIT_FUNC PyInit__version_cpu(void) { return NULL; }
 #endif
 #endif

-SPARSE_API int64_t cuda_version() {
-#ifdef WITH_HIP
-  return TORCH_HIP_VERSION;
+namespace sparse {
+SPARSE_API int64_t cuda_version() noexcept {
+#ifdef WITH_CUDA
+#ifdef USE_ROCM
+  return HIP_VERSION;
+#else
+  return CUDA_VERSION;
+#endif
 #else
  return -1;
 #endif
 }
+} // namespace sparse

-static auto registry =
-    torch::RegisterOperators().op("torch_sparse::cuda_version", &cuda_version);
+static auto registry = torch::RegisterOperators().op(
+    "torch_sparse::cuda_version", [] { return sparse::cuda_version(); });
--- a/setup.cfg
+++ b/setup.cfg
 [metadata]
-long_description = file: README.md
-long_description_content_type = text/markdown
-classifiers = 
-	Development Status :: 5 - Production/Stable
-	License :: OSI Approved :: MIT License
-	Programming Language :: Python
-	Programming Language :: Python :: 3.7
-	Programming Language :: Python :: 3.8
-	Programming Language :: Python :: 3.9
-	Programming Language :: Python :: 3.10
-	Programming Language :: Python :: 3 :: Only
+long_description=file: README.md
+long_description_content_type=text/markdown
+
+classifiers =
+    Development Status :: 5 - Production/Stable
+    License :: OSI Approved :: MIT License
+    Programming Language :: Python
+    Programming Language :: Python :: 3.8
+    Programming Language :: Python :: 3.9
+    Programming Language :: Python :: 3.10
+    Programming Language :: Python :: 3.11
+    Programming Language :: Python :: 3 :: Only

 [aliases]
 test = pytest
@@ -17,7 +18,7 @@ test = pytest
 [tool:pytest]
 addopts = --capture=no

-[egg_info]
-tag_build = 
-tag_date = 0
-
+[isort]
+multi_line_output=3
+include_trailing_comma = True
+skip=.gitignore,__init__.py
--- a/setup.py
+++ b/setup.py
@@ -8,23 +8,26 @@ from itertools import product
 import torch
 from setuptools import find_packages, setup
 from torch.__config__ import parallel_info
-from torch.utils.cpp_extension import (CUDA_HOME, BuildExtension, CppExtension,
-                                       CUDAExtension)
+from torch.utils.cpp_extension import (
+    CUDA_HOME,
+    BuildExtension,
+    CppExtension,
+    CUDAExtension,
+)

-__version__ = '0.6.13'
+__version__ = '0.6.18'
 URL = 'https://github.com/rusty1s/pytorch_sparse'

-WITH_HIP = torch.cuda.is_available() and CUDA_HOME is not None
-suffices = ['cpu', 'cuda'] if WITH_HIP else ['cpu']
+WITH_CUDA = False
+if torch.cuda.is_available():
+    WITH_CUDA = CUDA_HOME is not None or torch.version.hip
+suffices = ['cpu', 'cuda'] if WITH_CUDA else ['cpu']
 if os.getenv('FORCE_CUDA', '0') == '1':
    suffices = ['cuda', 'cpu']
-if os.getenv('FORCE_ONLY_HIP', '0') == '1':
-    suffices = ['hip']
+if os.getenv('FORCE_ONLY_CUDA', '0') == '1':
+    suffices = ['cuda']
 if os.getenv('FORCE_ONLY_CPU', '0') == '1':
    suffices = ['cpu']
-ROCM_PATH = os.getenv('ROCM_PATH')
-HIPLIB2 = osp.join(ROCM_PATH, 'hiprand', 'include')
-HIPLIB1 = osp.join(ROCM_PATH, 'hipsparse', 'include')

 BUILD_DOCS = os.getenv('BUILD_DOCS', '0') == '1'

@@ -39,9 +42,12 @@ def get_extensions():

    extensions_dir = osp.join('csrc')
    main_files = glob.glob(osp.join(extensions_dir, '*.cpp'))
+    # remove generated 'hip' files, in case of rebuilds
+    main_files = [path for path in main_files if 'hip' not in path]

    for main, suffix in product(main_files, suffices):
        define_macros = [('WITH_PYTHON', None)]
+        undef_macros = []

        if sys.platform == 'win32':
            define_macros += [('torchsparse_EXPORTS', None)]
@@ -58,9 +64,11 @@ def get_extensions():
            define_macros += [('MTMETIS_64BIT_PARTITIONS', None)]
            libraries += ['mtmetis', 'wildriver']

-        extra_compile_args = {'cxx': ['-O2']}
+        extra_compile_args = {'cxx': ['-O3']}
        if not os.name == 'nt':  # Not on Windows:
            extra_compile_args['cxx'] += ['-Wno-sign-compare']
+        if sys.platform == 'darwin':  # On macOS:
+            extra_compile_args['cxx'] += ['-D_LIBCPP_DISABLE_AVAILABILITY']
        extra_link_args = [] if WITH_SYMBOLS else ['-s']

        info = parallel_info()
@@ -79,18 +87,19 @@ def get_extensions():
            extra_compile_args['cxx'] += ['-arch', 'arm64']
            extra_link_args += ['-arch', 'arm64']

-        if suffix == 'hip':
-            define_macros += [('WITH_HIP', None)]
-            hipcc_flags = os.getenv('HIPCC_FLAGS', '')
-            hipcc_flags = [] if hipcc_flags == '' else hipcc_flags.split(' ')
-            hipcc_flags += ['--expt-relaxed-constexpr', '-O2']
-            extra_compile_args['hipcc'] = hipcc_flags
-
-            if sys.platform == 'win32':
-                extra_link_args += ['hipsparse.lib']
+        if suffix == 'cuda':
+            define_macros += [('WITH_CUDA', None)]
+            nvcc_flags = os.getenv('NVCC_FLAGS', '')
+            nvcc_flags = [] if nvcc_flags == '' else nvcc_flags.split(' ')
+            nvcc_flags += ['-O3']
+            if torch.version.hip:
+                # USE_ROCM was added to later versions of PyTorch
+                # Define here to support older PyTorch versions as well:
+                define_macros += [('USE_ROCM', None)]
+                undef_macros += ['__HIP_NO_HALF_CONVERSIONS__']
            else:
-                extra_link_args += ['-lhipsparse', '-l', 'hipsparse']
-                extra_link_args += ['-fopenmp','-lomp']
+                nvcc_flags += ['--expt-relaxed-constexpr']
+            extra_compile_args['nvcc'] = nvcc_flags

        name = main.split(os.sep)[-1][:-4]
        sources = [main]
@@ -99,17 +108,19 @@ def get_extensions():
        if osp.exists(path):
            sources += [path]

-        path = osp.join(extensions_dir, 'hip', f'{name}_hip.hip')
-        if suffix == 'hip' and osp.exists(path):
+        path = osp.join(extensions_dir, 'cuda', f'{name}_cuda.cu')
+        if suffix == 'cuda' and osp.exists(path):
            sources += [path]

+        phmap_dir = osp.abspath("third_party/parallel-hashmap")
+
        Extension = CppExtension if suffix == 'cpu' else CUDAExtension
-        define_macros += [('TORCH_HIP_VERSION', 10000), ('__HIP__', None), ('__HCC__', None)]
        extension = Extension(
            f'torch_sparse._{name}_{suffix}',
            sources,
-            include_dirs=[extensions_dir, HIPLIB1, HIPLIB2],
+            include_dirs=[extensions_dir, phmap_dir],
            define_macros=define_macros,
+            undef_macros=undef_macros,
            extra_compile_args=extra_compile_args,
            extra_link_args=extra_link_args,
            libraries=libraries,
@@ -128,6 +139,11 @@ test_requires = [
    'pytest-cov',
 ]

+# work-around hipify abs paths
+include_package_data = True
+if torch.cuda.is_available() and torch.version.hip:
+    include_package_data = False
+
 setup(
    name='torch_sparse',
    version=__version__,
@@ -143,16 +159,15 @@ setup(
        'sparse-matrices',
        'autograd',
    ],
-    python_requires='>=3.7',
+    python_requires='>=3.8',
    install_requires=install_requires,
    extras_require={
        'test': test_requires,
    },
    ext_modules=get_extensions() if not BUILD_DOCS else [],
    cmdclass={
-        'build_ext':
-        BuildExtension.with_options(no_python_abi_suffix=True, use_ninja=False)
+        'build_ext': BuildExtension.with_options(no_python_abi_suffix=True)
    },
    packages=find_packages(),
-    include_package_data=False,
+    include_package_data=include_package_data,
 )
--- a/test/.hypothesis/unicode_data/12.1.0/charmap.json.gz
+++ b/test/.hypothesis/unicode_data/12.1.0/charmap.json.gz
--- a/test/.hypothesis/unicode_data/12.1.0/codec-utf-8.json.gz
+++ b/test/.hypothesis/unicode_data/12.1.0/codec-utf-8.json.gz
--- a/test/__pycache__/test_matmul.cpython-38-pytest-7.2.0.pyc
+++ b/test/__pycache__/test_matmul.cpython-38-pytest-7.2.0.pyc
--- a/test/__pycache__/test_matmul.cpython-38-pytest-7.4.3.pyc
+++ b/test/__pycache__/test_matmul.cpython-38-pytest-7.4.3.pyc
--- a/test/__pycache__/test_matmul.cpython-38-pytest-7.4.4.pyc
+++ b/test/__pycache__/test_matmul.cpython-38-pytest-7.4.4.pyc
--- a/test/test_add.py
+++ b/test/test_add.py
+from itertools import product
+
+import pytest
+import torch
+
+from torch_sparse import SparseTensor, add
+from torch_sparse.testing import devices, dtypes, tensor
+
+
+@pytest.mark.parametrize('dtype,device', product(dtypes, devices))
+def test_add(dtype, device):
+    rowA = torch.tensor([0, 0, 1, 2, 2], device=device)
+    colA = torch.tensor([0, 2, 1, 0, 1], device=device)
+    valueA = tensor([1, 2, 4, 1, 3], dtype, device)
+    A = SparseTensor(row=rowA, col=colA, value=valueA)
+
+    rowB = torch.tensor([0, 0, 1, 2, 2], device=device)
+    colB = torch.tensor([1, 2, 2, 1, 2], device=device)
+    valueB = tensor([2, 3, 1, 2, 4], dtype, device)
+    B = SparseTensor(row=rowB, col=colB, value=valueB)
+
+    C = A + B
+    rowC, colC, valueC = C.coo()
+
+    assert rowC.tolist() == [0, 0, 0, 1, 1, 2, 2, 2]
+    assert colC.tolist() == [0, 1, 2, 1, 2, 0, 1, 2]
+    assert valueC.tolist() == [1, 2, 5, 4, 1, 1, 5, 4]
+
+    @torch.jit.script
+    def jit_add(A: SparseTensor, B: SparseTensor) -> SparseTensor:
+        return add(A, B)
+
+    jit_add(A, B)
--- a/test/test_cat.py
+++ b/test/test_cat.py
+import pytest
+import torch
+
+from torch_sparse.cat import cat
+from torch_sparse.tensor import SparseTensor
+from torch_sparse.testing import devices, tensor
+
+
+@pytest.mark.parametrize('device', devices)
+def test_cat(device):
+    row, col = tensor([[0, 0, 1], [0, 1, 2]], torch.long, device)
+    mat1 = SparseTensor(row=row, col=col)
+    mat1.fill_cache_()
+
+    row, col = tensor([[0, 0, 1, 2], [0, 1, 1, 0]], torch.long, device)
+    mat2 = SparseTensor(row=row, col=col)
+    mat2.fill_cache_()
+
+    out = cat([mat1, mat2], dim=0)
+    assert out.to_dense().tolist() == [[1, 1, 0], [0, 0, 1], [1, 1, 0],
+                                       [0, 1, 0], [1, 0, 0]]
+    assert out.storage.has_row()
+    assert out.storage.has_rowptr()
+    assert out.storage.has_rowcount()
+    assert out.storage.num_cached_keys() == 1
+
+    out = cat([mat1, mat2], dim=1)
+    assert out.to_dense().tolist() == [[1, 1, 0, 1, 1], [0, 0, 1, 0, 1],
+                                       [0, 0, 0, 1, 0]]
+    assert out.storage.has_row()
+    assert not out.storage.has_rowptr()
+    assert out.storage.num_cached_keys() == 2
+
+    out = cat([mat1, mat2], dim=(0, 1))
+    assert out.to_dense().tolist() == [[1, 1, 0, 0, 0], [0, 0, 1, 0, 0],
+                                       [0, 0, 0, 1, 1], [0, 0, 0, 0, 1],
+                                       [0, 0, 0, 1, 0]]
+    assert out.storage.has_row()
+    assert out.storage.has_rowptr()
+    assert out.storage.num_cached_keys() == 5
+
+    value = torch.randn((mat1.nnz(), 4), device=device)
+    mat1 = mat1.set_value_(value, layout='coo')
+    out = cat([mat1, mat1], dim=-1)
+    assert out.storage.value().size() == (mat1.nnz(), 8)
+    assert out.storage.has_row()
+    assert out.storage.has_rowptr()
+    assert out.storage.num_cached_keys() == 5