suport v2.1.0

0799bc08 · limm · 50e05e1e · 0799bc08 · 0799bc08 · 0799bc08
Commit 0799bc08 authored Jul 25, 2024 by limm
18 changed files
--- a/readthedocs.yml
+++ b/readthedocs.yml
+version: 2
+
+build:
+   image: latest
+
+python:
+   version: 3.8
+   system_packages: true
+   install:
+      - requirements: docs/requirements.txt
+      - method: setuptools
+        path: .
+
+formats: []
--- a/setup.cfg
+++ b/setup.cfg
 [metadata]
-description-file = README.md
+long_description=file: README.md
+long_description_content_type=text/markdown
+
+classifiers =
+    Development Status :: 5 - Production/Stable
+    License :: OSI Approved :: MIT License
+    Programming Language :: Python
+    Programming Language :: Python :: 3.7
+    Programming Language :: Python :: 3.8
+    Programming Language :: Python :: 3.9
+    Programming Language :: Python :: 3.10
+    Programming Language :: Python :: 3 :: Only

 [aliases]
 test = pytest

 [tool:pytest]
-addopts = --capture=no --cov
-
-[egg_info]
-tag_build = 
-tag_date = 0
-
+addopts = --capture=no
--- a/setup.py
+++ b/setup.py
-import os
-import sys
 import glob
+import os
 import os.path as osp
+import platform
+import sys
 from itertools import product
-from setuptools import setup, find_packages

 import torch
+from setuptools import find_packages, setup
 from torch.__config__ import parallel_info
-from torch.utils.cpp_extension import BuildExtension
-from torch.utils.cpp_extension import CppExtension, CUDAExtension, CUDA_HOME
+from torch.utils.cpp_extension import (CUDA_HOME, BuildExtension, CppExtension,
+                                       CUDAExtension)

-WITH_HIP = torch.cuda.is_available() and CUDA_HOME is not None
-suffices = ['cpu', 'cuda'] if WITH_HIP else ['cpu']
+__version__ = '2.1.0'
+URL = 'https://github.com/rusty1s/pytorch_scatter'
+
+WITH_CUDA = False
+if torch.cuda.is_available():
+    WITH_CUDA = CUDA_HOME is not None or torch.version.hip
+suffices = ['cpu', 'cuda'] if WITH_CUDA else ['cpu']
 if os.getenv('FORCE_CUDA', '0') == '1':
    suffices = ['cuda', 'cpu']
-if os.getenv('FORCE_ONLY_HIP', '0') == '1':
-    suffices = ['hip']
+if os.getenv('FORCE_ONLY_CUDA', '0') == '1':
+    suffices = ['cuda']
 if os.getenv('FORCE_ONLY_CPU', '0') == '1':
    suffices = ['cpu']
-ROCM_PATH = os.getenv('ROCM_PATH')
-HIPLIB = osp.join(ROCM_PATH, 'hipsparse', 'include')

 BUILD_DOCS = os.getenv('BUILD_DOCS', '0') == '1'
+WITH_SYMBOLS = os.getenv('WITH_SYMBOLS', '0') == '1'


 def get_extensions():
@@ -29,11 +34,20 @@ def get_extensions():

    extensions_dir = osp.join('csrc')
    main_files = glob.glob(osp.join(extensions_dir, '*.cpp'))
+    # remove generated 'hip' files, in case of rebuilds
+    main_files = [path for path in main_files if 'hip' not in path]

    for main, suffix in product(main_files, suffices):
-        define_macros = []
-        extra_compile_args = {'cxx': ['-O2']}
-        extra_link_args = ['-s']
+        define_macros = [('WITH_PYTHON', None)]
+        undef_macros = []
+
+        if sys.platform == 'win32':
+            define_macros += [('torchscatter_EXPORTS', None)]
+
+        extra_compile_args = {'cxx': ['-O3']}
+        if not os.name == 'nt':  # Not on Windows:
+            extra_compile_args['cxx'] += ['-Wno-sign-compare']
+        extra_link_args = [] if WITH_SYMBOLS else ['-s']

        info = parallel_info()
        if ('backend: OpenMP' in info and 'OpenMP not found' not in info
@@ -46,12 +60,24 @@ def get_extensions():
        else:
            print('Compiling without OpenMP...')

-        if suffix == 'hip':
-            define_macros += [('WITH_HIP', None)]
-            hipcc_flags = os.getenv('HIPCC_FLAGS', '')
-            hipcc_flags = [] if hipcc_flags == '' else hipcc_flags.split(' ')
-            hipcc_flags += ['--expt-relaxed-constexpr', '-O2']
-            extra_compile_args['hipcc'] = hipcc_flags
+        # Compile for mac arm64
+        if (sys.platform == 'darwin' and platform.machine() == 'arm64'):
+            extra_compile_args['cxx'] += ['-arch', 'arm64']
+            extra_link_args += ['-arch', 'arm64']
+
+        if suffix == 'cuda':
+            define_macros += [('WITH_CUDA', None)]
+            nvcc_flags = os.getenv('NVCC_FLAGS', '')
+            nvcc_flags = [] if nvcc_flags == '' else nvcc_flags.split(' ')
+            nvcc_flags += ['-O3']
+            if torch.version.hip:
+                # USE_ROCM was added to later versions of PyTorch.
+                # Define here to support older PyTorch versions as well:
+                define_macros += [('USE_ROCM', None)]
+                undef_macros += ['__HIP_NO_HALF_CONVERSIONS__']
+            else:
+                nvcc_flags += ['--expt-relaxed-constexpr']
+            extra_compile_args['nvcc'] = nvcc_flags

        name = main.split(os.sep)[-1][:-4]
        sources = [main]
@@ -60,17 +86,17 @@ def get_extensions():
        if osp.exists(path):
            sources += [path]

-        path = osp.join(extensions_dir, 'hip', f'{name}_hip.hip')
-        if suffix == 'hip' and osp.exists(path):
+        path = osp.join(extensions_dir, 'cuda', f'{name}_cuda.cu')
+        if suffix == 'cuda' and osp.exists(path):
            sources += [path]

        Extension = CppExtension if suffix == 'cpu' else CUDAExtension
-        define_macros += [('TORCH_HIP_VERSION', 10000), ('__HIP__', None), ('__HCC__', None)]
        extension = Extension(
            f'torch_scatter._{name}_{suffix}',
            sources,
-            include_dirs=[extensions_dir, HIPLIB],
+            include_dirs=[extensions_dir],
            define_macros=define_macros,
+            undef_macros=undef_macros,
            extra_compile_args=extra_compile_args,
            extra_link_args=extra_link_args,
        )
@@ -80,27 +106,36 @@ def get_extensions():


 install_requires = []
-setup_requires = []
-tests_require = ['pytest', 'pytest-runner', 'pytest-cov']
+
+test_requires = [
+    'pytest',
+    'pytest-cov',
+]
+
+# work-around hipify abs paths
+include_package_data = True
+if torch.cuda.is_available() and torch.version.hip:
+    include_package_data = False

 setup(
    name='torch_scatter',
-    version='2.0.9',
+    version=__version__,
+    description='PyTorch Extension Library of Optimized Scatter Operations',
    author='Matthias Fey',
    author_email='matthias.fey@tu-dortmund.de',
-    url='https://github.com/rusty1s/pytorch_scatter',
-    description='PyTorch Extension Library of Optimized Scatter Operations',
+    url=URL,
+    download_url=f'{URL}/archive/{__version__}.tar.gz',
    keywords=['pytorch', 'scatter', 'segment', 'gather'],
-    license='MIT',
-    python_requires='>=3.6',
+    python_requires='>=3.7',
    install_requires=install_requires,
-    setup_requires=setup_requires,
-    tests_require=tests_require,
-    extras_require={'test': tests_require},
+    extras_require={
+        'test': test_requires,
+    },
    ext_modules=get_extensions() if not BUILD_DOCS else [],
    cmdclass={
        'build_ext':
        BuildExtension.with_options(no_python_abi_suffix=True, use_ninja=False)
    },
    packages=find_packages(),
+    include_package_data=include_package_data,
 )
--- a/test/composite/test_logsumexp.py
+++ b/test/composite/test_logsumexp.py
+import torch
+from torch_scatter import scatter_logsumexp
+
+
+def test_logsumexp():
+    inputs = torch.tensor([
+        0.5, 0.5, 0.0, -2.1, 3.2, 7.0, -1.0, -100.0,
+        float('-inf'),
+        float('-inf'), 0.0
+    ])
+    inputs.requires_grad_()
+    index = torch.tensor([0, 0, 1, 1, 1, 2, 4, 4, 5, 6, 6])
+    splits = [2, 3, 1, 0, 2, 1, 2]
+
+    outputs = scatter_logsumexp(inputs, index)
+
+    for src, out in zip(inputs.split(splits), outputs.unbind()):
+        assert out.tolist() == torch.logsumexp(src, dim=0).tolist()
+
+    outputs.backward(torch.randn_like(outputs))
+
+    jit = torch.jit.script(scatter_logsumexp)
+    assert jit(inputs, index).tolist() == outputs.tolist()
--- a/test/composite/test_softmax.py
+++ b/test/composite/test_softmax.py
+import torch
+from torch_scatter import scatter_log_softmax, scatter_softmax
+
+
+def test_softmax():
+    src = torch.tensor([0.2, 0, 0.2, -2.1, 3.2, 7, -1, float('-inf')])
+    src.requires_grad_()
+    index = torch.tensor([0, 1, 0, 1, 1, 2, 4, 4])
+
+    out = scatter_softmax(src, index)
+
+    out0 = torch.softmax(torch.tensor([0.2, 0.2]), dim=-1)
+    out1 = torch.softmax(torch.tensor([0, -2.1, 3.2]), dim=-1)
+    out2 = torch.softmax(torch.tensor([7], dtype=torch.float), dim=-1)
+    out4 = torch.softmax(torch.tensor([-1, float('-inf')]), dim=-1)
+
+    expected = torch.stack([
+        out0[0], out1[0], out0[1], out1[1], out1[2], out2[0], out4[0], out4[1]
+    ], dim=0)
+
+    assert torch.allclose(out, expected)
+
+    out.backward(torch.randn_like(out))
+
+    jit = torch.jit.script(scatter_softmax)
+    assert jit(src, index).tolist() == out.tolist()
+
+
+def test_log_softmax():
+    src = torch.tensor([0.2, 0, 0.2, -2.1, 3.2, 7, -1, float('-inf')])
+    src.requires_grad_()
+    index = torch.tensor([0, 1, 0, 1, 1, 2, 4, 4])
+
+    out = scatter_log_softmax(src, index)
+
+    out0 = torch.log_softmax(torch.tensor([0.2, 0.2]), dim=-1)
+    out1 = torch.log_softmax(torch.tensor([0, -2.1, 3.2]), dim=-1)
+    out2 = torch.log_softmax(torch.tensor([7], dtype=torch.float), dim=-1)
+    out4 = torch.log_softmax(torch.tensor([-1, float('-inf')]), dim=-1)
+
+    expected = torch.stack([
+        out0[0], out1[0], out0[1], out1[1], out1[2], out2[0], out4[0], out4[1]
+    ], dim=0)
+
+    assert torch.allclose(out, expected)
+
+    out.backward(torch.randn_like(out))
+
+    jit = torch.jit.script(scatter_log_softmax)
+    assert jit(src, index).tolist() == out.tolist()
--- a/test/composite/test_std.py
+++ b/test/composite/test_std.py
+import torch
+from torch_scatter import scatter_std
+
+
+def test_std():
+    src = torch.tensor([[2, 0, 1, 4, 3], [0, 2, 1, 3, 4]], dtype=torch.float)
+    src.requires_grad_()
+    index = torch.tensor([[0, 0, 0, 0, 0], [1, 1, 1, 1, 1]], dtype=torch.long)
+
+    out = scatter_std(src, index, dim=-1, unbiased=True)
+    std = src.std(dim=-1, unbiased=True)[0]
+    expected = torch.tensor([[std, 0], [0, std]])
+    assert torch.allclose(out, expected)
+
+    out.backward(torch.randn_like(out))
+
+    jit = torch.jit.script(scatter_std)
+    assert jit(src, index, dim=-1, unbiased=True).tolist() == out.tolist()
--- a/test/test_broadcasting.py
+++ b/test/test_broadcasting.py
+from itertools import product
+
+import pytest
+import torch
+from torch_scatter import scatter
+from torch_scatter.testing import devices, reductions
+
+
+@pytest.mark.parametrize('reduce,device', product(reductions, devices))
+def test_broadcasting(reduce, device):
+    B, C, H, W = (4, 3, 8, 8)
+
+    src = torch.randn((B, C, H, W), device=device)
+    index = torch.randint(0, H, (H, )).to(device, torch.long)
+    out = scatter(src, index, dim=2, dim_size=H, reduce=reduce)
+    assert out.size() == (B, C, H, W)
+
+    src = torch.randn((B, C, H, W), device=device)
+    index = torch.randint(0, H, (B, 1, H, W)).to(device, torch.long)
+    out = scatter(src, index, dim=2, dim_size=H, reduce=reduce)
+    assert out.size() == (B, C, H, W)
+
+    src = torch.randn((B, C, H, W), device=device)
+    index = torch.randint(0, H, (H, )).to(device, torch.long)
+    out = scatter(src, index, dim=2, dim_size=H, reduce=reduce)
+    assert out.size() == (B, C, H, W)
--- a/test/test_gather.py
+++ b/test/test_gather.py
+from itertools import product
+
+import pytest
+import torch
+from torch.autograd import gradcheck
+from torch_scatter import gather_coo, gather_csr
+from torch_scatter.testing import devices, dtypes, tensor
+
+tests = [
+    {
+        'src': [1, 2, 3, 4],
+        'index': [0, 0, 1, 1, 1, 3],
+        'indptr': [0, 2, 5, 5, 6],
+        'expected': [1, 1, 2, 2, 2, 4],
+    },
+    {
+        'src': [[1, 2], [3, 4], [5, 6], [7, 8]],
+        'index': [0, 0, 1, 1, 1, 3],
+        'indptr': [0, 2, 5, 5, 6],
+        'expected': [[1, 2], [1, 2], [3, 4], [3, 4], [3, 4], [7, 8]]
+    },
+    {
+        'src': [[1, 3, 5, 7], [2, 4, 6, 8]],
+        'index': [[0, 0, 1, 1, 1, 3], [0, 0, 0, 1, 1, 2]],
+        'indptr': [[0, 2, 5, 5, 6], [0, 3, 5, 6, 6]],
+        'expected': [[1, 1, 3, 3, 3, 7], [2, 2, 2, 4, 4, 6]],
+    },
+    {
+        'src': [[[1, 2], [3, 4], [5, 6]], [[7, 9], [10, 11], [12, 13]]],
+        'index': [[0, 0, 1], [0, 2, 2]],
+        'indptr': [[0, 2, 3, 3], [0, 1, 1, 3]],
+        'expected': [[[1, 2], [1, 2], [3, 4]], [[7, 9], [12, 13], [12, 13]]],
+    },
+    {
+        'src': [[1], [2]],
+        'index': [[0, 0], [0, 0]],
+        'indptr': [[0, 2], [0, 2]],
+        'expected': [[1, 1], [2, 2]],
+    },
+    {
+        'src': [[[1, 1]], [[2, 2]]],
+        'index': [[0, 0], [0, 0]],
+        'indptr': [[0, 2], [0, 2]],
+        'expected': [[[1, 1], [1, 1]], [[2, 2], [2, 2]]],
+    },
+]
+
+
+@pytest.mark.parametrize('test,dtype,device', product(tests, dtypes, devices))
+def test_forward(test, dtype, device):
+    src = tensor(test['src'], dtype, device)
+    index = tensor(test['index'], torch.long, device)
+    indptr = tensor(test['indptr'], torch.long, device)
+    expected = tensor(test['expected'], dtype, device)
+
+    out = gather_csr(src, indptr)
+    assert torch.all(out == expected)
+
+    out = gather_coo(src, index)
+    assert torch.all(out == expected)
+
+
+@pytest.mark.parametrize('test,device', product(tests, devices))
+def test_backward(test, device):
+    src = tensor(test['src'], torch.double, device)
+    src.requires_grad_()
+    index = tensor(test['index'], torch.long, device)
+    indptr = tensor(test['indptr'], torch.long, device)
+
+    assert gradcheck(gather_csr, (src, indptr, None)) is True
+    assert gradcheck(gather_coo, (src, index, None)) is True
+
+
+@pytest.mark.parametrize('test,dtype,device', product(tests, dtypes, devices))
+def test_out(test, dtype, device):
+    src = tensor(test['src'], dtype, device)
+    index = tensor(test['index'], torch.long, device)
+    indptr = tensor(test['indptr'], torch.long, device)
+    expected = tensor(test['expected'], dtype, device)
+
+    size = list(src.size())
+    size[index.dim() - 1] = index.size(-1)
+    out = src.new_full(size, -2)
+
+    gather_csr(src, indptr, out)
+    assert torch.all(out == expected)
+
+    out.fill_(-2)
+
+    gather_coo(src, index, out)
+    assert torch.all(out == expected)
+
+
+@pytest.mark.parametrize('test,dtype,device', product(tests, dtypes, devices))
+def test_non_contiguous(test, dtype, device):
+    src = tensor(test['src'], dtype, device)
+    index = tensor(test['index'], torch.long, device)
+    indptr = tensor(test['indptr'], torch.long, device)
+    expected = tensor(test['expected'], dtype, device)
+
+    if src.dim() > 1:
+        src = src.transpose(0, 1).contiguous().transpose(0, 1)
+    if index.dim() > 1:
+        index = index.transpose(0, 1).contiguous().transpose(0, 1)
+    if indptr.dim() > 1:
+        indptr = indptr.transpose(0, 1).contiguous().transpose(0, 1)
+
+    out = gather_csr(src, indptr)
+    assert torch.all(out == expected)
+
+    out = gather_coo(src, index)
+    assert torch.all(out == expected)
--- a/test/test_multi_gpu.py
+++ b/test/test_multi_gpu.py
+from itertools import product
+
+import pytest
+import torch
+import torch_scatter
+from torch_scatter.testing import dtypes, reductions, tensor
+
+tests = [
+    {
+        'src': [1, 2, 3, 4, 5, 6],
+        'index': [0, 0, 1, 1, 1, 3],
+        'indptr': [0, 2, 5, 5, 6],
+        'dim': 0,
+        'sum': [3, 12, 0, 6],
+        'add': [3, 12, 0, 6],
+        'mean': [1.5, 4, 0, 6],
+        'min': [1, 3, 0, 6],
+        'max': [2, 5, 0, 6],
+    },
+]
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason='CUDA not available')
+@pytest.mark.skipif(torch.cuda.device_count() < 2, reason='No multiple GPUS')
+@pytest.mark.parametrize('test,reduce,dtype', product(tests, reductions,
+                                                      dtypes))
+def test_forward(test, reduce, dtype):
+    device = torch.device('cuda:1')
+    src = tensor(test['src'], dtype, device)
+    index = tensor(test['index'], torch.long, device)
+    indptr = tensor(test['indptr'], torch.long, device)
+    dim = test['dim']
+    expected = tensor(test[reduce], dtype, device)
+
+    out = torch_scatter.scatter(src, index, dim, reduce=reduce)
+    assert torch.all(out == expected)
+
+    out = torch_scatter.segment_coo(src, index, reduce=reduce)
+    assert torch.all(out == expected)
+
+    out = torch_scatter.segment_csr(src, indptr, reduce=reduce)
+    assert torch.all(out == expected)
--- a/test/test_scatter.py
+++ b/test/test_scatter.py
+from itertools import product
+
+import pytest
+import torch
+import torch_scatter
+from torch.autograd import gradcheck
+from torch_scatter.testing import devices, dtypes, reductions, tensor
+
+reductions = reductions + ['mul']
+
+tests = [
+    {
+        'src': [1, 3, 2, 4, 5, 6],
+        'index': [0, 1, 0, 1, 1, 3],
+        'dim': -1,
+        'sum': [3, 12, 0, 6],
+        'add': [3, 12, 0, 6],
+        'mul': [2, 60, 1, 6],
+        'mean': [1.5, 4, 0, 6],
+        'min': [1, 3, 0, 6],
+        'arg_min': [0, 1, 6, 5],
+        'max': [2, 5, 0, 6],
+        'arg_max': [2, 4, 6, 5],
+    },
+    {
+        'src': [[1, 2], [5, 6], [3, 4], [7, 8], [9, 10], [11, 12]],
+        'index': [0, 1, 0, 1, 1, 3],
+        'dim': 0,
+        'sum': [[4, 6], [21, 24], [0, 0], [11, 12]],
+        'add': [[4, 6], [21, 24], [0, 0], [11, 12]],
+        'mul': [[1 * 3, 2 * 4], [5 * 7 * 9, 6 * 8 * 10], [1, 1], [11, 12]],
+        'mean': [[2, 3], [7, 8], [0, 0], [11, 12]],
+        'min': [[1, 2], [5, 6], [0, 0], [11, 12]],
+        'arg_min': [[0, 0], [1, 1], [6, 6], [5, 5]],
+        'max': [[3, 4], [9, 10], [0, 0], [11, 12]],
+        'arg_max': [[2, 2], [4, 4], [6, 6], [5, 5]],
+    },
+    {
+        'src': [[1, 5, 3, 7, 9, 11], [2, 4, 8, 6, 10, 12]],
+        'index': [[0, 1, 0, 1, 1, 3], [0, 0, 1, 0, 1, 2]],
+        'dim': 1,
+        'sum': [[4, 21, 0, 11], [12, 18, 12, 0]],
+        'add': [[4, 21, 0, 11], [12, 18, 12, 0]],
+        'mul': [[1 * 3, 5 * 7 * 9, 1, 11], [2 * 4 * 6, 8 * 10, 12, 1]],
+        'mean': [[2, 7, 0, 11], [4, 9, 12, 0]],
+        'min': [[1, 5, 0, 11], [2, 8, 12, 0]],
+        'arg_min': [[0, 1, 6, 5], [0, 2, 5, 6]],
+        'max': [[3, 9, 0, 11], [6, 10, 12, 0]],
+        'arg_max': [[2, 4, 6, 5], [3, 4, 5, 6]],
+    },
+    {
+        'src': [[[1, 2], [5, 6], [3, 4]], [[10, 11], [7, 9], [12, 13]]],
+        'index': [[0, 1, 0], [2, 0, 2]],
+        'dim': 1,
+        'sum': [[[4, 6], [5, 6], [0, 0]], [[7, 9], [0, 0], [22, 24]]],
+        'add': [[[4, 6], [5, 6], [0, 0]], [[7, 9], [0, 0], [22, 24]]],
+        'mul': [[[3, 8], [5, 6], [1, 1]], [[7, 9], [1, 1], [120, 11 * 13]]],
+        'mean': [[[2, 3], [5, 6], [0, 0]], [[7, 9], [0, 0], [11, 12]]],
+        'min': [[[1, 2], [5, 6], [0, 0]], [[7, 9], [0, 0], [10, 11]]],
+        'arg_min': [[[0, 0], [1, 1], [3, 3]], [[1, 1], [3, 3], [0, 0]]],
+        'max': [[[3, 4], [5, 6], [0, 0]], [[7, 9], [0, 0], [12, 13]]],
+        'arg_max': [[[2, 2], [1, 1], [3, 3]], [[1, 1], [3, 3], [2, 2]]],
+    },
+    {
+        'src': [[1, 3], [2, 4]],
+        'index': [[0, 0], [0, 0]],
+        'dim': 1,
+        'sum': [[4], [6]],
+        'add': [[4], [6]],
+        'mul': [[3], [8]],
+        'mean': [[2], [3]],
+        'min': [[1], [2]],
+        'arg_min': [[0], [0]],
+        'max': [[3], [4]],
+        'arg_max': [[1], [1]],
+    },
+    {
+        'src': [[[1, 1], [3, 3]], [[2, 2], [4, 4]]],
+        'index': [[0, 0], [0, 0]],
+        'dim': 1,
+        'sum': [[[4, 4]], [[6, 6]]],
+        'add': [[[4, 4]], [[6, 6]]],
+        'mul': [[[3, 3]], [[8, 8]]],
+        'mean': [[[2, 2]], [[3, 3]]],
+        'min': [[[1, 1]], [[2, 2]]],
+        'arg_min': [[[0, 0]], [[0, 0]]],
+        'max': [[[3, 3]], [[4, 4]]],
+        'arg_max': [[[1, 1]], [[1, 1]]],
+    },
+]
+
+
+@pytest.mark.parametrize('test,reduce,dtype,device',
+                         product(tests, reductions, dtypes, devices))
+def test_forward(test, reduce, dtype, device):
+    src = tensor(test['src'], dtype, device)
+    index = tensor(test['index'], torch.long, device)
+    dim = test['dim']
+    expected = tensor(test[reduce], dtype, device)
+
+    fn = getattr(torch_scatter, 'scatter_' + reduce)
+    jit = torch.jit.script(fn)
+    out1 = fn(src, index, dim)
+    out2 = jit(src, index, dim)
+    if isinstance(out1, tuple):
+        out1, arg_out1 = out1
+        out2, arg_out2 = out2
+        arg_expected = tensor(test['arg_' + reduce], torch.long, device)
+        assert torch.all(arg_out1 == arg_expected)
+        assert arg_out1.tolist() == arg_out1.tolist()
+    assert torch.all(out1 == expected)
+    assert out1.tolist() == out2.tolist()
+
+
+@pytest.mark.parametrize('test,reduce,device',
+                         product(tests, reductions, devices))
+def test_backward(test, reduce, device):
+    src = tensor(test['src'], torch.double, device)
+    src.requires_grad_()
+    index = tensor(test['index'], torch.long, device)
+    dim = test['dim']
+
+    assert gradcheck(torch_scatter.scatter,
+                     (src, index, dim, None, None, reduce))
+
+
+@pytest.mark.parametrize('test,reduce,dtype,device',
+                         product(tests, reductions, dtypes, devices))
+def test_out(test, reduce, dtype, device):
+    src = tensor(test['src'], dtype, device)
+    index = tensor(test['index'], torch.long, device)
+    dim = test['dim']
+    expected = tensor(test[reduce], dtype, device)
+
+    out = torch.full_like(expected, -2)
+
+    getattr(torch_scatter, 'scatter_' + reduce)(src, index, dim, out)
+
+    if reduce == 'sum' or reduce == 'add':
+        expected = expected - 2
+    elif reduce == 'mul':
+        expected = out  # We can not really test this here.
+    elif reduce == 'mean':
+        expected = out  # We can not really test this here.
+    elif reduce == 'min':
+        expected = expected.fill_(-2)
+    elif reduce == 'max':
+        expected[expected == 0] = -2
+    else:
+        raise ValueError
+
+    assert torch.all(out == expected)
+
+
+@pytest.mark.parametrize('test,reduce,dtype,device',
+                         product(tests, reductions, dtypes, devices))
+def test_non_contiguous(test, reduce, dtype, device):
+    src = tensor(test['src'], dtype, device)
+    index = tensor(test['index'], torch.long, device)
+    dim = test['dim']
+    expected = tensor(test[reduce], dtype, device)
+
+    if src.dim() > 1:
+        src = src.transpose(0, 1).contiguous().transpose(0, 1)
+    if index.dim() > 1:
+        index = index.transpose(0, 1).contiguous().transpose(0, 1)
+
+    out = getattr(torch_scatter, 'scatter_' + reduce)(src, index, dim)
+    if isinstance(out, tuple):
+        out, arg_out = out
+        arg_expected = tensor(test['arg_' + reduce], torch.long, device)
+        assert torch.all(arg_out == arg_expected)
+    assert torch.all(out == expected)
--- a/test/test_segment.py
+++ b/test/test_segment.py
+from itertools import product
+
+import pytest
+import torch
+import torch_scatter
+from torch.autograd import gradcheck
+from torch_scatter.testing import devices, dtypes, reductions, tensor
+
+tests = [
+    {
+        'src': [1, 2, 3, 4, 5, 6],
+        'index': [0, 0, 1, 1, 1, 3],
+        'indptr': [0, 2, 5, 5, 6],
+        'sum': [3, 12, 0, 6],
+        'add': [3, 12, 0, 6],
+        'mean': [1.5, 4, 0, 6],
+        'min': [1, 3, 0, 6],
+        'arg_min': [0, 2, 6, 5],
+        'max': [2, 5, 0, 6],
+        'arg_max': [1, 4, 6, 5],
+    },
+    {
+        'src': [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]],
+        'index': [0, 0, 1, 1, 1, 3],
+        'indptr': [0, 2, 5, 5, 6],
+        'sum': [[4, 6], [21, 24], [0, 0], [11, 12]],
+        'add': [[4, 6], [21, 24], [0, 0], [11, 12]],
+        'mean': [[2, 3], [7, 8], [0, 0], [11, 12]],
+        'min': [[1, 2], [5, 6], [0, 0], [11, 12]],
+        'arg_min': [[0, 0], [2, 2], [6, 6], [5, 5]],
+        'max': [[3, 4], [9, 10], [0, 0], [11, 12]],
+        'arg_max': [[1, 1], [4, 4], [6, 6], [5, 5]],
+    },
+    {
+        'src': [[1, 3, 5, 7, 9, 11], [2, 4, 6, 8, 10, 12]],
+        'index': [[0, 0, 1, 1, 1, 3], [0, 0, 0, 1, 1, 2]],
+        'indptr': [[0, 2, 5, 5, 6], [0, 3, 5, 6, 6]],
+        'sum': [[4, 21, 0, 11], [12, 18, 12, 0]],
+        'add': [[4, 21, 0, 11], [12, 18, 12, 0]],
+        'mean': [[2, 7, 0, 11], [4, 9, 12, 0]],
+        'min': [[1, 5, 0, 11], [2, 8, 12, 0]],
+        'arg_min': [[0, 2, 6, 5], [0, 3, 5, 6]],
+        'max': [[3, 9, 0, 11], [6, 10, 12, 0]],
+        'arg_max': [[1, 4, 6, 5], [2, 4, 5, 6]],
+    },
+    {
+        'src': [[[1, 2], [3, 4], [5, 6]], [[7, 9], [10, 11], [12, 13]]],
+        'index': [[0, 0, 1], [0, 2, 2]],
+        'indptr': [[0, 2, 3, 3], [0, 1, 1, 3]],
+        'sum': [[[4, 6], [5, 6], [0, 0]], [[7, 9], [0, 0], [22, 24]]],
+        'add': [[[4, 6], [5, 6], [0, 0]], [[7, 9], [0, 0], [22, 24]]],
+        'mean': [[[2, 3], [5, 6], [0, 0]], [[7, 9], [0, 0], [11, 12]]],
+        'min': [[[1, 2], [5, 6], [0, 0]], [[7, 9], [0, 0], [10, 11]]],
+        'arg_min': [[[0, 0], [2, 2], [3, 3]], [[0, 0], [3, 3], [1, 1]]],
+        'max': [[[3, 4], [5, 6], [0, 0]], [[7, 9], [0, 0], [12, 13]]],
+        'arg_max': [[[1, 1], [2, 2], [3, 3]], [[0, 0], [3, 3], [2, 2]]],
+    },
+    {
+        'src': [[1, 3], [2, 4]],
+        'index': [[0, 0], [0, 0]],
+        'indptr': [[0, 2], [0, 2]],
+        'sum': [[4], [6]],
+        'add': [[4], [6]],
+        'mean': [[2], [3]],
+        'min': [[1], [2]],
+        'arg_min': [[0], [0]],
+        'max': [[3], [4]],
+        'arg_max': [[1], [1]],
+    },
+    {
+        'src': [[[1, 1], [3, 3]], [[2, 2], [4, 4]]],
+        'index': [[0, 0], [0, 0]],
+        'indptr': [[0, 2], [0, 2]],
+        'sum': [[[4, 4]], [[6, 6]]],
+        'add': [[[4, 4]], [[6, 6]]],
+        'mean': [[[2, 2]], [[3, 3]]],
+        'min': [[[1, 1]], [[2, 2]]],
+        'arg_min': [[[0, 0]], [[0, 0]]],
+        'max': [[[3, 3]], [[4, 4]]],
+        'arg_max': [[[1, 1]], [[1, 1]]],
+    },
+]
+
+
+@pytest.mark.parametrize('test,reduce,dtype,device',
+                         product(tests, reductions, dtypes, devices))
+def test_forward(test, reduce, dtype, device):
+    src = tensor(test['src'], dtype, device)
+    index = tensor(test['index'], torch.long, device)
+    indptr = tensor(test['indptr'], torch.long, device)
+    expected = tensor(test[reduce], dtype, device)
+
+    fn = getattr(torch_scatter, 'segment_' + reduce + '_csr')
+    jit = torch.jit.script(fn)
+    out1 = fn(src, indptr)
+    out2 = jit(src, indptr)
+    if isinstance(out1, tuple):
+        out1, arg_out1 = out1
+        out2, arg_out2 = out2
+        arg_expected = tensor(test['arg_' + reduce], torch.long, device)
+        assert torch.all(arg_out1 == arg_expected)
+        assert arg_out1.tolist() == arg_out2.tolist()
+    assert torch.all(out1 == expected)
+    assert out1.tolist() == out2.tolist()
+
+    fn = getattr(torch_scatter, 'segment_' + reduce + '_coo')
+    jit = torch.jit.script(fn)
+    out1 = fn(src, index)
+    out2 = jit(src, index)
+    if isinstance(out1, tuple):
+        out1, arg_out1 = out1
+        out2, arg_out2 = out2
+        arg_expected = tensor(test['arg_' + reduce], torch.long, device)
+        assert torch.all(arg_out1 == arg_expected)
+        assert arg_out1.tolist() == arg_out2.tolist()
+    assert torch.all(out1 == expected)
+    assert out1.tolist() == out2.tolist()
+
+
+@pytest.mark.parametrize('test,reduce,device',
+                         product(tests, reductions, devices))
+def test_backward(test, reduce, device):
+    src = tensor(test['src'], torch.double, device)
+    src.requires_grad_()
+    index = tensor(test['index'], torch.long, device)
+    indptr = tensor(test['indptr'], torch.long, device)
+
+    assert gradcheck(torch_scatter.segment_csr, (src, indptr, None, reduce))
+    assert gradcheck(torch_scatter.segment_coo,
+                     (src, index, None, None, reduce))
+
+
+@pytest.mark.parametrize('test,reduce,dtype,device',
+                         product(tests, reductions, dtypes, devices))
+def test_out(test, reduce, dtype, device):
+    src = tensor(test['src'], dtype, device)
+    index = tensor(test['index'], torch.long, device)
+    indptr = tensor(test['indptr'], torch.long, device)
+    expected = tensor(test[reduce], dtype, device)
+
+    out = torch.full_like(expected, -2)
+
+    getattr(torch_scatter, 'segment_' + reduce + '_csr')(src, indptr, out)
+    assert torch.all(out == expected)
+
+    out.fill_(-2)
+
+    getattr(torch_scatter, 'segment_' + reduce + '_coo')(src, index, out)
+
+    if reduce == 'sum' or reduce == 'add':
+        expected = expected - 2
+    elif reduce == 'mean':
+        expected = out  # We can not really test this here.
+    elif reduce == 'min':
+        expected = expected.fill_(-2)
+    elif reduce == 'max':
+        expected[expected == 0] = -2
+    else:
+        raise ValueError
+
+    assert torch.all(out == expected)
+
+
+@pytest.mark.parametrize('test,reduce,dtype,device',
+                         product(tests, reductions, dtypes, devices))
+def test_non_contiguous(test, reduce, dtype, device):
+    src = tensor(test['src'], dtype, device)
+    index = tensor(test['index'], torch.long, device)
+    indptr = tensor(test['indptr'], torch.long, device)
+    expected = tensor(test[reduce], dtype, device)
+
+    if src.dim() > 1:
+        src = src.transpose(0, 1).contiguous().transpose(0, 1)
+    if index.dim() > 1:
+        index = index.transpose(0, 1).contiguous().transpose(0, 1)
+    if indptr.dim() > 1:
+        indptr = indptr.transpose(0, 1).contiguous().transpose(0, 1)
+
+    out = getattr(torch_scatter, 'segment_' + reduce + '_csr')(src, indptr)
+    if isinstance(out, tuple):
+        out, arg_out = out
+        arg_expected = tensor(test['arg_' + reduce], torch.long, device)
+        assert torch.all(arg_out == arg_expected)
+    assert torch.all(out == expected)
+
+    out = getattr(torch_scatter, 'segment_' + reduce + '_coo')(src, index)
+    if isinstance(out, tuple):
+        out, arg_out = out
+        arg_expected = tensor(test['arg_' + reduce], torch.long, device)
+        assert torch.all(arg_out == arg_expected)
+    assert torch.all(out == expected)
--- a/test/test_zero_tensors.py
+++ b/test/test_zero_tensors.py
+from itertools import product
+
+import pytest
+import torch
+from torch_scatter import (gather_coo, gather_csr, scatter, segment_coo,
+                           segment_csr)
+from torch_scatter.testing import devices, grad_dtypes, reductions, tensor
+
+
+@pytest.mark.parametrize('reduce,dtype,device',
+                         product(reductions, grad_dtypes, devices))
+def test_zero_elements(reduce, dtype, device):
+    x = torch.randn(0, 0, 0, 16, dtype=dtype, device=device,
+                    requires_grad=True)
+    index = tensor([], torch.long, device)
+    indptr = tensor([], torch.long, device)
+
+    out = scatter(x, index, dim=0, dim_size=0, reduce=reduce)
+    out.backward(torch.randn_like(out))
+    assert out.size() == (0, 0, 0, 16)
+
+    out = segment_coo(x, index, dim_size=0, reduce=reduce)
+    out.backward(torch.randn_like(out))
+    assert out.size() == (0, 0, 0, 16)
+
+    out = gather_coo(x, index)
+    out.backward(torch.randn_like(out))
+    assert out.size() == (0, 0, 0, 16)
+
+    out = segment_csr(x, indptr, reduce=reduce)
+    out.backward(torch.randn_like(out))
+    assert out.size() == (0, 0, 0, 16)
+
+    out = gather_csr(x, indptr)
+    out.backward(torch.randn_like(out))
+    assert out.size() == (0, 0, 0, 16)
--- a/torch_scatter.egg-info/PKG-INFO
+++ b/torch_scatter.egg-info/PKG-INFO
 Metadata-Version: 2.1
 Name: torch-scatter
-Version: 2.0.9
+Version: 2.1.2
 Summary: PyTorch Extension Library of Optimized Scatter Operations
 Home-page: https://github.com/rusty1s/pytorch_scatter
 Author: Matthias Fey
 Author-email: matthias.fey@tu-dortmund.de
-License: MIT
+License: UNKNOWN
+Download-URL: https://github.com/rusty1s/pytorch_scatter/archive/2.1.2.tar.gz
+Description: [pypi-image]: https://badge.fury.io/py/torch-scatter.svg
+        [pypi-url]: https://pypi.python.org/pypi/torch-scatter
+        [testing-image]: https://github.com/rusty1s/pytorch_scatter/actions/workflows/testing.yml/badge.svg
+        [testing-url]: https://github.com/rusty1s/pytorch_scatter/actions/workflows/testing.yml
+        [linting-image]: https://github.com/rusty1s/pytorch_scatter/actions/workflows/linting.yml/badge.svg
+        [linting-url]: https://github.com/rusty1s/pytorch_scatter/actions/workflows/linting.yml
+        [docs-image]: https://readthedocs.org/projects/pytorch-scatter/badge/?version=latest
+        [docs-url]: https://pytorch-scatter.readthedocs.io/en/latest/?badge=latest
+        [coverage-image]: https://codecov.io/gh/rusty1s/pytorch_scatter/branch/master/graph/badge.svg
+        [coverage-url]: https://codecov.io/github/rusty1s/pytorch_scatter?branch=master
+        
+        # PyTorch Scatter
+        
+        [![PyPI Version][pypi-image]][pypi-url]
+        [![Testing Status][testing-image]][testing-url]
+        [![Linting Status][linting-image]][linting-url]
+        [![Docs Status][docs-image]][docs-url]
+        [![Code Coverage][coverage-image]][coverage-url]
+        
+        <p align="center">
+          <img width="50%" src="https://raw.githubusercontent.com/rusty1s/pytorch_scatter/master/docs/source/_figures/add.svg?sanitize=true" />
+        </p>
+        
+        --------------------------------------------------------------------------------
+        
+        **[Documentation](https://pytorch-scatter.readthedocs.io)**
+        
+        This package consists of a small extension library of highly optimized sparse update (scatter and segment) operations for the use in [PyTorch](http://pytorch.org/), which are missing in the main package.
+        Scatter and segment operations can be roughly described as reduce operations based on a given "group-index" tensor.
+        Segment operations require the "group-index" tensor to be sorted, whereas scatter operations are not subject to these requirements.
+        
+        The package consists of the following operations with reduction types `"sum"|"mean"|"min"|"max"`:
+        
+        * [**scatter**](https://pytorch-scatter.readthedocs.io/en/latest/functions/scatter.html) based on arbitrary indices
+        * [**segment_coo**](https://pytorch-scatter.readthedocs.io/en/latest/functions/segment_coo.html) based on sorted indices
+        * [**segment_csr**](https://pytorch-scatter.readthedocs.io/en/latest/functions/segment_csr.html) based on compressed indices via pointers
+        
+        In addition, we provide the following **composite functions** which make use of `scatter_*` operations under the hood: `scatter_std`, `scatter_logsumexp`, `scatter_softmax` and `scatter_log_softmax`.
+        
+        All included operations are broadcastable, work on varying data types, are implemented both for CPU and GPU with corresponding backward implementations, and are fully traceable.
+        
+        ## Installation
+        
+        ### Anaconda
+        
+        **Update:** You can now install `pytorch-scatter` via [Anaconda](https://anaconda.org/pyg/pytorch-scatter) for all major OS/PyTorch/CUDA combinations 🤗
+        Given that you have [`pytorch >= 1.8.0` installed](https://pytorch.org/get-started/locally/), simply run
+        
+        ```
+        conda install pytorch-scatter -c pyg
+        ```
+        
+        ### Binaries
+        
+        We alternatively provide pip wheels for all major OS/PyTorch/CUDA combinations, see [here](https://data.pyg.org/whl).
+        
+        #### PyTorch 2.2
+        
+        To install the binaries for PyTorch 2.2.0, simply run
+        
+        ```
+        pip install torch-scatter -f https://data.pyg.org/whl/torch-2.2.0+${CUDA}.html
+        ```
+        
+        where `${CUDA}` should be replaced by either `cpu`, `cu118`, or `cu121` depending on your PyTorch installation.
+        
+        |             | `cpu` | `cu118` | `cu121` |
+        |-------------|-------|---------|---------|
+        | **Linux**   | ✅    | ✅      | ✅      |
+        | **Windows** | ✅    | ✅      | ✅      |
+        | **macOS**   | ✅    |         |         |
+        
+        #### PyTorch 2.1
+        
+        To install the binaries for PyTorch 2.1.0, simply run
+        
+        ```
+        pip install torch-scatter -f https://data.pyg.org/whl/torch-2.1.0+${CUDA}.html
+        ```
+        
+        where `${CUDA}` should be replaced by either `cpu`, `cu118`, or `cu121` depending on your PyTorch installation.
+        
+        |             | `cpu` | `cu118` | `cu121` |
+        |-------------|-------|---------|---------|
+        | **Linux**   | ✅    | ✅      | ✅      |
+        | **Windows** | ✅    | ✅      | ✅      |
+        | **macOS**   | ✅    |         |         |
+        
+        **Note:** Binaries of older versions are also provided for PyTorch 1.4.0, PyTorch 1.5.0, PyTorch 1.6.0, PyTorch 1.7.0/1.7.1, PyTorch 1.8.0/1.8.1, PyTorch 1.9.0, PyTorch 1.10.0/1.10.1/1.10.2, PyTorch 1.11.0, PyTorch 1.12.0/1.12.1, PyTorch 1.13.0/1.13.1, and PyTorch 2.0.0 (following the same procedure).
+        For older versions, you need to explicitly specify the latest supported version number or install via `pip install --no-index` in order to prevent a manual installation from source.
+        You can look up the latest supported version number [here](https://data.pyg.org/whl).
+        
+        ### From source
+        
+        Ensure that at least PyTorch 1.4.0 is installed and verify that `cuda/bin` and `cuda/include` are in your `$PATH` and `$CPATH` respectively, *e.g.*:
+        
+        ```
+        $ python -c "import torch; print(torch.__version__)"
+        >>> 1.4.0
+        
+        $ echo $PATH
+        >>> /usr/local/cuda/bin:...
+        
+        $ echo $CPATH
+        >>> /usr/local/cuda/include:...
+        ```
+        
+        Then run:
+        
+        ```
+        pip install torch-scatter
+        ```
+        
+        When running in a docker container without NVIDIA driver, PyTorch needs to evaluate the compute capabilities and may fail.
+        In this case, ensure that the compute capabilities are set via `TORCH_CUDA_ARCH_LIST`, *e.g.*:
+        
+        ```
+        export TORCH_CUDA_ARCH_LIST = "6.0 6.1 7.2+PTX 7.5+PTX"
+        ```
+        
+        ## Example
+        
+        ```py
+        import torch
+        from torch_scatter import scatter_max
+        
+        src = torch.tensor([[2, 0, 1, 4, 3], [0, 2, 1, 3, 4]])
+        index = torch.tensor([[4, 5, 4, 2, 3], [0, 0, 2, 2, 1]])
+        
+        out, argmax = scatter_max(src, index, dim=-1)
+        ```
+        
+        ```
+        print(out)
+        tensor([[0, 0, 4, 3, 2, 0],
+                [2, 4, 3, 0, 0, 0]])
+        
+        print(argmax)
+        tensor([[5, 5, 3, 4, 0, 1]
+                [1, 4, 3, 5, 5, 5]])
+        ```
+        
+        ## Running tests
+        
+        ```
+        pytest
+        ```
+        
+        ## C++ API
+        
+        `torch-scatter` also offers a C++ API that contains C++ equivalent of python models.
+        For this, we need to add `TorchLib` to the `-DCMAKE_PREFIX_PATH` (*e.g.*, it may exists in `{CONDA}/lib/python{X.X}/site-packages/torch` if installed via `conda`):
+        
+        ```
+        mkdir build
+        cd build
+        # Add -DWITH_CUDA=on support for CUDA support
+        cmake -DCMAKE_PREFIX_PATH="..." ..
+        make
+        make install
+        ```
+        
 Keywords: pytorch,scatter,segment,gather
-Requires-Python: >=3.6
+Platform: UNKNOWN
+Classifier: Development Status :: 5 - Production/Stable
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3 :: Only
+Requires-Python: >=3.8
+Description-Content-Type: text/markdown
 Provides-Extra: test
-License-File: LICENSE
--- a/torch_scatter.egg-info/SOURCES.txt
+++ b/torch_scatter.egg-info/SOURCES.txt
@@ -3,22 +3,18 @@ MANIFEST.in
 README.md
 setup.cfg
 setup.py
-/work/home/quyuanhao123/software/test_ocp/torch_scatter-2.0.9/csrc/scatter.cpp
-/work/home/quyuanhao123/software/test_ocp/torch_scatter-2.0.9/csrc/segment_coo.cpp
-/work/home/quyuanhao123/software/test_ocp/torch_scatter-2.0.9/csrc/segment_csr.cpp
-/work/home/quyuanhao123/software/test_ocp/torch_scatter-2.0.9/csrc/version.cpp
-/work/home/quyuanhao123/software/test_ocp/torch_scatter-2.0.9/csrc/cpu/scatter_cpu.cpp
-/work/home/quyuanhao123/software/test_ocp/torch_scatter-2.0.9/csrc/cpu/segment_coo_cpu.cpp
-/work/home/quyuanhao123/software/test_ocp/torch_scatter-2.0.9/csrc/cpu/segment_csr_cpu.cpp
-/work/home/quyuanhao123/software/test_ocp/torch_scatter-2.0.9/csrc/hip/scatter_hip_hip.hip
-/work/home/quyuanhao123/software/test_ocp/torch_scatter-2.0.9/csrc/hip/segment_coo_hip_hip.hip
-/work/home/quyuanhao123/software/test_ocp/torch_scatter-2.0.9/csrc/hip/segment_csr_hip_hip.hip
+csrc/extensions.h
+csrc/macros.h
 csrc/scatter.cpp
 csrc/scatter.h
+csrc/scatter_hip.cpp
 csrc/segment_coo.cpp
+csrc/segment_coo_hip.cpp
 csrc/segment_csr.cpp
+csrc/segment_csr_hip.cpp
 csrc/utils.h
 csrc/version.cpp
+csrc/version_hip.cpp
 csrc/cpu/index_info.h
 csrc/cpu/reducer.h
 csrc/cpu/scatter_cpu.cpp
@@ -28,24 +24,32 @@ csrc/cpu/segment_coo_cpu.h
 csrc/cpu/segment_csr_cpu.cpp
 csrc/cpu/segment_csr_cpu.h
 csrc/cpu/utils.h
+csrc/cuda/atomics.cuh
+csrc/cuda/index_info.cuh
+csrc/cuda/reducer.cuh
+csrc/cuda/scatter_cuda.cu
+csrc/cuda/scatter_cuda.h
+csrc/cuda/segment_coo_cuda.cu
+csrc/cuda/segment_coo_cuda.h
+csrc/cuda/segment_csr_cuda.cu
+csrc/cuda/segment_csr_cuda.h
+csrc/cuda/utils.cuh
 csrc/hip/atomics.cuh
 csrc/hip/index_info.cuh
 csrc/hip/reducer.cuh
-csrc/hip/scatter_hip.h
-csrc/hip/scatter_hip.hip
-csrc/hip/scatter_hip_hip.hip
-csrc/hip/segment_coo_hip.h
-csrc/hip/segment_coo_hip.hip
-csrc/hip/segment_coo_hip_hip.hip
-csrc/hip/segment_csr_hip.h
-csrc/hip/segment_csr_hip.hip
-csrc/hip/segment_csr_hip_hip.hip
+csrc/hip/scatter_cuda.h
+csrc/hip/scatter_cuda.hip
+csrc/hip/segment_coo_cuda.h
+csrc/hip/segment_coo_cuda.hip
+csrc/hip/segment_csr_cuda.h
+csrc/hip/segment_csr_cuda.hip
 csrc/hip/utils.cuh
 torch_scatter/__init__.py
 torch_scatter/placeholder.py
 torch_scatter/scatter.py
 torch_scatter/segment_coo.py
 torch_scatter/segment_csr.py
+torch_scatter/testing.py
 torch_scatter/utils.py
 torch_scatter.egg-info/PKG-INFO
 torch_scatter.egg-info/SOURCES.txt

--- a/torch_scatter.egg-info/requires.txt
+++ b/torch_scatter.egg-info/requires.txt

 [test]
 pytest
-pytest-runner
 pytest-cov
--- a/torch_scatter/__init__.py
+++ b/torch_scatter/__init__.py
@@ -4,14 +4,14 @@ import os.path as osp

 import torch

-__version__ = '2.0.9'
+__version__ = '2.1.0'

 for library in ['_version', '_scatter', '_segment_csr', '_segment_coo']:
-    hip_spec = importlib.machinery.PathFinder().find_spec(
-        f'{library}_hip', [osp.dirname(__file__)])
+    cuda_spec = importlib.machinery.PathFinder().find_spec(
+        f'{library}_cuda', [osp.dirname(__file__)])
    cpu_spec = importlib.machinery.PathFinder().find_spec(
        f'{library}_cpu', [osp.dirname(__file__)])
-    spec = hip_spec or cpu_spec
+    spec = cuda_spec or cpu_spec
    if spec is not None:
        torch.ops.load_library(spec.origin)
    elif os.getenv('BUILD_DOCS', '0') != '1':  # pragma: no cover
@@ -47,11 +47,22 @@ for library in ['_version', '_scatter', '_segment_csr', '_segment_coo']:
        torch.ops.torch_scatter.gather_coo = gather_coo_placeholder

 cuda_version = torch.ops.torch_scatter.cuda_version()
-if torch.cuda.is_available() and cuda_version != -1:  # pragma: no cover
+is_not_hip = torch.version.hip is None
+is_cuda = torch.version.cuda is not None
+if is_not_hip and is_cuda and cuda_version != -1:  # pragma: no cover
    if cuda_version < 10000:
        major, minor = int(str(cuda_version)[0]), int(str(cuda_version)[2])
    else:
        major, minor = int(str(cuda_version)[0:2]), int(str(cuda_version)[3])
+    t_major, t_minor = [int(x) for x in torch.version.cuda.split('.')]
+
+    if t_major != major:
+        raise RuntimeError(
+            f'Detected that PyTorch and torch_scatter were compiled with '
+            f'different CUDA versions. PyTorch has CUDA version '
+            f'{t_major}.{t_minor} and torch_scatter has CUDA version '
+            f'{major}.{minor}. Please reinstall the torch_scatter that '
+            f'matches your PyTorch install.')

 from .scatter import scatter_sum, scatter_add, scatter_mul  # noqa
 from .scatter import scatter_mean, scatter_min, scatter_max, scatter  # noqa

--- a/torch_scatter/testing.py
+++ b/torch_scatter/testing.py
+from typing import Any
+
+import torch
+
+reductions = ['sum', 'add', 'mean', 'min', 'max']
+
+dtypes = [
+    torch.half, torch.bfloat16, torch.float, torch.double, torch.int,
+    torch.long
+]
+grad_dtypes = [torch.float, torch.double]
+
+devices = [torch.device('cpu')]
+if torch.cuda.is_available():
+    devices += [torch.device('cuda:0')]
+
+
+def tensor(x: Any, dtype: torch.dtype, device: torch.device):
+    return None if x is None else torch.tensor(x, device=device).to(dtype)
--- a/torch_scatter/utils.py
+++ b/torch_scatter/utils.py
@@ -9,5 +9,5 @@ def broadcast(src: torch.Tensor, other: torch.Tensor, dim: int):
            src = src.unsqueeze(0)
    for _ in range(src.dim(), other.dim()):
        src = src.unsqueeze(-1)
-    src = src.expand_as(other)
+    src = src.expand(other.size())
    return src