push 2.0.9 version

c7c514c2 · yangzhong · cf967b1f · c7c514c2 · c7c514c2 · c7c514c2
Commit c7c514c2 authored Jan 22, 2024 by yangzhong
17 changed files
--- a/setup.py
+++ b/setup.py
@@ -10,16 +10,14 @@ from torch.__config__ import parallel_info
 from torch.utils.cpp_extension import BuildExtension
 from torch.utils.cpp_extension import CppExtension, CUDAExtension, CUDA_HOME

-WITH_HIP = torch.cuda.is_available() and CUDA_HOME is not None
-suffices = ['cpu', 'cuda'] if WITH_HIP else ['cpu']
+WITH_CUDA = torch.cuda.is_available() and CUDA_HOME is not None
+suffices = ['cpu', 'cuda'] if WITH_CUDA else ['cpu']
 if os.getenv('FORCE_CUDA', '0') == '1':
    suffices = ['cuda', 'cpu']
-if os.getenv('FORCE_ONLY_HIP', '0') == '1':
-    suffices = ['hip']
+if os.getenv('FORCE_ONLY_CUDA', '0') == '1':
+    suffices = ['cuda']
 if os.getenv('FORCE_ONLY_CPU', '0') == '1':
    suffices = ['cpu']
-ROCM_PATH = os.getenv('ROCM_PATH')
-HIPLIB = osp.join(ROCM_PATH, 'hipsparse', 'include')

 BUILD_DOCS = os.getenv('BUILD_DOCS', '0') == '1'

@@ -46,12 +44,12 @@ def get_extensions():
        else:
            print('Compiling without OpenMP...')

-        if suffix == 'hip':
-            define_macros += [('WITH_HIP', None)]
-            hipcc_flags = os.getenv('HIPCC_FLAGS', '')
-            hipcc_flags = [] if hipcc_flags == '' else hipcc_flags.split(' ')
-            hipcc_flags += ['--expt-relaxed-constexpr', '-O2']
-            extra_compile_args['hipcc'] = hipcc_flags
+        if suffix == 'cuda':
+            define_macros += [('WITH_CUDA', None)]
+            nvcc_flags = os.getenv('NVCC_FLAGS', '')
+            nvcc_flags = [] if nvcc_flags == '' else nvcc_flags.split(' ')
+            nvcc_flags += ['--expt-relaxed-constexpr', '-O2']
+            extra_compile_args['nvcc'] = nvcc_flags

        name = main.split(os.sep)[-1][:-4]
        sources = [main]
@@ -60,16 +58,15 @@ def get_extensions():
        if osp.exists(path):
            sources += [path]

-        path = osp.join(extensions_dir, 'hip', f'{name}_hip.hip')
-        if suffix == 'hip' and osp.exists(path):
+        path = osp.join(extensions_dir, 'cuda', f'{name}_cuda.cu')
+        if suffix == 'cuda' and osp.exists(path):
            sources += [path]

        Extension = CppExtension if suffix == 'cpu' else CUDAExtension
-        define_macros += [('TORCH_HIP_VERSION', 10000), ('__HIP__', None), ('__HCC__', None)]
        extension = Extension(
            f'torch_scatter._{name}_{suffix}',
            sources,
-            include_dirs=[extensions_dir, HIPLIB],
+            include_dirs=[extensions_dir],
            define_macros=define_macros,
            extra_compile_args=extra_compile_args,
            extra_link_args=extra_link_args,

--- a/test/__init__.py
+++ b/test/__init__.py
--- a/test/composite/test_logsumexp.py
+++ b/test/composite/test_logsumexp.py
+import torch
+from torch_scatter import scatter_logsumexp
+
+
+def test_logsumexp():
+    inputs = torch.tensor([
+        0.5, 0.5, 0.0, -2.1, 3.2, 7.0, -1.0, -100.0,
+        float('-inf'),
+        float('-inf'), 0.0
+    ])
+    inputs.requires_grad_()
+    index = torch.tensor([0, 0, 1, 1, 1, 2, 4, 4, 5, 6, 6])
+    splits = [2, 3, 1, 0, 2, 1, 2]
+
+    outputs = scatter_logsumexp(inputs, index)
+
+    for src, out in zip(inputs.split(splits), outputs.unbind()):
+        assert out.tolist() == torch.logsumexp(src, dim=0).tolist()
+
+    outputs.backward(torch.randn_like(outputs))
+
+    jit = torch.jit.script(scatter_logsumexp)
+    assert jit(inputs, index).tolist() == outputs.tolist()
--- a/test/composite/test_softmax.py
+++ b/test/composite/test_softmax.py
+import torch
+from torch_scatter import scatter_log_softmax, scatter_softmax
+
+
+def test_softmax():
+    src = torch.tensor([0.2, 0, 0.2, -2.1, 3.2, 7, -1, float('-inf')])
+    src.requires_grad_()
+    index = torch.tensor([0, 1, 0, 1, 1, 2, 4, 4])
+
+    out = scatter_softmax(src, index)
+
+    out0 = torch.softmax(torch.tensor([0.2, 0.2]), dim=-1)
+    out1 = torch.softmax(torch.tensor([0, -2.1, 3.2]), dim=-1)
+    out2 = torch.softmax(torch.tensor([7], dtype=torch.float), dim=-1)
+    out4 = torch.softmax(torch.tensor([-1, float('-inf')]), dim=-1)
+
+    expected = torch.stack([
+        out0[0], out1[0], out0[1], out1[1], out1[2], out2[0], out4[0], out4[1]
+    ], dim=0)
+
+    assert torch.allclose(out, expected)
+
+    out.backward(torch.randn_like(out))
+
+    jit = torch.jit.script(scatter_softmax)
+    assert jit(src, index).tolist() == out.tolist()
+
+
+def test_log_softmax():
+    src = torch.tensor([0.2, 0, 0.2, -2.1, 3.2, 7, -1, float('-inf')])
+    src.requires_grad_()
+    index = torch.tensor([0, 1, 0, 1, 1, 2, 4, 4])
+
+    out = scatter_log_softmax(src, index)
+
+    out0 = torch.log_softmax(torch.tensor([0.2, 0.2]), dim=-1)
+    out1 = torch.log_softmax(torch.tensor([0, -2.1, 3.2]), dim=-1)
+    out2 = torch.log_softmax(torch.tensor([7], dtype=torch.float), dim=-1)
+    out4 = torch.log_softmax(torch.tensor([-1, float('-inf')]), dim=-1)
+
+    expected = torch.stack([
+        out0[0], out1[0], out0[1], out1[1], out1[2], out2[0], out4[0], out4[1]
+    ], dim=0)
+
+    assert torch.allclose(out, expected)
+
+    out.backward(torch.randn_like(out))
+
+    jit = torch.jit.script(scatter_log_softmax)
+    assert jit(src, index).tolist() == out.tolist()
--- a/test/composite/test_std.py
+++ b/test/composite/test_std.py
+import torch
+from torch_scatter import scatter_std
+
+
+def test_std():
+    src = torch.tensor([[2, 0, 1, 4, 3], [0, 2, 1, 3, 4]], dtype=torch.float)
+    src.requires_grad_()
+    index = torch.tensor([[0, 0, 0, 0, 0], [1, 1, 1, 1, 1]], dtype=torch.long)
+
+    out = scatter_std(src, index, dim=-1, unbiased=True)
+    std = src.std(dim=-1, unbiased=True)[0]
+    expected = torch.tensor([[std, 0], [0, std]])
+    assert torch.allclose(out, expected)
+
+    out.backward(torch.randn_like(out))
+
+    jit = torch.jit.script(scatter_std)
+    assert jit(src, index, dim=-1, unbiased=True).tolist() == out.tolist()
--- a/test/test_broadcasting.py
+++ b/test/test_broadcasting.py
+from itertools import product
+
+import pytest
+import torch
+from torch_scatter import scatter
+
+from .utils import reductions, devices
+
+
+@pytest.mark.parametrize('reduce,device', product(reductions, devices))
+def test_broadcasting(reduce, device):
+    B, C, H, W = (4, 3, 8, 8)
+
+    src = torch.randn((B, C, H, W), device=device)
+    index = torch.randint(0, H, (H, )).to(device, torch.long)
+    out = scatter(src, index, dim=2, dim_size=H, reduce=reduce)
+    assert out.size() == (B, C, H, W)
+
+    src = torch.randn((B, C, H, W), device=device)
+    index = torch.randint(0, H, (B, 1, H, W)).to(device, torch.long)
+    out = scatter(src, index, dim=2, dim_size=H, reduce=reduce)
+    assert out.size() == (B, C, H, W)
+
+    src = torch.randn((B, C, H, W), device=device)
+    index = torch.randint(0, H, (H, )).to(device, torch.long)
+    out = scatter(src, index, dim=2, dim_size=H, reduce=reduce)
+    assert out.size() == (B, C, H, W)
--- a/test/test_gather.py
+++ b/test/test_gather.py
+from itertools import product
+
+import pytest
+import torch
+from torch.autograd import gradcheck
+from torch_scatter import gather_csr, gather_coo
+
+from .utils import tensor, dtypes, devices
+
+tests = [
+    {
+        'src': [1, 2, 3, 4],
+        'index': [0, 0, 1, 1, 1, 3],
+        'indptr': [0, 2, 5, 5, 6],
+        'expected': [1, 1, 2, 2, 2, 4],
+    },
+    {
+        'src': [[1, 2], [3, 4], [5, 6], [7, 8]],
+        'index': [0, 0, 1, 1, 1, 3],
+        'indptr': [0, 2, 5, 5, 6],
+        'expected': [[1, 2], [1, 2], [3, 4], [3, 4], [3, 4], [7, 8]]
+    },
+    {
+        'src': [[1, 3, 5, 7], [2, 4, 6, 8]],
+        'index': [[0, 0, 1, 1, 1, 3], [0, 0, 0, 1, 1, 2]],
+        'indptr': [[0, 2, 5, 5, 6], [0, 3, 5, 6, 6]],
+        'expected': [[1, 1, 3, 3, 3, 7], [2, 2, 2, 4, 4, 6]],
+    },
+    {
+        'src': [[[1, 2], [3, 4], [5, 6]], [[7, 9], [10, 11], [12, 13]]],
+        'index': [[0, 0, 1], [0, 2, 2]],
+        'indptr': [[0, 2, 3, 3], [0, 1, 1, 3]],
+        'expected': [[[1, 2], [1, 2], [3, 4]], [[7, 9], [12, 13], [12, 13]]],
+    },
+    {
+        'src': [[1], [2]],
+        'index': [[0, 0], [0, 0]],
+        'indptr': [[0, 2], [0, 2]],
+        'expected': [[1, 1], [2, 2]],
+    },
+    {
+        'src': [[[1, 1]], [[2, 2]]],
+        'index': [[0, 0], [0, 0]],
+        'indptr': [[0, 2], [0, 2]],
+        'expected': [[[1, 1], [1, 1]], [[2, 2], [2, 2]]],
+    },
+]
+
+
+@pytest.mark.parametrize('test,dtype,device', product(tests, dtypes, devices))
+def test_forward(test, dtype, device):
+    src = tensor(test['src'], dtype, device)
+    index = tensor(test['index'], torch.long, device)
+    indptr = tensor(test['indptr'], torch.long, device)
+    expected = tensor(test['expected'], dtype, device)
+
+    out = gather_csr(src, indptr)
+    assert torch.all(out == expected)
+
+    out = gather_coo(src, index)
+    assert torch.all(out == expected)
+
+
+@pytest.mark.parametrize('test,device', product(tests, devices))
+def test_backward(test, device):
+    src = tensor(test['src'], torch.double, device)
+    src.requires_grad_()
+    index = tensor(test['index'], torch.long, device)
+    indptr = tensor(test['indptr'], torch.long, device)
+
+    assert gradcheck(gather_csr, (src, indptr, None)) is True
+    assert gradcheck(gather_coo, (src, index, None)) is True
+
+
+@pytest.mark.parametrize('test,dtype,device', product(tests, dtypes, devices))
+def test_out(test, dtype, device):
+    src = tensor(test['src'], dtype, device)
+    index = tensor(test['index'], torch.long, device)
+    indptr = tensor(test['indptr'], torch.long, device)
+    expected = tensor(test['expected'], dtype, device)
+
+    size = list(src.size())
+    size[index.dim() - 1] = index.size(-1)
+    out = src.new_full(size, -2)
+
+    gather_csr(src, indptr, out)
+    assert torch.all(out == expected)
+
+    out.fill_(-2)
+
+    gather_coo(src, index, out)
+    assert torch.all(out == expected)
+
+
+@pytest.mark.parametrize('test,dtype,device', product(tests, dtypes, devices))
+def test_non_contiguous(test, dtype, device):
+    src = tensor(test['src'], dtype, device)
+    index = tensor(test['index'], torch.long, device)
+    indptr = tensor(test['indptr'], torch.long, device)
+    expected = tensor(test['expected'], dtype, device)
+
+    if src.dim() > 1:
+        src = src.transpose(0, 1).contiguous().transpose(0, 1)
+    if index.dim() > 1:
+        index = index.transpose(0, 1).contiguous().transpose(0, 1)
+    if indptr.dim() > 1:
+        indptr = indptr.transpose(0, 1).contiguous().transpose(0, 1)
+
+    out = gather_csr(src, indptr)
+    assert torch.all(out == expected)
+
+    out = gather_coo(src, index)
+    assert torch.all(out == expected)
--- a/test/test_multi_gpu.py
+++ b/test/test_multi_gpu.py
+from itertools import product
+
+import pytest
+import torch
+import torch_scatter
+
+from .utils import reductions, tensor, dtypes
+
+tests = [
+    {
+        'src': [1, 2, 3, 4, 5, 6],
+        'index': [0, 0, 1, 1, 1, 3],
+        'indptr': [0, 2, 5, 5, 6],
+        'dim': 0,
+        'sum': [3, 12, 0, 6],
+        'add': [3, 12, 0, 6],
+        'mean': [1.5, 4, 0, 6],
+        'min': [1, 3, 0, 6],
+        'max': [2, 5, 0, 6],
+    },
+]
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason='CUDA not available')
+@pytest.mark.skipif(torch.cuda.device_count() < 2, reason='No multiple GPUS')
+@pytest.mark.parametrize('test,reduce,dtype', product(tests, reductions,
+                                                      dtypes))
+def test_forward(test, reduce, dtype):
+    device = torch.device('cuda:1')
+    src = tensor(test['src'], dtype, device)
+    index = tensor(test['index'], torch.long, device)
+    indptr = tensor(test['indptr'], torch.long, device)
+    dim = test['dim']
+    expected = tensor(test[reduce], dtype, device)
+
+    out = torch_scatter.scatter(src, index, dim, reduce=reduce)
+    assert torch.all(out == expected)
+
+    out = torch_scatter.segment_coo(src, index, reduce=reduce)
+    assert torch.all(out == expected)
+
+    out = torch_scatter.segment_csr(src, indptr, reduce=reduce)
+    assert torch.all(out == expected)
--- a/test/test_scatter.py
+++ b/test/test_scatter.py
+from itertools import product
+
+import pytest
+import torch
+from torch.autograd import gradcheck
+import torch_scatter
+
+from .utils import reductions, tensor, dtypes, devices
+
+reductions = reductions + ['mul']
+
+tests = [
+    {
+        'src': [1, 3, 2, 4, 5, 6],
+        'index': [0, 1, 0, 1, 1, 3],
+        'dim': 0,
+        'sum': [3, 12, 0, 6],
+        'add': [3, 12, 0, 6],
+        'mul': [2, 60, 1, 6],
+        'mean': [1.5, 4, 0, 6],
+        'min': [1, 3, 0, 6],
+        'arg_min': [0, 1, 6, 5],
+        'max': [2, 5, 0, 6],
+        'arg_max': [2, 4, 6, 5],
+    },
+    {
+        'src': [[1, 2], [5, 6], [3, 4], [7, 8], [9, 10], [11, 12]],
+        'index': [0, 1, 0, 1, 1, 3],
+        'dim': 0,
+        'sum': [[4, 6], [21, 24], [0, 0], [11, 12]],
+        'add': [[4, 6], [21, 24], [0, 0], [11, 12]],
+        'mul': [[1 * 3, 2 * 4], [5 * 7 * 9, 6 * 8 * 10], [1, 1], [11, 12]],
+        'mean': [[2, 3], [7, 8], [0, 0], [11, 12]],
+        'min': [[1, 2], [5, 6], [0, 0], [11, 12]],
+        'arg_min': [[0, 0], [1, 1], [6, 6], [5, 5]],
+        'max': [[3, 4], [9, 10], [0, 0], [11, 12]],
+        'arg_max': [[2, 2], [4, 4], [6, 6], [5, 5]],
+    },
+    {
+        'src': [[1, 5, 3, 7, 9, 11], [2, 4, 8, 6, 10, 12]],
+        'index': [[0, 1, 0, 1, 1, 3], [0, 0, 1, 0, 1, 2]],
+        'dim': 1,
+        'sum': [[4, 21, 0, 11], [12, 18, 12, 0]],
+        'add': [[4, 21, 0, 11], [12, 18, 12, 0]],
+        'mul': [[1 * 3, 5 * 7 * 9, 1, 11], [2 * 4 * 6, 8 * 10, 12, 1]],
+        'mean': [[2, 7, 0, 11], [4, 9, 12, 0]],
+        'min': [[1, 5, 0, 11], [2, 8, 12, 0]],
+        'arg_min': [[0, 1, 6, 5], [0, 2, 5, 6]],
+        'max': [[3, 9, 0, 11], [6, 10, 12, 0]],
+        'arg_max': [[2, 4, 6, 5], [3, 4, 5, 6]],
+    },
+    {
+        'src': [[[1, 2], [5, 6], [3, 4]], [[10, 11], [7, 9], [12, 13]]],
+        'index': [[0, 1, 0], [2, 0, 2]],
+        'dim': 1,
+        'sum': [[[4, 6], [5, 6], [0, 0]], [[7, 9], [0, 0], [22, 24]]],
+        'add': [[[4, 6], [5, 6], [0, 0]], [[7, 9], [0, 0], [22, 24]]],
+        'mul': [[[3, 8], [5, 6], [1, 1]], [[7, 9], [1, 1], [120, 11 * 13]]],
+        'mean': [[[2, 3], [5, 6], [0, 0]], [[7, 9], [0, 0], [11, 12]]],
+        'min': [[[1, 2], [5, 6], [0, 0]], [[7, 9], [0, 0], [10, 11]]],
+        'arg_min': [[[0, 0], [1, 1], [3, 3]], [[1, 1], [3, 3], [0, 0]]],
+        'max': [[[3, 4], [5, 6], [0, 0]], [[7, 9], [0, 0], [12, 13]]],
+        'arg_max': [[[2, 2], [1, 1], [3, 3]], [[1, 1], [3, 3], [2, 2]]],
+    },
+    {
+        'src': [[1, 3], [2, 4]],
+        'index': [[0, 0], [0, 0]],
+        'dim': 1,
+        'sum': [[4], [6]],
+        'add': [[4], [6]],
+        'mul': [[3], [8]],
+        'mean': [[2], [3]],
+        'min': [[1], [2]],
+        'arg_min': [[0], [0]],
+        'max': [[3], [4]],
+        'arg_max': [[1], [1]],
+    },
+    {
+        'src': [[[1, 1], [3, 3]], [[2, 2], [4, 4]]],
+        'index': [[0, 0], [0, 0]],
+        'dim': 1,
+        'sum': [[[4, 4]], [[6, 6]]],
+        'add': [[[4, 4]], [[6, 6]]],
+        'mul': [[[3, 3]], [[8, 8]]],
+        'mean': [[[2, 2]], [[3, 3]]],
+        'min': [[[1, 1]], [[2, 2]]],
+        'arg_min': [[[0, 0]], [[0, 0]]],
+        'max': [[[3, 3]], [[4, 4]]],
+        'arg_max': [[[1, 1]], [[1, 1]]],
+    },
+]
+
+
+@pytest.mark.parametrize('test,reduce,dtype,device',
+                         product(tests, reductions, dtypes, devices))
+def test_forward(test, reduce, dtype, device):
+    src = tensor(test['src'], dtype, device)
+    index = tensor(test['index'], torch.long, device)
+    dim = test['dim']
+    expected = tensor(test[reduce], dtype, device)
+
+    fn = getattr(torch_scatter, 'scatter_' + reduce)
+    jit = torch.jit.script(fn)
+    out1 = fn(src, index, dim)
+    out2 = jit(src, index, dim)
+    if isinstance(out1, tuple):
+        out1, arg_out1 = out1
+        out2, arg_out2 = out2
+        arg_expected = tensor(test['arg_' + reduce], torch.long, device)
+        assert torch.all(arg_out1 == arg_expected)
+        assert arg_out1.tolist() == arg_out1.tolist()
+    assert torch.all(out1 == expected)
+    assert out1.tolist() == out2.tolist()
+
+
+@pytest.mark.parametrize('test,reduce,device',
+                         product(tests, reductions, devices))
+def test_backward(test, reduce, device):
+    src = tensor(test['src'], torch.double, device)
+    src.requires_grad_()
+    index = tensor(test['index'], torch.long, device)
+    dim = test['dim']
+
+    assert gradcheck(torch_scatter.scatter,
+                     (src, index, dim, None, None, reduce))
+
+
+@pytest.mark.parametrize('test,reduce,dtype,device',
+                         product(tests, reductions, dtypes, devices))
+def test_out(test, reduce, dtype, device):
+    src = tensor(test['src'], dtype, device)
+    index = tensor(test['index'], torch.long, device)
+    dim = test['dim']
+    expected = tensor(test[reduce], dtype, device)
+
+    out = torch.full_like(expected, -2)
+
+    getattr(torch_scatter, 'scatter_' + reduce)(src, index, dim, out)
+
+    if reduce == 'sum' or reduce == 'add':
+        expected = expected - 2
+    elif reduce == 'mul':
+        expected = out  # We can not really test this here.
+    elif reduce == 'mean':
+        expected = out  # We can not really test this here.
+    elif reduce == 'min':
+        expected = expected.fill_(-2)
+    elif reduce == 'max':
+        expected[expected == 0] = -2
+    else:
+        raise ValueError
+
+    assert torch.all(out == expected)
+
+
+@pytest.mark.parametrize('test,reduce,dtype,device',
+                         product(tests, reductions, dtypes, devices))
+def test_non_contiguous(test, reduce, dtype, device):
+    src = tensor(test['src'], dtype, device)
+    index = tensor(test['index'], torch.long, device)
+    dim = test['dim']
+    expected = tensor(test[reduce], dtype, device)
+
+    if src.dim() > 1:
+        src = src.transpose(0, 1).contiguous().transpose(0, 1)
+    if index.dim() > 1:
+        index = index.transpose(0, 1).contiguous().transpose(0, 1)
+
+    out = getattr(torch_scatter, 'scatter_' + reduce)(src, index, dim)
+    if isinstance(out, tuple):
+        out, arg_out = out
+        arg_expected = tensor(test['arg_' + reduce], torch.long, device)
+        assert torch.all(arg_out == arg_expected)
+    assert torch.all(out == expected)
--- a/test/test_segment.py
+++ b/test/test_segment.py
+from itertools import product
+
+import pytest
+import torch
+from torch.autograd import gradcheck
+import torch_scatter
+
+from .utils import reductions, tensor, dtypes, devices
+
+tests = [
+    {
+        'src': [1, 2, 3, 4, 5, 6],
+        'index': [0, 0, 1, 1, 1, 3],
+        'indptr': [0, 2, 5, 5, 6],
+        'sum': [3, 12, 0, 6],
+        'add': [3, 12, 0, 6],
+        'mean': [1.5, 4, 0, 6],
+        'min': [1, 3, 0, 6],
+        'arg_min': [0, 2, 6, 5],
+        'max': [2, 5, 0, 6],
+        'arg_max': [1, 4, 6, 5],
+    },
+    {
+        'src': [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]],
+        'index': [0, 0, 1, 1, 1, 3],
+        'indptr': [0, 2, 5, 5, 6],
+        'sum': [[4, 6], [21, 24], [0, 0], [11, 12]],
+        'add': [[4, 6], [21, 24], [0, 0], [11, 12]],
+        'mean': [[2, 3], [7, 8], [0, 0], [11, 12]],
+        'min': [[1, 2], [5, 6], [0, 0], [11, 12]],
+        'arg_min': [[0, 0], [2, 2], [6, 6], [5, 5]],
+        'max': [[3, 4], [9, 10], [0, 0], [11, 12]],
+        'arg_max': [[1, 1], [4, 4], [6, 6], [5, 5]],
+    },
+    {
+        'src': [[1, 3, 5, 7, 9, 11], [2, 4, 6, 8, 10, 12]],
+        'index': [[0, 0, 1, 1, 1, 3], [0, 0, 0, 1, 1, 2]],
+        'indptr': [[0, 2, 5, 5, 6], [0, 3, 5, 6, 6]],
+        'sum': [[4, 21, 0, 11], [12, 18, 12, 0]],
+        'add': [[4, 21, 0, 11], [12, 18, 12, 0]],
+        'mean': [[2, 7, 0, 11], [4, 9, 12, 0]],
+        'min': [[1, 5, 0, 11], [2, 8, 12, 0]],
+        'arg_min': [[0, 2, 6, 5], [0, 3, 5, 6]],
+        'max': [[3, 9, 0, 11], [6, 10, 12, 0]],
+        'arg_max': [[1, 4, 6, 5], [2, 4, 5, 6]],
+    },
+    {
+        'src': [[[1, 2], [3, 4], [5, 6]], [[7, 9], [10, 11], [12, 13]]],
+        'index': [[0, 0, 1], [0, 2, 2]],
+        'indptr': [[0, 2, 3, 3], [0, 1, 1, 3]],
+        'sum': [[[4, 6], [5, 6], [0, 0]], [[7, 9], [0, 0], [22, 24]]],
+        'add': [[[4, 6], [5, 6], [0, 0]], [[7, 9], [0, 0], [22, 24]]],
+        'mean': [[[2, 3], [5, 6], [0, 0]], [[7, 9], [0, 0], [11, 12]]],
+        'min': [[[1, 2], [5, 6], [0, 0]], [[7, 9], [0, 0], [10, 11]]],
+        'arg_min': [[[0, 0], [2, 2], [3, 3]], [[0, 0], [3, 3], [1, 1]]],
+        'max': [[[3, 4], [5, 6], [0, 0]], [[7, 9], [0, 0], [12, 13]]],
+        'arg_max': [[[1, 1], [2, 2], [3, 3]], [[0, 0], [3, 3], [2, 2]]],
+    },
+    {
+        'src': [[1, 3], [2, 4]],
+        'index': [[0, 0], [0, 0]],
+        'indptr': [[0, 2], [0, 2]],
+        'sum': [[4], [6]],
+        'add': [[4], [6]],
+        'mean': [[2], [3]],
+        'min': [[1], [2]],
+        'arg_min': [[0], [0]],
+        'max': [[3], [4]],
+        'arg_max': [[1], [1]],
+    },
+    {
+        'src': [[[1, 1], [3, 3]], [[2, 2], [4, 4]]],
+        'index': [[0, 0], [0, 0]],
+        'indptr': [[0, 2], [0, 2]],
+        'sum': [[[4, 4]], [[6, 6]]],
+        'add': [[[4, 4]], [[6, 6]]],
+        'mean': [[[2, 2]], [[3, 3]]],
+        'min': [[[1, 1]], [[2, 2]]],
+        'arg_min': [[[0, 0]], [[0, 0]]],
+        'max': [[[3, 3]], [[4, 4]]],
+        'arg_max': [[[1, 1]], [[1, 1]]],
+    },
+]
+
+
+@pytest.mark.parametrize('test,reduce,dtype,device',
+                         product(tests, reductions, dtypes, devices))
+def test_forward(test, reduce, dtype, device):
+    src = tensor(test['src'], dtype, device)
+    index = tensor(test['index'], torch.long, device)
+    indptr = tensor(test['indptr'], torch.long, device)
+    expected = tensor(test[reduce], dtype, device)
+
+    fn = getattr(torch_scatter, 'segment_' + reduce + '_csr')
+    jit = torch.jit.script(fn)
+    out1 = fn(src, indptr)
+    out2 = jit(src, indptr)
+    if isinstance(out1, tuple):
+        out1, arg_out1 = out1
+        out2, arg_out2 = out2
+        arg_expected = tensor(test['arg_' + reduce], torch.long, device)
+        assert torch.all(arg_out1 == arg_expected)
+        assert arg_out1.tolist() == arg_out2.tolist()
+    assert torch.all(out1 == expected)
+    assert out1.tolist() == out2.tolist()
+
+    fn = getattr(torch_scatter, 'segment_' + reduce + '_coo')
+    jit = torch.jit.script(fn)
+    out1 = fn(src, index)
+    out2 = jit(src, index)
+    if isinstance(out1, tuple):
+        out1, arg_out1 = out1
+        out2, arg_out2 = out2
+        arg_expected = tensor(test['arg_' + reduce], torch.long, device)
+        assert torch.all(arg_out1 == arg_expected)
+        assert arg_out1.tolist() == arg_out2.tolist()
+    assert torch.all(out1 == expected)
+    assert out1.tolist() == out2.tolist()
+
+
+@pytest.mark.parametrize('test,reduce,device',
+                         product(tests, reductions, devices))
+def test_backward(test, reduce, device):
+    src = tensor(test['src'], torch.double, device)
+    src.requires_grad_()
+    index = tensor(test['index'], torch.long, device)
+    indptr = tensor(test['indptr'], torch.long, device)
+
+    assert gradcheck(torch_scatter.segment_csr, (src, indptr, None, reduce))
+    assert gradcheck(torch_scatter.segment_coo,
+                     (src, index, None, None, reduce))
+
+
+@pytest.mark.parametrize('test,reduce,dtype,device',
+                         product(tests, reductions, dtypes, devices))
+def test_out(test, reduce, dtype, device):
+    src = tensor(test['src'], dtype, device)
+    index = tensor(test['index'], torch.long, device)
+    indptr = tensor(test['indptr'], torch.long, device)
+    expected = tensor(test[reduce], dtype, device)
+
+    out = torch.full_like(expected, -2)
+
+    getattr(torch_scatter, 'segment_' + reduce + '_csr')(src, indptr, out)
+    assert torch.all(out == expected)
+
+    out.fill_(-2)
+
+    getattr(torch_scatter, 'segment_' + reduce + '_coo')(src, index, out)
+
+    if reduce == 'sum' or reduce == 'add':
+        expected = expected - 2
+    elif reduce == 'mean':
+        expected = out  # We can not really test this here.
+    elif reduce == 'min':
+        expected = expected.fill_(-2)
+    elif reduce == 'max':
+        expected[expected == 0] = -2
+    else:
+        raise ValueError
+
+    assert torch.all(out == expected)
+
+
+@pytest.mark.parametrize('test,reduce,dtype,device',
+                         product(tests, reductions, dtypes, devices))
+def test_non_contiguous(test, reduce, dtype, device):
+    src = tensor(test['src'], dtype, device)
+    index = tensor(test['index'], torch.long, device)
+    indptr = tensor(test['indptr'], torch.long, device)
+    expected = tensor(test[reduce], dtype, device)
+
+    if src.dim() > 1:
+        src = src.transpose(0, 1).contiguous().transpose(0, 1)
+    if index.dim() > 1:
+        index = index.transpose(0, 1).contiguous().transpose(0, 1)
+    if indptr.dim() > 1:
+        indptr = indptr.transpose(0, 1).contiguous().transpose(0, 1)
+
+    out = getattr(torch_scatter, 'segment_' + reduce + '_csr')(src, indptr)
+    if isinstance(out, tuple):
+        out, arg_out = out
+        arg_expected = tensor(test['arg_' + reduce], torch.long, device)
+        assert torch.all(arg_out == arg_expected)
+    assert torch.all(out == expected)
+
+    out = getattr(torch_scatter, 'segment_' + reduce + '_coo')(src, index)
+    if isinstance(out, tuple):
+        out, arg_out = out
+        arg_expected = tensor(test['arg_' + reduce], torch.long, device)
+        assert torch.all(arg_out == arg_expected)
+    assert torch.all(out == expected)
--- a/test/test_zero_tensors.py
+++ b/test/test_zero_tensors.py
+from itertools import product
+
+import pytest
+import torch
+from torch_scatter import scatter, segment_coo, gather_coo
+from torch_scatter import segment_csr, gather_csr
+
+from .utils import reductions, tensor, grad_dtypes, devices
+
+
+@pytest.mark.parametrize('reduce,dtype,device',
+                         product(reductions, grad_dtypes, devices))
+def test_zero_elements(reduce, dtype, device):
+    x = torch.randn(0, 0, 0, 16, dtype=dtype, device=device,
+                    requires_grad=True)
+    index = tensor([], torch.long, device)
+    indptr = tensor([], torch.long, device)
+
+    out = scatter(x, index, dim=0, dim_size=0, reduce=reduce)
+    out.backward(torch.randn_like(out))
+    assert out.size() == (0, 0, 0, 16)
+
+    out = segment_coo(x, index, dim_size=0, reduce=reduce)
+    out.backward(torch.randn_like(out))
+    assert out.size() == (0, 0, 0, 16)
+
+    out = gather_coo(x, index)
+    out.backward(torch.randn_like(out))
+    assert out.size() == (0, 0, 0, 16)
+
+    out = segment_csr(x, indptr, reduce=reduce)
+    out.backward(torch.randn_like(out))
+    assert out.size() == (0, 0, 0, 16)
+
+    out = gather_csr(x, indptr)
+    out.backward(torch.randn_like(out))
+    assert out.size() == (0, 0, 0, 16)
--- a/test/utils.py
+++ b/test/utils.py
+import torch
+
+reductions = ['sum', 'add', 'mean', 'min', 'max']
+
+dtypes = [torch.half, torch.float, torch.double, torch.int, torch.long]
+grad_dtypes = [torch.float, torch.double]
+
+devices = [torch.device('cpu')]
+if torch.cuda.is_available():
+    devices += [torch.device(f'cuda:{torch.cuda.current_device()}')]
+
+
+def tensor(x, dtype, device):
+    return None if x is None else torch.tensor(x, device=device).to(dtype)
--- a/torch_scatter.egg-info/PKG-INFO
+++ b/torch_scatter.egg-info/PKG-INFO
-Metadata-Version: 2.1
-Name: torch-scatter
-Version: 2.0.9
-Summary: PyTorch Extension Library of Optimized Scatter Operations
-Home-page: https://github.com/rusty1s/pytorch_scatter
-Author: Matthias Fey
-Author-email: matthias.fey@tu-dortmund.de
-License: MIT
-Keywords: pytorch,scatter,segment,gather
-Requires-Python: >=3.6
-Provides-Extra: test
-License-File: LICENSE
--- a/torch_scatter.egg-info/SOURCES.txt
+++ b/torch_scatter.egg-info/SOURCES.txt
-LICENSE
-MANIFEST.in
-README.md
-setup.cfg
-setup.py
-/work/home/quyuanhao123/software/test_ocp/torch_scatter-2.0.9/csrc/scatter.cpp
-/work/home/quyuanhao123/software/test_ocp/torch_scatter-2.0.9/csrc/segment_coo.cpp
-/work/home/quyuanhao123/software/test_ocp/torch_scatter-2.0.9/csrc/segment_csr.cpp
-/work/home/quyuanhao123/software/test_ocp/torch_scatter-2.0.9/csrc/version.cpp
-/work/home/quyuanhao123/software/test_ocp/torch_scatter-2.0.9/csrc/cpu/scatter_cpu.cpp
-/work/home/quyuanhao123/software/test_ocp/torch_scatter-2.0.9/csrc/cpu/segment_coo_cpu.cpp
-/work/home/quyuanhao123/software/test_ocp/torch_scatter-2.0.9/csrc/cpu/segment_csr_cpu.cpp
-/work/home/quyuanhao123/software/test_ocp/torch_scatter-2.0.9/csrc/hip/scatter_hip_hip.hip
-/work/home/quyuanhao123/software/test_ocp/torch_scatter-2.0.9/csrc/hip/segment_coo_hip_hip.hip
-/work/home/quyuanhao123/software/test_ocp/torch_scatter-2.0.9/csrc/hip/segment_csr_hip_hip.hip
-csrc/scatter.cpp
-csrc/scatter.h
-csrc/segment_coo.cpp
-csrc/segment_csr.cpp
-csrc/utils.h
-csrc/version.cpp
-csrc/cpu/index_info.h
-csrc/cpu/reducer.h
-csrc/cpu/scatter_cpu.cpp
-csrc/cpu/scatter_cpu.h
-csrc/cpu/segment_coo_cpu.cpp
-csrc/cpu/segment_coo_cpu.h
-csrc/cpu/segment_csr_cpu.cpp
-csrc/cpu/segment_csr_cpu.h
-csrc/cpu/utils.h
-csrc/hip/atomics.cuh
-csrc/hip/index_info.cuh
-csrc/hip/reducer.cuh
-csrc/hip/scatter_hip.h
-csrc/hip/scatter_hip.hip
-csrc/hip/scatter_hip_hip.hip
-csrc/hip/segment_coo_hip.h
-csrc/hip/segment_coo_hip.hip
-csrc/hip/segment_coo_hip_hip.hip
-csrc/hip/segment_csr_hip.h
-csrc/hip/segment_csr_hip.hip
-csrc/hip/segment_csr_hip_hip.hip
-csrc/hip/utils.cuh
-torch_scatter/__init__.py
-torch_scatter/placeholder.py
-torch_scatter/scatter.py
-torch_scatter/segment_coo.py
-torch_scatter/segment_csr.py
-torch_scatter/utils.py
-torch_scatter.egg-info/PKG-INFO
-torch_scatter.egg-info/SOURCES.txt
-torch_scatter.egg-info/dependency_links.txt
-torch_scatter.egg-info/requires.txt
-torch_scatter.egg-info/top_level.txt
-torch_scatter/composite/__init__.py
-torch_scatter/composite/logsumexp.py
-torch_scatter/composite/softmax.py
-torch_scatter/composite/std.py
\ No newline at end of file
--- a/torch_scatter.egg-info/dependency_links.txt
+++ b/torch_scatter.egg-info/dependency_links.txt
-
--- a/torch_scatter.egg-info/top_level.txt
+++ b/torch_scatter.egg-info/top_level.txt
-torch_scatter
--- a/torch_scatter/__init__.py
+++ b/torch_scatter/__init__.py
@@ -7,11 +7,11 @@ import torch
 __version__ = '2.0.9'

 for library in ['_version', '_scatter', '_segment_csr', '_segment_coo']:
-    hip_spec = importlib.machinery.PathFinder().find_spec(
-        f'{library}_hip', [osp.dirname(__file__)])
+    cuda_spec = importlib.machinery.PathFinder().find_spec(
+        f'{library}_cuda', [osp.dirname(__file__)])
    cpu_spec = importlib.machinery.PathFinder().find_spec(
        f'{library}_cpu', [osp.dirname(__file__)])
-    spec = hip_spec or cpu_spec
+    spec = cuda_spec or cpu_spec
    if spec is not None:
        torch.ops.load_library(spec.origin)
    elif os.getenv('BUILD_DOCS', '0') != '1':  # pragma: no cover
@@ -52,6 +52,15 @@ if torch.cuda.is_available() and cuda_version != -1:  # pragma: no cover
        major, minor = int(str(cuda_version)[0]), int(str(cuda_version)[2])
    else:
        major, minor = int(str(cuda_version)[0:2]), int(str(cuda_version)[3])
+    t_major, t_minor = [int(x) for x in torch.version.cuda.split('.')]
+
+    if t_major != major:
+        raise RuntimeError(
+            f'Detected that PyTorch and torch_scatter were compiled with '
+            f'different CUDA versions. PyTorch has CUDA version '
+            f'{t_major}.{t_minor} and torch_scatter has CUDA version '
+            f'{major}.{minor}. Please reinstall the torch_scatter that '
+            f'matches your PyTorch install.')

 from .scatter import scatter_sum, scatter_add, scatter_mul  # noqa
 from .scatter import scatter_mean, scatter_min, scatter_max, scatter  # noqa