push 2.0.9 version

c7c514c2 · yangzhong · cf967b1f · c7c514c2 · cf967b1f · c7c514c2
Commit c7c514c2 authored Jan 22, 2024 by yangzhong
20 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
+cmake_minimum_required(VERSION 3.0)
+project(torchscatter)
+set(CMAKE_CXX_STANDARD 14)
+set(TORCHSCATTER_VERSION 2.0.9)
+option(WITH_CUDA "Enable CUDA support" OFF)
+if(WITH_CUDA)
+  enable_language(CUDA)
+  add_definitions(-D__CUDA_NO_HALF_OPERATORS__)
+  add_definitions(-DWITH_CUDA)
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
+endif()
+find_package(Python3 COMPONENTS Development)
+find_package(Torch REQUIRED)
+file(GLOB HEADERS csrc/scatter.h)
+file(GLOB OPERATOR_SOURCES csrc/cpu/*.h csrc/cpu/*.cpp csrc/*.cpp)
+if(WITH_CUDA)
+  file(GLOB OPERATOR_SOURCES ${OPERATOR_SOURCES} csrc/cuda/*.h csrc/cuda/*.cu)
+endif()
+add_library(${PROJECT_NAME} SHARED ${OPERATOR_SOURCES})
+target_link_libraries(${PROJECT_NAME} PRIVATE ${TORCH_LIBRARIES} Python3::Python)
+set_target_properties(${PROJECT_NAME} PROPERTIES EXPORT_NAME TorchScatter)
+target_include_directories(${PROJECT_NAME} INTERFACE
+  $<BUILD_INTERFACE:${HEADERS}>
+  $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
+include(GNUInstallDirs)
+include(CMakePackageConfigHelpers)
+set(TORCHSCATTER_CMAKECONFIG_INSTALL_DIR "share/cmake/TorchScatter" CACHE STRING "install path for TorchScatterConfig.cmake")
+configure_package_config_file(cmake/TorchScatterConfig.cmake.in
+  "${CMAKE_CURRENT_BINARY_DIR}/TorchScatterConfig.cmake"
+  INSTALL_DESTINATION ${TORCHSCATTER_CMAKECONFIG_INSTALL_DIR})
+write_basic_package_version_file(${CMAKE_CURRENT_BINARY_DIR}/TorchScatterConfigVersion.cmake
+  VERSION ${TORCHSCATTER_VERSION}
+  COMPATIBILITY AnyNewerVersion)
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/TorchScatterConfig.cmake
+  ${CMAKE_CURRENT_BINARY_DIR}/TorchScatterConfigVersion.cmake
+  DESTINATION ${TORCHSCATTER_CMAKECONFIG_INSTALL_DIR})
+install(TARGETS ${PROJECT_NAME}
+  EXPORT TorchScatterTargets
+  LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  )
+install(EXPORT TorchScatterTargets
+  NAMESPACE TorchScatter::
+  DESTINATION ${TORCHSCATTER_CMAKECONFIG_INSTALL_DIR})
+install(FILES ${HEADERS} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME})
+install(FILES
+  csrc/cpu/scatter_cpu.h
+  csrc/cpu/segment_coo_cpu.h
+  csrc/cpu/segment_csr_cpu.h
+  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME}/cpu)
+if(WITH_CUDA)
+  install(FILES
+    csrc/cuda/scatter_cuda.h
+    csrc/cuda/segment_coo_cuda.h
+    csrc/cuda/segment_csr_cuda.h
+    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME}/cuda)
+endif()
+if(WITH_CUDA)
+  set_property(TARGET torch_cuda PROPERTY INTERFACE_COMPILE_OPTIONS "")
+  set_property(TARGET torch_cpu PROPERTY INTERFACE_COMPILE_OPTIONS "")
+endif()
--- a/PKG-INFO
+++ b/PKG-INFO
-Metadata-Version: 2.1
-Name: torch_scatter
-Version: 2.0.9
-Summary: PyTorch Extension Library of Optimized Scatter Operations
-Home-page: https://github.com/rusty1s/pytorch_scatter
-Author: Matthias Fey
-Author-email: matthias.fey@tu-dortmund.de
-License: MIT
-Description: UNKNOWN
-Keywords: pytorch,scatter,segment,gather
-Platform: UNKNOWN
-Requires-Python: >=3.6
-Provides-Extra: test
--- a/README.md
+++ b/README.md
-# <div align="center"><strong>torch-scatter-2.0.9</strong></div>
+[pypi-image]: https://badge.fury.io/py/torch-scatter.svg
-## 简介
+[pypi-url]: https://pypi.python.org/pypi/torch-scatter
-torch-scatter是一个在PyTorch库中使用的Python库，它用于从张量中随机选择元素并返回一个新的张量。这个库提供了一种简单的方法来创建具有随机标签的数据集，这对于许多机器学习任务非常有用，例如数据增强或生成对抗网络(GANs)。
+[testing-image]: https://github.com/rusty1s/pytorch_scatter/actions/workflows/testing.yml/badge.svg
+[testing-url]: https://github.com/rusty1s/pytorch_scatter/actions/workflows/testing.yml
+[linting-image]: https://github.com/rusty1s/pytorch_scatter/actions/workflows/linting.yml/badge.svg
+[linting-url]: https://github.com/rusty1s/pytorch_scatter/actions/workflows/linting.yml
+[docs-image]: https://readthedocs.org/projects/pytorch-scatter/badge/?version=latest
+[docs-url]: https://pytorch-scatter.readthedocs.io/en/latest/?badge=latest
+[coverage-image]: https://codecov.io/gh/rusty1s/pytorch_scatter/branch/master/graph/badge.svg
+[coverage-url]: https://codecov.io/github/rusty1s/pytorch_scatter?branch=master
-## 依赖安装
+# PyTorch Scatter
-+ pytorch1.10或者pytorch1.13 以及对应的torchvision（建议dtk-22.04.2、dtk-23.04与dtk-23.10）
-+ python 3.7-3.10
-### 1、使用源码编译方式安装
+[![PyPI Version][pypi-image]][pypi-url]
+[![Testing Status][testing-image]][testing-url]
+[![Linting Status][linting-image]][linting-url]
+[![Docs Status][docs-image]][docs-url]
+[![Code Coverage][coverage-image]][coverage-url]
-#### 编译环境准备
+<p align="center">
-提供2种环境准备方式：
+  <img width="50%" src="https://raw.githubusercontent.com/rusty1s/pytorch_scatter/master/docs/source/_figures/add.svg?sanitize=true" />
+</p>
-1. 基于光源pytorch基础镜像环境：镜像下载地址：[https://sourcefind.cn/#/image/dcu/pytorch](https://sourcefind.cn/#/image/dcu/pytorch)，根据pytorch、python、dtk及系统下载对应的镜像版本。
+--------------------------------------------------------------------------------
-2. 基于现有python环境：安装pytorch和torchvision，whl包下载目录：[https://cancon.hpccube.com:65024/4/main/pytorch](https://cancon.hpccube.com:65024/4/main/pytorch)，[https://cancon.hpccube.com:65024/4/main/vision](https://cancon.hpccube.com:65024/4/main/vision)，根据python、dtk版本,下载对应pytorch和torchvision的whl包。安装命令如下：
+**[Documentation](https://pytorch-scatter.readthedocs.io)**
-```shell
-pip install torch* (下载的torch的whl包)
+This package consists of a small extension library of highly optimized sparse update (scatter and segment) operations for the use in [PyTorch](http://pytorch.org/), which are missing in the main package.
-pip install torchvision* (下载的torchvision的whl包)
+Scatter and segment operations can be roughly described as reduce operations based on a given "group-index" tensor.
-pip install setuptools==59.5.0 wheel
+Segment operations require the "group-index" tensor to be sorted, whereas scatter operations are not subject to these requirements.
+The package consists of the following operations with reduction types `"sum"|"mean"|"min"|"max"`:
+* [**scatter**](https://pytorch-scatter.readthedocs.io/en/latest/functions/scatter.html) based on arbitrary indices
+* [**segment_coo**](https://pytorch-scatter.readthedocs.io/en/latest/functions/segment_coo.html) based on sorted indices
+* [**segment_csr**](https://pytorch-scatter.readthedocs.io/en/latest/functions/segment_csr.html) based on compressed indices via pointers
+In addition, we provide the following **composite functions** which make use of `scatter_*` operations under the hood: `scatter_std`, `scatter_logsumexp`, `scatter_softmax` and `scatter_log_softmax`.
+All included operations are broadcastable, work on varying data types, are implemented both for CPU and GPU with corresponding backward implementations, and are fully traceable.
+## Installation
+### Anaconda
+**Update:** You can now install `pytorch-scatter` via [Anaconda](https://anaconda.org/pyg/pytorch-scatter) for all major OS/PyTorch/CUDA combinations 🤗
+Given that you have [`pytorch >= 1.8.0` installed](https://pytorch.org/get-started/locally/), simply run
+```
+conda install pytorch-scatter -c pyg
 ```
-#### 源码编译安装
+### Binaries
- 代码下载
-```shell
+We alternatively provide pip wheels for all major OS/PyTorch/CUDA combinations, see [here](https://data.pyg.org/whl).
-git clone http://developer.hpccube.com/codes/aicomponent/torch-scatter # 根据编译需要切换分支
+#### PyTorch 1.9.0
+To install the binaries for PyTorch 1.9.0, simply run
 ```
- 源码编译（进入torch-scatter目录）：
+pip install torch-scatter -f https://data.pyg.org/whl/torch-1.9.0+${CUDA}.html
 ```
-export C_INCLUDE_PATH=/public/software/apps/DeepLearning/PyTorch_Lib/gflags-2.1.2-build/include:$C_INCLUDE_PATH
-export CPLUS_INCLUDE_PATH=/public/software/apps/DeepLearning/PyTorch_Lib/gflags-2.1.2-build/include:$CPLUS_INCLUDE_PATH
-export C_INCLUDE_PATH=/public/software/apps/DeepLearning/PyTorch_Lib/glog-build/include:$C_INCLUDE_PATH
-export CPLUS_INCLUDE_PATH=/public/software/apps/DeepLearning/PyTorch_Lib/glog-build/include:$CPLUS_INCLUDE_PATH
-export C_INCLUDE_PATH=$ROCM_PATH/rocrand/include:$C_INCLUDE_PATH
-export CPLUS_INCLUDE_PATH=$ROCM_PATH/rocrand/include:$CPLUS_INCLUDE_PATH
-export LD_LIBRARY_PATH=$ROCM_PATH/rocrand/lib:$LD_LIBRARY_PATH
-export FORCE_ONLY_HIP=1
-export CC=hipcc
-export CXX=hipcc
-python setup.py install
+where `${CUDA}` should be replaced by either `cpu`, `cu102`, or `cu111` depending on your PyTorch installation.
+|             | `cpu` | `cu102` | `cu111` |
+|-------------|-------|---------|---------|
+| **Linux**   | ✅    | ✅      | ✅      |
+| **Windows** | ✅    | ✅      | ✅      |
+| **macOS**   | ✅    |         |         |
+#### PyTorch 1.8.0/1.8.1
+To install the binaries for PyTorch 1.8.0 and 1.8.1, simply run
+```
+pip install torch-scatter -f https://data.pyg.org/whl/torch-1.8.0+${CUDA}.html
 ```
-#### 注意事项
-+ 若使用pip install下载安装过慢，可添加pypi清华源：-i https://pypi.tuna.tsinghua.edu.cn/simple/
-+ ROCM_PATH为dtk的路径，默认为/opt/dtk
-## 验证
+where `${CUDA}` should be replaced by either `cpu`, `cu101`, `cu102`, or `cu111` depending on your PyTorch installation.
+|             | `cpu` | `cu101` | `cu102` | `cu111` |
+|-------------|-------|---------|---------|---------|
+| **Linux**   | ✅    | ✅      | ✅      | ✅      |
+| **Windows** | ✅    | ❌      | ✅      | ✅      |
+| **macOS**   | ✅    |         |         |         |
+**Note:** Binaries of older versions are also provided for PyTorch 1.4.0, PyTorch 1.5.0, PyTorch 1.6.0 and PyTorch 1.7.0/1.7.1 (following the same procedure).
+### From source
+Ensure that at least PyTorch 1.4.0 is installed and verify that `cuda/bin` and `cuda/include` are in your `$PATH` and `$CPATH` respectively, *e.g.*:
+```
+$ python -c "import torch; print(torch.__version__)"
+>>> 1.4.0
-```python
+$ echo $PATH
+>>> /usr/local/cuda/bin:...
+$ echo $CPATH
+>>> /usr/local/cuda/include:...
+```
+Then run:
+```
+pip install torch-scatter
+```
+When running in a docker container without NVIDIA driver, PyTorch needs to evaluate the compute capabilities and may fail.
+In this case, ensure that the compute capabilities are set via `TORCH_CUDA_ARCH_LIST`, *e.g.*:
+```
+export TORCH_CUDA_ARCH_LIST = "6.0 6.1 7.2+PTX 7.5+PTX"
+```
+## Example
+```py
 import torch
 from torch_scatter import scatter_max
@@ -66,10 +138,30 @@ tensor([[5, 5, 3, 4, 0, 1]
        [1, 4, 3, 5, 5, 5]])
 ```
-## Known Issue
+## Running tests
- 该库没有基于cpu环境修改，仅支持dcu，请在有dcu卡的环境运行。
- 如需完整使用所有pyg功能，请pip install torch-geometric
+```
+pip install -r requirements.txt
+python setup.py test
+```
+## C++ API
+`torch-scatter` also offers a C++ API that contains C++ equivalent of python models.
+```
+mkdir build
+cd build
+# Add -DWITH_CUDA=on support for the CUDA if needed
+cmake ..
+make
+make install
+```
+### Compile the python library
+```
+python setup.py bdist_wheel
+pip install dist/*.whl
+```
-## 参考资料
- [README_ORIGIN](README_ORIGIN.md)
- [https://pypi.org/project/torch-scatter/2.0.9/](https://pypi.org/project/torch-scatter/2.0.9/)
--- a/README_ORIGIN.md
+++ b/README_ORIGIN.md
-[pypi-image]: https://badge.fury.io/py/torch-scatter.svg
-[pypi-url]: https://pypi.python.org/pypi/torch-scatter
-[testing-image]: https://github.com/rusty1s/pytorch_scatter/actions/workflows/testing.yml/badge.svg
-[testing-url]: https://github.com/rusty1s/pytorch_scatter/actions/workflows/testing.yml
-[linting-image]: https://github.com/rusty1s/pytorch_scatter/actions/workflows/linting.yml/badge.svg
-[linting-url]: https://github.com/rusty1s/pytorch_scatter/actions/workflows/linting.yml
-[docs-image]: https://readthedocs.org/projects/pytorch-scatter/badge/?version=latest
-[docs-url]: https://pytorch-scatter.readthedocs.io/en/latest/?badge=latest
-[coverage-image]: https://codecov.io/gh/rusty1s/pytorch_scatter/branch/master/graph/badge.svg
-[coverage-url]: https://codecov.io/github/rusty1s/pytorch_scatter?branch=master
-# PyTorch Scatter
-[![PyPI Version][pypi-image]][pypi-url]
-[![Testing Status][testing-image]][testing-url]
-[![Linting Status][linting-image]][linting-url]
-[![Docs Status][docs-image]][docs-url]
-[![Code Coverage][coverage-image]][coverage-url]
-<p align="center">
-  <img width="50%" src="https://raw.githubusercontent.com/rusty1s/pytorch_scatter/master/docs/source/_figures/add.svg?sanitize=true" />
-</p>
--------------------------------------------------------------------------------
-**[Documentation](https://pytorch-scatter.readthedocs.io)**
-This package consists of a small extension library of highly optimized sparse update (scatter and segment) operations for the use in [PyTorch](http://pytorch.org/), which are missing in the main package.
-Scatter and segment operations can be roughly described as reduce operations based on a given "group-index" tensor.
-Segment operations require the "group-index" tensor to be sorted, whereas scatter operations are not subject to these requirements.
-The package consists of the following operations with reduction types `"sum"|"mean"|"min"|"max"`:
-* [**scatter**](https://pytorch-scatter.readthedocs.io/en/latest/functions/scatter.html) based on arbitrary indices
-* [**segment_coo**](https://pytorch-scatter.readthedocs.io/en/latest/functions/segment_coo.html) based on sorted indices
-* [**segment_csr**](https://pytorch-scatter.readthedocs.io/en/latest/functions/segment_csr.html) based on compressed indices via pointers
-In addition, we provide the following **composite functions** which make use of `scatter_*` operations under the hood: `scatter_std`, `scatter_logsumexp`, `scatter_softmax` and `scatter_log_softmax`.
-All included operations are broadcastable, work on varying data types, are implemented both for CPU and GPU with corresponding backward implementations, and are fully traceable.
-## Installation
-### Anaconda
-**Update:** You can now install `pytorch-scatter` via [Anaconda](https://anaconda.org/pyg/pytorch-scatter) for all major OS/PyTorch/CUDA combinations 🤗
-Given that you have [`pytorch >= 1.8.0` installed](https://pytorch.org/get-started/locally/), simply run
-```
-conda install pytorch-scatter -c pyg
-```
-### Binaries
-We alternatively provide pip wheels for all major OS/PyTorch/CUDA combinations, see [here](https://data.pyg.org/whl).
-#### PyTorch 1.10.0
-To install the binaries for PyTorch 1.10.0, simply run
-```
-pip install torch-scatter -f https://data.pyg.org/whl/torch-1.10.0+${CUDA}.html
-```
-where `${CUDA}` should be replaced by either `cpu`, `cu102`, or `cu113` depending on your PyTorch installation.
-|             | `cpu` | `cu102` | `cu113` |
-|-------------|-------|---------|---------|
-| **Linux**   | ✅    | ✅      | ✅      |
-| **Windows** | ✅    | ✅      | ✅      |
-| **macOS**   | ✅    |         |         |
-#### PyTorch 1.9.0/1.9.1
-To install the binaries for PyTorch 1.9.0 and 1.9.1, simply run
-```
-pip install torch-scatter -f https://data.pyg.org/whl/torch-1.9.0+${CUDA}.html
-```
-where `${CUDA}` should be replaced by either `cpu`, `cu102`, or `cu111` depending on your PyTorch installation.
-|             | `cpu` | `cu102` | `cu111` |
-|-------------|-------|---------|---------|
-| **Linux**   | ✅    | ✅      | ✅      |
-| **Windows** | ✅    | ✅      | ✅      |
-| **macOS**   | ✅    |         |         |
-**Note:** Binaries of older versions are also provided for PyTorch 1.4.0, PyTorch 1.5.0, PyTorch 1.6.0, PyTorch 1.7.0/1.7.1 and PyTorch 1.8.0/1.8.1 (following the same procedure).
-### From source
-Ensure that at least PyTorch 1.4.0 is installed and verify that `cuda/bin` and `cuda/include` are in your `$PATH` and `$CPATH` respectively, *e.g.*:
-```
-$ python -c "import torch; print(torch.__version__)"
->>> 1.4.0
-$ echo $PATH
->>> /usr/local/cuda/bin:...
-$ echo $CPATH
->>> /usr/local/cuda/include:...
-```
-Then run:
-```
-pip install torch-scatter
-```
-When running in a docker container without NVIDIA driver, PyTorch needs to evaluate the compute capabilities and may fail.
-In this case, ensure that the compute capabilities are set via `TORCH_CUDA_ARCH_LIST`, *e.g.*:
-```
-export TORCH_CUDA_ARCH_LIST = "6.0 6.1 7.2+PTX 7.5+PTX"
-```
-## Example
-```py
-import torch
-from torch_scatter import scatter_max
-src = torch.tensor([[2, 0, 1, 4, 3], [0, 2, 1, 3, 4]])
-index = torch.tensor([[4, 5, 4, 2, 3], [0, 0, 2, 2, 1]])
-out, argmax = scatter_max(src, index, dim=-1)
-```
-```
-print(out)
-tensor([[0, 0, 4, 3, 2, 0],
-        [2, 4, 3, 0, 0, 0]])
-print(argmax)
-tensor([[5, 5, 3, 4, 0, 1]
-        [1, 4, 3, 5, 5, 5]])
-```
-## Running tests
-```
-python setup.py test
-```
-## C++ API
-`torch-scatter` also offers a C++ API that contains C++ equivalent of python models.
-```
-mkdir build
-cd build
-# Add -DWITH_CUDA=on support for the CUDA if needed
-cmake ..
-make
-make install
-```
--- a/benchmark/.gitignore
+++ b/benchmark/.gitignore
+*.mat
+*.tmp
--- a/benchmark/gather.py
+++ b/benchmark/gather.py
+import time
+import itertools
+import argparse
+import torch
+from scipy.io import loadmat
+from torch_scatter import gather_coo, gather_csr
+from scatter_segment import short_rows, long_rows, download, bold
+@torch.no_grad()
+def correctness(dataset):
+    group, name = dataset
+    mat = loadmat(f'{name}.mat')['Problem'][0][0][2].tocsr()
+    rowptr = torch.from_numpy(mat.indptr).to(args.device, torch.long)
+    row = torch.from_numpy(mat.tocoo().row).to(args.device, torch.long)
+    dim_size = rowptr.size(0) - 1
+    for size in sizes[1:]:
+        try:
+            x = torch.randn((dim_size, size), device=args.device)
+            x = x.squeeze(-1) if size == 1 else x
+            out1 = x.index_select(0, row)
+            out2 = gather_coo(x, row)
+            out3 = gather_csr(x, rowptr)
+            assert torch.allclose(out1, out2, atol=1e-4)
+            assert torch.allclose(out1, out3, atol=1e-4)
+        except RuntimeError as e:
+            if 'out of memory' not in str(e):
+                raise RuntimeError(e)
+            torch.cuda.empty_cache()
+def time_func(func, x):
+    try:
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+        t = time.perf_counter()
+        if not args.with_backward:
+            with torch.no_grad():
+                for _ in range(iters):
+                    func(x)
+        else:
+            x = x.requires_grad_()
+            for _ in range(iters):
+                out = func(x)
+                torch.autograd.grad(out, x, out, only_inputs=True)
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+        return time.perf_counter() - t
+    except RuntimeError as e:
+        if 'out of memory' not in str(e):
+            raise RuntimeError(e)
+        torch.cuda.empty_cache()
+        return float('inf')
+def timing(dataset):
+    group, name = dataset
+    mat = loadmat(f'{name}.mat')['Problem'][0][0][2].tocsr()
+    rowptr = torch.from_numpy(mat.indptr).to(args.device, torch.long)
+    row = torch.from_numpy(mat.tocoo().row).to(args.device, torch.long)
+    dim_size = rowptr.size(0) - 1
+    avg_row_len = row.size(0) / dim_size
+    def select(x):
+        return x.index_select(0, row)
+    def gather(x):
+        return x.gather(0, row.view(-1, 1).expand(-1, x.size(1)))
+    def gat_coo(x):
+        return gather_coo(x, row)
+    def gat_csr(x):
+        return gather_csr(x, rowptr)
+    t1, t2, t3, t4 = [], [], [], []
+    for size in sizes:
+        try:
+            x = torch.randn((dim_size, size), device=args.device)
+            t1 += [time_func(select, x)]
+            t2 += [time_func(gather, x)]
+            t3 += [time_func(gat_coo, x)]
+            t4 += [time_func(gat_csr, x)]
+            del x
+        except RuntimeError as e:
+            if 'out of memory' not in str(e):
+                raise RuntimeError(e)
+            torch.cuda.empty_cache()
+            for t in (t1, t2, t3, t4):
+                t.append(float('inf'))
+    ts = torch.tensor([t1, t2, t3, t4])
+    winner = torch.zeros_like(ts, dtype=torch.bool)
+    winner[ts.argmin(dim=0), torch.arange(len(sizes))] = 1
+    winner = winner.tolist()
+    name = f'{group}/{name}'
+    print(f'{bold(name)} (avg row length: {avg_row_len:.2f}):')
+    print('\t'.join(['       '] + [f'{size:>5}' for size in sizes]))
+    print('\t'.join([bold('SELECT ')] +
+                    [bold(f'{t:.5f}', f) for t, f in zip(t1, winner[0])]))
+    print('\t'.join([bold('GAT    ')] +
+                    [bold(f'{t:.5f}', f) for t, f in zip(t2, winner[1])]))
+    print('\t'.join([bold('GAT_COO')] +
+                    [bold(f'{t:.5f}', f) for t, f in zip(t3, winner[2])]))
+    print('\t'.join([bold('GAT_CSR')] +
+                    [bold(f'{t:.5f}', f) for t, f in zip(t4, winner[3])]))
+    print()
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--with_backward', action='store_true')
+    parser.add_argument('--device', type=str, default='cuda')
+    args = parser.parse_args()
+    iters = 1 if args.device == 'cpu' else 20
+    sizes = [1, 16, 32, 64, 128, 256, 512]
+    sizes = sizes[:3] if args.device == 'cpu' else sizes
+    for _ in range(10):  # Warmup.
+        torch.randn(100, 100, device=args.device).sum()
+    for dataset in itertools.chain(short_rows, long_rows):
+        download(dataset)
+        correctness(dataset)
+        timing(dataset)
--- a/benchmark/scatter_segment.py
+++ b/benchmark/scatter_segment.py
+import time
+import os.path as osp
+import itertools
+import argparse
+import wget
+import torch
+from scipy.io import loadmat
+from torch_scatter import scatter, segment_coo, segment_csr
+short_rows = [
+    ('DIMACS10', 'citationCiteseer'),
+    ('SNAP', 'web-Stanford'),
+]
+long_rows = [
+    ('Janna', 'StocF-1465'),
+    ('GHS_psdef', 'ldoor'),
+]
+def download(dataset):
+    url = 'https://sparse.tamu.edu/mat/{}/{}.mat'
+    for group, name in itertools.chain(long_rows, short_rows):
+        if not osp.exists(f'{name}.mat'):
+            print(f'Downloading {group}/{name}:')
+            wget.download(url.format(group, name))
+            print('')
+def bold(text, flag=True):
+    return f'\033[1m{text}\033[0m' if flag else text
+@torch.no_grad()
+def correctness(dataset):
+    group, name = dataset
+    mat = loadmat(f'{name}.mat')['Problem'][0][0][2].tocsr()
+    rowptr = torch.from_numpy(mat.indptr).to(args.device, torch.long)
+    row = torch.from_numpy(mat.tocoo().row).to(args.device, torch.long)
+    dim_size = rowptr.size(0) - 1
+    for size in sizes:
+        try:
+            x = torch.randn((row.size(0), size), device=args.device)
+            x = x.squeeze(-1) if size == 1 else x
+            out1 = scatter(x, row, dim=0, dim_size=dim_size, reduce='add')
+            out2 = segment_coo(x, row, dim_size=dim_size, reduce='add')
+            out3 = segment_csr(x, rowptr, reduce='add')
+            assert torch.allclose(out1, out2, atol=1e-4)
+            assert torch.allclose(out1, out3, atol=1e-4)
+            out1 = scatter(x, row, dim=0, dim_size=dim_size, reduce='mean')
+            out2 = segment_coo(x, row, dim_size=dim_size, reduce='mean')
+            out3 = segment_csr(x, rowptr, reduce='mean')
+            assert torch.allclose(out1, out2, atol=1e-4)
+            assert torch.allclose(out1, out3, atol=1e-4)
+            out1 = scatter(x, row, dim=0, dim_size=dim_size, reduce='min')
+            out2 = segment_coo(x, row, reduce='min')
+            out3 = segment_csr(x, rowptr, reduce='min')
+            assert torch.allclose(out1, out2, atol=1e-4)
+            assert torch.allclose(out1, out3, atol=1e-4)
+            out1 = scatter(x, row, dim=0, dim_size=dim_size, reduce='max')
+            out2 = segment_coo(x, row, reduce='max')
+            out3 = segment_csr(x, rowptr, reduce='max')
+            assert torch.allclose(out1, out2, atol=1e-4)
+            assert torch.allclose(out1, out3, atol=1e-4)
+        except RuntimeError as e:
+            if 'out of memory' not in str(e):
+                raise RuntimeError(e)
+            torch.cuda.empty_cache()
+def time_func(func, x):
+    try:
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+        t = time.perf_counter()
+        if not args.with_backward:
+            with torch.no_grad():
+                for _ in range(iters):
+                    func(x)
+        else:
+            x = x.requires_grad_()
+            for _ in range(iters):
+                out = func(x)
+                out = out[0] if isinstance(out, tuple) else out
+                torch.autograd.grad(out, x, out, only_inputs=True)
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+        return time.perf_counter() - t
+    except RuntimeError as e:
+        if 'out of memory' not in str(e):
+            raise RuntimeError(e)
+        torch.cuda.empty_cache()
+        return float('inf')
+def timing(dataset):
+    group, name = dataset
+    mat = loadmat(f'{name}.mat')['Problem'][0][0][2].tocsr()
+    rowptr = torch.from_numpy(mat.indptr).to(args.device, torch.long)
+    row = torch.from_numpy(mat.tocoo().row).to(args.device, torch.long)
+    row2 = row[torch.randperm(row.size(0))]
+    dim_size = rowptr.size(0) - 1
+    avg_row_len = row.size(0) / dim_size
+    def sca1_row(x):
+        out = x.new_zeros(dim_size, *x.size()[1:])
+        row_tmp = row.view(-1, 1).expand_as(x) if x.dim() > 1 else row
+        return out.scatter_add_(0, row_tmp, x)
+    def sca1_col(x):
+        out = x.new_zeros(dim_size, *x.size()[1:])
+        row2_tmp = row2.view(-1, 1).expand_as(x) if x.dim() > 1 else row2
+        return out.scatter_add_(0, row2_tmp, x)
+    def sca2_row(x):
+        return scatter(x, row, dim=0, dim_size=dim_size, reduce=args.reduce)
+    def sca2_col(x):
+        return scatter(x, row2, dim=0, dim_size=dim_size, reduce=args.reduce)
+    def seg_coo(x):
+        return segment_coo(x, row, reduce=args.reduce)
+    def seg_csr(x):
+        return segment_csr(x, rowptr, reduce=args.reduce)
+    def dense1(x):
+        return getattr(torch, args.reduce)(x, dim=-2)
+    def dense2(x):
+        return getattr(torch, args.reduce)(x, dim=-1)
+    t1, t2, t3, t4, t5, t6, t7, t8 = [], [], [], [], [], [], [], []
+    for size in sizes:
+        try:
+            x = torch.randn((row.size(0), size), device=args.device)
+            x = x.squeeze(-1) if size == 1 else x
+            t1 += [time_func(sca1_row, x)]
+            t2 += [time_func(sca1_col, x)]
+            t3 += [time_func(sca2_row, x)]
+            t4 += [time_func(sca2_col, x)]
+            t5 += [time_func(seg_coo, x)]
+            t6 += [time_func(seg_csr, x)]
+            del x
+        except RuntimeError as e:
+            if 'out of memory' not in str(e):
+                raise RuntimeError(e)
+            torch.cuda.empty_cache()
+            for t in (t1, t2, t3, t4, t5, t6):
+                t.append(float('inf'))
+        try:
+            x = torch.randn((dim_size, int(avg_row_len + 1), size),
+                            device=args.device)
+            t7 += [time_func(dense1, x)]
+            x = x.view(dim_size, size, int(avg_row_len + 1))
+            t8 += [time_func(dense2, x)]
+            del x
+        except RuntimeError as e:
+            if 'out of memory' not in str(e):
+                raise RuntimeError(e)
+            torch.cuda.empty_cache()
+            for t in (t7, t8):
+                t.append(float('inf'))
+    ts = torch.tensor([t1, t2, t3, t4, t5, t6, t7, t8])
+    winner = torch.zeros_like(ts, dtype=torch.bool)
+    winner[ts.argmin(dim=0), torch.arange(len(sizes))] = 1
+    winner = winner.tolist()
+    name = f'{group}/{name}'
+    print(f'{bold(name)} (avg row length: {avg_row_len:.2f}):')
+    print('\t'.join(['        '] + [f'{size:>5}' for size in sizes]))
+    print('\t'.join([bold('SCA1_ROW')] +
+                    [bold(f'{t:.5f}', f) for t, f in zip(t1, winner[0])]))
+    print('\t'.join([bold('SCA1_COL')] +
+                    [bold(f'{t:.5f}', f) for t, f in zip(t2, winner[1])]))
+    print('\t'.join([bold('SCA2_ROW')] +
+                    [bold(f'{t:.5f}', f) for t, f in zip(t3, winner[2])]))
+    print('\t'.join([bold('SCA2_COL')] +
+                    [bold(f'{t:.5f}', f) for t, f in zip(t4, winner[3])]))
+    print('\t'.join([bold('SEG_COO ')] +
+                    [bold(f'{t:.5f}', f) for t, f in zip(t5, winner[4])]))
+    print('\t'.join([bold('SEG_CSR ')] +
+                    [bold(f'{t:.5f}', f) for t, f in zip(t6, winner[5])]))
+    print('\t'.join([bold('DENSE1  ')] +
+                    [bold(f'{t:.5f}', f) for t, f in zip(t7, winner[6])]))
+    print('\t'.join([bold('DENSE2  ')] +
+                    [bold(f'{t:.5f}', f) for t, f in zip(t8, winner[7])]))
+    print()
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--reduce', type=str, required=True,
+                        choices=['sum', 'mean', 'min', 'max'])
+    parser.add_argument('--with_backward', action='store_true')
+    parser.add_argument('--device', type=str, default='cuda')
+    args = parser.parse_args()
+    iters = 1 if args.device == 'cpu' else 20
+    sizes = [1, 16, 32, 64, 128, 256, 512]
+    sizes = sizes[:3] if args.device == 'cpu' else sizes
+    for _ in range(10):  # Warmup.
+        torch.randn(100, 100, device=args.device).sum()
+    for dataset in itertools.chain(short_rows, long_rows):
+        download(dataset)
+        correctness(dataset)
+        timing(dataset)
--- a/cmake/TorchScatterConfig.cmake.in
+++ b/cmake/TorchScatterConfig.cmake.in
+# TorchScatterConfig.cmake
+# --------------------
+#
+# Exported targets:: Scatter
+#
+@PACKAGE_INIT@
+set(PN TorchScatter)
+set(${PN}_INCLUDE_DIR "${PACKAGE_PREFIX_DIR}/@CMAKE_INSTALL_INCLUDEDIR@")
+set(${PN}_LIBRARY "")
+set(${PN}_DEFINITIONS USING_${PN})
+check_required_components(${PN})
+if(NOT (CMAKE_VERSION VERSION_LESS 3.0))
+#-----------------------------------------------------------------------------
+# Don't include targets if this file is being picked up by another
+# project which has already built this as a subproject
+#-----------------------------------------------------------------------------
+if(NOT TARGET ${PN}::TorchScatter)
+include("${CMAKE_CURRENT_LIST_DIR}/${PN}Targets.cmake")
+if(NOT TARGET torch_library)
+find_package(Torch REQUIRED)
+endif()
+if(NOT TARGET Python3::Python)
+find_package(Python3 COMPONENTS Development)
+endif()
+target_link_libraries(TorchScatter::TorchScatter INTERFACE ${TORCH_LIBRARIES} Python3::Python)
+if(@WITH_CUDA@)
+  target_compile_definitions(TorchScatter::TorchScatter INTERFACE WITH_CUDA)
+endif()
+endif()
+endif()
--- a/conda/pytorch-scatter/README.md
+++ b/conda/pytorch-scatter/README.md
+```
+./build_conda.sh 3.9 1.9.0 cu111  # python, pytorch and cuda version
+```
--- a/conda/pytorch-scatter/build_conda.sh
+++ b/conda/pytorch-scatter/build_conda.sh
+#!/bin/bash
+export PYTHON_VERSION=$1
+export TORCH_VERSION=$2
+export CUDA_VERSION=$3
+export CONDA_PYTORCH_CONSTRAINT="pytorch==${TORCH_VERSION%.*}.*"
+if [ "${CUDA_VERSION}" = "cpu" ]; then
+  export CONDA_CUDATOOLKIT_CONSTRAINT="cpuonly  # [not osx]"
+else
+  case $CUDA_VERSION in
+    cu113)
+      export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit==11.3.*"
+      ;;
+    cu111)
+      export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit==11.1.*"
+      ;;
+    cu102)
+      export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit==10.2.*"
+      ;;
+    cu101)
+      export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit==10.1.*"
+      ;;
+    *)
+      echo "Unrecognized CUDA_VERSION=$CUDA_VERSION"
+      exit 1
+      ;;
+  esac
+fi
+echo "PyTorch $TORCH_VERSION+$CUDA_VERSION"
+echo "- $CONDA_PYTORCH_CONSTRAINT"
+echo "- $CONDA_CUDATOOLKIT_CONSTRAINT"
+conda build . -c nvidia -c pytorch -c default -c conda-forge --output-folder "$HOME/conda-bld"
--- a/conda/pytorch-scatter/meta.yaml
+++ b/conda/pytorch-scatter/meta.yaml
+package:
+  name: pytorch-scatter
+  version: 2.0.9
+source:
+  path: ../..
+requirements:
+  build:
+    - {{ compiler('c') }}  # [win]
+  host:
+    - pip
+    - python {{ environ.get('PYTHON_VERSION') }}
+    - {{ environ.get('CONDA_PYTORCH_CONSTRAINT') }}
+    - {{ environ.get('CONDA_CUDATOOLKIT_CONSTRAINT') }}
+  run:
+    - python {{ environ.get('PYTHON_VERSION') }}
+    - {{ environ.get('CONDA_PYTORCH_CONSTRAINT') }}
+    - {{ environ.get('CONDA_CUDATOOLKIT_CONSTRAINT') }}
+build:
+  string: py{{ environ.get('PYTHON_VERSION').replace('.', '') }}_torch_{{ environ['TORCH_VERSION'] }}_{{ environ['CUDA_VERSION'] }}
+  script: pip install .
+  script_env:
+    - FORCE_CUDA
+    - TORCH_CUDA_ARCH_LIST
+test:
+  imports:
+    - torch_scatter
+about:
+  home: https://github.com/rusty1s/pytorch_scatter
+  license: MIT
+  summary: PyTorch Extension Library of Optimized Scatter Operations
--- a/csrc/hip/atomics.cuh
+++ b/csrc/hip/atomics.cuh
@@ -135,19 +135,19 @@ static inline __device__ void atomAdd(int32_t *address, int32_t val) {
 static inline __device__ void atomAdd(int64_t *address, int64_t val) {
  AtomicAddIntegerImpl<int64_t, sizeof(int64_t)>()(address, val);
 }
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700 || TORCH_HIP_VERSION < 10000)
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700 || CUDA_VERSION < 10000)
 static inline __device__ void atomAdd(at::Half *address, at::Half val) {
  AtomicAddDecimalImpl<at::Half, sizeof(at::Half)>()(address, val);
 }
 #else
 static inline __device__ void atomAdd(at::Half *address, at::Half val) {
-  AtomicAddDecimalImpl<at::Half, sizeof(at::Half)>()(address, val);
+  atomicAdd(reinterpret_cast<__half *>(address), val);
 }
 #endif
 static inline __device__ void atomAdd(float *address, float val) {
  atomicAdd(address, val);
 }
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600 || TORCH_HIP_VERSION < 8000)
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600 || CUDA_VERSION < 8000)
 static inline __device__ void atomAdd(double *address, double val) {
  AtomicAddDecimalImpl<double, sizeof(double)>()(address, val);
 }

--- a/csrc/hip/index_info.cuh
+++ b/csrc/hip/index_info.cuh
 #pragma once
-#include <ATen/hip/detail/TensorInfo.cuh>
+#include <ATen/cuda/detail/TensorInfo.cuh>
 // We need our own `IndexToOffset` implementation since we do not want to
 // access the last element of the `indexptr`.

--- a/csrc/hip/reducer.cuh
+++ b/csrc/hip/reducer.cuh
--- a/csrc/hip/scatter_hip.hip
+++ b/csrc/hip/scatter_hip.hip
-#include "hip/hip_runtime.h"
+#include "scatter_cuda.h"
-#include "scatter_hip.h"
-#include <ATen/hip/HIPContext.h>
+#include <ATen/cuda/CUDAContext.h>
-#include <ATen/hip/detail/IndexUtils.cuh>
+#include <ATen/cuda/detail/IndexUtils.cuh>
-#include <ATen/hip/detail/TensorInfo.cuh>
+#include <ATen/cuda/detail/TensorInfo.cuh>
 #include "reducer.cuh"
 #include "utils.cuh"
@@ -64,7 +63,7 @@ scatter_cuda(torch::Tensor src, torch::Tensor index, int64_t dim,
  CHECK_CUDA(index);
  if (optional_out.has_value())
    CHECK_CUDA(optional_out.value());
-  hipSetDevice(src.get_device());
+  cudaSetDevice(src.get_device());
  CHECK_INPUT(src.dim() == index.dim());
  for (auto i = 0; i < index.dim() - 1; i++)

--- a/csrc/hip/scatter_hip.h
+++ b/csrc/hip/scatter_hip.h
--- a/csrc/hip/segment_coo_hip.hip
+++ b/csrc/hip/segment_coo_hip.hip
-#include "hip/hip_runtime.h"
+#include "segment_coo_cuda.h"
-#include "segment_coo_hip.h"
-#include <ATen/hip/HIPContext.h>
+#include <ATen/cuda/CUDAContext.h>
-#include <ATen/hip/detail/IndexUtils.cuh>
+#include <ATen/cuda/detail/IndexUtils.cuh>
-#include <ATen/hip/detail/TensorInfo.cuh>
+#include <ATen/cuda/detail/TensorInfo.cuh>
 #include "reducer.cuh"
 #include "utils.cuh"
@@ -158,7 +157,7 @@ segment_coo_cuda(torch::Tensor src, torch::Tensor index,
  CHECK_CUDA(index);
  if (optional_out.has_value())
    CHECK_CUDA(optional_out.value());
-  hipSetDevice(src.get_device());
+  cudaSetDevice(src.get_device());
  CHECK_INPUT(src.dim() >= index.dim());
@@ -331,7 +330,7 @@ torch::Tensor gather_coo_cuda(torch::Tensor src, torch::Tensor index,
  CHECK_CUDA(index);
  if (optional_out.has_value())
    CHECK_CUDA(optional_out.value());
-  hipSetDevice(src.get_device());
+  cudaSetDevice(src.get_device());
  CHECK_INPUT(src.dim() >= index.dim());

--- a/csrc/hip/segment_coo_hip.h
+++ b/csrc/hip/segment_coo_hip.h
@@ -9,7 +9,3 @@ segment_coo_cuda(torch::Tensor src, torch::Tensor index,
 torch::Tensor gather_coo_cuda(torch::Tensor src, torch::Tensor index,
                              torch::optional<torch::Tensor> optional_out);
-template<typename T>
-__device__ T __ldg(const T* ptr) {
-    return *ptr;
-}
--- a/csrc/hip/segment_csr_hip.hip
+++ b/csrc/hip/segment_csr_hip.hip
-#include "hip/hip_runtime.h"
+#include "segment_csr_cuda.h"
-#include "segment_csr_hip.h"
-#include <ATen/hip/HIPContext.h>
+#include <ATen/cuda/CUDAContext.h>
-#include <ATen/hip/detail/IndexUtils.cuh>
+#include <ATen/cuda/detail/IndexUtils.cuh>
-#include <ATen/hip/detail/TensorInfo.cuh>
+#include <ATen/cuda/detail/TensorInfo.cuh>
 #include "index_info.cuh"
 #include "reducer.cuh"
@@ -103,7 +102,7 @@ segment_csr_cuda(torch::Tensor src, torch::Tensor indptr,
  CHECK_CUDA(indptr);
  if (optional_out.has_value())
    CHECK_CUDA(optional_out.value());
-  hipSetDevice(src.get_device());
+  cudaSetDevice(src.get_device());
  CHECK_INPUT(src.dim() >= indptr.dim());
@@ -223,7 +222,7 @@ torch::Tensor gather_csr_cuda(torch::Tensor src, torch::Tensor indptr,
  CHECK_CUDA(indptr);
  if (optional_out.has_value())
    CHECK_CUDA(optional_out.value());
-  hipSetDevice(src.get_device());
+  cudaSetDevice(src.get_device());
  CHECK_INPUT(src.dim() >= indptr.dim());

--- a/csrc/hip/segment_csr_hip.h
+++ b/csrc/hip/segment_csr_hip.h
@@ -9,7 +9,3 @@ segment_csr_cuda(torch::Tensor src, torch::Tensor indptr,
 torch::Tensor gather_csr_cuda(torch::Tensor src, torch::Tensor indptr,
                              torch::optional<torch::Tensor> optional_out);
-template<typename T>
-__device__ T __ldg(const T* ptr) {
-    return *ptr;
-}