Commit c7c514c2 authored by yangzhong's avatar yangzhong
Browse files

push 2.0.9 version

parent cf967b1f
cmake_minimum_required(VERSION 3.0)
project(torchscatter)
set(CMAKE_CXX_STANDARD 14)
set(TORCHSCATTER_VERSION 2.0.9)
option(WITH_CUDA "Enable CUDA support" OFF)
if(WITH_CUDA)
enable_language(CUDA)
add_definitions(-D__CUDA_NO_HALF_OPERATORS__)
add_definitions(-DWITH_CUDA)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
endif()
find_package(Python3 COMPONENTS Development)
find_package(Torch REQUIRED)
file(GLOB HEADERS csrc/scatter.h)
file(GLOB OPERATOR_SOURCES csrc/cpu/*.h csrc/cpu/*.cpp csrc/*.cpp)
if(WITH_CUDA)
file(GLOB OPERATOR_SOURCES ${OPERATOR_SOURCES} csrc/cuda/*.h csrc/cuda/*.cu)
endif()
add_library(${PROJECT_NAME} SHARED ${OPERATOR_SOURCES})
target_link_libraries(${PROJECT_NAME} PRIVATE ${TORCH_LIBRARIES} Python3::Python)
set_target_properties(${PROJECT_NAME} PROPERTIES EXPORT_NAME TorchScatter)
target_include_directories(${PROJECT_NAME} INTERFACE
$<BUILD_INTERFACE:${HEADERS}>
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
include(GNUInstallDirs)
include(CMakePackageConfigHelpers)
set(TORCHSCATTER_CMAKECONFIG_INSTALL_DIR "share/cmake/TorchScatter" CACHE STRING "install path for TorchScatterConfig.cmake")
configure_package_config_file(cmake/TorchScatterConfig.cmake.in
"${CMAKE_CURRENT_BINARY_DIR}/TorchScatterConfig.cmake"
INSTALL_DESTINATION ${TORCHSCATTER_CMAKECONFIG_INSTALL_DIR})
write_basic_package_version_file(${CMAKE_CURRENT_BINARY_DIR}/TorchScatterConfigVersion.cmake
VERSION ${TORCHSCATTER_VERSION}
COMPATIBILITY AnyNewerVersion)
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/TorchScatterConfig.cmake
${CMAKE_CURRENT_BINARY_DIR}/TorchScatterConfigVersion.cmake
DESTINATION ${TORCHSCATTER_CMAKECONFIG_INSTALL_DIR})
install(TARGETS ${PROJECT_NAME}
EXPORT TorchScatterTargets
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
)
install(EXPORT TorchScatterTargets
NAMESPACE TorchScatter::
DESTINATION ${TORCHSCATTER_CMAKECONFIG_INSTALL_DIR})
install(FILES ${HEADERS} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME})
install(FILES
csrc/cpu/scatter_cpu.h
csrc/cpu/segment_coo_cpu.h
csrc/cpu/segment_csr_cpu.h
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME}/cpu)
if(WITH_CUDA)
install(FILES
csrc/cuda/scatter_cuda.h
csrc/cuda/segment_coo_cuda.h
csrc/cuda/segment_csr_cuda.h
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME}/cuda)
endif()
if(WITH_CUDA)
set_property(TARGET torch_cuda PROPERTY INTERFACE_COMPILE_OPTIONS "")
set_property(TARGET torch_cpu PROPERTY INTERFACE_COMPILE_OPTIONS "")
endif()
Metadata-Version: 2.1
Name: torch_scatter
Version: 2.0.9
Summary: PyTorch Extension Library of Optimized Scatter Operations
Home-page: https://github.com/rusty1s/pytorch_scatter
Author: Matthias Fey
Author-email: matthias.fey@tu-dortmund.de
License: MIT
Description: UNKNOWN
Keywords: pytorch,scatter,segment,gather
Platform: UNKNOWN
Requires-Python: >=3.6
Provides-Extra: test
# <div align="center"><strong>torch-scatter-2.0.9</strong></div> [pypi-image]: https://badge.fury.io/py/torch-scatter.svg
## 简介 [pypi-url]: https://pypi.python.org/pypi/torch-scatter
torch-scatter是一个在PyTorch库中使用的Python库,它用于从张量中随机选择元素并返回一个新的张量。这个库提供了一种简单的方法来创建具有随机标签的数据集,这对于许多机器学习任务非常有用,例如数据增强或生成对抗网络(GANs)。 [testing-image]: https://github.com/rusty1s/pytorch_scatter/actions/workflows/testing.yml/badge.svg
[testing-url]: https://github.com/rusty1s/pytorch_scatter/actions/workflows/testing.yml
[linting-image]: https://github.com/rusty1s/pytorch_scatter/actions/workflows/linting.yml/badge.svg
[linting-url]: https://github.com/rusty1s/pytorch_scatter/actions/workflows/linting.yml
[docs-image]: https://readthedocs.org/projects/pytorch-scatter/badge/?version=latest
[docs-url]: https://pytorch-scatter.readthedocs.io/en/latest/?badge=latest
[coverage-image]: https://codecov.io/gh/rusty1s/pytorch_scatter/branch/master/graph/badge.svg
[coverage-url]: https://codecov.io/github/rusty1s/pytorch_scatter?branch=master
## 依赖安装 # PyTorch Scatter
+ pytorch1.10或者pytorch1.13 以及对应的torchvision(建议dtk-22.04.2、dtk-23.04与dtk-23.10)
+ python 3.7-3.10
### 1、使用源码编译方式安装 [![PyPI Version][pypi-image]][pypi-url]
[![Testing Status][testing-image]][testing-url]
[![Linting Status][linting-image]][linting-url]
[![Docs Status][docs-image]][docs-url]
[![Code Coverage][coverage-image]][coverage-url]
#### 编译环境准备 <p align="center">
提供2种环境准备方式: <img width="50%" src="https://raw.githubusercontent.com/rusty1s/pytorch_scatter/master/docs/source/_figures/add.svg?sanitize=true" />
</p>
1. 基于光源pytorch基础镜像环境:镜像下载地址:[https://sourcefind.cn/#/image/dcu/pytorch](https://sourcefind.cn/#/image/dcu/pytorch),根据pytorch、python、dtk及系统下载对应的镜像版本。 --------------------------------------------------------------------------------
2. 基于现有python环境:安装pytorch和torchvision,whl包下载目录:[https://cancon.hpccube.com:65024/4/main/pytorch](https://cancon.hpccube.com:65024/4/main/pytorch)[https://cancon.hpccube.com:65024/4/main/vision](https://cancon.hpccube.com:65024/4/main/vision),根据python、dtk版本,下载对应pytorch和torchvision的whl包。安装命令如下: **[Documentation](https://pytorch-scatter.readthedocs.io)**
```shell
pip install torch* (下载的torch的whl包) This package consists of a small extension library of highly optimized sparse update (scatter and segment) operations for the use in [PyTorch](http://pytorch.org/), which are missing in the main package.
pip install torchvision* (下载的torchvision的whl包) Scatter and segment operations can be roughly described as reduce operations based on a given "group-index" tensor.
pip install setuptools==59.5.0 wheel Segment operations require the "group-index" tensor to be sorted, whereas scatter operations are not subject to these requirements.
The package consists of the following operations with reduction types `"sum"|"mean"|"min"|"max"`:
* [**scatter**](https://pytorch-scatter.readthedocs.io/en/latest/functions/scatter.html) based on arbitrary indices
* [**segment_coo**](https://pytorch-scatter.readthedocs.io/en/latest/functions/segment_coo.html) based on sorted indices
* [**segment_csr**](https://pytorch-scatter.readthedocs.io/en/latest/functions/segment_csr.html) based on compressed indices via pointers
In addition, we provide the following **composite functions** which make use of `scatter_*` operations under the hood: `scatter_std`, `scatter_logsumexp`, `scatter_softmax` and `scatter_log_softmax`.
All included operations are broadcastable, work on varying data types, are implemented both for CPU and GPU with corresponding backward implementations, and are fully traceable.
## Installation
### Anaconda
**Update:** You can now install `pytorch-scatter` via [Anaconda](https://anaconda.org/pyg/pytorch-scatter) for all major OS/PyTorch/CUDA combinations 🤗
Given that you have [`pytorch >= 1.8.0` installed](https://pytorch.org/get-started/locally/), simply run
```
conda install pytorch-scatter -c pyg
``` ```
#### 源码编译安装 ### Binaries
- 代码下载
```shell We alternatively provide pip wheels for all major OS/PyTorch/CUDA combinations, see [here](https://data.pyg.org/whl).
git clone http://developer.hpccube.com/codes/aicomponent/torch-scatter # 根据编译需要切换分支
#### PyTorch 1.9.0
To install the binaries for PyTorch 1.9.0, simply run
``` ```
- 源码编译(进入torch-scatter目录): pip install torch-scatter -f https://data.pyg.org/whl/torch-1.9.0+${CUDA}.html
``` ```
export C_INCLUDE_PATH=/public/software/apps/DeepLearning/PyTorch_Lib/gflags-2.1.2-build/include:$C_INCLUDE_PATH
export CPLUS_INCLUDE_PATH=/public/software/apps/DeepLearning/PyTorch_Lib/gflags-2.1.2-build/include:$CPLUS_INCLUDE_PATH
export C_INCLUDE_PATH=/public/software/apps/DeepLearning/PyTorch_Lib/glog-build/include:$C_INCLUDE_PATH
export CPLUS_INCLUDE_PATH=/public/software/apps/DeepLearning/PyTorch_Lib/glog-build/include:$CPLUS_INCLUDE_PATH
export C_INCLUDE_PATH=$ROCM_PATH/rocrand/include:$C_INCLUDE_PATH
export CPLUS_INCLUDE_PATH=$ROCM_PATH/rocrand/include:$CPLUS_INCLUDE_PATH
export LD_LIBRARY_PATH=$ROCM_PATH/rocrand/lib:$LD_LIBRARY_PATH
export FORCE_ONLY_HIP=1
export CC=hipcc
export CXX=hipcc
python setup.py install where `${CUDA}` should be replaced by either `cpu`, `cu102`, or `cu111` depending on your PyTorch installation.
| | `cpu` | `cu102` | `cu111` |
|-------------|-------|---------|---------|
| **Linux** | ✅ | ✅ | ✅ |
| **Windows** | ✅ | ✅ | ✅ |
| **macOS** | ✅ | | |
#### PyTorch 1.8.0/1.8.1
To install the binaries for PyTorch 1.8.0 and 1.8.1, simply run
```
pip install torch-scatter -f https://data.pyg.org/whl/torch-1.8.0+${CUDA}.html
``` ```
#### 注意事项
+ 若使用pip install下载安装过慢,可添加pypi清华源:-i https://pypi.tuna.tsinghua.edu.cn/simple/
+ ROCM_PATH为dtk的路径,默认为/opt/dtk
## 验证 where `${CUDA}` should be replaced by either `cpu`, `cu101`, `cu102`, or `cu111` depending on your PyTorch installation.
| | `cpu` | `cu101` | `cu102` | `cu111` |
|-------------|-------|---------|---------|---------|
| **Linux** | ✅ | ✅ | ✅ | ✅ |
| **Windows** | ✅ | ❌ | ✅ | ✅ |
| **macOS** | ✅ | | | |
**Note:** Binaries of older versions are also provided for PyTorch 1.4.0, PyTorch 1.5.0, PyTorch 1.6.0 and PyTorch 1.7.0/1.7.1 (following the same procedure).
### From source
Ensure that at least PyTorch 1.4.0 is installed and verify that `cuda/bin` and `cuda/include` are in your `$PATH` and `$CPATH` respectively, *e.g.*:
```
$ python -c "import torch; print(torch.__version__)"
>>> 1.4.0
```python $ echo $PATH
>>> /usr/local/cuda/bin:...
$ echo $CPATH
>>> /usr/local/cuda/include:...
```
Then run:
```
pip install torch-scatter
```
When running in a docker container without NVIDIA driver, PyTorch needs to evaluate the compute capabilities and may fail.
In this case, ensure that the compute capabilities are set via `TORCH_CUDA_ARCH_LIST`, *e.g.*:
```
export TORCH_CUDA_ARCH_LIST = "6.0 6.1 7.2+PTX 7.5+PTX"
```
## Example
```py
import torch import torch
from torch_scatter import scatter_max from torch_scatter import scatter_max
...@@ -66,10 +138,30 @@ tensor([[5, 5, 3, 4, 0, 1] ...@@ -66,10 +138,30 @@ tensor([[5, 5, 3, 4, 0, 1]
[1, 4, 3, 5, 5, 5]]) [1, 4, 3, 5, 5, 5]])
``` ```
## Known Issue ## Running tests
- 该库没有基于cpu环境修改,仅支持dcu,请在有dcu卡的环境运行。
- 如需完整使用所有pyg功能,请pip install torch-geometric ```
pip install -r requirements.txt
python setup.py test
```
## C++ API
`torch-scatter` also offers a C++ API that contains C++ equivalent of python models.
```
mkdir build
cd build
# Add -DWITH_CUDA=on support for the CUDA if needed
cmake ..
make
make install
```
### Compile the python library
```
python setup.py bdist_wheel
pip install dist/*.whl
```
## 参考资料
- [README_ORIGIN](README_ORIGIN.md)
- [https://pypi.org/project/torch-scatter/2.0.9/](https://pypi.org/project/torch-scatter/2.0.9/)
[pypi-image]: https://badge.fury.io/py/torch-scatter.svg
[pypi-url]: https://pypi.python.org/pypi/torch-scatter
[testing-image]: https://github.com/rusty1s/pytorch_scatter/actions/workflows/testing.yml/badge.svg
[testing-url]: https://github.com/rusty1s/pytorch_scatter/actions/workflows/testing.yml
[linting-image]: https://github.com/rusty1s/pytorch_scatter/actions/workflows/linting.yml/badge.svg
[linting-url]: https://github.com/rusty1s/pytorch_scatter/actions/workflows/linting.yml
[docs-image]: https://readthedocs.org/projects/pytorch-scatter/badge/?version=latest
[docs-url]: https://pytorch-scatter.readthedocs.io/en/latest/?badge=latest
[coverage-image]: https://codecov.io/gh/rusty1s/pytorch_scatter/branch/master/graph/badge.svg
[coverage-url]: https://codecov.io/github/rusty1s/pytorch_scatter?branch=master
# PyTorch Scatter
[![PyPI Version][pypi-image]][pypi-url]
[![Testing Status][testing-image]][testing-url]
[![Linting Status][linting-image]][linting-url]
[![Docs Status][docs-image]][docs-url]
[![Code Coverage][coverage-image]][coverage-url]
<p align="center">
<img width="50%" src="https://raw.githubusercontent.com/rusty1s/pytorch_scatter/master/docs/source/_figures/add.svg?sanitize=true" />
</p>
--------------------------------------------------------------------------------
**[Documentation](https://pytorch-scatter.readthedocs.io)**
This package consists of a small extension library of highly optimized sparse update (scatter and segment) operations for the use in [PyTorch](http://pytorch.org/), which are missing in the main package.
Scatter and segment operations can be roughly described as reduce operations based on a given "group-index" tensor.
Segment operations require the "group-index" tensor to be sorted, whereas scatter operations are not subject to these requirements.
The package consists of the following operations with reduction types `"sum"|"mean"|"min"|"max"`:
* [**scatter**](https://pytorch-scatter.readthedocs.io/en/latest/functions/scatter.html) based on arbitrary indices
* [**segment_coo**](https://pytorch-scatter.readthedocs.io/en/latest/functions/segment_coo.html) based on sorted indices
* [**segment_csr**](https://pytorch-scatter.readthedocs.io/en/latest/functions/segment_csr.html) based on compressed indices via pointers
In addition, we provide the following **composite functions** which make use of `scatter_*` operations under the hood: `scatter_std`, `scatter_logsumexp`, `scatter_softmax` and `scatter_log_softmax`.
All included operations are broadcastable, work on varying data types, are implemented both for CPU and GPU with corresponding backward implementations, and are fully traceable.
## Installation
### Anaconda
**Update:** You can now install `pytorch-scatter` via [Anaconda](https://anaconda.org/pyg/pytorch-scatter) for all major OS/PyTorch/CUDA combinations 🤗
Given that you have [`pytorch >= 1.8.0` installed](https://pytorch.org/get-started/locally/), simply run
```
conda install pytorch-scatter -c pyg
```
### Binaries
We alternatively provide pip wheels for all major OS/PyTorch/CUDA combinations, see [here](https://data.pyg.org/whl).
#### PyTorch 1.10.0
To install the binaries for PyTorch 1.10.0, simply run
```
pip install torch-scatter -f https://data.pyg.org/whl/torch-1.10.0+${CUDA}.html
```
where `${CUDA}` should be replaced by either `cpu`, `cu102`, or `cu113` depending on your PyTorch installation.
| | `cpu` | `cu102` | `cu113` |
|-------------|-------|---------|---------|
| **Linux** | ✅ | ✅ | ✅ |
| **Windows** | ✅ | ✅ | ✅ |
| **macOS** | ✅ | | |
#### PyTorch 1.9.0/1.9.1
To install the binaries for PyTorch 1.9.0 and 1.9.1, simply run
```
pip install torch-scatter -f https://data.pyg.org/whl/torch-1.9.0+${CUDA}.html
```
where `${CUDA}` should be replaced by either `cpu`, `cu102`, or `cu111` depending on your PyTorch installation.
| | `cpu` | `cu102` | `cu111` |
|-------------|-------|---------|---------|
| **Linux** | ✅ | ✅ | ✅ |
| **Windows** | ✅ | ✅ | ✅ |
| **macOS** | ✅ | | |
**Note:** Binaries of older versions are also provided for PyTorch 1.4.0, PyTorch 1.5.0, PyTorch 1.6.0, PyTorch 1.7.0/1.7.1 and PyTorch 1.8.0/1.8.1 (following the same procedure).
### From source
Ensure that at least PyTorch 1.4.0 is installed and verify that `cuda/bin` and `cuda/include` are in your `$PATH` and `$CPATH` respectively, *e.g.*:
```
$ python -c "import torch; print(torch.__version__)"
>>> 1.4.0
$ echo $PATH
>>> /usr/local/cuda/bin:...
$ echo $CPATH
>>> /usr/local/cuda/include:...
```
Then run:
```
pip install torch-scatter
```
When running in a docker container without NVIDIA driver, PyTorch needs to evaluate the compute capabilities and may fail.
In this case, ensure that the compute capabilities are set via `TORCH_CUDA_ARCH_LIST`, *e.g.*:
```
export TORCH_CUDA_ARCH_LIST = "6.0 6.1 7.2+PTX 7.5+PTX"
```
## Example
```py
import torch
from torch_scatter import scatter_max
src = torch.tensor([[2, 0, 1, 4, 3], [0, 2, 1, 3, 4]])
index = torch.tensor([[4, 5, 4, 2, 3], [0, 0, 2, 2, 1]])
out, argmax = scatter_max(src, index, dim=-1)
```
```
print(out)
tensor([[0, 0, 4, 3, 2, 0],
[2, 4, 3, 0, 0, 0]])
print(argmax)
tensor([[5, 5, 3, 4, 0, 1]
[1, 4, 3, 5, 5, 5]])
```
## Running tests
```
python setup.py test
```
## C++ API
`torch-scatter` also offers a C++ API that contains C++ equivalent of python models.
```
mkdir build
cd build
# Add -DWITH_CUDA=on support for the CUDA if needed
cmake ..
make
make install
```
import time
import itertools
import argparse
import torch
from scipy.io import loadmat
from torch_scatter import gather_coo, gather_csr
from scatter_segment import short_rows, long_rows, download, bold
@torch.no_grad()
def correctness(dataset):
group, name = dataset
mat = loadmat(f'{name}.mat')['Problem'][0][0][2].tocsr()
rowptr = torch.from_numpy(mat.indptr).to(args.device, torch.long)
row = torch.from_numpy(mat.tocoo().row).to(args.device, torch.long)
dim_size = rowptr.size(0) - 1
for size in sizes[1:]:
try:
x = torch.randn((dim_size, size), device=args.device)
x = x.squeeze(-1) if size == 1 else x
out1 = x.index_select(0, row)
out2 = gather_coo(x, row)
out3 = gather_csr(x, rowptr)
assert torch.allclose(out1, out2, atol=1e-4)
assert torch.allclose(out1, out3, atol=1e-4)
except RuntimeError as e:
if 'out of memory' not in str(e):
raise RuntimeError(e)
torch.cuda.empty_cache()
def time_func(func, x):
try:
if torch.cuda.is_available():
torch.cuda.synchronize()
t = time.perf_counter()
if not args.with_backward:
with torch.no_grad():
for _ in range(iters):
func(x)
else:
x = x.requires_grad_()
for _ in range(iters):
out = func(x)
torch.autograd.grad(out, x, out, only_inputs=True)
if torch.cuda.is_available():
torch.cuda.synchronize()
return time.perf_counter() - t
except RuntimeError as e:
if 'out of memory' not in str(e):
raise RuntimeError(e)
torch.cuda.empty_cache()
return float('inf')
def timing(dataset):
group, name = dataset
mat = loadmat(f'{name}.mat')['Problem'][0][0][2].tocsr()
rowptr = torch.from_numpy(mat.indptr).to(args.device, torch.long)
row = torch.from_numpy(mat.tocoo().row).to(args.device, torch.long)
dim_size = rowptr.size(0) - 1
avg_row_len = row.size(0) / dim_size
def select(x):
return x.index_select(0, row)
def gather(x):
return x.gather(0, row.view(-1, 1).expand(-1, x.size(1)))
def gat_coo(x):
return gather_coo(x, row)
def gat_csr(x):
return gather_csr(x, rowptr)
t1, t2, t3, t4 = [], [], [], []
for size in sizes:
try:
x = torch.randn((dim_size, size), device=args.device)
t1 += [time_func(select, x)]
t2 += [time_func(gather, x)]
t3 += [time_func(gat_coo, x)]
t4 += [time_func(gat_csr, x)]
del x
except RuntimeError as e:
if 'out of memory' not in str(e):
raise RuntimeError(e)
torch.cuda.empty_cache()
for t in (t1, t2, t3, t4):
t.append(float('inf'))
ts = torch.tensor([t1, t2, t3, t4])
winner = torch.zeros_like(ts, dtype=torch.bool)
winner[ts.argmin(dim=0), torch.arange(len(sizes))] = 1
winner = winner.tolist()
name = f'{group}/{name}'
print(f'{bold(name)} (avg row length: {avg_row_len:.2f}):')
print('\t'.join([' '] + [f'{size:>5}' for size in sizes]))
print('\t'.join([bold('SELECT ')] +
[bold(f'{t:.5f}', f) for t, f in zip(t1, winner[0])]))
print('\t'.join([bold('GAT ')] +
[bold(f'{t:.5f}', f) for t, f in zip(t2, winner[1])]))
print('\t'.join([bold('GAT_COO')] +
[bold(f'{t:.5f}', f) for t, f in zip(t3, winner[2])]))
print('\t'.join([bold('GAT_CSR')] +
[bold(f'{t:.5f}', f) for t, f in zip(t4, winner[3])]))
print()
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--with_backward', action='store_true')
parser.add_argument('--device', type=str, default='cuda')
args = parser.parse_args()
iters = 1 if args.device == 'cpu' else 20
sizes = [1, 16, 32, 64, 128, 256, 512]
sizes = sizes[:3] if args.device == 'cpu' else sizes
for _ in range(10): # Warmup.
torch.randn(100, 100, device=args.device).sum()
for dataset in itertools.chain(short_rows, long_rows):
download(dataset)
correctness(dataset)
timing(dataset)
import time
import os.path as osp
import itertools
import argparse
import wget
import torch
from scipy.io import loadmat
from torch_scatter import scatter, segment_coo, segment_csr
short_rows = [
('DIMACS10', 'citationCiteseer'),
('SNAP', 'web-Stanford'),
]
long_rows = [
('Janna', 'StocF-1465'),
('GHS_psdef', 'ldoor'),
]
def download(dataset):
url = 'https://sparse.tamu.edu/mat/{}/{}.mat'
for group, name in itertools.chain(long_rows, short_rows):
if not osp.exists(f'{name}.mat'):
print(f'Downloading {group}/{name}:')
wget.download(url.format(group, name))
print('')
def bold(text, flag=True):
return f'\033[1m{text}\033[0m' if flag else text
@torch.no_grad()
def correctness(dataset):
group, name = dataset
mat = loadmat(f'{name}.mat')['Problem'][0][0][2].tocsr()
rowptr = torch.from_numpy(mat.indptr).to(args.device, torch.long)
row = torch.from_numpy(mat.tocoo().row).to(args.device, torch.long)
dim_size = rowptr.size(0) - 1
for size in sizes:
try:
x = torch.randn((row.size(0), size), device=args.device)
x = x.squeeze(-1) if size == 1 else x
out1 = scatter(x, row, dim=0, dim_size=dim_size, reduce='add')
out2 = segment_coo(x, row, dim_size=dim_size, reduce='add')
out3 = segment_csr(x, rowptr, reduce='add')
assert torch.allclose(out1, out2, atol=1e-4)
assert torch.allclose(out1, out3, atol=1e-4)
out1 = scatter(x, row, dim=0, dim_size=dim_size, reduce='mean')
out2 = segment_coo(x, row, dim_size=dim_size, reduce='mean')
out3 = segment_csr(x, rowptr, reduce='mean')
assert torch.allclose(out1, out2, atol=1e-4)
assert torch.allclose(out1, out3, atol=1e-4)
out1 = scatter(x, row, dim=0, dim_size=dim_size, reduce='min')
out2 = segment_coo(x, row, reduce='min')
out3 = segment_csr(x, rowptr, reduce='min')
assert torch.allclose(out1, out2, atol=1e-4)
assert torch.allclose(out1, out3, atol=1e-4)
out1 = scatter(x, row, dim=0, dim_size=dim_size, reduce='max')
out2 = segment_coo(x, row, reduce='max')
out3 = segment_csr(x, rowptr, reduce='max')
assert torch.allclose(out1, out2, atol=1e-4)
assert torch.allclose(out1, out3, atol=1e-4)
except RuntimeError as e:
if 'out of memory' not in str(e):
raise RuntimeError(e)
torch.cuda.empty_cache()
def time_func(func, x):
try:
if torch.cuda.is_available():
torch.cuda.synchronize()
t = time.perf_counter()
if not args.with_backward:
with torch.no_grad():
for _ in range(iters):
func(x)
else:
x = x.requires_grad_()
for _ in range(iters):
out = func(x)
out = out[0] if isinstance(out, tuple) else out
torch.autograd.grad(out, x, out, only_inputs=True)
if torch.cuda.is_available():
torch.cuda.synchronize()
return time.perf_counter() - t
except RuntimeError as e:
if 'out of memory' not in str(e):
raise RuntimeError(e)
torch.cuda.empty_cache()
return float('inf')
def timing(dataset):
group, name = dataset
mat = loadmat(f'{name}.mat')['Problem'][0][0][2].tocsr()
rowptr = torch.from_numpy(mat.indptr).to(args.device, torch.long)
row = torch.from_numpy(mat.tocoo().row).to(args.device, torch.long)
row2 = row[torch.randperm(row.size(0))]
dim_size = rowptr.size(0) - 1
avg_row_len = row.size(0) / dim_size
def sca1_row(x):
out = x.new_zeros(dim_size, *x.size()[1:])
row_tmp = row.view(-1, 1).expand_as(x) if x.dim() > 1 else row
return out.scatter_add_(0, row_tmp, x)
def sca1_col(x):
out = x.new_zeros(dim_size, *x.size()[1:])
row2_tmp = row2.view(-1, 1).expand_as(x) if x.dim() > 1 else row2
return out.scatter_add_(0, row2_tmp, x)
def sca2_row(x):
return scatter(x, row, dim=0, dim_size=dim_size, reduce=args.reduce)
def sca2_col(x):
return scatter(x, row2, dim=0, dim_size=dim_size, reduce=args.reduce)
def seg_coo(x):
return segment_coo(x, row, reduce=args.reduce)
def seg_csr(x):
return segment_csr(x, rowptr, reduce=args.reduce)
def dense1(x):
return getattr(torch, args.reduce)(x, dim=-2)
def dense2(x):
return getattr(torch, args.reduce)(x, dim=-1)
t1, t2, t3, t4, t5, t6, t7, t8 = [], [], [], [], [], [], [], []
for size in sizes:
try:
x = torch.randn((row.size(0), size), device=args.device)
x = x.squeeze(-1) if size == 1 else x
t1 += [time_func(sca1_row, x)]
t2 += [time_func(sca1_col, x)]
t3 += [time_func(sca2_row, x)]
t4 += [time_func(sca2_col, x)]
t5 += [time_func(seg_coo, x)]
t6 += [time_func(seg_csr, x)]
del x
except RuntimeError as e:
if 'out of memory' not in str(e):
raise RuntimeError(e)
torch.cuda.empty_cache()
for t in (t1, t2, t3, t4, t5, t6):
t.append(float('inf'))
try:
x = torch.randn((dim_size, int(avg_row_len + 1), size),
device=args.device)
t7 += [time_func(dense1, x)]
x = x.view(dim_size, size, int(avg_row_len + 1))
t8 += [time_func(dense2, x)]
del x
except RuntimeError as e:
if 'out of memory' not in str(e):
raise RuntimeError(e)
torch.cuda.empty_cache()
for t in (t7, t8):
t.append(float('inf'))
ts = torch.tensor([t1, t2, t3, t4, t5, t6, t7, t8])
winner = torch.zeros_like(ts, dtype=torch.bool)
winner[ts.argmin(dim=0), torch.arange(len(sizes))] = 1
winner = winner.tolist()
name = f'{group}/{name}'
print(f'{bold(name)} (avg row length: {avg_row_len:.2f}):')
print('\t'.join([' '] + [f'{size:>5}' for size in sizes]))
print('\t'.join([bold('SCA1_ROW')] +
[bold(f'{t:.5f}', f) for t, f in zip(t1, winner[0])]))
print('\t'.join([bold('SCA1_COL')] +
[bold(f'{t:.5f}', f) for t, f in zip(t2, winner[1])]))
print('\t'.join([bold('SCA2_ROW')] +
[bold(f'{t:.5f}', f) for t, f in zip(t3, winner[2])]))
print('\t'.join([bold('SCA2_COL')] +
[bold(f'{t:.5f}', f) for t, f in zip(t4, winner[3])]))
print('\t'.join([bold('SEG_COO ')] +
[bold(f'{t:.5f}', f) for t, f in zip(t5, winner[4])]))
print('\t'.join([bold('SEG_CSR ')] +
[bold(f'{t:.5f}', f) for t, f in zip(t6, winner[5])]))
print('\t'.join([bold('DENSE1 ')] +
[bold(f'{t:.5f}', f) for t, f in zip(t7, winner[6])]))
print('\t'.join([bold('DENSE2 ')] +
[bold(f'{t:.5f}', f) for t, f in zip(t8, winner[7])]))
print()
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--reduce', type=str, required=True,
choices=['sum', 'mean', 'min', 'max'])
parser.add_argument('--with_backward', action='store_true')
parser.add_argument('--device', type=str, default='cuda')
args = parser.parse_args()
iters = 1 if args.device == 'cpu' else 20
sizes = [1, 16, 32, 64, 128, 256, 512]
sizes = sizes[:3] if args.device == 'cpu' else sizes
for _ in range(10): # Warmup.
torch.randn(100, 100, device=args.device).sum()
for dataset in itertools.chain(short_rows, long_rows):
download(dataset)
correctness(dataset)
timing(dataset)
# TorchScatterConfig.cmake
# --------------------
#
# Exported targets:: Scatter
#
@PACKAGE_INIT@
set(PN TorchScatter)
set(${PN}_INCLUDE_DIR "${PACKAGE_PREFIX_DIR}/@CMAKE_INSTALL_INCLUDEDIR@")
set(${PN}_LIBRARY "")
set(${PN}_DEFINITIONS USING_${PN})
check_required_components(${PN})
if(NOT (CMAKE_VERSION VERSION_LESS 3.0))
#-----------------------------------------------------------------------------
# Don't include targets if this file is being picked up by another
# project which has already built this as a subproject
#-----------------------------------------------------------------------------
if(NOT TARGET ${PN}::TorchScatter)
include("${CMAKE_CURRENT_LIST_DIR}/${PN}Targets.cmake")
if(NOT TARGET torch_library)
find_package(Torch REQUIRED)
endif()
if(NOT TARGET Python3::Python)
find_package(Python3 COMPONENTS Development)
endif()
target_link_libraries(TorchScatter::TorchScatter INTERFACE ${TORCH_LIBRARIES} Python3::Python)
if(@WITH_CUDA@)
target_compile_definitions(TorchScatter::TorchScatter INTERFACE WITH_CUDA)
endif()
endif()
endif()
```
./build_conda.sh 3.9 1.9.0 cu111 # python, pytorch and cuda version
```
#!/bin/bash
export PYTHON_VERSION=$1
export TORCH_VERSION=$2
export CUDA_VERSION=$3
export CONDA_PYTORCH_CONSTRAINT="pytorch==${TORCH_VERSION%.*}.*"
if [ "${CUDA_VERSION}" = "cpu" ]; then
export CONDA_CUDATOOLKIT_CONSTRAINT="cpuonly # [not osx]"
else
case $CUDA_VERSION in
cu113)
export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit==11.3.*"
;;
cu111)
export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit==11.1.*"
;;
cu102)
export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit==10.2.*"
;;
cu101)
export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit==10.1.*"
;;
*)
echo "Unrecognized CUDA_VERSION=$CUDA_VERSION"
exit 1
;;
esac
fi
echo "PyTorch $TORCH_VERSION+$CUDA_VERSION"
echo "- $CONDA_PYTORCH_CONSTRAINT"
echo "- $CONDA_CUDATOOLKIT_CONSTRAINT"
conda build . -c nvidia -c pytorch -c default -c conda-forge --output-folder "$HOME/conda-bld"
package:
name: pytorch-scatter
version: 2.0.9
source:
path: ../..
requirements:
build:
- {{ compiler('c') }} # [win]
host:
- pip
- python {{ environ.get('PYTHON_VERSION') }}
- {{ environ.get('CONDA_PYTORCH_CONSTRAINT') }}
- {{ environ.get('CONDA_CUDATOOLKIT_CONSTRAINT') }}
run:
- python {{ environ.get('PYTHON_VERSION') }}
- {{ environ.get('CONDA_PYTORCH_CONSTRAINT') }}
- {{ environ.get('CONDA_CUDATOOLKIT_CONSTRAINT') }}
build:
string: py{{ environ.get('PYTHON_VERSION').replace('.', '') }}_torch_{{ environ['TORCH_VERSION'] }}_{{ environ['CUDA_VERSION'] }}
script: pip install .
script_env:
- FORCE_CUDA
- TORCH_CUDA_ARCH_LIST
test:
imports:
- torch_scatter
about:
home: https://github.com/rusty1s/pytorch_scatter
license: MIT
summary: PyTorch Extension Library of Optimized Scatter Operations
...@@ -135,19 +135,19 @@ static inline __device__ void atomAdd(int32_t *address, int32_t val) { ...@@ -135,19 +135,19 @@ static inline __device__ void atomAdd(int32_t *address, int32_t val) {
static inline __device__ void atomAdd(int64_t *address, int64_t val) { static inline __device__ void atomAdd(int64_t *address, int64_t val) {
AtomicAddIntegerImpl<int64_t, sizeof(int64_t)>()(address, val); AtomicAddIntegerImpl<int64_t, sizeof(int64_t)>()(address, val);
} }
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700 || TORCH_HIP_VERSION < 10000) #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700 || CUDA_VERSION < 10000)
static inline __device__ void atomAdd(at::Half *address, at::Half val) { static inline __device__ void atomAdd(at::Half *address, at::Half val) {
AtomicAddDecimalImpl<at::Half, sizeof(at::Half)>()(address, val); AtomicAddDecimalImpl<at::Half, sizeof(at::Half)>()(address, val);
} }
#else #else
static inline __device__ void atomAdd(at::Half *address, at::Half val) { static inline __device__ void atomAdd(at::Half *address, at::Half val) {
AtomicAddDecimalImpl<at::Half, sizeof(at::Half)>()(address, val); atomicAdd(reinterpret_cast<__half *>(address), val);
} }
#endif #endif
static inline __device__ void atomAdd(float *address, float val) { static inline __device__ void atomAdd(float *address, float val) {
atomicAdd(address, val); atomicAdd(address, val);
} }
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600 || TORCH_HIP_VERSION < 8000) #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600 || CUDA_VERSION < 8000)
static inline __device__ void atomAdd(double *address, double val) { static inline __device__ void atomAdd(double *address, double val) {
AtomicAddDecimalImpl<double, sizeof(double)>()(address, val); AtomicAddDecimalImpl<double, sizeof(double)>()(address, val);
} }
......
#pragma once #pragma once
#include <ATen/hip/detail/TensorInfo.cuh> #include <ATen/cuda/detail/TensorInfo.cuh>
// We need our own `IndexToOffset` implementation since we do not want to // We need our own `IndexToOffset` implementation since we do not want to
// access the last element of the `indexptr`. // access the last element of the `indexptr`.
......
#include "hip/hip_runtime.h" #include "scatter_cuda.h"
#include "scatter_hip.h"
#include <ATen/hip/HIPContext.h> #include <ATen/cuda/CUDAContext.h>
#include <ATen/hip/detail/IndexUtils.cuh> #include <ATen/cuda/detail/IndexUtils.cuh>
#include <ATen/hip/detail/TensorInfo.cuh> #include <ATen/cuda/detail/TensorInfo.cuh>
#include "reducer.cuh" #include "reducer.cuh"
#include "utils.cuh" #include "utils.cuh"
...@@ -64,7 +63,7 @@ scatter_cuda(torch::Tensor src, torch::Tensor index, int64_t dim, ...@@ -64,7 +63,7 @@ scatter_cuda(torch::Tensor src, torch::Tensor index, int64_t dim,
CHECK_CUDA(index); CHECK_CUDA(index);
if (optional_out.has_value()) if (optional_out.has_value())
CHECK_CUDA(optional_out.value()); CHECK_CUDA(optional_out.value());
hipSetDevice(src.get_device()); cudaSetDevice(src.get_device());
CHECK_INPUT(src.dim() == index.dim()); CHECK_INPUT(src.dim() == index.dim());
for (auto i = 0; i < index.dim() - 1; i++) for (auto i = 0; i < index.dim() - 1; i++)
......
#include "hip/hip_runtime.h" #include "segment_coo_cuda.h"
#include "segment_coo_hip.h"
#include <ATen/hip/HIPContext.h> #include <ATen/cuda/CUDAContext.h>
#include <ATen/hip/detail/IndexUtils.cuh> #include <ATen/cuda/detail/IndexUtils.cuh>
#include <ATen/hip/detail/TensorInfo.cuh> #include <ATen/cuda/detail/TensorInfo.cuh>
#include "reducer.cuh" #include "reducer.cuh"
#include "utils.cuh" #include "utils.cuh"
...@@ -158,7 +157,7 @@ segment_coo_cuda(torch::Tensor src, torch::Tensor index, ...@@ -158,7 +157,7 @@ segment_coo_cuda(torch::Tensor src, torch::Tensor index,
CHECK_CUDA(index); CHECK_CUDA(index);
if (optional_out.has_value()) if (optional_out.has_value())
CHECK_CUDA(optional_out.value()); CHECK_CUDA(optional_out.value());
hipSetDevice(src.get_device()); cudaSetDevice(src.get_device());
CHECK_INPUT(src.dim() >= index.dim()); CHECK_INPUT(src.dim() >= index.dim());
...@@ -331,7 +330,7 @@ torch::Tensor gather_coo_cuda(torch::Tensor src, torch::Tensor index, ...@@ -331,7 +330,7 @@ torch::Tensor gather_coo_cuda(torch::Tensor src, torch::Tensor index,
CHECK_CUDA(index); CHECK_CUDA(index);
if (optional_out.has_value()) if (optional_out.has_value())
CHECK_CUDA(optional_out.value()); CHECK_CUDA(optional_out.value());
hipSetDevice(src.get_device()); cudaSetDevice(src.get_device());
CHECK_INPUT(src.dim() >= index.dim()); CHECK_INPUT(src.dim() >= index.dim());
......
...@@ -9,7 +9,3 @@ segment_coo_cuda(torch::Tensor src, torch::Tensor index, ...@@ -9,7 +9,3 @@ segment_coo_cuda(torch::Tensor src, torch::Tensor index,
torch::Tensor gather_coo_cuda(torch::Tensor src, torch::Tensor index, torch::Tensor gather_coo_cuda(torch::Tensor src, torch::Tensor index,
torch::optional<torch::Tensor> optional_out); torch::optional<torch::Tensor> optional_out);
template<typename T>
__device__ T __ldg(const T* ptr) {
return *ptr;
}
#include "hip/hip_runtime.h" #include "segment_csr_cuda.h"
#include "segment_csr_hip.h"
#include <ATen/hip/HIPContext.h> #include <ATen/cuda/CUDAContext.h>
#include <ATen/hip/detail/IndexUtils.cuh> #include <ATen/cuda/detail/IndexUtils.cuh>
#include <ATen/hip/detail/TensorInfo.cuh> #include <ATen/cuda/detail/TensorInfo.cuh>
#include "index_info.cuh" #include "index_info.cuh"
#include "reducer.cuh" #include "reducer.cuh"
...@@ -103,7 +102,7 @@ segment_csr_cuda(torch::Tensor src, torch::Tensor indptr, ...@@ -103,7 +102,7 @@ segment_csr_cuda(torch::Tensor src, torch::Tensor indptr,
CHECK_CUDA(indptr); CHECK_CUDA(indptr);
if (optional_out.has_value()) if (optional_out.has_value())
CHECK_CUDA(optional_out.value()); CHECK_CUDA(optional_out.value());
hipSetDevice(src.get_device()); cudaSetDevice(src.get_device());
CHECK_INPUT(src.dim() >= indptr.dim()); CHECK_INPUT(src.dim() >= indptr.dim());
...@@ -223,7 +222,7 @@ torch::Tensor gather_csr_cuda(torch::Tensor src, torch::Tensor indptr, ...@@ -223,7 +222,7 @@ torch::Tensor gather_csr_cuda(torch::Tensor src, torch::Tensor indptr,
CHECK_CUDA(indptr); CHECK_CUDA(indptr);
if (optional_out.has_value()) if (optional_out.has_value())
CHECK_CUDA(optional_out.value()); CHECK_CUDA(optional_out.value());
hipSetDevice(src.get_device()); cudaSetDevice(src.get_device());
CHECK_INPUT(src.dim() >= indptr.dim()); CHECK_INPUT(src.dim() >= indptr.dim());
......
...@@ -9,7 +9,3 @@ segment_csr_cuda(torch::Tensor src, torch::Tensor indptr, ...@@ -9,7 +9,3 @@ segment_csr_cuda(torch::Tensor src, torch::Tensor indptr,
torch::Tensor gather_csr_cuda(torch::Tensor src, torch::Tensor indptr, torch::Tensor gather_csr_cuda(torch::Tensor src, torch::Tensor indptr,
torch::optional<torch::Tensor> optional_out); torch::optional<torch::Tensor> optional_out);
template<typename T>
__device__ T __ldg(const T* ptr) {
return *ptr;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment