Commit 70d03142 authored by limm's avatar limm
Browse files

support v1.2.2

parent 378d2b88
cmake_minimum_required(VERSION 3.0)
project(torchsplineconv)
set(CMAKE_CXX_STANDARD 14)
set(TORCHSPLINECONV_VERSION 1.2.1)
set(TORCHSPLINECONV_VERSION 1.2.2)
option(WITH_CUDA "Enable CUDA support" OFF)
......@@ -9,7 +9,7 @@ if(WITH_CUDA)
enable_language(CUDA)
add_definitions(-D__CUDA_NO_HALF_OPERATORS__)
add_definitions(-DWITH_CUDA)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -arch=sm_35 --expt-relaxed-constexpr")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
endif()
find_package(Python3 COMPONENTS Development)
......
Metadata-Version: 1.2
Name: torch_spline_conv
Version: 1.2.1
Summary: Implementation of the Spline-Based Convolution Operator of SplineCNN in PyTorch
Home-page: https://github.com/rusty1s/pytorch_spline_conv
Author: Matthias Fey
Author-email: matthias.fey@tu-dortmund.de
License: MIT
Description: UNKNOWN
Keywords: pytorch,geometric-deep-learning,graph-neural-networks,spline-cnn
Platform: UNKNOWN
Requires-Python: >=3.6
# <div aligh="center"><strong>PyTorch Spline Conv</strong></div>
## 简介
Pytorch Spline Conv 是基于PyTorch框架的一个软件包,用于实现图卷积神经网络中的Spline卷积操作。图卷积神经网络是一种能够在图结构数据上进行深度学习的模型,适用于节点分类、图分类和图生成等任务。DAS软件栈中的PyTorch Spline Conv 版本,不仅保证了该组件 核心功能在DCU加速卡的可用性,还针对DCU特有的硬件架构进行了深度定制优化,这使得开发者能够以极低的成本,轻松实现应用程序在DCU加速卡上的快速迁移和性能提升。目前已适配支持Pytorch1.13,Pyotrch2.1,Pytorch2.3。
Pytorch Spline Conv 是基于PyTorch框架的一个软件包,用于实现图卷积神经网络中的Spline卷积操作。图卷积神经网络是一种能够在图结构数据上进行深度学习的模型,适用于节点分类、图分类和图生成等任务。DAS软件栈中的PyTorch Spline Conv 版本,不仅保证了该组件 核心功能在DCU加速卡的可用性,还针对DCU特有的硬件架构进行了深度定制优化,这使得开发者能够以极低的成本,轻松实现应用程序在DCU加速卡上的快速迁移和性能提升。目前已适配支持Pytorch1.13,Pyotrch2.1,Pytorch2.4.1. Pytorch2.5.1
### 使用pip方式安装
pytorch-spline-conv whl包下载目录:[http://10.6.10.68:8000/customized/torch-spline-conv/dtk2310](http://10.6.10.68:8000/customized/torch-spline-conv/dtk2310). 目前只提供有python3.8版本的安装包
pytorch-spline-conv whl包下载目录:[https://das.sourcefind.cn:55011/portal/#/home](https://das.sourcefind.cn:55011/portal/#/home).
```shell
pip install torch_spline_conv* (下载的torch_spine_conv的whl包)
```
......@@ -17,23 +17,28 @@ pip install 'urllib3==1.26.14'
pip install pytest
pip insta;; wheel
```
- 在首页 | 光合开发者社区下载 det23.10 解压在 /opt/ 路径下,并建立软连接,例如
- 在首页 | 光合开发者社区下载 dtk25.04 解压在 /opt/ 路径下,并建立软连接,例如
```shell
cd /opt
wget http://10.6.10.68:8000/dtk-release/dtk23.10/CentOS7.6/DTK-23.10-CentOS7.6-x86_64.tar.gz
tar -zxvf DTK-23.10-CentOS7.6-x86_64.tar.gz
ln -s dtk-23.10 dtk
source /opt/dtk/env.sh
cd /opt & ln -s dtk-25.04 dtk
```
- 安装pytorch. pytorch whl包下载目录: [http://10.6.10.68:8000/debug/pytorch/dtk23.10/hipify](http://10.6.10.68:8000/debug/pytorch/dtk23.10/hipify). 根据需求下载对应的版本,安装如下:
- 安装pytorch. pytorch whl包下载目录: [https://das.sourcefind.cn:55011/portal/#/home](https://das.sourcefind.cn:55011/portal/#/home). 根据需求下载对应的版本,安装如下:
```shell
pip install torch* (下载的torch的whl包)
```
- 安装fastpt. fastpt whl包下载目录: [https://das.sourcefind.cn:55011/portal/#/home](https://das.sourcefind.cn:55011/portal/#/home). 根据需求下载对应的版本,安装如下:
```shell
pip install fastpt* (下载的fastpt whl包)
```
#### 源码下载编译安装
```shell
git clone -b 1.2.1-release http://developer.hpccube.com/codes/aicomponent/torch-spline-conv.git
python pymap_script.py /path/to/torch-spline-conv
git clone -b 1.2.2-fastpt http://developer.hpccube.com/codes/aicomponent/torch-spline-conv.git
export FORCE_CUDA=1
source /usr/local/bin/fastpt -C
cd torch-spline-conv
python setup.py bdist_wheel
pip install dist/*.whl
......@@ -43,6 +48,7 @@ pip install dist/*.whl
```shell
cd torch-spline-conv
python setup.py test
pytest
```
## Known Issue
......@@ -52,6 +58,5 @@ find / -name "_version_cpu.so"
cd /torch_spline_conv/torch_spline_conv
ln -s /usr/local/lib/python3.8/site-packages/torch_spline_conv/* .
```
## 参考资料
[https://github.com/rusty1s/pytorch_spline_conv](https://github.com/rusty1s/pytorch_spline_conv)
[pypi-image]: https://badge.fury.io/py/torch-spline-conv.svg
[pypi-url]: https://pypi.python.org/pypi/torch-spline-conv
[build-image]: https://travis-ci.org/rusty1s/pytorch_spline_conv.svg?branch=master
[build-url]: https://travis-ci.org/rusty1s/pytorch_spline_conv
[coverage-image]: https://codecov.io/gh/rusty1s/pytorch_spline_conv/branch/master/graph/badge.svg
[coverage-url]: https://codecov.io/github/rusty1s/pytorch_spline_conv?branch=master
# Spline-Based Convolution Operator of SplineCNN
[![PyPI Version][pypi-image]][pypi-url]
[![Build Status][build-image]][build-url]
[![Code Coverage][coverage-image]][coverage-url]
--------------------------------------------------------------------------------
This is a PyTorch implementation of the spline-based convolution operator of SplineCNN, as described in our paper:
Matthias Fey, Jan Eric Lenssen, Frank Weichert, Heinrich Müller: [SplineCNN: Fast Geometric Deep Learning with Continuous B-Spline Kernels](https://arxiv.org/abs/1711.08920) (CVPR 2018)
The operator works on all floating point data types and is implemented both for CPU and GPU.
## Installation
### Binaries
We provide pip wheels for all major OS/PyTorch/CUDA combinations, see [here](https://pytorch-geometric.com/whl).
#### PyTorch 1.7.0
To install the binaries for PyTorch 1.7.0, simply run
```
pip install torch-spline-conv -f https://pytorch-geometric.com/whl/torch-1.7.0+${CUDA}.html
```
where `${CUDA}` should be replaced by either `cpu`, `cu92`, `cu101`, `cu102`, or `cu110` depending on your PyTorch installation.
| | `cpu` | `cu92` | `cu101` | `cu102` | `cu110` |
|-------------|-------|--------|---------|---------|---------|
| **Linux** | ✅ | ✅ | ✅ | ✅ | ✅ |
| **Windows** | ✅ | ❌ | ✅ | ✅ | ✅ |
| **macOS** | ✅ | | | | |
#### PyTorch 1.6.0
To install the binaries for PyTorch 1.6.0, simply run
```
pip install torch-spline-conv -f https://pytorch-geometric.com/whl/torch-1.6.0+${CUDA}.html
```
where `${CUDA}` should be replaced by either `cpu`, `cu92`, `cu101` or `cu102` depending on your PyTorch installation.
| | `cpu` | `cu92` | `cu101` | `cu102` |
|-------------|-------|--------|---------|---------|
| **Linux** | ✅ | ✅ | ✅ | ✅ |
| **Windows** | ✅ | ❌ | ✅ | ✅ |
| **macOS** | ✅ | | | |
**Note:** Binaries of older versions are also provided for PyTorch 1.4.0 and PyTorch 1.5.0 (following the same procedure).
### From source
Ensure that at least PyTorch 1.4.0 is installed and verify that `cuda/bin` and `cuda/include` are in your `$PATH` and `$CPATH` respectively, *e.g.*:
```
$ python -c "import torch; print(torch.__version__)"
>>> 1.4.0
$ echo $PATH
>>> /usr/local/cuda/bin:...
$ echo $CPATH
>>> /usr/local/cuda/include:...
```
Then run:
```
pip install torch-spline-conv
```
When running in a docker container without NVIDIA driver, PyTorch needs to evaluate the compute capabilities and may fail.
In this case, ensure that the compute capabilities are set via `TORCH_CUDA_ARCH_LIST`, *e.g.*:
```
export TORCH_CUDA_ARCH_LIST = "6.0 6.1 7.2+PTX 7.5+PTX"
```
## Usage
```python
from torch_spline_conv import spline_conv
out = spline_conv(x,
edge_index,
pseudo,
weight,
kernel_size,
is_open_spline,
degree=1,
norm=True,
root_weight=None,
bias=None)
```
Applies the spline-based convolution operator
<p align="center">
<img width="50%" src="https://user-images.githubusercontent.com/6945922/38684093-36d9c52e-3e6f-11e8-9021-db054223c6b9.png" />
</p>
over several node features of an input graph.
The kernel function is defined over the weighted B-spline tensor product basis, as shown below for different B-spline degrees.
<p align="center">
<img width="45%" src="https://user-images.githubusercontent.com/6945922/38685443-3a2a0c68-3e72-11e8-8e13-9ce9ad8fe43e.png" />
<img width="45%" src="https://user-images.githubusercontent.com/6945922/38685459-42b2bcae-3e72-11e8-88cc-4b61e41dbd93.png" />
</p>
### Parameters
* **x** *(Tensor)* - Input node features of shape `(number_of_nodes x in_channels)`.
* **edge_index** *(LongTensor)* - Graph edges, given by source and target indices, of shape `(2 x number_of_edges)`.
* **pseudo** *(Tensor)* - Edge attributes, ie. pseudo coordinates, of shape `(number_of_edges x number_of_edge_attributes)` in the fixed interval [0, 1].
* **weight** *(Tensor)* - Trainable weight parameters of shape `(kernel_size x in_channels x out_channels)`.
* **kernel_size** *(LongTensor)* - Number of trainable weight parameters in each edge dimension.
* **is_open_spline** *(ByteTensor)* - Whether to use open or closed B-spline bases for each dimension.
* **degree** *(int, optional)* - B-spline basis degree. (default: `1`)
* **norm** *(bool, optional)*: Whether to normalize output by node degree. (default: `True`)
* **root_weight** *(Tensor, optional)* - Additional shared trainable parameters for each feature of the root node of shape `(in_channels x out_channels)`. (default: `None`)
* **bias** *(Tensor, optional)* - Optional bias of shape `(out_channels)`. (default: `None`)
### Returns
* **out** *(Tensor)* - Out node features of shape `(number_of_nodes x out_channels)`.
### Example
```python
import torch
from torch_spline_conv import spline_conv
x = torch.rand((4, 2), dtype=torch.float) # 4 nodes with 2 features each
edge_index = torch.tensor([[0, 1, 1, 2, 2, 3], [1, 0, 2, 1, 3, 2]]) # 6 edges
pseudo = torch.rand((6, 2), dtype=torch.float) # two-dimensional edge attributes
weight = torch.rand((25, 2, 4), dtype=torch.float) # 25 parameters for in_channels x out_channels
kernel_size = torch.tensor([5, 5]) # 5 parameters in each edge dimension
is_open_spline = torch.tensor([1, 1], dtype=torch.uint8) # only use open B-splines
degree = 1 # B-spline degree of 1
norm = True # Normalize output by node degree.
root_weight = torch.rand((2, 4), dtype=torch.float) # separately weight root nodes
bias = None # do not apply an additional bias
out = spline_conv(x, edge_index, pseudo, weight, kernel_size,
is_open_spline, degree, norm, root_weight, bias)
print(out.size())
torch.Size([4, 4]) # 4 nodes with 4 features each
```
## Cite
Please cite our paper if you use this code in your own work:
```
@inproceedings{Fey/etal/2018,
title={{SplineCNN}: Fast Geometric Deep Learning with Continuous {B}-Spline Kernels},
author={Fey, Matthias and Lenssen, Jan Eric and Weichert, Frank and M{\"u}ller, Heinrich},
booktitle={IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
year={2018},
}
```
## Running tests
```
python setup.py test
```
## C++ API
`torch-spline-conv` also offers a C++ API that contains C++ equivalent of python models.
```
mkdir build
cd build
# Add -DWITH_CUDA=on support for the CUDA if needed
cmake ..
make
make install
```
[pypi-image]: https://badge.fury.io/py/torch-spline-conv.svg
[pypi-url]: https://pypi.python.org/pypi/torch-spline-conv
[build-image]: https://travis-ci.org/rusty1s/pytorch_spline_conv.svg?branch=master
[build-url]: https://travis-ci.org/rusty1s/pytorch_spline_conv
[testing-image]: https://github.com/rusty1s/pytorch_spline_conv/actions/workflows/testing.yml/badge.svg
[testing-url]: https://github.com/rusty1s/pytorch_spline_conv/actions/workflows/testing.yml
[linting-image]: https://github.com/rusty1s/pytorch_spline_conv/actions/workflows/linting.yml/badge.svg
[linting-url]: https://github.com/rusty1s/pytorch_spline_conv/actions/workflows/linting.yml
[coverage-image]: https://codecov.io/gh/rusty1s/pytorch_spline_conv/branch/master/graph/badge.svg
[coverage-url]: https://codecov.io/github/rusty1s/pytorch_spline_conv?branch=master
# Spline-Based Convolution Operator of SplineCNN
[![PyPI Version][pypi-image]][pypi-url]
[![Build Status][build-image]][build-url]
[![Testing Status][testing-image]][testing-url]
[![Linting Status][linting-image]][linting-url]
[![Code Coverage][coverage-image]][coverage-url]
--------------------------------------------------------------------------------
......@@ -21,43 +24,54 @@ The operator works on all floating point data types and is implemented both for
## Installation
### Anaconda
**Update:** You can now install `pytorch-spline-conv` via [Anaconda](https://anaconda.org/pyg/pytorch-spline-conv) for all major OS/PyTorch/CUDA combinations 🤗
Given that you have [`pytorch >= 1.8.0` installed](https://pytorch.org/get-started/locally/), simply run
```
conda install pytorch-spline-conv -c pyg
```
### Binaries
We provide pip wheels for all major OS/PyTorch/CUDA combinations, see [here](https://pytorch-geometric.com/whl).
We alternatively provide pip wheels for all major OS/PyTorch/CUDA combinations, see [here](https://data.pyg.org/whl).
#### PyTorch 1.7.0
#### PyTorch 2.0
To install the binaries for PyTorch 1.7.0, simply run
To install the binaries for PyTorch 2.0.0, simply run
```
pip install torch-spline-conv -f https://pytorch-geometric.com/whl/torch-1.7.0+${CUDA}.html
pip install torch-spline-conv -f https://data.pyg.org/whl/torch-2.0.0+${CUDA}.html
```
where `${CUDA}` should be replaced by either `cpu`, `cu92`, `cu101`, `cu102`, or `cu110` depending on your PyTorch installation.
where `${CUDA}` should be replaced by either `cpu`, `cu117`, or `cu118` depending on your PyTorch installation.
| | `cpu` | `cu92` | `cu101` | `cu102` | `cu110` |
|-------------|-------|--------|---------|---------|---------|
| **Linux** | ✅ | ✅ | ✅ | ✅ | ✅ |
| **Windows** | ✅ | ❌ | ✅ | ✅ | ✅ |
| **macOS** | ✅ | | | | |
| | `cpu` | `cu117` | `cu118` |
|-------------|-------|---------|---------|
| **Linux** | ✅ | ✅ | ✅ |
| **Windows** | ✅ | ✅ | ✅ |
| **macOS** | ✅ | | |
#### PyTorch 1.6.0
#### PyTorch 1.13
To install the binaries for PyTorch 1.6.0, simply run
To install the binaries for PyTorch 1.13.0, simply run
```
pip install torch-spline-conv -f https://pytorch-geometric.com/whl/torch-1.6.0+${CUDA}.html
pip install torch-spline-conv -f https://data.pyg.org/whl/torch-1.13.0+${CUDA}.html
```
where `${CUDA}` should be replaced by either `cpu`, `cu92`, `cu101` or `cu102` depending on your PyTorch installation.
where `${CUDA}` should be replaced by either `cpu`, `cu116`, or `cu117` depending on your PyTorch installation.
| | `cpu` | `cu92` | `cu101` | `cu102` |
|-------------|-------|--------|---------|---------|
| **Linux** | ✅ | ✅ | ✅ | ✅ |
| **Windows** | ✅ | ❌ | ✅ | ✅ |
| **macOS** | ✅ | | | |
| | `cpu` | `cu116` | `cu117` |
|-------------|-------|---------|---------|
| **Linux** | ✅ | ✅ | ✅ |
| **Windows** | ✅ | ✅ | ✅ |
| **macOS** | ✅ | | |
**Note:** Binaries of older versions are also provided for PyTorch 1.4.0 and PyTorch 1.5.0 (following the same procedure).
**Note:** Binaries of older versions are also provided for PyTorch 1.4.0, PyTorch 1.5.0, PyTorch 1.6.0, PyTorch 1.7.0/1.7.1, PyTorch 1.8.0/1.8.1, PyTorch 1.9.0, PyTorch 1.10.0/1.10.1/1.10.2, PyTorch 1.11.0 and PyTorch 1.12.0/1.12.1 (following the same procedure).
For older versions, you need to explicitly specify the latest supported version number or install via `pip install --no-index` in order to prevent a manual installation from source.
You can look up the latest supported version number [here](https://data.pyg.org/whl).
### From source
......@@ -173,7 +187,7 @@ Please cite our paper if you use this code in your own work:
## Running tests
```
python setup.py test
pytest
```
## C++ API
......
```
./build_conda.sh 3.9 2.0.0 cu117 # python, pytorch and cuda version
```
#!/bin/bash
export PYTHON_VERSION=$1
export TORCH_VERSION=$2
export CUDA_VERSION=$3
export CONDA_PYTORCH_CONSTRAINT="pytorch==${TORCH_VERSION%.*}.*"
if [ "${CUDA_VERSION}" = "cpu" ]; then
export CONDA_CUDATOOLKIT_CONSTRAINT="cpuonly # [not osx]"
else
case $CUDA_VERSION in
cu118)
export CONDA_CUDATOOLKIT_CONSTRAINT="pytorch-cuda==11.8.*"
;;
cu117)
export CONDA_CUDATOOLKIT_CONSTRAINT="pytorch-cuda==11.7.*"
;;
cu116)
if [ "${TORCH_VERSION}" = "1.12.0" ]; then
export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit==11.6.*"
else
export CONDA_CUDATOOLKIT_CONSTRAINT="pytorch-cuda==11.6.*"
fi
;;
cu115)
export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit==11.5.*"
;;
cu113)
export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit==11.3.*"
;;
cu111)
export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit==11.1.*"
;;
cu102)
export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit==10.2.*"
;;
cu101)
export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit==10.1.*"
;;
*)
echo "Unrecognized CUDA_VERSION=$CUDA_VERSION"
exit 1
;;
esac
fi
echo "PyTorch $TORCH_VERSION+$CUDA_VERSION"
echo "- $CONDA_PYTORCH_CONSTRAINT"
echo "- $CONDA_CUDATOOLKIT_CONSTRAINT"
if [ "${TORCH_VERSION}" = "1.12.0" ] && [ "${CUDA_VERSION}" = "cu116" ]; then
conda build . -c pytorch -c default -c nvidia -c conda-forge --output-folder "$HOME/conda-bld"
else
conda build . -c pytorch -c default -c nvidia --output-folder "$HOME/conda-bld"
fi
package:
name: pytorch-spline-conv
version: 1.2.2
source:
path: ../..
requirements:
build:
- {{ compiler('c') }} # [win]
host:
- pip
- python {{ environ.get('PYTHON_VERSION') }}
- {{ environ.get('CONDA_PYTORCH_CONSTRAINT') }}
- {{ environ.get('CONDA_CUDATOOLKIT_CONSTRAINT') }}
run:
- python {{ environ.get('PYTHON_VERSION') }}
- {{ environ.get('CONDA_PYTORCH_CONSTRAINT') }}
- {{ environ.get('CONDA_CUDATOOLKIT_CONSTRAINT') }}
build:
string: py{{ environ.get('PYTHON_VERSION').replace('.', '') }}_torch_{{ environ['TORCH_VERSION'] }}_{{ environ['CUDA_VERSION'] }}
script: pip install .
script_env:
- FORCE_CUDA
- TORCH_CUDA_ARCH_LIST
test:
imports:
- torch_spline_conv
about:
home: https://github.com/rusty1s/pytorch_spline_conv
license: MIT
summary: Implementation of the Spline-Based Convolution Operator of SplineCNN in PyTorch
......@@ -75,7 +75,7 @@ spline_basis_fw_cpu(torch::Tensor pseudo, torch::Tensor kernel_size,
auto is_open_spline_data = is_open_spline.data_ptr<uint8_t>();
auto weight_index_data = weight_index.data_ptr<int64_t>();
AT_DISPATCH_FLOATING_TYPES(pseudo.scalar_type(), "basis_fw", [&] {
AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, pseudo.scalar_type(), "basis_fw", [&] {
auto pseudo_data = pseudo.data_ptr<scalar_t>();
auto basis_data = basis.data_ptr<scalar_t>();
......@@ -135,7 +135,7 @@ torch::Tensor spline_basis_bw_cpu(torch::Tensor grad_basis,
auto kernel_size_data = kernel_size.data_ptr<int64_t>();
auto is_open_spline_data = is_open_spline.data_ptr<uint8_t>();
AT_DISPATCH_FLOATING_TYPES(pseudo.scalar_type(), "basis_bw", [&] {
AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, pseudo.scalar_type(), "basis_bw", [&] {
auto grad_basis_data = grad_basis.data_ptr<scalar_t>();
auto pseudo_data = pseudo.data_ptr<scalar_t>();
auto grad_pseudo_data = grad_pseudo.data_ptr<scalar_t>();
......
......@@ -21,7 +21,7 @@ torch::Tensor spline_weighting_fw_cpu(torch::Tensor x, torch::Tensor weight,
auto weight_index_data = weight_index.data_ptr<int64_t>();
AT_DISPATCH_FLOATING_TYPES(x.scalar_type(), "weighting_fw", [&] {
AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, x.scalar_type(), "weighting_fw", [&] {
auto x_data = x.data_ptr<scalar_t>();
auto weight_data = weight.data_ptr<scalar_t>();
auto basis_data = basis.data_ptr<scalar_t>();
......@@ -71,7 +71,7 @@ torch::Tensor spline_weighting_bw_x_cpu(torch::Tensor grad_out,
auto weight_index_data = weight_index.data_ptr<int64_t>();
AT_DISPATCH_FLOATING_TYPES(grad_out.scalar_type(), "weighting_bw_x", [&] {
AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, grad_out.scalar_type(), "weighting_bw_x", [&] {
auto grad_out_data = grad_out.data_ptr<scalar_t>();
auto weight_data = weight.data_ptr<scalar_t>();
auto basis_data = basis.data_ptr<scalar_t>();
......@@ -117,7 +117,7 @@ torch::Tensor spline_weighting_bw_weight_cpu(torch::Tensor grad_out,
auto weight_index_data = weight_index.data_ptr<int64_t>();
AT_DISPATCH_FLOATING_TYPES(x.scalar_type(), "weighting_bw_weight", [&] {
AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, x.scalar_type(), "weighting_bw_weight", [&] {
auto grad_out_data = grad_out.data_ptr<scalar_t>();
auto x_data = x.data_ptr<scalar_t>();
auto basis_data = basis.data_ptr<scalar_t>();
......@@ -163,7 +163,7 @@ torch::Tensor spline_weighting_bw_basis_cpu(torch::Tensor grad_out,
auto weight_index_data = weight_index.data_ptr<int64_t>();
AT_DISPATCH_FLOATING_TYPES(x.scalar_type(), "weighting_bw_basis", [&] {
AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, x.scalar_type(), "weighting_bw_basis", [&] {
auto grad_out_data = grad_out.data_ptr<scalar_t>();
auto x_data = x.data_ptr<scalar_t>();
auto weight_data = weight.data_ptr<scalar_t>();
......
......@@ -3,7 +3,7 @@
static inline __device__ void atomAdd(float *address, float val) {
atomicAdd(address, val);
}
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600 || CUDA_VERSION < 8000)
#if defined(USE_ROCM) || (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600 || CUDA_VERSION < 8000))
static inline __device__ void atomAdd(double *address, double val) {
unsigned long long int *address_as_ull = (unsigned long long int *)address;
unsigned long long int old = *address_as_ull;
......
#pragma once
static inline __device__ void atomAdd(float *address, float val) {
atomicAdd(address, val);
}
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600 || TORCH_HIP_VERSION < 8000)
static inline __device__ void atomAdd(double *address, double val) {
unsigned long long int *address_as_ull = (unsigned long long int *)address;
unsigned long long int old = *address_as_ull;
unsigned long long int assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed,
__double_as_longlong(val + __longlong_as_double(assumed)));
} while (assumed != old);
}
#else
static inline __device__ void atomAdd(double *address, double val) {
atomicAdd(address, val);
}
#endif
#pragma once
#include <torch/extension.h>
std::tuple<torch::Tensor, torch::Tensor>
spline_basis_fw_cuda(torch::Tensor pseudo, torch::Tensor kernel_size,
torch::Tensor is_open_spline, int64_t degree);
torch::Tensor spline_basis_bw_cuda(torch::Tensor grad_basis,
torch::Tensor pseudo,
torch::Tensor kernel_size,
torch::Tensor is_open_spline,
int64_t degree);
#include "hip/hip_runtime.h"
#include "basis_hip.h"
#include <ATen/hip/HIPContext.h>
#include "utils.cuh"
#define THREADS 1024
#define BLOCKS(N) (N + THREADS - 1) / THREADS
template <typename scalar_t, int64_t degree> struct Basis {
static inline __device__ scalar_t forward(scalar_t v, int64_t k_mod) {
if (degree == 1) {
return 1. - v - k_mod + 2. * v * k_mod;
} else if (degree == 2) {
if (k_mod == 0)
return 0.5 * v * v - v + 0.5;
else if (k_mod == 1)
return -v * v + v + 0.5;
else
return 0.5 * v * v;
} else if (degree == 3) {
if (k_mod == 0)
return (1. - v) * (1. - v) * (1. - v) / 6.;
else if (k_mod == 1)
return (3. * v * v * v - 6. * v * v + 4.) / 6.;
else if (k_mod == 2)
return (-3. * v * v * v + 3. * v * v + 3. * v + 1.) / 6.;
else
return v * v * v / 6.;
} else {
return (scalar_t)-1.;
}
}
static inline __device__ scalar_t backward(scalar_t v, int64_t k_mod) {
if (degree == 1) {
return 2 * k_mod - 1;
} else if (degree == 2) {
if (k_mod == 0)
return v - 1.;
else if (k_mod == 1)
return -2. * v + 1.;
else
return v;
} else if (degree == 3) {
if (k_mod == 0)
return (-v * v + 2. * v - 1.) / 2.;
else if (k_mod == 1)
return (3. * v * v - 4. * v) / 2.;
else if (k_mod == 2)
return (-3. * v * v + 2. * v + 1.) / 2.;
else
return v * v / 2.;
} else {
return (scalar_t)-1.;
}
}
};
template <typename scalar_t, int64_t degree>
__global__ void
spline_basis_fw_kernel(const scalar_t *pseudo, const int64_t *kernel_size,
const uint8_t *is_open_spline, scalar_t *basis,
int64_t *weight_index, int64_t E, int64_t D, int64_t S,
int64_t numel) {
const int64_t thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
const int64_t e = thread_idx / S;
const int64_t s = thread_idx % S;
if (thread_idx < numel) {
int64_t k = s, wi = 0, wi_offset = 1;
scalar_t b = (scalar_t)1.;
for (int64_t d = 0; d < D; d++) {
const int64_t k_mod = k % (degree + 1);
k /= degree + 1;
scalar_t v = pseudo[e * D + d];
v *= kernel_size[d] - degree * is_open_spline[d];
wi += (((int64_t)v + k_mod) % kernel_size[d]) * wi_offset;
wi_offset *= kernel_size[d];
v -= floor(v);
v = Basis<scalar_t, degree>::forward(v, k_mod);
b *= v;
}
basis[thread_idx] = b;
weight_index[thread_idx] = wi;
}
}
std::tuple<torch::Tensor, torch::Tensor>
spline_basis_fw_cuda(torch::Tensor pseudo, torch::Tensor kernel_size,
torch::Tensor is_open_spline, int64_t degree) {
CHECK_CUDA(pseudo);
CHECK_CUDA(kernel_size);
CHECK_CUDA(is_open_spline);
hipSetDevice(pseudo.get_device());
CHECK_INPUT(kernel_size.dim() == 1);
CHECK_INPUT(pseudo.size(1) == kernel_size.numel());
CHECK_INPUT(is_open_spline.dim());
CHECK_INPUT(pseudo.size(1) == is_open_spline.numel());
auto E = pseudo.size(0);
auto D = pseudo.size(1);
auto S = (int64_t)(powf(degree + 1, D) + 0.5);
auto basis = at::empty({E, S}, pseudo.options());
auto weight_index = at::empty({E, S}, kernel_size.options());
auto kernel_size_data = kernel_size.data_ptr<int64_t>();
auto is_open_spline_data = is_open_spline.data_ptr<uint8_t>();
auto weight_index_data = weight_index.data_ptr<int64_t>();
auto stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES(pseudo.scalar_type(), "basis_fw", [&] {
auto pseudo_data = pseudo.data_ptr<scalar_t>();
auto basis_data = basis.data_ptr<scalar_t>();
AT_DISPATCH_DEGREE_TYPES(degree, [&] {
spline_basis_fw_kernel<scalar_t, DEGREE>
<<<BLOCKS(basis.numel()), THREADS, 0, stream>>>(
pseudo_data, kernel_size_data, is_open_spline_data, basis_data,
weight_index_data, E, D, S, basis.numel());
});
});
return std::make_tuple(basis, weight_index);
}
template <typename scalar_t, int64_t degree>
__global__ void
spline_basis_bw_kernel(const scalar_t *grad_basis, const scalar_t *pseudo,
const int64_t *kernel_size,
const uint8_t *is_open_spline, scalar_t *grad_pseudo,
int64_t E, int64_t D, int64_t S, int64_t numel) {
const int64_t thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
const int64_t e = thread_idx / D;
const int64_t d = thread_idx % D;
if (thread_idx < numel) {
scalar_t g = (scalar_t)0., tmp;
for (ptrdiff_t s = 0; s < S; s++) {
int64_t k_mod = (s / (int64_t)(powf(degree + 1, d) + 0.5)) % (degree + 1);
scalar_t v = pseudo[e * D + d];
v *= kernel_size[d] - degree * is_open_spline[d];
v -= floor(v);
v = Basis<scalar_t, degree>::backward(v, k_mod);
tmp = v;
for (int64_t d_it = 1; d_it < D; d_it++) {
const int64_t d_new = d_it - (d >= d_it);
k_mod = (s / (int64_t)(powf(degree + 1, d_new) + 0.5)) % (degree + 1);
v = pseudo[e * D + d_new];
v *= kernel_size[d_new] - degree * is_open_spline[d_new];
v -= floor(v);
v = Basis<scalar_t, degree>::forward(v, k_mod);
tmp *= v;
}
g += tmp * grad_basis[e * S + s];
}
g *= kernel_size[d] - degree * is_open_spline[d];
grad_pseudo[thread_idx] = g;
}
}
torch::Tensor spline_basis_bw_cuda(torch::Tensor grad_basis,
torch::Tensor pseudo,
torch::Tensor kernel_size,
torch::Tensor is_open_spline,
int64_t degree) {
CHECK_CUDA(grad_basis);
CHECK_CUDA(pseudo);
CHECK_CUDA(kernel_size);
CHECK_CUDA(is_open_spline);
hipSetDevice(grad_basis.get_device());
CHECK_INPUT(grad_basis.size(0) == pseudo.size(0));
CHECK_INPUT(kernel_size.dim() == 1);
CHECK_INPUT(pseudo.size(1) == kernel_size.numel());
CHECK_INPUT(is_open_spline.dim());
CHECK_INPUT(pseudo.size(1) == is_open_spline.numel());
auto E = pseudo.size(0);
auto D = pseudo.size(1);
auto S = grad_basis.size(1);
auto grad_pseudo = at::empty({E, D}, pseudo.options());
auto kernel_size_data = kernel_size.data_ptr<int64_t>();
auto is_open_spline_data = is_open_spline.data_ptr<uint8_t>();
auto stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES(pseudo.scalar_type(), "basis_bw", [&] {
auto grad_basis_data = grad_basis.data_ptr<scalar_t>();
auto pseudo_data = pseudo.data_ptr<scalar_t>();
auto grad_pseudo_data = grad_pseudo.data_ptr<scalar_t>();
AT_DISPATCH_DEGREE_TYPES(degree, [&] {
spline_basis_bw_kernel<scalar_t, DEGREE>
<<<BLOCKS(grad_pseudo.numel()), THREADS, 0, stream>>>(
grad_basis_data, pseudo_data, kernel_size_data,
is_open_spline_data, grad_pseudo_data, E, D, S,
grad_pseudo.numel());
});
});
return grad_pseudo;
}
#include "hip/hip_runtime.h"
#include "basis_hip.h"
#include <ATen/hip/HIPContext.h>
#include "utils.cuh"
#define THREADS 1024
#define BLOCKS(N) (N + THREADS - 1) / THREADS
template <typename scalar_t, int64_t degree> struct Basis {
static inline __device__ scalar_t forward(scalar_t v, int64_t k_mod) {
if (degree == 1) {
return 1. - v - k_mod + 2. * v * k_mod;
} else if (degree == 2) {
if (k_mod == 0)
return 0.5 * v * v - v + 0.5;
else if (k_mod == 1)
return -v * v + v + 0.5;
else
return 0.5 * v * v;
} else if (degree == 3) {
if (k_mod == 0)
return (1. - v) * (1. - v) * (1. - v) / 6.;
else if (k_mod == 1)
return (3. * v * v * v - 6. * v * v + 4.) / 6.;
else if (k_mod == 2)
return (-3. * v * v * v + 3. * v * v + 3. * v + 1.) / 6.;
else
return v * v * v / 6.;
} else {
return (scalar_t)-1.;
}
}
static inline __device__ scalar_t backward(scalar_t v, int64_t k_mod) {
if (degree == 1) {
return 2 * k_mod - 1;
} else if (degree == 2) {
if (k_mod == 0)
return v - 1.;
else if (k_mod == 1)
return -2. * v + 1.;
else
return v;
} else if (degree == 3) {
if (k_mod == 0)
return (-v * v + 2. * v - 1.) / 2.;
else if (k_mod == 1)
return (3. * v * v - 4. * v) / 2.;
else if (k_mod == 2)
return (-3. * v * v + 2. * v + 1.) / 2.;
else
return v * v / 2.;
} else {
return (scalar_t)-1.;
}
}
};
template <typename scalar_t, int64_t degree>
__global__ void
spline_basis_fw_kernel(const scalar_t *pseudo, const int64_t *kernel_size,
const uint8_t *is_open_spline, scalar_t *basis,
int64_t *weight_index, int64_t E, int64_t D, int64_t S,
int64_t numel) {
const int64_t thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
const int64_t e = thread_idx / S;
const int64_t s = thread_idx % S;
if (thread_idx < numel) {
int64_t k = s, wi = 0, wi_offset = 1;
scalar_t b = (scalar_t)1.;
for (int64_t d = 0; d < D; d++) {
const int64_t k_mod = k % (degree + 1);
k /= degree + 1;
scalar_t v = pseudo[e * D + d];
v *= kernel_size[d] - degree * is_open_spline[d];
wi += (((int64_t)v + k_mod) % kernel_size[d]) * wi_offset;
wi_offset *= kernel_size[d];
v -= floor(v);
v = Basis<scalar_t, degree>::forward(v, k_mod);
b *= v;
}
basis[thread_idx] = b;
weight_index[thread_idx] = wi;
}
}
std::tuple<torch::Tensor, torch::Tensor>
spline_basis_fw_cuda(torch::Tensor pseudo, torch::Tensor kernel_size,
torch::Tensor is_open_spline, int64_t degree) {
CHECK_CUDA(pseudo);
CHECK_CUDA(kernel_size);
CHECK_CUDA(is_open_spline);
hipSetDevice(pseudo.get_device());
CHECK_INPUT(kernel_size.dim() == 1);
CHECK_INPUT(pseudo.size(1) == kernel_size.numel());
CHECK_INPUT(is_open_spline.dim());
CHECK_INPUT(pseudo.size(1) == is_open_spline.numel());
auto E = pseudo.size(0);
auto D = pseudo.size(1);
auto S = (int64_t)(powf(degree + 1, D) + 0.5);
auto basis = at::empty({E, S}, pseudo.options());
auto weight_index = at::empty({E, S}, kernel_size.options());
auto kernel_size_data = kernel_size.data_ptr<int64_t>();
auto is_open_spline_data = is_open_spline.data_ptr<uint8_t>();
auto weight_index_data = weight_index.data_ptr<int64_t>();
auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
AT_DISPATCH_FLOATING_TYPES(pseudo.scalar_type(), "basis_fw", [&] {
auto pseudo_data = pseudo.data_ptr<scalar_t>();
auto basis_data = basis.data_ptr<scalar_t>();
AT_DISPATCH_DEGREE_TYPES(degree, [&] {
hipLaunchKernelGGL(( spline_basis_fw_kernel<scalar_t, DEGREE>)
, dim3(BLOCKS(basis.numel())), dim3(THREADS), 0, stream,
pseudo_data, kernel_size_data, is_open_spline_data, basis_data,
weight_index_data, E, D, S, basis.numel());
});
});
return std::make_tuple(basis, weight_index);
}
template <typename scalar_t, int64_t degree>
__global__ void
spline_basis_bw_kernel(const scalar_t *grad_basis, const scalar_t *pseudo,
const int64_t *kernel_size,
const uint8_t *is_open_spline, scalar_t *grad_pseudo,
int64_t E, int64_t D, int64_t S, int64_t numel) {
const int64_t thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
const int64_t e = thread_idx / D;
const int64_t d = thread_idx % D;
if (thread_idx < numel) {
scalar_t g = (scalar_t)0., tmp;
for (ptrdiff_t s = 0; s < S; s++) {
int64_t k_mod = (s / (int64_t)(powf(degree + 1, d) + 0.5)) % (degree + 1);
scalar_t v = pseudo[e * D + d];
v *= kernel_size[d] - degree * is_open_spline[d];
v -= floor(v);
v = Basis<scalar_t, degree>::backward(v, k_mod);
tmp = v;
for (int64_t d_it = 1; d_it < D; d_it++) {
const int64_t d_new = d_it - (d >= d_it);
k_mod = (s / (int64_t)(powf(degree + 1, d_new) + 0.5)) % (degree + 1);
v = pseudo[e * D + d_new];
v *= kernel_size[d_new] - degree * is_open_spline[d_new];
v -= floor(v);
v = Basis<scalar_t, degree>::forward(v, k_mod);
tmp *= v;
}
g += tmp * grad_basis[e * S + s];
}
g *= kernel_size[d] - degree * is_open_spline[d];
grad_pseudo[thread_idx] = g;
}
}
torch::Tensor spline_basis_bw_cuda(torch::Tensor grad_basis,
torch::Tensor pseudo,
torch::Tensor kernel_size,
torch::Tensor is_open_spline,
int64_t degree) {
CHECK_CUDA(grad_basis);
CHECK_CUDA(pseudo);
CHECK_CUDA(kernel_size);
CHECK_CUDA(is_open_spline);
hipSetDevice(grad_basis.get_device());
CHECK_INPUT(grad_basis.size(0) == pseudo.size(0));
CHECK_INPUT(kernel_size.dim() == 1);
CHECK_INPUT(pseudo.size(1) == kernel_size.numel());
CHECK_INPUT(is_open_spline.dim());
CHECK_INPUT(pseudo.size(1) == is_open_spline.numel());
auto E = pseudo.size(0);
auto D = pseudo.size(1);
auto S = grad_basis.size(1);
auto grad_pseudo = at::empty({E, D}, pseudo.options());
auto kernel_size_data = kernel_size.data_ptr<int64_t>();
auto is_open_spline_data = is_open_spline.data_ptr<uint8_t>();
auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
AT_DISPATCH_FLOATING_TYPES(pseudo.scalar_type(), "basis_bw", [&] {
auto grad_basis_data = grad_basis.data_ptr<scalar_t>();
auto pseudo_data = pseudo.data_ptr<scalar_t>();
auto grad_pseudo_data = grad_pseudo.data_ptr<scalar_t>();
AT_DISPATCH_DEGREE_TYPES(degree, [&] {
hipLaunchKernelGGL(( spline_basis_bw_kernel<scalar_t, DEGREE>)
, dim3(BLOCKS(grad_pseudo.numel())), dim3(THREADS), 0, stream,
grad_basis_data, pseudo_data, kernel_size_data,
is_open_spline_data, grad_pseudo_data, E, D, S,
grad_pseudo.numel());
});
});
return grad_pseudo;
}
#pragma once
#include <torch/extension.h>
#define CHECK_CUDA(x) \
AT_ASSERTM(x.device().is_cuda(), #x " must be CUDA tensor")
#define CHECK_INPUT(x) AT_ASSERTM(x, "Input mismatch")
#define AT_DISPATCH_DEGREE_TYPES(degree, ...) \
[&] { \
switch (degree) { \
case 1: { \
const int64_t DEGREE = 1; \
return __VA_ARGS__(); \
} \
case 2: { \
const int64_t DEGREE = 2; \
return __VA_ARGS__(); \
} \
case 3: { \
const int64_t DEGREE = 3; \
return __VA_ARGS__(); \
} \
default: \
AT_ERROR("Basis degree not implemented"); \
} \
}()
#pragma once
#include <torch/extension.h>
torch::Tensor spline_weighting_fw_cuda(torch::Tensor x, torch::Tensor weight,
torch::Tensor basis,
torch::Tensor weight_index);
torch::Tensor spline_weighting_bw_x_cuda(torch::Tensor grad_out,
torch::Tensor weight,
torch::Tensor basis,
torch::Tensor weight_index);
torch::Tensor spline_weighting_bw_weight_cuda(torch::Tensor grad_out,
torch::Tensor x,
torch::Tensor basis,
torch::Tensor weight_index,
int64_t kernel_size);
torch::Tensor spline_weighting_bw_basis_cuda(torch::Tensor grad_out,
torch::Tensor x,
torch::Tensor weight,
torch::Tensor weight_index);
#include "hip/hip_runtime.h"
#include "weighting_hip.h"
#include <ATen/hip/HIPContext.h>
#include "atomics.cuh"
#include "utils.cuh"
#define THREADS 1024
#define BLOCKS(N) (N + THREADS - 1) / THREADS
template <typename scalar_t>
__global__ void
spline_weighting_fw_kernel(const scalar_t *x, const scalar_t *weight,
const scalar_t *basis, const int64_t *weight_index,
scalar_t *out, int64_t E, int64_t M_in,
int64_t M_out, int64_t S, int64_t numel) {
const int64_t thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
const int64_t e = thread_idx / M_out;
const int64_t m_out = thread_idx % M_out;
if (thread_idx < numel) {
scalar_t v = (scalar_t)0.;
for (ptrdiff_t s = 0; s < S; s++) {
const scalar_t b = basis[e * S + s];
const int64_t wi = weight_index[e * S + s];
for (int64_t m_in = 0; m_in < M_in; m_in++) {
scalar_t tmp = weight[wi * M_in * M_out + m_in * M_out + m_out];
tmp *= b * x[e * M_in + m_in];
v += tmp;
}
}
out[thread_idx] = v;
}
}
torch::Tensor spline_weighting_fw_cuda(torch::Tensor x, torch::Tensor weight,
torch::Tensor basis,
torch::Tensor weight_index) {
CHECK_CUDA(x);
CHECK_CUDA(weight);
CHECK_CUDA(basis);
CHECK_CUDA(weight_index);
hipSetDevice(x.get_device());
CHECK_INPUT(x.size(1) == weight.size(1));
auto E = x.size(0);
auto M_in = x.size(1);
auto M_out = weight.size(2);
auto S = basis.size(1);
auto out = at::empty({E, M_out}, x.options());
auto weight_index_data = weight_index.data_ptr<int64_t>();
auto stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES(x.scalar_type(), "weighting_fw", [&] {
auto x_data = x.data_ptr<scalar_t>();
auto weight_data = weight.data_ptr<scalar_t>();
auto basis_data = basis.data_ptr<scalar_t>();
auto out_data = out.data_ptr<scalar_t>();
spline_weighting_fw_kernel<scalar_t>
<<<BLOCKS(out.numel()), THREADS, 0, stream>>>(
x_data, weight_data, basis_data, weight_index_data, out_data, E,
M_in, M_out, S, out.numel());
});
return out;
}
template <typename scalar_t>
__global__ void
spline_weighting_bw_x_kernel(const scalar_t *grad_out, const scalar_t *weight,
const scalar_t *basis, const int64_t *weight_index,
scalar_t *grad_x, int64_t E, int64_t M_in,
int64_t M_out, int64_t S, int64_t numel) {
const int64_t thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
const int64_t e = thread_idx / M_in;
const int64_t m_in = thread_idx % M_in;
if (thread_idx < numel) {
scalar_t v = (scalar_t)0.;
for (int64_t s = 0; s < S; s++) {
const scalar_t b = basis[e * S + s];
const int64_t wi = weight_index[e * S + s];
for (int64_t m_out = 0; m_out < M_out; m_out++) {
scalar_t tmp = weight[wi * M_out * M_in + m_out * M_in + m_in];
tmp *= b * grad_out[e * M_out + m_out];
v += tmp;
}
}
grad_x[thread_idx] = v;
}
}
torch::Tensor spline_weighting_bw_x_cuda(torch::Tensor grad_out,
torch::Tensor weight,
torch::Tensor basis,
torch::Tensor weight_index) {
CHECK_CUDA(grad_out);
CHECK_CUDA(weight);
CHECK_CUDA(basis);
CHECK_CUDA(weight_index);
hipSetDevice(grad_out.get_device());
CHECK_INPUT(grad_out.size(1) == weight.size(2));
auto E = grad_out.size(0);
auto M_in = weight.size(1);
auto M_out = grad_out.size(1);
auto S = basis.size(1);
auto grad_x = at::zeros({E, M_in}, grad_out.options());
weight = weight.transpose(1, 2).contiguous(); // Contiguous memory-access.
auto weight_index_data = weight_index.data_ptr<int64_t>();
auto stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES(grad_out.scalar_type(), "weighting_bw_x", [&] {
auto grad_out_data = grad_out.data_ptr<scalar_t>();
auto weight_data = weight.data_ptr<scalar_t>();
auto basis_data = basis.data_ptr<scalar_t>();
auto grad_x_data = grad_x.data_ptr<scalar_t>();
spline_weighting_bw_x_kernel<scalar_t>
<<<BLOCKS(grad_x.numel()), THREADS, 0, stream>>>(
grad_out_data, weight_data, basis_data, weight_index_data,
grad_x_data, E, M_in, M_out, S, grad_x.numel());
});
return grad_x;
}
template <typename scalar_t>
__global__ void spline_weighting_bw_weight_kernel(
const scalar_t *grad_out, const scalar_t *x, const scalar_t *basis,
const int64_t *weight_index, scalar_t *grad_weight, int64_t E, int64_t M_in,
int64_t M_out, int64_t S, int64_t numel) {
const int64_t thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
const int64_t e = thread_idx / M_out;
const int64_t m_out = thread_idx % M_out;
if (thread_idx < numel) {
auto g = grad_out[e * M_out + m_out];
for (int64_t s = 0; s < S; s++) {
const scalar_t b = basis[e * S + s];
const int64_t wi = weight_index[e * S + s];
for (int64_t m_in = 0; m_in < M_in; m_in++) {
auto v = g * b * x[e * M_in + m_in];
atomAdd(&grad_weight[wi * M_in * M_out + m_in * M_out + m_out], v);
}
}
}
}
torch::Tensor spline_weighting_bw_weight_cuda(torch::Tensor grad_out,
torch::Tensor x,
torch::Tensor basis,
torch::Tensor weight_index,
int64_t kernel_size) {
CHECK_CUDA(grad_out);
CHECK_CUDA(x);
CHECK_CUDA(basis);
CHECK_CUDA(weight_index);
hipSetDevice(grad_out.get_device());
auto E = grad_out.size(0);
auto M_in = x.size(1);
auto M_out = grad_out.size(1);
auto S = basis.size(1);
auto grad_weight = at::zeros({kernel_size, M_in, M_out}, grad_out.options());
auto weight_index_data = weight_index.data_ptr<int64_t>();
auto stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES(x.scalar_type(), "weighting_bw_weight", [&] {
auto grad_out_data = grad_out.data_ptr<scalar_t>();
auto x_data = x.data_ptr<scalar_t>();
auto basis_data = basis.data_ptr<scalar_t>();
auto grad_weight_data = grad_weight.data_ptr<scalar_t>();
spline_weighting_bw_weight_kernel<scalar_t>
<<<BLOCKS(grad_out.numel()), THREADS, 0, stream>>>(
grad_out_data, x_data, basis_data, weight_index_data,
grad_weight_data, E, M_in, M_out, S, grad_out.numel());
});
return grad_weight;
}
template <typename scalar_t>
__global__ void spline_weighting_bw_basis_kernel(
const scalar_t *grad_out, const scalar_t *x, const scalar_t *weight,
const int64_t *weight_index, scalar_t *grad_basis, int64_t E, int64_t M_in,
int64_t M_out, int64_t S, int64_t numel) {
const size_t thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
const int64_t e = thread_idx / M_out;
const int64_t m_out = thread_idx % M_out;
if (thread_idx < numel) {
const scalar_t g = grad_out[e * M_out + m_out];
for (int64_t s = 0; s < S; s++) {
scalar_t v = (scalar_t)0.;
const int64_t wi = weight_index[e * S + s];
for (int64_t m_in = 0; m_in < M_in; m_in++) {
const scalar_t w = weight[wi * M_in * M_out + m_in * M_out + m_out];
v += g * w * x[e * M_in + m_in];
}
atomAdd(&grad_basis[e * S + s], v);
}
}
}
torch::Tensor spline_weighting_bw_basis_cuda(torch::Tensor grad_out,
torch::Tensor x,
torch::Tensor weight,
torch::Tensor weight_index) {
CHECK_CUDA(grad_out);
CHECK_CUDA(x);
CHECK_CUDA(weight);
CHECK_CUDA(weight_index);
hipSetDevice(grad_out.get_device());
CHECK_INPUT(x.size(1) == weight.size(1));
CHECK_INPUT(grad_out.size(1) == weight.size(2));
auto E = grad_out.size(0);
auto M_in = x.size(1);
auto M_out = grad_out.size(1);
auto S = weight_index.size(1);
auto grad_basis = at::zeros({E, S}, grad_out.options());
auto weight_index_data = weight_index.data_ptr<int64_t>();
auto stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES(x.scalar_type(), "weighting_bw_basis", [&] {
auto grad_out_data = grad_out.data_ptr<scalar_t>();
auto x_data = x.data_ptr<scalar_t>();
auto weight_data = weight.data_ptr<scalar_t>();
auto grad_basis_data = grad_basis.data_ptr<scalar_t>();
spline_weighting_bw_basis_kernel<scalar_t>
<<<BLOCKS(grad_out.numel()), THREADS, 0, stream>>>(
grad_out_data, x_data, weight_data, weight_index_data,
grad_basis_data, E, M_in, M_out, S, grad_out.numel());
});
return grad_basis;
}
#include "hip/hip_runtime.h"
#include "weighting_hip.h"
#include <ATen/hip/HIPContext.h>
#include "atomics.cuh"
#include "utils.cuh"
#define THREADS 1024
#define BLOCKS(N) (N + THREADS - 1) / THREADS
template <typename scalar_t>
__global__ void
spline_weighting_fw_kernel(const scalar_t *x, const scalar_t *weight,
const scalar_t *basis, const int64_t *weight_index,
scalar_t *out, int64_t E, int64_t M_in,
int64_t M_out, int64_t S, int64_t numel) {
const int64_t thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
const int64_t e = thread_idx / M_out;
const int64_t m_out = thread_idx % M_out;
if (thread_idx < numel) {
scalar_t v = (scalar_t)0.;
for (ptrdiff_t s = 0; s < S; s++) {
const scalar_t b = basis[e * S + s];
const int64_t wi = weight_index[e * S + s];
for (int64_t m_in = 0; m_in < M_in; m_in++) {
scalar_t tmp = weight[wi * M_in * M_out + m_in * M_out + m_out];
tmp *= b * x[e * M_in + m_in];
v += tmp;
}
}
out[thread_idx] = v;
}
}
torch::Tensor spline_weighting_fw_cuda(torch::Tensor x, torch::Tensor weight,
torch::Tensor basis,
torch::Tensor weight_index) {
CHECK_CUDA(x);
CHECK_CUDA(weight);
CHECK_CUDA(basis);
CHECK_CUDA(weight_index);
hipSetDevice(x.get_device());
CHECK_INPUT(x.size(1) == weight.size(1));
auto E = x.size(0);
auto M_in = x.size(1);
auto M_out = weight.size(2);
auto S = basis.size(1);
auto out = at::empty({E, M_out}, x.options());
auto weight_index_data = weight_index.data_ptr<int64_t>();
auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
AT_DISPATCH_FLOATING_TYPES(x.scalar_type(), "weighting_fw", [&] {
auto x_data = x.data_ptr<scalar_t>();
auto weight_data = weight.data_ptr<scalar_t>();
auto basis_data = basis.data_ptr<scalar_t>();
auto out_data = out.data_ptr<scalar_t>();
hipLaunchKernelGGL(( spline_weighting_fw_kernel<scalar_t>)
, dim3(BLOCKS(out.numel())), dim3(THREADS), 0, stream,
x_data, weight_data, basis_data, weight_index_data, out_data, E,
M_in, M_out, S, out.numel());
});
return out;
}
template <typename scalar_t>
__global__ void
spline_weighting_bw_x_kernel(const scalar_t *grad_out, const scalar_t *weight,
const scalar_t *basis, const int64_t *weight_index,
scalar_t *grad_x, int64_t E, int64_t M_in,
int64_t M_out, int64_t S, int64_t numel) {
const int64_t thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
const int64_t e = thread_idx / M_in;
const int64_t m_in = thread_idx % M_in;
if (thread_idx < numel) {
scalar_t v = (scalar_t)0.;
for (int64_t s = 0; s < S; s++) {
const scalar_t b = basis[e * S + s];
const int64_t wi = weight_index[e * S + s];
for (int64_t m_out = 0; m_out < M_out; m_out++) {
scalar_t tmp = weight[wi * M_out * M_in + m_out * M_in + m_in];
tmp *= b * grad_out[e * M_out + m_out];
v += tmp;
}
}
grad_x[thread_idx] = v;
}
}
torch::Tensor spline_weighting_bw_x_cuda(torch::Tensor grad_out,
torch::Tensor weight,
torch::Tensor basis,
torch::Tensor weight_index) {
CHECK_CUDA(grad_out);
CHECK_CUDA(weight);
CHECK_CUDA(basis);
CHECK_CUDA(weight_index);
hipSetDevice(grad_out.get_device());
CHECK_INPUT(grad_out.size(1) == weight.size(2));
auto E = grad_out.size(0);
auto M_in = weight.size(1);
auto M_out = grad_out.size(1);
auto S = basis.size(1);
auto grad_x = at::zeros({E, M_in}, grad_out.options());
weight = weight.transpose(1, 2).contiguous(); // Contiguous memory-access.
auto weight_index_data = weight_index.data_ptr<int64_t>();
auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
AT_DISPATCH_FLOATING_TYPES(grad_out.scalar_type(), "weighting_bw_x", [&] {
auto grad_out_data = grad_out.data_ptr<scalar_t>();
auto weight_data = weight.data_ptr<scalar_t>();
auto basis_data = basis.data_ptr<scalar_t>();
auto grad_x_data = grad_x.data_ptr<scalar_t>();
hipLaunchKernelGGL(( spline_weighting_bw_x_kernel<scalar_t>)
, dim3(BLOCKS(grad_x.numel())), dim3(THREADS), 0, stream,
grad_out_data, weight_data, basis_data, weight_index_data,
grad_x_data, E, M_in, M_out, S, grad_x.numel());
});
return grad_x;
}
template <typename scalar_t>
__global__ void spline_weighting_bw_weight_kernel(
const scalar_t *grad_out, const scalar_t *x, const scalar_t *basis,
const int64_t *weight_index, scalar_t *grad_weight, int64_t E, int64_t M_in,
int64_t M_out, int64_t S, int64_t numel) {
const int64_t thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
const int64_t e = thread_idx / M_out;
const int64_t m_out = thread_idx % M_out;
if (thread_idx < numel) {
auto g = grad_out[e * M_out + m_out];
for (int64_t s = 0; s < S; s++) {
const scalar_t b = basis[e * S + s];
const int64_t wi = weight_index[e * S + s];
for (int64_t m_in = 0; m_in < M_in; m_in++) {
auto v = g * b * x[e * M_in + m_in];
atomAdd(&grad_weight[wi * M_in * M_out + m_in * M_out + m_out], v);
}
}
}
}
torch::Tensor spline_weighting_bw_weight_cuda(torch::Tensor grad_out,
torch::Tensor x,
torch::Tensor basis,
torch::Tensor weight_index,
int64_t kernel_size) {
CHECK_CUDA(grad_out);
CHECK_CUDA(x);
CHECK_CUDA(basis);
CHECK_CUDA(weight_index);
hipSetDevice(grad_out.get_device());
auto E = grad_out.size(0);
auto M_in = x.size(1);
auto M_out = grad_out.size(1);
auto S = basis.size(1);
auto grad_weight = at::zeros({kernel_size, M_in, M_out}, grad_out.options());
auto weight_index_data = weight_index.data_ptr<int64_t>();
auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
AT_DISPATCH_FLOATING_TYPES(x.scalar_type(), "weighting_bw_weight", [&] {
auto grad_out_data = grad_out.data_ptr<scalar_t>();
auto x_data = x.data_ptr<scalar_t>();
auto basis_data = basis.data_ptr<scalar_t>();
auto grad_weight_data = grad_weight.data_ptr<scalar_t>();
hipLaunchKernelGGL(( spline_weighting_bw_weight_kernel<scalar_t>)
, dim3(BLOCKS(grad_out.numel())), dim3(THREADS), 0, stream,
grad_out_data, x_data, basis_data, weight_index_data,
grad_weight_data, E, M_in, M_out, S, grad_out.numel());
});
return grad_weight;
}
template <typename scalar_t>
__global__ void spline_weighting_bw_basis_kernel(
const scalar_t *grad_out, const scalar_t *x, const scalar_t *weight,
const int64_t *weight_index, scalar_t *grad_basis, int64_t E, int64_t M_in,
int64_t M_out, int64_t S, int64_t numel) {
const size_t thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
const int64_t e = thread_idx / M_out;
const int64_t m_out = thread_idx % M_out;
if (thread_idx < numel) {
const scalar_t g = grad_out[e * M_out + m_out];
for (int64_t s = 0; s < S; s++) {
scalar_t v = (scalar_t)0.;
const int64_t wi = weight_index[e * S + s];
for (int64_t m_in = 0; m_in < M_in; m_in++) {
const scalar_t w = weight[wi * M_in * M_out + m_in * M_out + m_out];
v += g * w * x[e * M_in + m_in];
}
atomAdd(&grad_basis[e * S + s], v);
}
}
}
torch::Tensor spline_weighting_bw_basis_cuda(torch::Tensor grad_out,
torch::Tensor x,
torch::Tensor weight,
torch::Tensor weight_index) {
CHECK_CUDA(grad_out);
CHECK_CUDA(x);
CHECK_CUDA(weight);
CHECK_CUDA(weight_index);
hipSetDevice(grad_out.get_device());
CHECK_INPUT(x.size(1) == weight.size(1));
CHECK_INPUT(grad_out.size(1) == weight.size(2));
auto E = grad_out.size(0);
auto M_in = x.size(1);
auto M_out = grad_out.size(1);
auto S = weight_index.size(1);
auto grad_basis = at::zeros({E, S}, grad_out.options());
auto weight_index_data = weight_index.data_ptr<int64_t>();
auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
AT_DISPATCH_FLOATING_TYPES(x.scalar_type(), "weighting_bw_basis", [&] {
auto grad_out_data = grad_out.data_ptr<scalar_t>();
auto x_data = x.data_ptr<scalar_t>();
auto weight_data = weight.data_ptr<scalar_t>();
auto grad_basis_data = grad_basis.data_ptr<scalar_t>();
hipLaunchKernelGGL(( spline_weighting_bw_basis_kernel<scalar_t>)
, dim3(BLOCKS(grad_out.numel())), dim3(THREADS), 0, stream,
grad_out_data, x_data, weight_data, weight_index_data,
grad_basis_data, E, M_in, M_out, S, grad_out.numel());
});
return grad_basis;
}
......@@ -2,8 +2,12 @@
#include <torch/script.h>
#ifdef WITH_CUDA
#ifdef USE_ROCM
#include <hip/hip_version.h>
#else
#include <cuda.h>
#endif
#endif
#ifdef _WIN32
#ifdef WITH_CUDA
......@@ -15,11 +19,15 @@ PyMODINIT_FUNC PyInit__version_cpu(void) { return NULL; }
int64_t cuda_version() {
#ifdef WITH_CUDA
#ifdef USE_ROCM
return HIP_VERSION;
#else
return CUDA_VERSION;
#endif
#else
return -1;
#endif
}
static auto registry = torch::RegisterOperators().op(
"torch_spline_conv::cuda_version", &cuda_version);
"torch_spline_conv::cuda_version", [] { return cuda_version(); });
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment