support v1.2.2

70d03142 · limm · 378d2b88 · 70d03142 · 378d2b88 · 70d03142
Commit 70d03142 authored May 27, 2025 by limm
20 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
 cmake_minimum_required(VERSION 3.0)
 project(torchsplineconv)
 set(CMAKE_CXX_STANDARD 14)
-set(TORCHSPLINECONV_VERSION 1.2.1)
+set(TORCHSPLINECONV_VERSION 1.2.2)

 option(WITH_CUDA "Enable CUDA support" OFF)

@@ -9,7 +9,7 @@ if(WITH_CUDA)
  enable_language(CUDA)
  add_definitions(-D__CUDA_NO_HALF_OPERATORS__)
  add_definitions(-DWITH_CUDA)
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -arch=sm_35 --expt-relaxed-constexpr")
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
 endif()

 find_package(Python3 COMPONENTS Development)

--- a/PKG-INFO
+++ b/PKG-INFO
-Metadata-Version: 1.2
-Name: torch_spline_conv
-Version: 1.2.1
-Summary: Implementation of the Spline-Based Convolution Operator of SplineCNN in PyTorch
-Home-page: https://github.com/rusty1s/pytorch_spline_conv
-Author: Matthias Fey
-Author-email: matthias.fey@tu-dortmund.de
-License: MIT
-Description: UNKNOWN
-Keywords: pytorch,geometric-deep-learning,graph-neural-networks,spline-cnn
-Platform: UNKNOWN
-Requires-Python: >=3.6
--- a/README.md
+++ b/README.md
 # <div aligh="center"><strong>PyTorch Spline Conv</strong></div>
 ## 简介
-Pytorch Spline Conv 是基于PyTorch框架的一个软件包，用于实现图卷积神经网络中的Spline卷积操作。图卷积神经网络是一种能够在图结构数据上进行深度学习的模型，适用于节点分类、图分类和图生成等任务。DAS软件栈中的PyTorch Spline Conv 版本,不仅保证了该组件 核心功能在DCU加速卡的可用性，还针对DCU特有的硬件架构进行了深度定制优化，这使得开发者能够以极低的成本，轻松实现应用程序在DCU加速卡上的快速迁移和性能提升。目前已适配支持Pytorch1.13，Pyotrch2.1，Pytorch2.3。
+Pytorch Spline Conv 是基于PyTorch框架的一个软件包，用于实现图卷积神经网络中的Spline卷积操作。图卷积神经网络是一种能够在图结构数据上进行深度学习的模型，适用于节点分类、图分类和图生成等任务。DAS软件栈中的PyTorch Spline Conv 版本,不仅保证了该组件 核心功能在DCU加速卡的可用性，还针对DCU特有的硬件架构进行了深度定制优化，这使得开发者能够以极低的成本，轻松实现应用程序在DCU加速卡上的快速迁移和性能提升。目前已适配支持Pytorch1.13，Pyotrch2.1，Pytorch2.4.1. Pytorch2.5.1

 ### 使用pip方式安装
-pytorch-spline-conv whl包下载目录：[http://10.6.10.68:8000/customized/torch-spline-conv/dtk2310](http://10.6.10.68:8000/customized/torch-spline-conv/dtk2310). 目前只提供有python3.8版本的安装包
+pytorch-spline-conv whl包下载目录：[https://das.sourcefind.cn:55011/portal/#/home](https://das.sourcefind.cn:55011/portal/#/home). 
 ```shell
 pip install torch_spline_conv* (下载的torch_spine_conv的whl包)
 ```
@@ -17,23 +17,28 @@ pip install 'urllib3==1.26.14'
 pip install pytest
 pip insta;; wheel
 ```
- 在首页 | 光合开发者社区下载 det23.10 解压在 /opt/ 路径下，并建立软连接，例如
+- 在首页 | 光合开发者社区下载 dtk25.04 解压在 /opt/ 路径下，并建立软连接，例如
 ```shell
-cd /opt
-wget http://10.6.10.68:8000/dtk-release/dtk23.10/CentOS7.6/DTK-23.10-CentOS7.6-x86_64.tar.gz
-tar -zxvf DTK-23.10-CentOS7.6-x86_64.tar.gz
-ln -s dtk-23.10 dtk
-source /opt/dtk/env.sh
+cd /opt & ln -s dtk-25.04 dtk
+
 ```

- 安装pytorch. pytorch whl包下载目录: [http://10.6.10.68:8000/debug/pytorch/dtk23.10/hipify](http://10.6.10.68:8000/debug/pytorch/dtk23.10/hipify). 根据需求下载对应的版本，安装如下：
+- 安装pytorch. pytorch whl包下载目录: [https://das.sourcefind.cn:55011/portal/#/home](https://das.sourcefind.cn:55011/portal/#/home). 根据需求下载对应的版本，安装如下：
 ```shell
 pip install torch* (下载的torch的whl包)
 ```
+
+- 安装fastpt. fastpt whl包下载目录: [https://das.sourcefind.cn:55011/portal/#/home](https://das.sourcefind.cn:55011/portal/#/home). 根据需求下载对应的版本，安装如下：
+```shell
+pip install fastpt* (下载的fastpt whl包)
+```
+
+
 #### 源码下载编译安装
 ```shell
-git clone -b 1.2.1-release http://developer.hpccube.com/codes/aicomponent/torch-spline-conv.git
-python pymap_script.py /path/to/torch-spline-conv
+git clone -b 1.2.2-fastpt http://developer.hpccube.com/codes/aicomponent/torch-spline-conv.git
+export FORCE_CUDA=1
+source /usr/local/bin/fastpt -C
 cd torch-spline-conv
 python setup.py bdist_wheel
 pip install dist/*.whl
@@ -43,6 +48,7 @@ pip install dist/*.whl
 ```shell
 cd torch-spline-conv
 python setup.py test
+pytest
 ```

 ## Known Issue
@@ -52,6 +58,5 @@ find / -name "_version_cpu.so"
 cd /torch_spline_conv/torch_spline_conv
 ln -s /usr/local/lib/python3.8/site-packages/torch_spline_conv/* .
 ```
-
 ## 参考资料
 [https://github.com/rusty1s/pytorch_spline_conv](https://github.com/rusty1s/pytorch_spline_conv)
--- a/README_BAK.md
+++ b/README_BAK.md
-[pypi-image]: https://badge.fury.io/py/torch-spline-conv.svg
-[pypi-url]: https://pypi.python.org/pypi/torch-spline-conv
-[build-image]: https://travis-ci.org/rusty1s/pytorch_spline_conv.svg?branch=master
-[build-url]: https://travis-ci.org/rusty1s/pytorch_spline_conv
-[coverage-image]: https://codecov.io/gh/rusty1s/pytorch_spline_conv/branch/master/graph/badge.svg
-[coverage-url]: https://codecov.io/github/rusty1s/pytorch_spline_conv?branch=master
-
-# Spline-Based Convolution Operator of SplineCNN
-
-[![PyPI Version][pypi-image]][pypi-url]
-[![Build Status][build-image]][build-url]
-[![Code Coverage][coverage-image]][coverage-url]
-
--------------------------------------------------------------------------------
-
-This is a PyTorch implementation of the spline-based convolution operator of SplineCNN, as described in our paper:
-
-Matthias Fey, Jan Eric Lenssen, Frank Weichert, Heinrich Müller: [SplineCNN: Fast Geometric Deep Learning with Continuous B-Spline Kernels](https://arxiv.org/abs/1711.08920) (CVPR 2018)
-
-The operator works on all floating point data types and is implemented both for CPU and GPU.
-
-## Installation
-
-### Binaries
-
-We provide pip wheels for all major OS/PyTorch/CUDA combinations, see [here](https://pytorch-geometric.com/whl).
-
-#### PyTorch 1.7.0
-
-To install the binaries for PyTorch 1.7.0, simply run
-
-```
-pip install torch-spline-conv -f https://pytorch-geometric.com/whl/torch-1.7.0+${CUDA}.html
-```
-
-where `${CUDA}` should be replaced by either `cpu`, `cu92`, `cu101`, `cu102`, or `cu110` depending on your PyTorch installation.
-
-|             | `cpu` | `cu92` | `cu101` | `cu102` | `cu110` |
-|-------------|-------|--------|---------|---------|---------|
-| **Linux**   | ✅    | ✅     | ✅      | ✅      | ✅      |
-| **Windows** | ✅    | ❌     | ✅      | ✅      | ✅      |
-| **macOS**   | ✅    |        |         |         |         |
-
-#### PyTorch 1.6.0
-
-To install the binaries for PyTorch 1.6.0, simply run
-
-```
-pip install torch-spline-conv -f https://pytorch-geometric.com/whl/torch-1.6.0+${CUDA}.html
-```
-
-where `${CUDA}` should be replaced by either `cpu`, `cu92`, `cu101` or `cu102` depending on your PyTorch installation.
-
-|             | `cpu` | `cu92` | `cu101` | `cu102` |
-|-------------|-------|--------|---------|---------|
-| **Linux**   | ✅    | ✅     | ✅      | ✅      |
-| **Windows** | ✅    | ❌     | ✅      | ✅      |
-| **macOS**   | ✅    |        |         |         |
-
-**Note:** Binaries of older versions are also provided for PyTorch 1.4.0 and PyTorch 1.5.0 (following the same procedure).
-
-### From source
-
-Ensure that at least PyTorch 1.4.0 is installed and verify that `cuda/bin` and `cuda/include` are in your `$PATH` and `$CPATH` respectively, *e.g.*:
-
-```
-$ python -c "import torch; print(torch.__version__)"
->>> 1.4.0
-
-$ echo $PATH
->>> /usr/local/cuda/bin:...
-
-$ echo $CPATH
->>> /usr/local/cuda/include:...
-```
-
-Then run:
-
-```
-pip install torch-spline-conv
-```
-
-When running in a docker container without NVIDIA driver, PyTorch needs to evaluate the compute capabilities and may fail.
-In this case, ensure that the compute capabilities are set via `TORCH_CUDA_ARCH_LIST`, *e.g.*:
-
-```
-export TORCH_CUDA_ARCH_LIST = "6.0 6.1 7.2+PTX 7.5+PTX"
-```
-
-## Usage
-
-```python
-from torch_spline_conv import spline_conv
-
-out = spline_conv(x,
-                  edge_index,
-                  pseudo,
-                  weight,
-                  kernel_size,
-                  is_open_spline,
-                  degree=1,
-                  norm=True,
-                  root_weight=None,
-                  bias=None)
-```
-
-Applies the spline-based convolution operator
-<p align="center">
-  <img width="50%" src="https://user-images.githubusercontent.com/6945922/38684093-36d9c52e-3e6f-11e8-9021-db054223c6b9.png" />
-</p>
-over several node features of an input graph.
-The kernel function is defined over the weighted B-spline tensor product basis, as shown below for different B-spline degrees.
-
-<p align="center">
-  <img width="45%" src="https://user-images.githubusercontent.com/6945922/38685443-3a2a0c68-3e72-11e8-8e13-9ce9ad8fe43e.png" />
-  <img width="45%" src="https://user-images.githubusercontent.com/6945922/38685459-42b2bcae-3e72-11e8-88cc-4b61e41dbd93.png" />
-</p>
-
-### Parameters
-
-* **x** *(Tensor)* - Input node features of shape `(number_of_nodes x in_channels)`.
-* **edge_index** *(LongTensor)* - Graph edges, given by source and target indices, of shape `(2 x number_of_edges)`.
-* **pseudo** *(Tensor)* - Edge attributes, ie. pseudo coordinates, of shape `(number_of_edges x number_of_edge_attributes)` in the fixed interval [0, 1].
-* **weight** *(Tensor)* - Trainable weight parameters of shape `(kernel_size x in_channels x out_channels)`.
-* **kernel_size** *(LongTensor)* - Number of trainable weight parameters in each edge dimension.
-* **is_open_spline** *(ByteTensor)* - Whether to use open or closed B-spline bases for each dimension.
-* **degree** *(int, optional)* - B-spline basis degree. (default: `1`)
-* **norm** *(bool, optional)*: Whether to normalize output by node degree. (default: `True`)
-* **root_weight** *(Tensor, optional)* - Additional shared trainable parameters for each feature of the root node of shape `(in_channels x out_channels)`. (default: `None`)
-* **bias** *(Tensor, optional)* - Optional bias of shape `(out_channels)`. (default: `None`)
-
-### Returns
-
-* **out** *(Tensor)* - Out node features of shape `(number_of_nodes x out_channels)`.
-
-### Example
-
-```python
-import torch
-from torch_spline_conv import spline_conv
-
-x = torch.rand((4, 2), dtype=torch.float)  # 4 nodes with 2 features each
-edge_index = torch.tensor([[0, 1, 1, 2, 2, 3], [1, 0, 2, 1, 3, 2]])  # 6 edges
-pseudo = torch.rand((6, 2), dtype=torch.float)  # two-dimensional edge attributes
-weight = torch.rand((25, 2, 4), dtype=torch.float)  # 25 parameters for in_channels x out_channels
-kernel_size = torch.tensor([5, 5])  # 5 parameters in each edge dimension
-is_open_spline = torch.tensor([1, 1], dtype=torch.uint8)  # only use open B-splines
-degree = 1  # B-spline degree of 1
-norm = True  # Normalize output by node degree.
-root_weight = torch.rand((2, 4), dtype=torch.float)  # separately weight root nodes
-bias = None  # do not apply an additional bias
-
-out = spline_conv(x, edge_index, pseudo, weight, kernel_size,
-                  is_open_spline, degree, norm, root_weight, bias)
-
-print(out.size())
-torch.Size([4, 4])  # 4 nodes with 4 features each
-```
-
-## Cite
-
-Please cite our paper if you use this code in your own work:
-
-```
-@inproceedings{Fey/etal/2018,
-  title={{SplineCNN}: Fast Geometric Deep Learning with Continuous {B}-Spline Kernels},
-  author={Fey, Matthias and Lenssen, Jan Eric and Weichert, Frank and M{\"u}ller, Heinrich},
-  booktitle={IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
-  year={2018},
-}
-```
-
-## Running tests
-
-```
-python setup.py test
-```
-
-## C++ API
-
-`torch-spline-conv` also offers a C++ API that contains C++ equivalent of python models.
-
-```
-mkdir build
-cd build
-# Add -DWITH_CUDA=on support for the CUDA if needed
-cmake ..
-make
-make install
-```
--- a/README_ORIGIN.md
+++ b/README_ORIGIN.md
 [pypi-image]: https://badge.fury.io/py/torch-spline-conv.svg
 [pypi-url]: https://pypi.python.org/pypi/torch-spline-conv
-[build-image]: https://travis-ci.org/rusty1s/pytorch_spline_conv.svg?branch=master
-[build-url]: https://travis-ci.org/rusty1s/pytorch_spline_conv
+[testing-image]: https://github.com/rusty1s/pytorch_spline_conv/actions/workflows/testing.yml/badge.svg
+[testing-url]: https://github.com/rusty1s/pytorch_spline_conv/actions/workflows/testing.yml
+[linting-image]: https://github.com/rusty1s/pytorch_spline_conv/actions/workflows/linting.yml/badge.svg
+[linting-url]: https://github.com/rusty1s/pytorch_spline_conv/actions/workflows/linting.yml
 [coverage-image]: https://codecov.io/gh/rusty1s/pytorch_spline_conv/branch/master/graph/badge.svg
 [coverage-url]: https://codecov.io/github/rusty1s/pytorch_spline_conv?branch=master

 # Spline-Based Convolution Operator of SplineCNN

 [![PyPI Version][pypi-image]][pypi-url]
-[![Build Status][build-image]][build-url]
+[![Testing Status][testing-image]][testing-url]
+[![Linting Status][linting-image]][linting-url]
 [![Code Coverage][coverage-image]][coverage-url]

 --------------------------------------------------------------------------------
@@ -21,43 +24,54 @@ The operator works on all floating point data types and is implemented both for

 ## Installation

+### Anaconda
+
+**Update:** You can now install `pytorch-spline-conv` via [Anaconda](https://anaconda.org/pyg/pytorch-spline-conv) for all major OS/PyTorch/CUDA combinations 🤗
+Given that you have [`pytorch >= 1.8.0` installed](https://pytorch.org/get-started/locally/), simply run
+
+```
+conda install pytorch-spline-conv -c pyg
+```
+
 ### Binaries

-We provide pip wheels for all major OS/PyTorch/CUDA combinations, see [here](https://pytorch-geometric.com/whl).
+We alternatively provide pip wheels for all major OS/PyTorch/CUDA combinations, see [here](https://data.pyg.org/whl).

-#### PyTorch 1.7.0
+#### PyTorch 2.0

-To install the binaries for PyTorch 1.7.0, simply run
+To install the binaries for PyTorch 2.0.0, simply run

 ```
-pip install torch-spline-conv -f https://pytorch-geometric.com/whl/torch-1.7.0+${CUDA}.html
+pip install torch-spline-conv -f https://data.pyg.org/whl/torch-2.0.0+${CUDA}.html
 ```

-where `${CUDA}` should be replaced by either `cpu`, `cu92`, `cu101`, `cu102`, or `cu110` depending on your PyTorch installation.
+where `${CUDA}` should be replaced by either `cpu`, `cu117`, or `cu118` depending on your PyTorch installation.

-|             | `cpu` | `cu92` | `cu101` | `cu102` | `cu110` |
-|-------------|-------|--------|---------|---------|---------|
-| **Linux**   | ✅    | ✅     | ✅      | ✅      | ✅      |
-| **Windows** | ✅    | ❌     | ✅      | ✅      | ✅      |
-| **macOS**   | ✅    |        |         |         |         |
+|             | `cpu` | `cu117` | `cu118` |
+|-------------|-------|---------|---------|
+| **Linux**   | ✅    | ✅      | ✅      |
+| **Windows** | ✅    | ✅      | ✅      |
+| **macOS**   | ✅    |         |         |

-#### PyTorch 1.6.0
+#### PyTorch 1.13

-To install the binaries for PyTorch 1.6.0, simply run
+To install the binaries for PyTorch 1.13.0, simply run

 ```
-pip install torch-spline-conv -f https://pytorch-geometric.com/whl/torch-1.6.0+${CUDA}.html
+pip install torch-spline-conv -f https://data.pyg.org/whl/torch-1.13.0+${CUDA}.html
 ```

-where `${CUDA}` should be replaced by either `cpu`, `cu92`, `cu101` or `cu102` depending on your PyTorch installation.
+where `${CUDA}` should be replaced by either `cpu`, `cu116`, or `cu117` depending on your PyTorch installation.

-|             | `cpu` | `cu92` | `cu101` | `cu102` |
-|-------------|-------|--------|---------|---------|
-| **Linux**   | ✅    | ✅     | ✅      | ✅      |
-| **Windows** | ✅    | ❌     | ✅      | ✅      |
-| **macOS**   | ✅    |        |         |         |
+|             | `cpu` | `cu116` | `cu117` |
+|-------------|-------|---------|---------|
+| **Linux**   | ✅    | ✅      | ✅      |
+| **Windows** | ✅    | ✅      | ✅      |
+| **macOS**   | ✅    |         |         |

-**Note:** Binaries of older versions are also provided for PyTorch 1.4.0 and PyTorch 1.5.0 (following the same procedure).
+**Note:** Binaries of older versions are also provided for PyTorch 1.4.0, PyTorch 1.5.0, PyTorch 1.6.0, PyTorch 1.7.0/1.7.1, PyTorch 1.8.0/1.8.1, PyTorch 1.9.0, PyTorch 1.10.0/1.10.1/1.10.2, PyTorch 1.11.0 and PyTorch 1.12.0/1.12.1 (following the same procedure).
+For older versions, you need to explicitly specify the latest supported version number or install via `pip install --no-index` in order to prevent a manual installation from source.
+You can look up the latest supported version number [here](https://data.pyg.org/whl).

 ### From source

@@ -173,7 +187,7 @@ Please cite our paper if you use this code in your own work:
 ## Running tests

 ```
-python setup.py test
+pytest
 ```

 ## C++ API

--- a/conda/pytorch-spline-conv/README.md
+++ b/conda/pytorch-spline-conv/README.md
+```
+./build_conda.sh 3.9 2.0.0 cu117  # python, pytorch and cuda version
+```
--- a/conda/pytorch-spline-conv/build_conda.sh
+++ b/conda/pytorch-spline-conv/build_conda.sh
+#!/bin/bash
+
+export PYTHON_VERSION=$1
+export TORCH_VERSION=$2
+export CUDA_VERSION=$3
+
+export CONDA_PYTORCH_CONSTRAINT="pytorch==${TORCH_VERSION%.*}.*"
+
+if [ "${CUDA_VERSION}" = "cpu" ]; then
+  export CONDA_CUDATOOLKIT_CONSTRAINT="cpuonly  # [not osx]"
+else
+  case $CUDA_VERSION in
+    cu118)
+      export CONDA_CUDATOOLKIT_CONSTRAINT="pytorch-cuda==11.8.*"
+      ;;
+    cu117)
+      export CONDA_CUDATOOLKIT_CONSTRAINT="pytorch-cuda==11.7.*"
+      ;;
+    cu116)
+      if [ "${TORCH_VERSION}" = "1.12.0" ]; then
+        export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit==11.6.*"
+      else
+        export CONDA_CUDATOOLKIT_CONSTRAINT="pytorch-cuda==11.6.*"
+      fi
+      ;;
+    cu115)
+      export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit==11.5.*"
+      ;;
+    cu113)
+      export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit==11.3.*"
+      ;;
+    cu111)
+      export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit==11.1.*"
+      ;;
+    cu102)
+      export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit==10.2.*"
+      ;;
+    cu101)
+      export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit==10.1.*"
+      ;;
+    *)
+      echo "Unrecognized CUDA_VERSION=$CUDA_VERSION"
+      exit 1
+      ;;
+  esac
+fi
+
+echo "PyTorch $TORCH_VERSION+$CUDA_VERSION"
+echo "- $CONDA_PYTORCH_CONSTRAINT"
+echo "- $CONDA_CUDATOOLKIT_CONSTRAINT"
+
+if [ "${TORCH_VERSION}" = "1.12.0" ] && [ "${CUDA_VERSION}" = "cu116" ]; then
+  conda build . -c pytorch -c default -c nvidia -c conda-forge --output-folder "$HOME/conda-bld"
+else
+  conda build . -c pytorch -c default -c nvidia --output-folder "$HOME/conda-bld"
+fi
--- a/conda/pytorch-spline-conv/meta.yaml
+++ b/conda/pytorch-spline-conv/meta.yaml
+package:
+  name: pytorch-spline-conv
+  version: 1.2.2
+
+source:
+  path: ../..
+
+requirements:
+  build:
+    - {{ compiler('c') }}  # [win]
+
+  host:
+    - pip
+    - python {{ environ.get('PYTHON_VERSION') }}
+    - {{ environ.get('CONDA_PYTORCH_CONSTRAINT') }}
+    - {{ environ.get('CONDA_CUDATOOLKIT_CONSTRAINT') }}
+
+  run:
+    - python {{ environ.get('PYTHON_VERSION') }}
+    - {{ environ.get('CONDA_PYTORCH_CONSTRAINT') }}
+    - {{ environ.get('CONDA_CUDATOOLKIT_CONSTRAINT') }}
+
+build:
+  string: py{{ environ.get('PYTHON_VERSION').replace('.', '') }}_torch_{{ environ['TORCH_VERSION'] }}_{{ environ['CUDA_VERSION'] }}
+  script: pip install .
+  script_env:
+    - FORCE_CUDA
+    - TORCH_CUDA_ARCH_LIST
+
+test:
+  imports:
+    - torch_spline_conv
+
+about:
+  home: https://github.com/rusty1s/pytorch_spline_conv
+  license: MIT
+  summary: Implementation of the Spline-Based Convolution Operator of SplineCNN in PyTorch
--- a/csrc/cpu/basis_cpu.cpp
+++ b/csrc/cpu/basis_cpu.cpp
@@ -75,7 +75,7 @@ spline_basis_fw_cpu(torch::Tensor pseudo, torch::Tensor kernel_size,
  auto is_open_spline_data = is_open_spline.data_ptr<uint8_t>();
  auto weight_index_data = weight_index.data_ptr<int64_t>();

-  AT_DISPATCH_FLOATING_TYPES(pseudo.scalar_type(), "basis_fw", [&] {
+  AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, pseudo.scalar_type(), "basis_fw", [&] {
    auto pseudo_data = pseudo.data_ptr<scalar_t>();
    auto basis_data = basis.data_ptr<scalar_t>();

@@ -135,7 +135,7 @@ torch::Tensor spline_basis_bw_cpu(torch::Tensor grad_basis,
  auto kernel_size_data = kernel_size.data_ptr<int64_t>();
  auto is_open_spline_data = is_open_spline.data_ptr<uint8_t>();

-  AT_DISPATCH_FLOATING_TYPES(pseudo.scalar_type(), "basis_bw", [&] {
+  AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, pseudo.scalar_type(), "basis_bw", [&] {
    auto grad_basis_data = grad_basis.data_ptr<scalar_t>();
    auto pseudo_data = pseudo.data_ptr<scalar_t>();
    auto grad_pseudo_data = grad_pseudo.data_ptr<scalar_t>();

--- a/csrc/cpu/weighting_cpu.cpp
+++ b/csrc/cpu/weighting_cpu.cpp
@@ -21,7 +21,7 @@ torch::Tensor spline_weighting_fw_cpu(torch::Tensor x, torch::Tensor weight,

  auto weight_index_data = weight_index.data_ptr<int64_t>();

-  AT_DISPATCH_FLOATING_TYPES(x.scalar_type(), "weighting_fw", [&] {
+  AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, x.scalar_type(), "weighting_fw", [&] {
    auto x_data = x.data_ptr<scalar_t>();
    auto weight_data = weight.data_ptr<scalar_t>();
    auto basis_data = basis.data_ptr<scalar_t>();
@@ -71,7 +71,7 @@ torch::Tensor spline_weighting_bw_x_cpu(torch::Tensor grad_out,

  auto weight_index_data = weight_index.data_ptr<int64_t>();

-  AT_DISPATCH_FLOATING_TYPES(grad_out.scalar_type(), "weighting_bw_x", [&] {
+  AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, grad_out.scalar_type(), "weighting_bw_x", [&] {
    auto grad_out_data = grad_out.data_ptr<scalar_t>();
    auto weight_data = weight.data_ptr<scalar_t>();
    auto basis_data = basis.data_ptr<scalar_t>();
@@ -117,7 +117,7 @@ torch::Tensor spline_weighting_bw_weight_cpu(torch::Tensor grad_out,

  auto weight_index_data = weight_index.data_ptr<int64_t>();

-  AT_DISPATCH_FLOATING_TYPES(x.scalar_type(), "weighting_bw_weight", [&] {
+  AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, x.scalar_type(), "weighting_bw_weight", [&] {
    auto grad_out_data = grad_out.data_ptr<scalar_t>();
    auto x_data = x.data_ptr<scalar_t>();
    auto basis_data = basis.data_ptr<scalar_t>();
@@ -163,7 +163,7 @@ torch::Tensor spline_weighting_bw_basis_cpu(torch::Tensor grad_out,

  auto weight_index_data = weight_index.data_ptr<int64_t>();

-  AT_DISPATCH_FLOATING_TYPES(x.scalar_type(), "weighting_bw_basis", [&] {
+  AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, x.scalar_type(), "weighting_bw_basis", [&] {
    auto grad_out_data = grad_out.data_ptr<scalar_t>();
    auto x_data = x.data_ptr<scalar_t>();
    auto weight_data = weight.data_ptr<scalar_t>();

--- a/csrc/cuda/atomics.cuh
+++ b/csrc/cuda/atomics.cuh
@@ -3,7 +3,7 @@
 static inline __device__ void atomAdd(float *address, float val) {
  atomicAdd(address, val);
 }
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600 || CUDA_VERSION < 8000)
+#if defined(USE_ROCM) || (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600 || CUDA_VERSION < 8000))
 static inline __device__ void atomAdd(double *address, double val) {
  unsigned long long int *address_as_ull = (unsigned long long int *)address;
  unsigned long long int old = *address_as_ull;

--- a/csrc/hip/atomics.cuh
+++ b/csrc/hip/atomics.cuh
-#pragma once
-
-static inline __device__ void atomAdd(float *address, float val) {
-  atomicAdd(address, val);
-}
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600 || TORCH_HIP_VERSION < 8000)
-static inline __device__ void atomAdd(double *address, double val) {
-  unsigned long long int *address_as_ull = (unsigned long long int *)address;
-  unsigned long long int old = *address_as_ull;
-  unsigned long long int assumed;
-
-  do {
-    assumed = old;
-    old = atomicCAS(address_as_ull, assumed,
-                    __double_as_longlong(val + __longlong_as_double(assumed)));
-  } while (assumed != old);
-}
-#else
-static inline __device__ void atomAdd(double *address, double val) {
-  atomicAdd(address, val);
-}
-#endif
--- a/csrc/hip/basis_hip.h
+++ b/csrc/hip/basis_hip.h
-#pragma once
-
-#include <torch/extension.h>
-
-std::tuple<torch::Tensor, torch::Tensor>
-spline_basis_fw_cuda(torch::Tensor pseudo, torch::Tensor kernel_size,
-                     torch::Tensor is_open_spline, int64_t degree);
-
-torch::Tensor spline_basis_bw_cuda(torch::Tensor grad_basis,
-                                   torch::Tensor pseudo,
-                                   torch::Tensor kernel_size,
-                                   torch::Tensor is_open_spline,
-                                   int64_t degree);
--- a/csrc/hip/basis_hip.hip
+++ b/csrc/hip/basis_hip.hip
-#include "hip/hip_runtime.h"
-#include "basis_hip.h"
-
-#include <ATen/hip/HIPContext.h>
-
-#include "utils.cuh"
-
-#define THREADS 1024
-#define BLOCKS(N) (N + THREADS - 1) / THREADS
-
-template <typename scalar_t, int64_t degree> struct Basis {
-  static inline __device__ scalar_t forward(scalar_t v, int64_t k_mod) {
-    if (degree == 1) {
-      return 1. - v - k_mod + 2. * v * k_mod;
-    } else if (degree == 2) {
-      if (k_mod == 0)
-        return 0.5 * v * v - v + 0.5;
-      else if (k_mod == 1)
-        return -v * v + v + 0.5;
-      else
-        return 0.5 * v * v;
-    } else if (degree == 3) {
-      if (k_mod == 0)
-        return (1. - v) * (1. - v) * (1. - v) / 6.;
-      else if (k_mod == 1)
-        return (3. * v * v * v - 6. * v * v + 4.) / 6.;
-      else if (k_mod == 2)
-        return (-3. * v * v * v + 3. * v * v + 3. * v + 1.) / 6.;
-      else
-        return v * v * v / 6.;
-    } else {
-      return (scalar_t)-1.;
-    }
-  }
-
-  static inline __device__ scalar_t backward(scalar_t v, int64_t k_mod) {
-    if (degree == 1) {
-      return 2 * k_mod - 1;
-    } else if (degree == 2) {
-      if (k_mod == 0)
-        return v - 1.;
-      else if (k_mod == 1)
-        return -2. * v + 1.;
-      else
-        return v;
-    } else if (degree == 3) {
-      if (k_mod == 0)
-        return (-v * v + 2. * v - 1.) / 2.;
-      else if (k_mod == 1)
-        return (3. * v * v - 4. * v) / 2.;
-      else if (k_mod == 2)
-        return (-3. * v * v + 2. * v + 1.) / 2.;
-      else
-        return v * v / 2.;
-    } else {
-      return (scalar_t)-1.;
-    }
-  }
-};
-
-template <typename scalar_t, int64_t degree>
-__global__ void
-spline_basis_fw_kernel(const scalar_t *pseudo, const int64_t *kernel_size,
-                       const uint8_t *is_open_spline, scalar_t *basis,
-                       int64_t *weight_index, int64_t E, int64_t D, int64_t S,
-                       int64_t numel) {
-
-  const int64_t thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  const int64_t e = thread_idx / S;
-  const int64_t s = thread_idx % S;
-
-  if (thread_idx < numel) {
-    int64_t k = s, wi = 0, wi_offset = 1;
-    scalar_t b = (scalar_t)1.;
-
-    for (int64_t d = 0; d < D; d++) {
-      const int64_t k_mod = k % (degree + 1);
-      k /= degree + 1;
-
-      scalar_t v = pseudo[e * D + d];
-      v *= kernel_size[d] - degree * is_open_spline[d];
-
-      wi += (((int64_t)v + k_mod) % kernel_size[d]) * wi_offset;
-      wi_offset *= kernel_size[d];
-
-      v -= floor(v);
-      v = Basis<scalar_t, degree>::forward(v, k_mod);
-      b *= v;
-    }
-
-    basis[thread_idx] = b;
-    weight_index[thread_idx] = wi;
-  }
-}
-
-std::tuple<torch::Tensor, torch::Tensor>
-spline_basis_fw_cuda(torch::Tensor pseudo, torch::Tensor kernel_size,
-                     torch::Tensor is_open_spline, int64_t degree) {
-  CHECK_CUDA(pseudo);
-  CHECK_CUDA(kernel_size);
-  CHECK_CUDA(is_open_spline);
-  hipSetDevice(pseudo.get_device());
-
-  CHECK_INPUT(kernel_size.dim() == 1);
-  CHECK_INPUT(pseudo.size(1) == kernel_size.numel());
-  CHECK_INPUT(is_open_spline.dim());
-  CHECK_INPUT(pseudo.size(1) == is_open_spline.numel());
-
-  auto E = pseudo.size(0);
-  auto D = pseudo.size(1);
-  auto S = (int64_t)(powf(degree + 1, D) + 0.5);
-
-  auto basis = at::empty({E, S}, pseudo.options());
-  auto weight_index = at::empty({E, S}, kernel_size.options());
-
-  auto kernel_size_data = kernel_size.data_ptr<int64_t>();
-  auto is_open_spline_data = is_open_spline.data_ptr<uint8_t>();
-  auto weight_index_data = weight_index.data_ptr<int64_t>();
-
-  auto stream = at::cuda::getCurrentCUDAStream();
-  AT_DISPATCH_FLOATING_TYPES(pseudo.scalar_type(), "basis_fw", [&] {
-    auto pseudo_data = pseudo.data_ptr<scalar_t>();
-    auto basis_data = basis.data_ptr<scalar_t>();
-
-    AT_DISPATCH_DEGREE_TYPES(degree, [&] {
-      spline_basis_fw_kernel<scalar_t, DEGREE>
-          <<<BLOCKS(basis.numel()), THREADS, 0, stream>>>(
-              pseudo_data, kernel_size_data, is_open_spline_data, basis_data,
-              weight_index_data, E, D, S, basis.numel());
-    });
-  });
-
-  return std::make_tuple(basis, weight_index);
-}
-
-template <typename scalar_t, int64_t degree>
-__global__ void
-spline_basis_bw_kernel(const scalar_t *grad_basis, const scalar_t *pseudo,
-                       const int64_t *kernel_size,
-                       const uint8_t *is_open_spline, scalar_t *grad_pseudo,
-                       int64_t E, int64_t D, int64_t S, int64_t numel) {
-
-  const int64_t thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  const int64_t e = thread_idx / D;
-  const int64_t d = thread_idx % D;
-
-  if (thread_idx < numel) {
-    scalar_t g = (scalar_t)0., tmp;
-
-    for (ptrdiff_t s = 0; s < S; s++) {
-      int64_t k_mod = (s / (int64_t)(powf(degree + 1, d) + 0.5)) % (degree + 1);
-
-      scalar_t v = pseudo[e * D + d];
-      v *= kernel_size[d] - degree * is_open_spline[d];
-      v -= floor(v);
-      v = Basis<scalar_t, degree>::backward(v, k_mod);
-      tmp = v;
-
-      for (int64_t d_it = 1; d_it < D; d_it++) {
-        const int64_t d_new = d_it - (d >= d_it);
-        k_mod = (s / (int64_t)(powf(degree + 1, d_new) + 0.5)) % (degree + 1);
-        v = pseudo[e * D + d_new];
-        v *= kernel_size[d_new] - degree * is_open_spline[d_new];
-        v -= floor(v);
-        v = Basis<scalar_t, degree>::forward(v, k_mod);
-        tmp *= v;
-      }
-      g += tmp * grad_basis[e * S + s];
-    }
-    g *= kernel_size[d] - degree * is_open_spline[d];
-    grad_pseudo[thread_idx] = g;
-  }
-}
-
-torch::Tensor spline_basis_bw_cuda(torch::Tensor grad_basis,
-                                   torch::Tensor pseudo,
-                                   torch::Tensor kernel_size,
-                                   torch::Tensor is_open_spline,
-                                   int64_t degree) {
-  CHECK_CUDA(grad_basis);
-  CHECK_CUDA(pseudo);
-  CHECK_CUDA(kernel_size);
-  CHECK_CUDA(is_open_spline);
-  hipSetDevice(grad_basis.get_device());
-
-  CHECK_INPUT(grad_basis.size(0) == pseudo.size(0));
-  CHECK_INPUT(kernel_size.dim() == 1);
-  CHECK_INPUT(pseudo.size(1) == kernel_size.numel());
-  CHECK_INPUT(is_open_spline.dim());
-  CHECK_INPUT(pseudo.size(1) == is_open_spline.numel());
-
-  auto E = pseudo.size(0);
-  auto D = pseudo.size(1);
-  auto S = grad_basis.size(1);
-
-  auto grad_pseudo = at::empty({E, D}, pseudo.options());
-
-  auto kernel_size_data = kernel_size.data_ptr<int64_t>();
-  auto is_open_spline_data = is_open_spline.data_ptr<uint8_t>();
-
-  auto stream = at::cuda::getCurrentCUDAStream();
-  AT_DISPATCH_FLOATING_TYPES(pseudo.scalar_type(), "basis_bw", [&] {
-    auto grad_basis_data = grad_basis.data_ptr<scalar_t>();
-    auto pseudo_data = pseudo.data_ptr<scalar_t>();
-    auto grad_pseudo_data = grad_pseudo.data_ptr<scalar_t>();
-
-    AT_DISPATCH_DEGREE_TYPES(degree, [&] {
-      spline_basis_bw_kernel<scalar_t, DEGREE>
-          <<<BLOCKS(grad_pseudo.numel()), THREADS, 0, stream>>>(
-              grad_basis_data, pseudo_data, kernel_size_data,
-              is_open_spline_data, grad_pseudo_data, E, D, S,
-              grad_pseudo.numel());
-    });
-  });
-
-  return grad_pseudo;
-}
--- a/csrc/hip/basis_hip_hip.hip
+++ b/csrc/hip/basis_hip_hip.hip
-#include "hip/hip_runtime.h"
-#include "basis_hip.h"
-
-#include <ATen/hip/HIPContext.h>
-
-#include "utils.cuh"
-
-#define THREADS 1024
-#define BLOCKS(N) (N + THREADS - 1) / THREADS
-
-template <typename scalar_t, int64_t degree> struct Basis {
-  static inline __device__ scalar_t forward(scalar_t v, int64_t k_mod) {
-    if (degree == 1) {
-      return 1. - v - k_mod + 2. * v * k_mod;
-    } else if (degree == 2) {
-      if (k_mod == 0)
-        return 0.5 * v * v - v + 0.5;
-      else if (k_mod == 1)
-        return -v * v + v + 0.5;
-      else
-        return 0.5 * v * v;
-    } else if (degree == 3) {
-      if (k_mod == 0)
-        return (1. - v) * (1. - v) * (1. - v) / 6.;
-      else if (k_mod == 1)
-        return (3. * v * v * v - 6. * v * v + 4.) / 6.;
-      else if (k_mod == 2)
-        return (-3. * v * v * v + 3. * v * v + 3. * v + 1.) / 6.;
-      else
-        return v * v * v / 6.;
-    } else {
-      return (scalar_t)-1.;
-    }
-  }
-
-  static inline __device__ scalar_t backward(scalar_t v, int64_t k_mod) {
-    if (degree == 1) {
-      return 2 * k_mod - 1;
-    } else if (degree == 2) {
-      if (k_mod == 0)
-        return v - 1.;
-      else if (k_mod == 1)
-        return -2. * v + 1.;
-      else
-        return v;
-    } else if (degree == 3) {
-      if (k_mod == 0)
-        return (-v * v + 2. * v - 1.) / 2.;
-      else if (k_mod == 1)
-        return (3. * v * v - 4. * v) / 2.;
-      else if (k_mod == 2)
-        return (-3. * v * v + 2. * v + 1.) / 2.;
-      else
-        return v * v / 2.;
-    } else {
-      return (scalar_t)-1.;
-    }
-  }
-};
-
-template <typename scalar_t, int64_t degree>
-__global__ void
-spline_basis_fw_kernel(const scalar_t *pseudo, const int64_t *kernel_size,
-                       const uint8_t *is_open_spline, scalar_t *basis,
-                       int64_t *weight_index, int64_t E, int64_t D, int64_t S,
-                       int64_t numel) {
-
-  const int64_t thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  const int64_t e = thread_idx / S;
-  const int64_t s = thread_idx % S;
-
-  if (thread_idx < numel) {
-    int64_t k = s, wi = 0, wi_offset = 1;
-    scalar_t b = (scalar_t)1.;
-
-    for (int64_t d = 0; d < D; d++) {
-      const int64_t k_mod = k % (degree + 1);
-      k /= degree + 1;
-
-      scalar_t v = pseudo[e * D + d];
-      v *= kernel_size[d] - degree * is_open_spline[d];
-
-      wi += (((int64_t)v + k_mod) % kernel_size[d]) * wi_offset;
-      wi_offset *= kernel_size[d];
-
-      v -= floor(v);
-      v = Basis<scalar_t, degree>::forward(v, k_mod);
-      b *= v;
-    }
-
-    basis[thread_idx] = b;
-    weight_index[thread_idx] = wi;
-  }
-}
-
-std::tuple<torch::Tensor, torch::Tensor>
-spline_basis_fw_cuda(torch::Tensor pseudo, torch::Tensor kernel_size,
-                     torch::Tensor is_open_spline, int64_t degree) {
-  CHECK_CUDA(pseudo);
-  CHECK_CUDA(kernel_size);
-  CHECK_CUDA(is_open_spline);
-  hipSetDevice(pseudo.get_device());
-
-  CHECK_INPUT(kernel_size.dim() == 1);
-  CHECK_INPUT(pseudo.size(1) == kernel_size.numel());
-  CHECK_INPUT(is_open_spline.dim());
-  CHECK_INPUT(pseudo.size(1) == is_open_spline.numel());
-
-  auto E = pseudo.size(0);
-  auto D = pseudo.size(1);
-  auto S = (int64_t)(powf(degree + 1, D) + 0.5);
-
-  auto basis = at::empty({E, S}, pseudo.options());
-  auto weight_index = at::empty({E, S}, kernel_size.options());
-
-  auto kernel_size_data = kernel_size.data_ptr<int64_t>();
-  auto is_open_spline_data = is_open_spline.data_ptr<uint8_t>();
-  auto weight_index_data = weight_index.data_ptr<int64_t>();
-
-  auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
-  AT_DISPATCH_FLOATING_TYPES(pseudo.scalar_type(), "basis_fw", [&] {
-    auto pseudo_data = pseudo.data_ptr<scalar_t>();
-    auto basis_data = basis.data_ptr<scalar_t>();
-
-    AT_DISPATCH_DEGREE_TYPES(degree, [&] {
-     hipLaunchKernelGGL(( spline_basis_fw_kernel<scalar_t, DEGREE>)
-          , dim3(BLOCKS(basis.numel())), dim3(THREADS), 0, stream, 
-              pseudo_data, kernel_size_data, is_open_spline_data, basis_data,
-              weight_index_data, E, D, S, basis.numel());
-    });
-  });
-
-  return std::make_tuple(basis, weight_index);
-}
-
-template <typename scalar_t, int64_t degree>
-__global__ void
-spline_basis_bw_kernel(const scalar_t *grad_basis, const scalar_t *pseudo,
-                       const int64_t *kernel_size,
-                       const uint8_t *is_open_spline, scalar_t *grad_pseudo,
-                       int64_t E, int64_t D, int64_t S, int64_t numel) {
-
-  const int64_t thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  const int64_t e = thread_idx / D;
-  const int64_t d = thread_idx % D;
-
-  if (thread_idx < numel) {
-    scalar_t g = (scalar_t)0., tmp;
-
-    for (ptrdiff_t s = 0; s < S; s++) {
-      int64_t k_mod = (s / (int64_t)(powf(degree + 1, d) + 0.5)) % (degree + 1);
-
-      scalar_t v = pseudo[e * D + d];
-      v *= kernel_size[d] - degree * is_open_spline[d];
-      v -= floor(v);
-      v = Basis<scalar_t, degree>::backward(v, k_mod);
-      tmp = v;
-
-      for (int64_t d_it = 1; d_it < D; d_it++) {
-        const int64_t d_new = d_it - (d >= d_it);
-        k_mod = (s / (int64_t)(powf(degree + 1, d_new) + 0.5)) % (degree + 1);
-        v = pseudo[e * D + d_new];
-        v *= kernel_size[d_new] - degree * is_open_spline[d_new];
-        v -= floor(v);
-        v = Basis<scalar_t, degree>::forward(v, k_mod);
-        tmp *= v;
-      }
-      g += tmp * grad_basis[e * S + s];
-    }
-    g *= kernel_size[d] - degree * is_open_spline[d];
-    grad_pseudo[thread_idx] = g;
-  }
-}
-
-torch::Tensor spline_basis_bw_cuda(torch::Tensor grad_basis,
-                                   torch::Tensor pseudo,
-                                   torch::Tensor kernel_size,
-                                   torch::Tensor is_open_spline,
-                                   int64_t degree) {
-  CHECK_CUDA(grad_basis);
-  CHECK_CUDA(pseudo);
-  CHECK_CUDA(kernel_size);
-  CHECK_CUDA(is_open_spline);
-  hipSetDevice(grad_basis.get_device());
-
-  CHECK_INPUT(grad_basis.size(0) == pseudo.size(0));
-  CHECK_INPUT(kernel_size.dim() == 1);
-  CHECK_INPUT(pseudo.size(1) == kernel_size.numel());
-  CHECK_INPUT(is_open_spline.dim());
-  CHECK_INPUT(pseudo.size(1) == is_open_spline.numel());
-
-  auto E = pseudo.size(0);
-  auto D = pseudo.size(1);
-  auto S = grad_basis.size(1);
-
-  auto grad_pseudo = at::empty({E, D}, pseudo.options());
-
-  auto kernel_size_data = kernel_size.data_ptr<int64_t>();
-  auto is_open_spline_data = is_open_spline.data_ptr<uint8_t>();
-
-  auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
-  AT_DISPATCH_FLOATING_TYPES(pseudo.scalar_type(), "basis_bw", [&] {
-    auto grad_basis_data = grad_basis.data_ptr<scalar_t>();
-    auto pseudo_data = pseudo.data_ptr<scalar_t>();
-    auto grad_pseudo_data = grad_pseudo.data_ptr<scalar_t>();
-
-    AT_DISPATCH_DEGREE_TYPES(degree, [&] {
-     hipLaunchKernelGGL(( spline_basis_bw_kernel<scalar_t, DEGREE>)
-          , dim3(BLOCKS(grad_pseudo.numel())), dim3(THREADS), 0, stream, 
-              grad_basis_data, pseudo_data, kernel_size_data,
-              is_open_spline_data, grad_pseudo_data, E, D, S,
-              grad_pseudo.numel());
-    });
-  });
-
-  return grad_pseudo;
-}
--- a/csrc/hip/utils.cuh
+++ b/csrc/hip/utils.cuh
-#pragma once
-
-#include <torch/extension.h>
-
-#define CHECK_CUDA(x)                                                          \
-  AT_ASSERTM(x.device().is_cuda(), #x " must be CUDA tensor")
-#define CHECK_INPUT(x) AT_ASSERTM(x, "Input mismatch")
-
-#define AT_DISPATCH_DEGREE_TYPES(degree, ...)                                  \
-  [&] {                                                                        \
-    switch (degree) {                                                          \
-    case 1: {                                                                  \
-      const int64_t DEGREE = 1;                                                \
-      return __VA_ARGS__();                                                    \
-    }                                                                          \
-    case 2: {                                                                  \
-      const int64_t DEGREE = 2;                                                \
-      return __VA_ARGS__();                                                    \
-    }                                                                          \
-    case 3: {                                                                  \
-      const int64_t DEGREE = 3;                                                \
-      return __VA_ARGS__();                                                    \
-    }                                                                          \
-    default:                                                                   \
-      AT_ERROR("Basis degree not implemented");                                \
-    }                                                                          \
-  }()
--- a/csrc/hip/weighting_hip.h
+++ b/csrc/hip/weighting_hip.h
-#pragma once
-
-#include <torch/extension.h>
-
-torch::Tensor spline_weighting_fw_cuda(torch::Tensor x, torch::Tensor weight,
-                                       torch::Tensor basis,
-                                       torch::Tensor weight_index);
-
-torch::Tensor spline_weighting_bw_x_cuda(torch::Tensor grad_out,
-                                         torch::Tensor weight,
-                                         torch::Tensor basis,
-                                         torch::Tensor weight_index);
-
-torch::Tensor spline_weighting_bw_weight_cuda(torch::Tensor grad_out,
-                                              torch::Tensor x,
-                                              torch::Tensor basis,
-                                              torch::Tensor weight_index,
-                                              int64_t kernel_size);
-
-torch::Tensor spline_weighting_bw_basis_cuda(torch::Tensor grad_out,
-                                             torch::Tensor x,
-                                             torch::Tensor weight,
-                                             torch::Tensor weight_index);
--- a/csrc/hip/weighting_hip.hip
+++ b/csrc/hip/weighting_hip.hip
-#include "hip/hip_runtime.h"
-#include "weighting_hip.h"
-
-#include <ATen/hip/HIPContext.h>
-
-#include "atomics.cuh"
-#include "utils.cuh"
-
-#define THREADS 1024
-#define BLOCKS(N) (N + THREADS - 1) / THREADS
-
-template <typename scalar_t>
-__global__ void
-spline_weighting_fw_kernel(const scalar_t *x, const scalar_t *weight,
-                           const scalar_t *basis, const int64_t *weight_index,
-                           scalar_t *out, int64_t E, int64_t M_in,
-                           int64_t M_out, int64_t S, int64_t numel) {
-
-  const int64_t thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  const int64_t e = thread_idx / M_out;
-  const int64_t m_out = thread_idx % M_out;
-
-  if (thread_idx < numel) {
-    scalar_t v = (scalar_t)0.;
-
-    for (ptrdiff_t s = 0; s < S; s++) {
-      const scalar_t b = basis[e * S + s];
-      const int64_t wi = weight_index[e * S + s];
-      for (int64_t m_in = 0; m_in < M_in; m_in++) {
-        scalar_t tmp = weight[wi * M_in * M_out + m_in * M_out + m_out];
-        tmp *= b * x[e * M_in + m_in];
-        v += tmp;
-      }
-    }
-    out[thread_idx] = v;
-  }
-}
-
-torch::Tensor spline_weighting_fw_cuda(torch::Tensor x, torch::Tensor weight,
-                                       torch::Tensor basis,
-                                       torch::Tensor weight_index) {
-  CHECK_CUDA(x);
-  CHECK_CUDA(weight);
-  CHECK_CUDA(basis);
-  CHECK_CUDA(weight_index);
-  hipSetDevice(x.get_device());
-
-  CHECK_INPUT(x.size(1) == weight.size(1));
-
-  auto E = x.size(0);
-  auto M_in = x.size(1);
-  auto M_out = weight.size(2);
-  auto S = basis.size(1);
-
-  auto out = at::empty({E, M_out}, x.options());
-
-  auto weight_index_data = weight_index.data_ptr<int64_t>();
-
-  auto stream = at::cuda::getCurrentCUDAStream();
-  AT_DISPATCH_FLOATING_TYPES(x.scalar_type(), "weighting_fw", [&] {
-    auto x_data = x.data_ptr<scalar_t>();
-    auto weight_data = weight.data_ptr<scalar_t>();
-    auto basis_data = basis.data_ptr<scalar_t>();
-    auto out_data = out.data_ptr<scalar_t>();
-
-    spline_weighting_fw_kernel<scalar_t>
-        <<<BLOCKS(out.numel()), THREADS, 0, stream>>>(
-            x_data, weight_data, basis_data, weight_index_data, out_data, E,
-            M_in, M_out, S, out.numel());
-  });
-
-  return out;
-}
-
-template <typename scalar_t>
-__global__ void
-spline_weighting_bw_x_kernel(const scalar_t *grad_out, const scalar_t *weight,
-                             const scalar_t *basis, const int64_t *weight_index,
-                             scalar_t *grad_x, int64_t E, int64_t M_in,
-                             int64_t M_out, int64_t S, int64_t numel) {
-
-  const int64_t thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  const int64_t e = thread_idx / M_in;
-  const int64_t m_in = thread_idx % M_in;
-
-  if (thread_idx < numel) {
-    scalar_t v = (scalar_t)0.;
-
-    for (int64_t s = 0; s < S; s++) {
-      const scalar_t b = basis[e * S + s];
-      const int64_t wi = weight_index[e * S + s];
-
-      for (int64_t m_out = 0; m_out < M_out; m_out++) {
-        scalar_t tmp = weight[wi * M_out * M_in + m_out * M_in + m_in];
-        tmp *= b * grad_out[e * M_out + m_out];
-        v += tmp;
-      }
-    }
-    grad_x[thread_idx] = v;
-  }
-}
-
-torch::Tensor spline_weighting_bw_x_cuda(torch::Tensor grad_out,
-                                         torch::Tensor weight,
-                                         torch::Tensor basis,
-                                         torch::Tensor weight_index) {
-  CHECK_CUDA(grad_out);
-  CHECK_CUDA(weight);
-  CHECK_CUDA(basis);
-  CHECK_CUDA(weight_index);
-  hipSetDevice(grad_out.get_device());
-
-  CHECK_INPUT(grad_out.size(1) == weight.size(2));
-
-  auto E = grad_out.size(0);
-  auto M_in = weight.size(1);
-  auto M_out = grad_out.size(1);
-  auto S = basis.size(1);
-
-  auto grad_x = at::zeros({E, M_in}, grad_out.options());
-  weight = weight.transpose(1, 2).contiguous(); // Contiguous memory-access.
-
-  auto weight_index_data = weight_index.data_ptr<int64_t>();
-
-  auto stream = at::cuda::getCurrentCUDAStream();
-  AT_DISPATCH_FLOATING_TYPES(grad_out.scalar_type(), "weighting_bw_x", [&] {
-    auto grad_out_data = grad_out.data_ptr<scalar_t>();
-    auto weight_data = weight.data_ptr<scalar_t>();
-    auto basis_data = basis.data_ptr<scalar_t>();
-    auto grad_x_data = grad_x.data_ptr<scalar_t>();
-
-    spline_weighting_bw_x_kernel<scalar_t>
-        <<<BLOCKS(grad_x.numel()), THREADS, 0, stream>>>(
-            grad_out_data, weight_data, basis_data, weight_index_data,
-            grad_x_data, E, M_in, M_out, S, grad_x.numel());
-  });
-
-  return grad_x;
-}
-
-template <typename scalar_t>
-__global__ void spline_weighting_bw_weight_kernel(
-    const scalar_t *grad_out, const scalar_t *x, const scalar_t *basis,
-    const int64_t *weight_index, scalar_t *grad_weight, int64_t E, int64_t M_in,
-    int64_t M_out, int64_t S, int64_t numel) {
-
-  const int64_t thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  const int64_t e = thread_idx / M_out;
-  const int64_t m_out = thread_idx % M_out;
-
-  if (thread_idx < numel) {
-    auto g = grad_out[e * M_out + m_out];
-    for (int64_t s = 0; s < S; s++) {
-      const scalar_t b = basis[e * S + s];
-      const int64_t wi = weight_index[e * S + s];
-
-      for (int64_t m_in = 0; m_in < M_in; m_in++) {
-        auto v = g * b * x[e * M_in + m_in];
-        atomAdd(&grad_weight[wi * M_in * M_out + m_in * M_out + m_out], v);
-      }
-    }
-  }
-}
-
-torch::Tensor spline_weighting_bw_weight_cuda(torch::Tensor grad_out,
-                                              torch::Tensor x,
-                                              torch::Tensor basis,
-                                              torch::Tensor weight_index,
-                                              int64_t kernel_size) {
-  CHECK_CUDA(grad_out);
-  CHECK_CUDA(x);
-  CHECK_CUDA(basis);
-  CHECK_CUDA(weight_index);
-  hipSetDevice(grad_out.get_device());
-
-  auto E = grad_out.size(0);
-  auto M_in = x.size(1);
-  auto M_out = grad_out.size(1);
-  auto S = basis.size(1);
-
-  auto grad_weight = at::zeros({kernel_size, M_in, M_out}, grad_out.options());
-
-  auto weight_index_data = weight_index.data_ptr<int64_t>();
-
-  auto stream = at::cuda::getCurrentCUDAStream();
-  AT_DISPATCH_FLOATING_TYPES(x.scalar_type(), "weighting_bw_weight", [&] {
-    auto grad_out_data = grad_out.data_ptr<scalar_t>();
-    auto x_data = x.data_ptr<scalar_t>();
-    auto basis_data = basis.data_ptr<scalar_t>();
-    auto grad_weight_data = grad_weight.data_ptr<scalar_t>();
-
-    spline_weighting_bw_weight_kernel<scalar_t>
-        <<<BLOCKS(grad_out.numel()), THREADS, 0, stream>>>(
-            grad_out_data, x_data, basis_data, weight_index_data,
-            grad_weight_data, E, M_in, M_out, S, grad_out.numel());
-  });
-
-  return grad_weight;
-}
-
-template <typename scalar_t>
-__global__ void spline_weighting_bw_basis_kernel(
-    const scalar_t *grad_out, const scalar_t *x, const scalar_t *weight,
-    const int64_t *weight_index, scalar_t *grad_basis, int64_t E, int64_t M_in,
-    int64_t M_out, int64_t S, int64_t numel) {
-
-  const size_t thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  const int64_t e = thread_idx / M_out;
-  const int64_t m_out = thread_idx % M_out;
-
-  if (thread_idx < numel) {
-    const scalar_t g = grad_out[e * M_out + m_out];
-
-    for (int64_t s = 0; s < S; s++) {
-      scalar_t v = (scalar_t)0.;
-      const int64_t wi = weight_index[e * S + s];
-
-      for (int64_t m_in = 0; m_in < M_in; m_in++) {
-        const scalar_t w = weight[wi * M_in * M_out + m_in * M_out + m_out];
-        v += g * w * x[e * M_in + m_in];
-      }
-      atomAdd(&grad_basis[e * S + s], v);
-    }
-  }
-}
-
-torch::Tensor spline_weighting_bw_basis_cuda(torch::Tensor grad_out,
-                                             torch::Tensor x,
-                                             torch::Tensor weight,
-                                             torch::Tensor weight_index) {
-  CHECK_CUDA(grad_out);
-  CHECK_CUDA(x);
-  CHECK_CUDA(weight);
-  CHECK_CUDA(weight_index);
-  hipSetDevice(grad_out.get_device());
-
-  CHECK_INPUT(x.size(1) == weight.size(1));
-  CHECK_INPUT(grad_out.size(1) == weight.size(2));
-
-  auto E = grad_out.size(0);
-  auto M_in = x.size(1);
-  auto M_out = grad_out.size(1);
-  auto S = weight_index.size(1);
-
-  auto grad_basis = at::zeros({E, S}, grad_out.options());
-
-  auto weight_index_data = weight_index.data_ptr<int64_t>();
-
-  auto stream = at::cuda::getCurrentCUDAStream();
-  AT_DISPATCH_FLOATING_TYPES(x.scalar_type(), "weighting_bw_basis", [&] {
-    auto grad_out_data = grad_out.data_ptr<scalar_t>();
-    auto x_data = x.data_ptr<scalar_t>();
-    auto weight_data = weight.data_ptr<scalar_t>();
-    auto grad_basis_data = grad_basis.data_ptr<scalar_t>();
-
-    spline_weighting_bw_basis_kernel<scalar_t>
-        <<<BLOCKS(grad_out.numel()), THREADS, 0, stream>>>(
-            grad_out_data, x_data, weight_data, weight_index_data,
-            grad_basis_data, E, M_in, M_out, S, grad_out.numel());
-  });
-
-  return grad_basis;
-}
--- a/csrc/hip/weighting_hip_hip.hip
+++ b/csrc/hip/weighting_hip_hip.hip
-#include "hip/hip_runtime.h"
-#include "weighting_hip.h"
-
-#include <ATen/hip/HIPContext.h>
-
-#include "atomics.cuh"
-#include "utils.cuh"
-
-#define THREADS 1024
-#define BLOCKS(N) (N + THREADS - 1) / THREADS
-
-template <typename scalar_t>
-__global__ void
-spline_weighting_fw_kernel(const scalar_t *x, const scalar_t *weight,
-                           const scalar_t *basis, const int64_t *weight_index,
-                           scalar_t *out, int64_t E, int64_t M_in,
-                           int64_t M_out, int64_t S, int64_t numel) {
-
-  const int64_t thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  const int64_t e = thread_idx / M_out;
-  const int64_t m_out = thread_idx % M_out;
-
-  if (thread_idx < numel) {
-    scalar_t v = (scalar_t)0.;
-
-    for (ptrdiff_t s = 0; s < S; s++) {
-      const scalar_t b = basis[e * S + s];
-      const int64_t wi = weight_index[e * S + s];
-      for (int64_t m_in = 0; m_in < M_in; m_in++) {
-        scalar_t tmp = weight[wi * M_in * M_out + m_in * M_out + m_out];
-        tmp *= b * x[e * M_in + m_in];
-        v += tmp;
-      }
-    }
-    out[thread_idx] = v;
-  }
-}
-
-torch::Tensor spline_weighting_fw_cuda(torch::Tensor x, torch::Tensor weight,
-                                       torch::Tensor basis,
-                                       torch::Tensor weight_index) {
-  CHECK_CUDA(x);
-  CHECK_CUDA(weight);
-  CHECK_CUDA(basis);
-  CHECK_CUDA(weight_index);
-  hipSetDevice(x.get_device());
-
-  CHECK_INPUT(x.size(1) == weight.size(1));
-
-  auto E = x.size(0);
-  auto M_in = x.size(1);
-  auto M_out = weight.size(2);
-  auto S = basis.size(1);
-
-  auto out = at::empty({E, M_out}, x.options());
-
-  auto weight_index_data = weight_index.data_ptr<int64_t>();
-
-  auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
-  AT_DISPATCH_FLOATING_TYPES(x.scalar_type(), "weighting_fw", [&] {
-    auto x_data = x.data_ptr<scalar_t>();
-    auto weight_data = weight.data_ptr<scalar_t>();
-    auto basis_data = basis.data_ptr<scalar_t>();
-    auto out_data = out.data_ptr<scalar_t>();
-
-   hipLaunchKernelGGL(( spline_weighting_fw_kernel<scalar_t>)
-        , dim3(BLOCKS(out.numel())), dim3(THREADS), 0, stream, 
-            x_data, weight_data, basis_data, weight_index_data, out_data, E,
-            M_in, M_out, S, out.numel());
-  });
-
-  return out;
-}
-
-template <typename scalar_t>
-__global__ void
-spline_weighting_bw_x_kernel(const scalar_t *grad_out, const scalar_t *weight,
-                             const scalar_t *basis, const int64_t *weight_index,
-                             scalar_t *grad_x, int64_t E, int64_t M_in,
-                             int64_t M_out, int64_t S, int64_t numel) {
-
-  const int64_t thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  const int64_t e = thread_idx / M_in;
-  const int64_t m_in = thread_idx % M_in;
-
-  if (thread_idx < numel) {
-    scalar_t v = (scalar_t)0.;
-
-    for (int64_t s = 0; s < S; s++) {
-      const scalar_t b = basis[e * S + s];
-      const int64_t wi = weight_index[e * S + s];
-
-      for (int64_t m_out = 0; m_out < M_out; m_out++) {
-        scalar_t tmp = weight[wi * M_out * M_in + m_out * M_in + m_in];
-        tmp *= b * grad_out[e * M_out + m_out];
-        v += tmp;
-      }
-    }
-    grad_x[thread_idx] = v;
-  }
-}
-
-torch::Tensor spline_weighting_bw_x_cuda(torch::Tensor grad_out,
-                                         torch::Tensor weight,
-                                         torch::Tensor basis,
-                                         torch::Tensor weight_index) {
-  CHECK_CUDA(grad_out);
-  CHECK_CUDA(weight);
-  CHECK_CUDA(basis);
-  CHECK_CUDA(weight_index);
-  hipSetDevice(grad_out.get_device());
-
-  CHECK_INPUT(grad_out.size(1) == weight.size(2));
-
-  auto E = grad_out.size(0);
-  auto M_in = weight.size(1);
-  auto M_out = grad_out.size(1);
-  auto S = basis.size(1);
-
-  auto grad_x = at::zeros({E, M_in}, grad_out.options());
-  weight = weight.transpose(1, 2).contiguous(); // Contiguous memory-access.
-
-  auto weight_index_data = weight_index.data_ptr<int64_t>();
-
-  auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
-  AT_DISPATCH_FLOATING_TYPES(grad_out.scalar_type(), "weighting_bw_x", [&] {
-    auto grad_out_data = grad_out.data_ptr<scalar_t>();
-    auto weight_data = weight.data_ptr<scalar_t>();
-    auto basis_data = basis.data_ptr<scalar_t>();
-    auto grad_x_data = grad_x.data_ptr<scalar_t>();
-
-   hipLaunchKernelGGL(( spline_weighting_bw_x_kernel<scalar_t>)
-        , dim3(BLOCKS(grad_x.numel())), dim3(THREADS), 0, stream, 
-            grad_out_data, weight_data, basis_data, weight_index_data,
-            grad_x_data, E, M_in, M_out, S, grad_x.numel());
-  });
-
-  return grad_x;
-}
-
-template <typename scalar_t>
-__global__ void spline_weighting_bw_weight_kernel(
-    const scalar_t *grad_out, const scalar_t *x, const scalar_t *basis,
-    const int64_t *weight_index, scalar_t *grad_weight, int64_t E, int64_t M_in,
-    int64_t M_out, int64_t S, int64_t numel) {
-
-  const int64_t thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  const int64_t e = thread_idx / M_out;
-  const int64_t m_out = thread_idx % M_out;
-
-  if (thread_idx < numel) {
-    auto g = grad_out[e * M_out + m_out];
-    for (int64_t s = 0; s < S; s++) {
-      const scalar_t b = basis[e * S + s];
-      const int64_t wi = weight_index[e * S + s];
-
-      for (int64_t m_in = 0; m_in < M_in; m_in++) {
-        auto v = g * b * x[e * M_in + m_in];
-        atomAdd(&grad_weight[wi * M_in * M_out + m_in * M_out + m_out], v);
-      }
-    }
-  }
-}
-
-torch::Tensor spline_weighting_bw_weight_cuda(torch::Tensor grad_out,
-                                              torch::Tensor x,
-                                              torch::Tensor basis,
-                                              torch::Tensor weight_index,
-                                              int64_t kernel_size) {
-  CHECK_CUDA(grad_out);
-  CHECK_CUDA(x);
-  CHECK_CUDA(basis);
-  CHECK_CUDA(weight_index);
-  hipSetDevice(grad_out.get_device());
-
-  auto E = grad_out.size(0);
-  auto M_in = x.size(1);
-  auto M_out = grad_out.size(1);
-  auto S = basis.size(1);
-
-  auto grad_weight = at::zeros({kernel_size, M_in, M_out}, grad_out.options());
-
-  auto weight_index_data = weight_index.data_ptr<int64_t>();
-
-  auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
-  AT_DISPATCH_FLOATING_TYPES(x.scalar_type(), "weighting_bw_weight", [&] {
-    auto grad_out_data = grad_out.data_ptr<scalar_t>();
-    auto x_data = x.data_ptr<scalar_t>();
-    auto basis_data = basis.data_ptr<scalar_t>();
-    auto grad_weight_data = grad_weight.data_ptr<scalar_t>();
-
-   hipLaunchKernelGGL(( spline_weighting_bw_weight_kernel<scalar_t>)
-        , dim3(BLOCKS(grad_out.numel())), dim3(THREADS), 0, stream, 
-            grad_out_data, x_data, basis_data, weight_index_data,
-            grad_weight_data, E, M_in, M_out, S, grad_out.numel());
-  });
-
-  return grad_weight;
-}
-
-template <typename scalar_t>
-__global__ void spline_weighting_bw_basis_kernel(
-    const scalar_t *grad_out, const scalar_t *x, const scalar_t *weight,
-    const int64_t *weight_index, scalar_t *grad_basis, int64_t E, int64_t M_in,
-    int64_t M_out, int64_t S, int64_t numel) {
-
-  const size_t thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  const int64_t e = thread_idx / M_out;
-  const int64_t m_out = thread_idx % M_out;
-
-  if (thread_idx < numel) {
-    const scalar_t g = grad_out[e * M_out + m_out];
-
-    for (int64_t s = 0; s < S; s++) {
-      scalar_t v = (scalar_t)0.;
-      const int64_t wi = weight_index[e * S + s];
-
-      for (int64_t m_in = 0; m_in < M_in; m_in++) {
-        const scalar_t w = weight[wi * M_in * M_out + m_in * M_out + m_out];
-        v += g * w * x[e * M_in + m_in];
-      }
-      atomAdd(&grad_basis[e * S + s], v);
-    }
-  }
-}
-
-torch::Tensor spline_weighting_bw_basis_cuda(torch::Tensor grad_out,
-                                             torch::Tensor x,
-                                             torch::Tensor weight,
-                                             torch::Tensor weight_index) {
-  CHECK_CUDA(grad_out);
-  CHECK_CUDA(x);
-  CHECK_CUDA(weight);
-  CHECK_CUDA(weight_index);
-  hipSetDevice(grad_out.get_device());
-
-  CHECK_INPUT(x.size(1) == weight.size(1));
-  CHECK_INPUT(grad_out.size(1) == weight.size(2));
-
-  auto E = grad_out.size(0);
-  auto M_in = x.size(1);
-  auto M_out = grad_out.size(1);
-  auto S = weight_index.size(1);
-
-  auto grad_basis = at::zeros({E, S}, grad_out.options());
-
-  auto weight_index_data = weight_index.data_ptr<int64_t>();
-
-  auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
-  AT_DISPATCH_FLOATING_TYPES(x.scalar_type(), "weighting_bw_basis", [&] {
-    auto grad_out_data = grad_out.data_ptr<scalar_t>();
-    auto x_data = x.data_ptr<scalar_t>();
-    auto weight_data = weight.data_ptr<scalar_t>();
-    auto grad_basis_data = grad_basis.data_ptr<scalar_t>();
-
-   hipLaunchKernelGGL(( spline_weighting_bw_basis_kernel<scalar_t>)
-        , dim3(BLOCKS(grad_out.numel())), dim3(THREADS), 0, stream, 
-            grad_out_data, x_data, weight_data, weight_index_data,
-            grad_basis_data, E, M_in, M_out, S, grad_out.numel());
-  });
-
-  return grad_basis;
-}
--- a/csrc/version.cpp
+++ b/csrc/version.cpp
@@ -2,8 +2,12 @@
 #include <torch/script.h>

 #ifdef WITH_CUDA
+#ifdef USE_ROCM
+#include <hip/hip_version.h>
+#else
 #include <cuda.h>
 #endif
+#endif

 #ifdef _WIN32
 #ifdef WITH_CUDA
@@ -15,11 +19,15 @@ PyMODINIT_FUNC PyInit__version_cpu(void) { return NULL; }

 int64_t cuda_version() {
 #ifdef WITH_CUDA
+#ifdef USE_ROCM
+  return HIP_VERSION;
+#else
  return CUDA_VERSION;
+#endif
 #else
  return -1;
 #endif
 }

 static auto registry = torch::RegisterOperators().op(
-    "torch_spline_conv::cuda_version", &cuda_version);
+    "torch_spline_conv::cuda_version", [] { return cuda_version(); });