push v1.2.1 version

04d6ec40 · limm · 1d2126aa · 04d6ec40 · 04d6ec40 · 04d6ec40
Commit 04d6ec40 authored Jan 12, 2024 by limm
20 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
+cmake_minimum_required(VERSION 3.0)
+project(torchsplineconv)
+set(CMAKE_CXX_STANDARD 14)
+set(TORCHSPLINECONV_VERSION 1.2.1)
+
+option(WITH_CUDA "Enable CUDA support" OFF)
+
+if(WITH_CUDA)
+  enable_language(CUDA)
+  add_definitions(-D__CUDA_NO_HALF_OPERATORS__)
+  add_definitions(-DWITH_CUDA)
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -arch=sm_35 --expt-relaxed-constexpr")
+endif()
+
+find_package(Python3 COMPONENTS Development)
+find_package(Torch REQUIRED)
+
+file(GLOB HEADERS csrc/spline_conv.h)
+file(GLOB OPERATOR_SOURCES csrc/cpu/*.h csrc/cpu/*.cpp csrc/*.cpp)
+if(WITH_CUDA)
+  file(GLOB OPERATOR_SOURCES ${OPERATOR_SOURCES} csrc/cuda/*.h csrc/cuda/*.cu)
+endif()
+
+add_library(${PROJECT_NAME} SHARED ${OPERATOR_SOURCES})
+target_link_libraries(${PROJECT_NAME} PRIVATE ${TORCH_LIBRARIES} Python3::Python)
+set_target_properties(${PROJECT_NAME} PROPERTIES EXPORT_NAME TorchSplineConv)
+
+target_include_directories(${PROJECT_NAME} INTERFACE
+  $<BUILD_INTERFACE:${HEADERS}>
+  $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
+
+include(GNUInstallDirs)
+include(CMakePackageConfigHelpers)
+
+set(TORCHSPLINECONV_CMAKECONFIG_INSTALL_DIR "share/cmake/TorchSplineConv" CACHE STRING "install path for TorchSplineConvConfig.cmake")
+
+configure_package_config_file(cmake/TorchSplineConvConfig.cmake.in
+  "${CMAKE_CURRENT_BINARY_DIR}/TorchSplineConvConfig.cmake"
+  INSTALL_DESTINATION ${TORCHSPLINECONV_CMAKECONFIG_INSTALL_DIR})
+
+write_basic_package_version_file(${CMAKE_CURRENT_BINARY_DIR}/TorchSplineConvConfigVersion.cmake
+  VERSION ${TORCHSPLINECONV_VERSION}
+  COMPATIBILITY AnyNewerVersion)
+
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/TorchSplineConvConfig.cmake
+  ${CMAKE_CURRENT_BINARY_DIR}/TorchSplineConvConfigVersion.cmake
+  DESTINATION ${TORCHSPLINECONV_CMAKECONFIG_INSTALL_DIR})
+
+install(TARGETS ${PROJECT_NAME}
+  EXPORT TorchSplineConvTargets
+  LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  )
+
+install(EXPORT TorchSplineConvTargets
+  NAMESPACE TorchSplineConv::
+  DESTINATION ${TORCHSPLINECONV_CMAKECONFIG_INSTALL_DIR})
+
+install(FILES ${HEADERS} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME})
+install(FILES
+  csrc/cpu/basis_cpu.h
+  csrc/cpu/weighting_cpu.h
+  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME}/cpu)
+if(WITH_CUDA)
+  install(FILES
+    csrc/cuda/basis_cuda.h
+    csrc/cuda/weighting_cuda.h
+    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME}/cuda)
+endif()
+
+if(WITH_CUDA)
+  set_property(TARGET torch_cuda PROPERTY INTERFACE_COMPILE_OPTIONS "")
+  set_property(TARGET torch_cpu PROPERTY INTERFACE_COMPILE_OPTIONS "")
+endif()
--- a/README.md
+++ b/README.md
-# <div align="center"><strong>torch-spline-conv-1.2.1</strong></div>
-## 简介
-torch-spline-conv是基于PyTorch框架的一个软件包，用于实现图卷积神经网络中的Spline卷积操作。图卷积神经网络是一种能够在图结构数据上进行深度学习的模型，适用于节点分类、图分类和图生成等任务。
+[pypi-image]: https://badge.fury.io/py/torch-spline-conv.svg
+[pypi-url]: https://pypi.python.org/pypi/torch-spline-conv
+[build-image]: https://travis-ci.org/rusty1s/pytorch_spline_conv.svg?branch=master
+[build-url]: https://travis-ci.org/rusty1s/pytorch_spline_conv
+[coverage-image]: https://codecov.io/gh/rusty1s/pytorch_spline_conv/branch/master/graph/badge.svg
+[coverage-url]: https://codecov.io/github/rusty1s/pytorch_spline_conv?branch=master

-## 依赖安装
-+ pytorch1.10或者pytorch1.13 以及对应的torchvision（建议dtk-22.04.2、dtk-23.04与dtk-23.10）
-+ python 3.7-3.10
+# Spline-Based Convolution Operator of SplineCNN

-### 1、使用源码编译方式安装
+[![PyPI Version][pypi-image]][pypi-url]
+[![Build Status][build-image]][build-url]
+[![Code Coverage][coverage-image]][coverage-url]

-#### 编译环境准备
-提供2种环境准备方式：
+--------------------------------------------------------------------------------

-1. 基于光源pytorch基础镜像环境：镜像下载地址：[https://sourcefind.cn/#/image/dcu/pytorch](https://sourcefind.cn/#/image/dcu/pytorch)，根据pytorch、python、dtk及系统下载对应的镜像版本。
+This is a PyTorch implementation of the spline-based convolution operator of SplineCNN, as described in our paper:
+
+Matthias Fey, Jan Eric Lenssen, Frank Weichert, Heinrich Müller: [SplineCNN: Fast Geometric Deep Learning with Continuous B-Spline Kernels](https://arxiv.org/abs/1711.08920) (CVPR 2018)
+
+The operator works on all floating point data types and is implemented both for CPU and GPU.
+
+## Installation
+
+### Binaries
+
+We provide pip wheels for all major OS/PyTorch/CUDA combinations, see [here](https://pytorch-geometric.com/whl).
+
+#### PyTorch 1.7.0
+
+To install the binaries for PyTorch 1.7.0, simply run
+
+```
+pip install torch-spline-conv -f https://pytorch-geometric.com/whl/torch-1.7.0+${CUDA}.html
+```
+
+where `${CUDA}` should be replaced by either `cpu`, `cu92`, `cu101`, `cu102`, or `cu110` depending on your PyTorch installation.
+
+|             | `cpu` | `cu92` | `cu101` | `cu102` | `cu110` |
+|-------------|-------|--------|---------|---------|---------|
+| **Linux**   | ✅    | ✅     | ✅      | ✅      | ✅      |
+| **Windows** | ✅    | ❌     | ✅      | ✅      | ✅      |
+| **macOS**   | ✅    |        |         |         |         |
+
+#### PyTorch 1.6.0
+
+To install the binaries for PyTorch 1.6.0, simply run
+
+```
+pip install torch-spline-conv -f https://pytorch-geometric.com/whl/torch-1.6.0+${CUDA}.html
+```
+
+where `${CUDA}` should be replaced by either `cpu`, `cu92`, `cu101` or `cu102` depending on your PyTorch installation.
+
+|             | `cpu` | `cu92` | `cu101` | `cu102` |
+|-------------|-------|--------|---------|---------|
+| **Linux**   | ✅    | ✅     | ✅      | ✅      |
+| **Windows** | ✅    | ❌     | ✅      | ✅      |
+| **macOS**   | ✅    |        |         |         |
+
+**Note:** Binaries of older versions are also provided for PyTorch 1.4.0 and PyTorch 1.5.0 (following the same procedure).
+
+### From source
+
+Ensure that at least PyTorch 1.4.0 is installed and verify that `cuda/bin` and `cuda/include` are in your `$PATH` and `$CPATH` respectively, *e.g.*:
+
+```
+$ python -c "import torch; print(torch.__version__)"
+>>> 1.4.0
+
+$ echo $PATH
+>>> /usr/local/cuda/bin:...
+
+$ echo $CPATH
+>>> /usr/local/cuda/include:...
+```
+
+Then run:

-2. 基于现有python环境：安装pytorch和torchvision，whl包下载目录：[https://cancon.hpccube.com:65024/4/main/pytorch](https://cancon.hpccube.com:65024/4/main/pytorch)，[https://cancon.hpccube.com:65024/4/main/vision](https://cancon.hpccube.com:65024/4/main/vision)，根据python、dtk版本,下载对应pytorch和torchvision的whl包。安装命令如下：
-```shell
-pip install torch* (下载的torch的whl包)
-pip install torchvision* (下载的torchvision的whl包)
-pip install setuptools==59.5.0 wheel
 ```
+pip install torch-spline-conv
+```
+
+When running in a docker container without NVIDIA driver, PyTorch needs to evaluate the compute capabilities and may fail.
+In this case, ensure that the compute capabilities are set via `TORCH_CUDA_ARCH_LIST`, *e.g.*:

-#### 源码编译安装
- 代码下载
-```shell
-git clone http://developer.hpccube.com/codes/aicomponent/torch-spline-conv # 根据编译需要切换分支
 ```
- 源码编译（进入torch-spline-conv目录）：
+export TORCH_CUDA_ARCH_LIST = "6.0 6.1 7.2+PTX 7.5+PTX"
 ```
-export C_INCLUDE_PATH=/public/software/apps/DeepLearning/PyTorch_Lib/gflags-2.1.2-build/include:$C_INCLUDE_PATH
-export CPLUS_INCLUDE_PATH=/public/software/apps/DeepLearning/PyTorch_Lib/gflags-2.1.2-build/include:$CPLUS_INCLUDE_PATH
-export C_INCLUDE_PATH=/public/software/apps/DeepLearning/PyTorch_Lib/glog-build/include:$C_INCLUDE_PATH
-export CPLUS_INCLUDE_PATH=/public/software/apps/DeepLearning/PyTorch_Lib/glog-build/include:$CPLUS_INCLUDE_PATH
-export C_INCLUDE_PATH=$ROCM_PATH/rocrand/include:$C_INCLUDE_PATH
-export CPLUS_INCLUDE_PATH=$ROCM_PATH/rocrand/include:$CPLUS_INCLUDE_PATH
-export LD_LIBRARY_PATH=$ROCM_PATH/rocrand/lib:$LD_LIBRARY_PATH
-export FORCE_ONLY_HIP=1
-export CC=hipcc
-export CXX=hipcc

-python setup.py install
+## Usage
+
+```python
+from torch_spline_conv import spline_conv
+
+out = spline_conv(x,
+                  edge_index,
+                  pseudo,
+                  weight,
+                  kernel_size,
+                  is_open_spline,
+                  degree=1,
+                  norm=True,
+                  root_weight=None,
+                  bias=None)
 ```
-#### 注意事项
-+ 若使用pip install下载安装过慢，可添加pypi清华源：-i https://pypi.tuna.tsinghua.edu.cn/simple/
-+ ROCM_PATH为dtk的路径，默认为/opt/dtk

-## 验证
+Applies the spline-based convolution operator
+<p align="center">
+  <img width="50%" src="https://user-images.githubusercontent.com/6945922/38684093-36d9c52e-3e6f-11e8-9021-db054223c6b9.png" />
+</p>
+over several node features of an input graph.
+The kernel function is defined over the weighted B-spline tensor product basis, as shown below for different B-spline degrees.
+
+<p align="center">
+  <img width="45%" src="https://user-images.githubusercontent.com/6945922/38685443-3a2a0c68-3e72-11e8-8e13-9ce9ad8fe43e.png" />
+  <img width="45%" src="https://user-images.githubusercontent.com/6945922/38685459-42b2bcae-3e72-11e8-88cc-4b61e41dbd93.png" />
+</p>
+
+### Parameters
+
+* **x** *(Tensor)* - Input node features of shape `(number_of_nodes x in_channels)`.
+* **edge_index** *(LongTensor)* - Graph edges, given by source and target indices, of shape `(2 x number_of_edges)`.
+* **pseudo** *(Tensor)* - Edge attributes, ie. pseudo coordinates, of shape `(number_of_edges x number_of_edge_attributes)` in the fixed interval [0, 1].
+* **weight** *(Tensor)* - Trainable weight parameters of shape `(kernel_size x in_channels x out_channels)`.
+* **kernel_size** *(LongTensor)* - Number of trainable weight parameters in each edge dimension.
+* **is_open_spline** *(ByteTensor)* - Whether to use open or closed B-spline bases for each dimension.
+* **degree** *(int, optional)* - B-spline basis degree. (default: `1`)
+* **norm** *(bool, optional)*: Whether to normalize output by node degree. (default: `True`)
+* **root_weight** *(Tensor, optional)* - Additional shared trainable parameters for each feature of the root node of shape `(in_channels x out_channels)`. (default: `None`)
+* **bias** *(Tensor, optional)* - Optional bias of shape `(out_channels)`. (default: `None`)
+
+### Returns
+
+* **out** *(Tensor)* - Out node features of shape `(number_of_nodes x out_channels)`.
+
+### Example

 ```python
 import torch
@@ -68,10 +157,34 @@ print(out.size())
 torch.Size([4, 4])  # 4 nodes with 4 features each
 ```

-## Known Issue
- 该库没有基于cpu环境修改，仅支持dcu，请在有dcu卡的环境运行。
- 如需完整使用所有pyg功能，请pip install torch-geometric
+## Cite
+
+Please cite our paper if you use this code in your own work:

-## 参考资料
- [README_ORIGIN](README_ORIGIN.md)
- [https://pypi.org/project/torch-spline-conv/1.2.1/](https://pypi.org/project/torch-spline-conv/1.2.1/)
+```
+@inproceedings{Fey/etal/2018,
+  title={{SplineCNN}: Fast Geometric Deep Learning with Continuous {B}-Spline Kernels},
+  author={Fey, Matthias and Lenssen, Jan Eric and Weichert, Frank and M{\"u}ller, Heinrich},
+  booktitle={IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+  year={2018},
+}
+```
+
+## Running tests
+
+```
+python setup.py test
+```
+
+## C++ API
+
+`torch-spline-conv` also offers a C++ API that contains C++ equivalent of python models.
+
+```
+mkdir build
+cd build
+# Add -DWITH_CUDA=on support for the CUDA if needed
+cmake ..
+make
+make install
+```
--- a/cmake/TorchSplineConvConfig.cmake.in
+++ b/cmake/TorchSplineConvConfig.cmake.in
+# TorchSplineConvConfig.cmake
+# --------------------
+#
+# Exported targets:: SplineConv
+#
+
+@PACKAGE_INIT@
+
+set(PN TorchSplineConv)
+set(${PN}_INCLUDE_DIR "${PACKAGE_PREFIX_DIR}/@CMAKE_INSTALL_INCLUDEDIR@")
+set(${PN}_LIBRARY "")
+set(${PN}_DEFINITIONS USING_${PN})
+
+check_required_components(${PN})
+
+
+if(NOT (CMAKE_VERSION VERSION_LESS 3.0))
+#-----------------------------------------------------------------------------
+# Don't include targets if this file is being picked up by another
+# project which has already built this as a subproject
+#-----------------------------------------------------------------------------
+if(NOT TARGET ${PN}::TorchSplineConv)
+include("${CMAKE_CURRENT_LIST_DIR}/${PN}Targets.cmake")
+
+if(NOT TARGET torch_library)
+find_package(Torch REQUIRED)
+endif()
+if(NOT TARGET Python3::Python)
+find_package(Python3 COMPONENTS Development)
+endif()
+target_link_libraries(TorchSplineConv::TorchSplineConv INTERFACE ${TORCH_LIBRARIES} Python3::Python)
+
+if(@WITH_CUDA@)
+  target_compile_definitions(TorchSplineConv::TorchSplineConv INTERFACE WITH_CUDA)
+endif()
+
+endif()
+endif()
--- a/csrc/basis.cpp
+++ b/csrc/basis.cpp
@@ -3,12 +3,12 @@

 #include "cpu/basis_cpu.h"

-#ifdef WITH_HIP
-#include "hip/basis_hip.h"
+#ifdef WITH_CUDA
+#include "cuda/basis_cuda.h"
 #endif

 #ifdef _WIN32
-#ifdef WITH_HIP
+#ifdef WITH_CUDA
 PyMODINIT_FUNC PyInit__basis_cuda(void) { return NULL; }
 #else
 PyMODINIT_FUNC PyInit__basis_cpu(void) { return NULL; }
@@ -19,7 +19,7 @@ std::tuple<torch::Tensor, torch::Tensor>
 spline_basis_fw(torch::Tensor pseudo, torch::Tensor kernel_size,
                torch::Tensor is_open_spline, int64_t degree) {
  if (pseudo.device().is_cuda()) {
-#ifdef WITH_HIP
+#ifdef WITH_CUDA
    return spline_basis_fw_cuda(pseudo, kernel_size, is_open_spline, degree);
 #else
    AT_ERROR("Not compiled with CUDA support");
@@ -33,7 +33,7 @@ torch::Tensor spline_basis_bw(torch::Tensor grad_basis, torch::Tensor pseudo,
                              torch::Tensor kernel_size,
                              torch::Tensor is_open_spline, int64_t degree) {
  if (grad_basis.device().is_cuda()) {
-#ifdef WITH_HIP
+#ifdef WITH_CUDA
    return spline_basis_bw_cuda(grad_basis, pseudo, kernel_size, is_open_spline,
                                degree);
 #else

--- a/csrc/cuda/atomics.cuh
+++ b/csrc/cuda/atomics.cuh
+#pragma once
+
+static inline __device__ void atomAdd(float *address, float val) {
+  atomicAdd(address, val);
+}
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600 || CUDA_VERSION < 8000)
+static inline __device__ void atomAdd(double *address, double val) {
+  unsigned long long int *address_as_ull = (unsigned long long int *)address;
+  unsigned long long int old = *address_as_ull;
+  unsigned long long int assumed;
+
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_ull, assumed,
+                    __double_as_longlong(val + __longlong_as_double(assumed)));
+  } while (assumed != old);
+}
+#else
+static inline __device__ void atomAdd(double *address, double val) {
+  atomicAdd(address, val);
+}
+#endif
--- a/csrc/cuda/basis_cuda.cu
+++ b/csrc/cuda/basis_cuda.cu
+#include "basis_cuda.h"
+
+#include <ATen/cuda/CUDAContext.h>
+
+#include "utils.cuh"
+
+#define THREADS 1024
+#define BLOCKS(N) (N + THREADS - 1) / THREADS
+
+template <typename scalar_t, int64_t degree> struct Basis {
+  static inline __device__ scalar_t forward(scalar_t v, int64_t k_mod) {
+    if (degree == 1) {
+      return 1. - v - k_mod + 2. * v * k_mod;
+    } else if (degree == 2) {
+      if (k_mod == 0)
+        return 0.5 * v * v - v + 0.5;
+      else if (k_mod == 1)
+        return -v * v + v + 0.5;
+      else
+        return 0.5 * v * v;
+    } else if (degree == 3) {
+      if (k_mod == 0)
+        return (1. - v) * (1. - v) * (1. - v) / 6.;
+      else if (k_mod == 1)
+        return (3. * v * v * v - 6. * v * v + 4.) / 6.;
+      else if (k_mod == 2)
+        return (-3. * v * v * v + 3. * v * v + 3. * v + 1.) / 6.;
+      else
+        return v * v * v / 6.;
+    } else {
+      return (scalar_t)-1.;
+    }
+  }
+
+  static inline __device__ scalar_t backward(scalar_t v, int64_t k_mod) {
+    if (degree == 1) {
+      return 2 * k_mod - 1;
+    } else if (degree == 2) {
+      if (k_mod == 0)
+        return v - 1.;
+      else if (k_mod == 1)
+        return -2. * v + 1.;
+      else
+        return v;
+    } else if (degree == 3) {
+      if (k_mod == 0)
+        return (-v * v + 2. * v - 1.) / 2.;
+      else if (k_mod == 1)
+        return (3. * v * v - 4. * v) / 2.;
+      else if (k_mod == 2)
+        return (-3. * v * v + 2. * v + 1.) / 2.;
+      else
+        return v * v / 2.;
+    } else {
+      return (scalar_t)-1.;
+    }
+  }
+};
+
+template <typename scalar_t, int64_t degree>
+__global__ void
+spline_basis_fw_kernel(const scalar_t *pseudo, const int64_t *kernel_size,
+                       const uint8_t *is_open_spline, scalar_t *basis,
+                       int64_t *weight_index, int64_t E, int64_t D, int64_t S,
+                       int64_t numel) {
+
+  const int64_t thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int64_t e = thread_idx / S;
+  const int64_t s = thread_idx % S;
+
+  if (thread_idx < numel) {
+    int64_t k = s, wi = 0, wi_offset = 1;
+    scalar_t b = (scalar_t)1.;
+
+    for (int64_t d = 0; d < D; d++) {
+      const int64_t k_mod = k % (degree + 1);
+      k /= degree + 1;
+
+      scalar_t v = pseudo[e * D + d];
+      v *= kernel_size[d] - degree * is_open_spline[d];
+
+      wi += (((int64_t)v + k_mod) % kernel_size[d]) * wi_offset;
+      wi_offset *= kernel_size[d];
+
+      v -= floor(v);
+      v = Basis<scalar_t, degree>::forward(v, k_mod);
+      b *= v;
+    }
+
+    basis[thread_idx] = b;
+    weight_index[thread_idx] = wi;
+  }
+}
+
+std::tuple<torch::Tensor, torch::Tensor>
+spline_basis_fw_cuda(torch::Tensor pseudo, torch::Tensor kernel_size,
+                     torch::Tensor is_open_spline, int64_t degree) {
+  CHECK_CUDA(pseudo);
+  CHECK_CUDA(kernel_size);
+  CHECK_CUDA(is_open_spline);
+  cudaSetDevice(pseudo.get_device());
+
+  CHECK_INPUT(kernel_size.dim() == 1);
+  CHECK_INPUT(pseudo.size(1) == kernel_size.numel());
+  CHECK_INPUT(is_open_spline.dim());
+  CHECK_INPUT(pseudo.size(1) == is_open_spline.numel());
+
+  auto E = pseudo.size(0);
+  auto D = pseudo.size(1);
+  auto S = (int64_t)(powf(degree + 1, D) + 0.5);
+
+  auto basis = at::empty({E, S}, pseudo.options());
+  auto weight_index = at::empty({E, S}, kernel_size.options());
+
+  auto kernel_size_data = kernel_size.data_ptr<int64_t>();
+  auto is_open_spline_data = is_open_spline.data_ptr<uint8_t>();
+  auto weight_index_data = weight_index.data_ptr<int64_t>();
+
+  auto stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES(pseudo.scalar_type(), "basis_fw", [&] {
+    auto pseudo_data = pseudo.data_ptr<scalar_t>();
+    auto basis_data = basis.data_ptr<scalar_t>();
+
+    AT_DISPATCH_DEGREE_TYPES(degree, [&] {
+      spline_basis_fw_kernel<scalar_t, DEGREE>
+          <<<BLOCKS(basis.numel()), THREADS, 0, stream>>>(
+              pseudo_data, kernel_size_data, is_open_spline_data, basis_data,
+              weight_index_data, E, D, S, basis.numel());
+    });
+  });
+
+  return std::make_tuple(basis, weight_index);
+}
+
+template <typename scalar_t, int64_t degree>
+__global__ void
+spline_basis_bw_kernel(const scalar_t *grad_basis, const scalar_t *pseudo,
+                       const int64_t *kernel_size,
+                       const uint8_t *is_open_spline, scalar_t *grad_pseudo,
+                       int64_t E, int64_t D, int64_t S, int64_t numel) {
+
+  const int64_t thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int64_t e = thread_idx / D;
+  const int64_t d = thread_idx % D;
+
+  if (thread_idx < numel) {
+    scalar_t g = (scalar_t)0., tmp;
+
+    for (ptrdiff_t s = 0; s < S; s++) {
+      int64_t k_mod = (s / (int64_t)(powf(degree + 1, d) + 0.5)) % (degree + 1);
+
+      scalar_t v = pseudo[e * D + d];
+      v *= kernel_size[d] - degree * is_open_spline[d];
+      v -= floor(v);
+      v = Basis<scalar_t, degree>::backward(v, k_mod);
+      tmp = v;
+
+      for (int64_t d_it = 1; d_it < D; d_it++) {
+        const int64_t d_new = d_it - (d >= d_it);
+        k_mod = (s / (int64_t)(powf(degree + 1, d_new) + 0.5)) % (degree + 1);
+        v = pseudo[e * D + d_new];
+        v *= kernel_size[d_new] - degree * is_open_spline[d_new];
+        v -= floor(v);
+        v = Basis<scalar_t, degree>::forward(v, k_mod);
+        tmp *= v;
+      }
+      g += tmp * grad_basis[e * S + s];
+    }
+    g *= kernel_size[d] - degree * is_open_spline[d];
+    grad_pseudo[thread_idx] = g;
+  }
+}
+
+torch::Tensor spline_basis_bw_cuda(torch::Tensor grad_basis,
+                                   torch::Tensor pseudo,
+                                   torch::Tensor kernel_size,
+                                   torch::Tensor is_open_spline,
+                                   int64_t degree) {
+  CHECK_CUDA(grad_basis);
+  CHECK_CUDA(pseudo);
+  CHECK_CUDA(kernel_size);
+  CHECK_CUDA(is_open_spline);
+  cudaSetDevice(grad_basis.get_device());
+
+  CHECK_INPUT(grad_basis.size(0) == pseudo.size(0));
+  CHECK_INPUT(kernel_size.dim() == 1);
+  CHECK_INPUT(pseudo.size(1) == kernel_size.numel());
+  CHECK_INPUT(is_open_spline.dim());
+  CHECK_INPUT(pseudo.size(1) == is_open_spline.numel());
+
+  auto E = pseudo.size(0);
+  auto D = pseudo.size(1);
+  auto S = grad_basis.size(1);
+
+  auto grad_pseudo = at::empty({E, D}, pseudo.options());
+
+  auto kernel_size_data = kernel_size.data_ptr<int64_t>();
+  auto is_open_spline_data = is_open_spline.data_ptr<uint8_t>();
+
+  auto stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES(pseudo.scalar_type(), "basis_bw", [&] {
+    auto grad_basis_data = grad_basis.data_ptr<scalar_t>();
+    auto pseudo_data = pseudo.data_ptr<scalar_t>();
+    auto grad_pseudo_data = grad_pseudo.data_ptr<scalar_t>();
+
+    AT_DISPATCH_DEGREE_TYPES(degree, [&] {
+      spline_basis_bw_kernel<scalar_t, DEGREE>
+          <<<BLOCKS(grad_pseudo.numel()), THREADS, 0, stream>>>(
+              grad_basis_data, pseudo_data, kernel_size_data,
+              is_open_spline_data, grad_pseudo_data, E, D, S,
+              grad_pseudo.numel());
+    });
+  });
+
+  return grad_pseudo;
+}
--- a/csrc/cuda/basis_cuda.h
+++ b/csrc/cuda/basis_cuda.h
+#pragma once
+
+#include <torch/extension.h>
+
+std::tuple<torch::Tensor, torch::Tensor>
+spline_basis_fw_cuda(torch::Tensor pseudo, torch::Tensor kernel_size,
+                     torch::Tensor is_open_spline, int64_t degree);
+
+torch::Tensor spline_basis_bw_cuda(torch::Tensor grad_basis,
+                                   torch::Tensor pseudo,
+                                   torch::Tensor kernel_size,
+                                   torch::Tensor is_open_spline,
+                                   int64_t degree);
--- a/csrc/cuda/utils.cuh
+++ b/csrc/cuda/utils.cuh
+#pragma once
+
+#include <torch/extension.h>
+
+#define CHECK_CUDA(x)                                                          \
+  AT_ASSERTM(x.device().is_cuda(), #x " must be CUDA tensor")
+#define CHECK_INPUT(x) AT_ASSERTM(x, "Input mismatch")
+
+#define AT_DISPATCH_DEGREE_TYPES(degree, ...)                                  \
+  [&] {                                                                        \
+    switch (degree) {                                                          \
+    case 1: {                                                                  \
+      const int64_t DEGREE = 1;                                                \
+      return __VA_ARGS__();                                                    \
+    }                                                                          \
+    case 2: {                                                                  \
+      const int64_t DEGREE = 2;                                                \
+      return __VA_ARGS__();                                                    \
+    }                                                                          \
+    case 3: {                                                                  \
+      const int64_t DEGREE = 3;                                                \
+      return __VA_ARGS__();                                                    \
+    }                                                                          \
+    default:                                                                   \
+      AT_ERROR("Basis degree not implemented");                                \
+    }                                                                          \
+  }()
--- a/csrc/cuda/weighting_cuda.cu
+++ b/csrc/cuda/weighting_cuda.cu
+#include "weighting_cuda.h"
+
+#include <ATen/cuda/CUDAContext.h>
+
+#include "atomics.cuh"
+#include "utils.cuh"
+
+#define THREADS 1024
+#define BLOCKS(N) (N + THREADS - 1) / THREADS
+
+template <typename scalar_t>
+__global__ void
+spline_weighting_fw_kernel(const scalar_t *x, const scalar_t *weight,
+                           const scalar_t *basis, const int64_t *weight_index,
+                           scalar_t *out, int64_t E, int64_t M_in,
+                           int64_t M_out, int64_t S, int64_t numel) {
+
+  const int64_t thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int64_t e = thread_idx / M_out;
+  const int64_t m_out = thread_idx % M_out;
+
+  if (thread_idx < numel) {
+    scalar_t v = (scalar_t)0.;
+
+    for (ptrdiff_t s = 0; s < S; s++) {
+      const scalar_t b = basis[e * S + s];
+      const int64_t wi = weight_index[e * S + s];
+      for (int64_t m_in = 0; m_in < M_in; m_in++) {
+        scalar_t tmp = weight[wi * M_in * M_out + m_in * M_out + m_out];
+        tmp *= b * x[e * M_in + m_in];
+        v += tmp;
+      }
+    }
+    out[thread_idx] = v;
+  }
+}
+
+torch::Tensor spline_weighting_fw_cuda(torch::Tensor x, torch::Tensor weight,
+                                       torch::Tensor basis,
+                                       torch::Tensor weight_index) {
+  CHECK_CUDA(x);
+  CHECK_CUDA(weight);
+  CHECK_CUDA(basis);
+  CHECK_CUDA(weight_index);
+  cudaSetDevice(x.get_device());
+
+  CHECK_INPUT(x.size(1) == weight.size(1));
+
+  auto E = x.size(0);
+  auto M_in = x.size(1);
+  auto M_out = weight.size(2);
+  auto S = basis.size(1);
+
+  auto out = at::empty({E, M_out}, x.options());
+
+  auto weight_index_data = weight_index.data_ptr<int64_t>();
+
+  auto stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES(x.scalar_type(), "weighting_fw", [&] {
+    auto x_data = x.data_ptr<scalar_t>();
+    auto weight_data = weight.data_ptr<scalar_t>();
+    auto basis_data = basis.data_ptr<scalar_t>();
+    auto out_data = out.data_ptr<scalar_t>();
+
+    spline_weighting_fw_kernel<scalar_t>
+        <<<BLOCKS(out.numel()), THREADS, 0, stream>>>(
+            x_data, weight_data, basis_data, weight_index_data, out_data, E,
+            M_in, M_out, S, out.numel());
+  });
+
+  return out;
+}
+
+template <typename scalar_t>
+__global__ void
+spline_weighting_bw_x_kernel(const scalar_t *grad_out, const scalar_t *weight,
+                             const scalar_t *basis, const int64_t *weight_index,
+                             scalar_t *grad_x, int64_t E, int64_t M_in,
+                             int64_t M_out, int64_t S, int64_t numel) {
+
+  const int64_t thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int64_t e = thread_idx / M_in;
+  const int64_t m_in = thread_idx % M_in;
+
+  if (thread_idx < numel) {
+    scalar_t v = (scalar_t)0.;
+
+    for (int64_t s = 0; s < S; s++) {
+      const scalar_t b = basis[e * S + s];
+      const int64_t wi = weight_index[e * S + s];
+
+      for (int64_t m_out = 0; m_out < M_out; m_out++) {
+        scalar_t tmp = weight[wi * M_out * M_in + m_out * M_in + m_in];
+        tmp *= b * grad_out[e * M_out + m_out];
+        v += tmp;
+      }
+    }
+    grad_x[thread_idx] = v;
+  }
+}
+
+torch::Tensor spline_weighting_bw_x_cuda(torch::Tensor grad_out,
+                                         torch::Tensor weight,
+                                         torch::Tensor basis,
+                                         torch::Tensor weight_index) {
+  CHECK_CUDA(grad_out);
+  CHECK_CUDA(weight);
+  CHECK_CUDA(basis);
+  CHECK_CUDA(weight_index);
+  cudaSetDevice(grad_out.get_device());
+
+  CHECK_INPUT(grad_out.size(1) == weight.size(2));
+
+  auto E = grad_out.size(0);
+  auto M_in = weight.size(1);
+  auto M_out = grad_out.size(1);
+  auto S = basis.size(1);
+
+  auto grad_x = at::zeros({E, M_in}, grad_out.options());
+  weight = weight.transpose(1, 2).contiguous(); // Contiguous memory-access.
+
+  auto weight_index_data = weight_index.data_ptr<int64_t>();
+
+  auto stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES(grad_out.scalar_type(), "weighting_bw_x", [&] {
+    auto grad_out_data = grad_out.data_ptr<scalar_t>();
+    auto weight_data = weight.data_ptr<scalar_t>();
+    auto basis_data = basis.data_ptr<scalar_t>();
+    auto grad_x_data = grad_x.data_ptr<scalar_t>();
+
+    spline_weighting_bw_x_kernel<scalar_t>
+        <<<BLOCKS(grad_x.numel()), THREADS, 0, stream>>>(
+            grad_out_data, weight_data, basis_data, weight_index_data,
+            grad_x_data, E, M_in, M_out, S, grad_x.numel());
+  });
+
+  return grad_x;
+}
+
+template <typename scalar_t>
+__global__ void spline_weighting_bw_weight_kernel(
+    const scalar_t *grad_out, const scalar_t *x, const scalar_t *basis,
+    const int64_t *weight_index, scalar_t *grad_weight, int64_t E, int64_t M_in,
+    int64_t M_out, int64_t S, int64_t numel) {
+
+  const int64_t thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int64_t e = thread_idx / M_out;
+  const int64_t m_out = thread_idx % M_out;
+
+  if (thread_idx < numel) {
+    auto g = grad_out[e * M_out + m_out];
+    for (int64_t s = 0; s < S; s++) {
+      const scalar_t b = basis[e * S + s];
+      const int64_t wi = weight_index[e * S + s];
+
+      for (int64_t m_in = 0; m_in < M_in; m_in++) {
+        auto v = g * b * x[e * M_in + m_in];
+        atomAdd(&grad_weight[wi * M_in * M_out + m_in * M_out + m_out], v);
+      }
+    }
+  }
+}
+
+torch::Tensor spline_weighting_bw_weight_cuda(torch::Tensor grad_out,
+                                              torch::Tensor x,
+                                              torch::Tensor basis,
+                                              torch::Tensor weight_index,
+                                              int64_t kernel_size) {
+  CHECK_CUDA(grad_out);
+  CHECK_CUDA(x);
+  CHECK_CUDA(basis);
+  CHECK_CUDA(weight_index);
+  cudaSetDevice(grad_out.get_device());
+
+  auto E = grad_out.size(0);
+  auto M_in = x.size(1);
+  auto M_out = grad_out.size(1);
+  auto S = basis.size(1);
+
+  auto grad_weight = at::zeros({kernel_size, M_in, M_out}, grad_out.options());
+
+  auto weight_index_data = weight_index.data_ptr<int64_t>();
+
+  auto stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES(x.scalar_type(), "weighting_bw_weight", [&] {
+    auto grad_out_data = grad_out.data_ptr<scalar_t>();
+    auto x_data = x.data_ptr<scalar_t>();
+    auto basis_data = basis.data_ptr<scalar_t>();
+    auto grad_weight_data = grad_weight.data_ptr<scalar_t>();
+
+    spline_weighting_bw_weight_kernel<scalar_t>
+        <<<BLOCKS(grad_out.numel()), THREADS, 0, stream>>>(
+            grad_out_data, x_data, basis_data, weight_index_data,
+            grad_weight_data, E, M_in, M_out, S, grad_out.numel());
+  });
+
+  return grad_weight;
+}
+
+template <typename scalar_t>
+__global__ void spline_weighting_bw_basis_kernel(
+    const scalar_t *grad_out, const scalar_t *x, const scalar_t *weight,
+    const int64_t *weight_index, scalar_t *grad_basis, int64_t E, int64_t M_in,
+    int64_t M_out, int64_t S, int64_t numel) {
+
+  const size_t thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int64_t e = thread_idx / M_out;
+  const int64_t m_out = thread_idx % M_out;
+
+  if (thread_idx < numel) {
+    const scalar_t g = grad_out[e * M_out + m_out];
+
+    for (int64_t s = 0; s < S; s++) {
+      scalar_t v = (scalar_t)0.;
+      const int64_t wi = weight_index[e * S + s];
+
+      for (int64_t m_in = 0; m_in < M_in; m_in++) {
+        const scalar_t w = weight[wi * M_in * M_out + m_in * M_out + m_out];
+        v += g * w * x[e * M_in + m_in];
+      }
+      atomAdd(&grad_basis[e * S + s], v);
+    }
+  }
+}
+
+torch::Tensor spline_weighting_bw_basis_cuda(torch::Tensor grad_out,
+                                             torch::Tensor x,
+                                             torch::Tensor weight,
+                                             torch::Tensor weight_index) {
+  CHECK_CUDA(grad_out);
+  CHECK_CUDA(x);
+  CHECK_CUDA(weight);
+  CHECK_CUDA(weight_index);
+  cudaSetDevice(grad_out.get_device());
+
+  CHECK_INPUT(x.size(1) == weight.size(1));
+  CHECK_INPUT(grad_out.size(1) == weight.size(2));
+
+  auto E = grad_out.size(0);
+  auto M_in = x.size(1);
+  auto M_out = grad_out.size(1);
+  auto S = weight_index.size(1);
+
+  auto grad_basis = at::zeros({E, S}, grad_out.options());
+
+  auto weight_index_data = weight_index.data_ptr<int64_t>();
+
+  auto stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES(x.scalar_type(), "weighting_bw_basis", [&] {
+    auto grad_out_data = grad_out.data_ptr<scalar_t>();
+    auto x_data = x.data_ptr<scalar_t>();
+    auto weight_data = weight.data_ptr<scalar_t>();
+    auto grad_basis_data = grad_basis.data_ptr<scalar_t>();
+
+    spline_weighting_bw_basis_kernel<scalar_t>
+        <<<BLOCKS(grad_out.numel()), THREADS, 0, stream>>>(
+            grad_out_data, x_data, weight_data, weight_index_data,
+            grad_basis_data, E, M_in, M_out, S, grad_out.numel());
+  });
+
+  return grad_basis;
+}
--- a/csrc/cuda/weighting_cuda.h
+++ b/csrc/cuda/weighting_cuda.h
+#pragma once
+
+#include <torch/extension.h>
+
+torch::Tensor spline_weighting_fw_cuda(torch::Tensor x, torch::Tensor weight,
+                                       torch::Tensor basis,
+                                       torch::Tensor weight_index);
+
+torch::Tensor spline_weighting_bw_x_cuda(torch::Tensor grad_out,
+                                         torch::Tensor weight,
+                                         torch::Tensor basis,
+                                         torch::Tensor weight_index);
+
+torch::Tensor spline_weighting_bw_weight_cuda(torch::Tensor grad_out,
+                                              torch::Tensor x,
+                                              torch::Tensor basis,
+                                              torch::Tensor weight_index,
+                                              int64_t kernel_size);
+
+torch::Tensor spline_weighting_bw_basis_cuda(torch::Tensor grad_out,
+                                             torch::Tensor x,
+                                             torch::Tensor weight,
+                                             torch::Tensor weight_index);
--- a/csrc/version.cpp
+++ b/csrc/version.cpp
 #include <Python.h>
 #include <torch/script.h>

-#ifdef WITH_HIP
-#include <hip/hip_runtime.h>
+#ifdef WITH_CUDA
+#include <cuda.h>
 #endif

 #ifdef _WIN32
-#ifdef WITH_HIP
+#ifdef WITH_CUDA
 PyMODINIT_FUNC PyInit__version_cuda(void) { return NULL; }
 #else
 PyMODINIT_FUNC PyInit__version_cpu(void) { return NULL; }
@@ -14,8 +14,8 @@ PyMODINIT_FUNC PyInit__version_cpu(void) { return NULL; }
 #endif

 int64_t cuda_version() {
-#ifdef WITH_HIP
-  return TORCH_HIP_VERSION;
+#ifdef WITH_CUDA
+  return CUDA_VERSION;
 #else
  return -1;
 #endif

--- a/csrc/weighting.cpp
+++ b/csrc/weighting.cpp
@@ -3,12 +3,12 @@

 #include "cpu/weighting_cpu.h"

-#ifdef WITH_HIP
-#include "hip/weighting_hip.h"
+#ifdef WITH_CUDA
+#include "cuda/weighting_cuda.h"
 #endif

 #ifdef _WIN32
-#ifdef WITH_HIP
+#ifdef WITH_CUDA
 PyMODINIT_FUNC PyInit__weighting_cuda(void) { return NULL; }
 #else
 PyMODINIT_FUNC PyInit__weighting_cpu(void) { return NULL; }
@@ -19,7 +19,7 @@ torch::Tensor spline_weighting_fw(torch::Tensor x, torch::Tensor weight,
                                  torch::Tensor basis,
                                  torch::Tensor weight_index) {
  if (x.device().is_cuda()) {
-#ifdef WITH_HIP
+#ifdef WITH_CUDA
    return spline_weighting_fw_cuda(x, weight, basis, weight_index);
 #else
    AT_ERROR("Not compiled with CUDA support");
@@ -33,7 +33,7 @@ torch::Tensor spline_weighting_bw_x(torch::Tensor grad_out,
                                    torch::Tensor weight, torch::Tensor basis,
                                    torch::Tensor weight_index) {
  if (grad_out.device().is_cuda()) {
-#ifdef WITH_HIP
+#ifdef WITH_CUDA
    return spline_weighting_bw_x_cuda(grad_out, weight, basis, weight_index);
 #else
    AT_ERROR("Not compiled with CUDA support");
@@ -48,7 +48,7 @@ torch::Tensor spline_weighting_bw_weight(torch::Tensor grad_out,
                                         torch::Tensor weight_index,
                                         int64_t kernel_size) {
  if (grad_out.device().is_cuda()) {
-#ifdef WITH_HIP
+#ifdef WITH_CUDA
    return spline_weighting_bw_weight_cuda(grad_out, x, basis, weight_index,
                                           kernel_size);
 #else
@@ -64,7 +64,7 @@ torch::Tensor spline_weighting_bw_basis(torch::Tensor grad_out, torch::Tensor x,
                                        torch::Tensor weight,
                                        torch::Tensor weight_index) {
  if (grad_out.device().is_cuda()) {
-#ifdef WITH_HIP
+#ifdef WITH_CUDA
    return spline_weighting_bw_basis_cuda(grad_out, x, weight, weight_index);
 #else
    AT_ERROR("Not compiled with CUDA support");

--- a/script/conda.sh
+++ b/script/conda.sh
+#!/bin/bash
+
+if [ "${TRAVIS_OS_NAME}" = "linux" ]; then
+  wget -nv https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh
+  chmod +x miniconda.sh
+  ./miniconda.sh -b
+  PATH=/home/travis/miniconda3/bin:${PATH}
+fi
+
+if [ "${TRAVIS_OS_NAME}" = "osx" ]; then
+  wget -nv https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O miniconda.sh
+  chmod +x miniconda.sh
+  ./miniconda.sh -b
+  PATH=/Users/travis/miniconda3/bin:${PATH}
+fi
+
+
+if [ "${TRAVIS_OS_NAME}" = "windows" ]; then
+  choco install openssl.light
+  choco install miniconda3
+  PATH=/c/tools/miniconda3/Scripts:$PATH
+fi
+
+conda update --yes conda
+
+conda create --yes -n test python="${PYTHON_VERSION}"
--- a/script/cuda.sh
+++ b/script/cuda.sh
+#!/bin/bash
+
+if [ "${TRAVIS_OS_NAME}" = "linux" ] && [ "$IDX" = "cpu" ]; then
+  export TOOLKIT=cpuonly
+fi
+
+if [ "${TRAVIS_OS_NAME}" = "linux" ] && [ "$IDX" = "cu92" ]; then
+  export CUDA_SHORT=9.2
+  export CUDA=9.2.148-1
+  export UBUNTU_VERSION=ubuntu1604
+  export TOOLKIT="cudatoolkit=${CUDA_SHORT}"
+fi
+
+if [ "${TRAVIS_OS_NAME}" = "linux" ] && [ "$IDX" = "cu101" ]; then
+  export IDX=cu101
+  export CUDA_SHORT=10.1
+  export CUDA=10.1.243-1
+  export UBUNTU_VERSION=ubuntu1804
+  export TOOLKIT="cudatoolkit=${CUDA_SHORT}"
+fi
+
+
+if [ "${TRAVIS_OS_NAME}" = "linux" ] && [ "$IDX" = "cu102" ]; then
+  export CUDA_SHORT=10.2
+  export CUDA=10.2.89-1
+  export UBUNTU_VERSION=ubuntu1804
+  export TOOLKIT="cudatoolkit=${CUDA_SHORT}"
+fi
+
+if [ "${TRAVIS_OS_NAME}" = "linux" ] && [ "$IDX" = "cu110" ]; then
+  export CUDA_SHORT=11.0
+  export TOOLKIT="cudatoolkit=${CUDA_SHORT}"
+fi
+
+if [ "${TRAVIS_OS_NAME}" = "windows" ] && [ "$IDX" = "cpu" ]; then
+  export TOOLKIT=cpuonly
+fi
+
+if [ "${TRAVIS_OS_NAME}" = "windows" ] && [ "$IDX" = "cu92" ]; then
+  export CUDA_SHORT=9.2
+  export CUDA_URL=https://developer.nvidia.com/compute/cuda/${CUDA_SHORT}/Prod2/local_installers2
+  export CUDA_FILE=cuda_${CUDA_SHORT}.148_win10
+  export TOOLKIT="cudatoolkit=${CUDA_SHORT}"
+fi
+
+if [ "${TRAVIS_OS_NAME}" = "windows" ] && [ "$IDX" = "cu101" ]; then
+  export CUDA_SHORT=10.1
+  export CUDA_URL=https://developer.nvidia.com/compute/cuda/${CUDA_SHORT}/Prod/local_installers
+  export CUDA_FILE=cuda_${CUDA_SHORT}.105_418.96_win10.exe
+  export TOOLKIT="cudatoolkit=${CUDA_SHORT}"
+fi
+
+if [ "${TRAVIS_OS_NAME}" = "windows" ] && [ "$IDX" = "cu102" ]; then
+  export CUDA_SHORT=10.2
+  export CUDA_URL=https://developer.download.nvidia.com/compute/cuda/${CUDA_SHORT}/Prod/local_installers
+  export CUDA_FILE=cuda_${CUDA_SHORT}.89_441.22_win10.exe
+  export TOOLKIT="cudatoolkit=${CUDA_SHORT}"
+fi
+
+if [ "${TRAVIS_OS_NAME}" = "windows" ] && [ "$IDX" = "cu110" ]; then
+  export CUDA_SHORT=11.0
+  export CUDA_URL=https://developer.download.nvidia.com/compute/cuda/${CUDA_SHORT}.2/local_installers
+  export CUDA_FILE=cuda_${CUDA_SHORT}.2_451.48_win10.exe
+  export TOOLKIT="cudatoolkit=${CUDA_SHORT}"
+fi
+
+if [ "${TRAVIS_OS_NAME}" = "osx" ] && [ "$IDX" = "cpu" ]; then
+  export TOOLKIT=""
+fi
+
+if [ "${IDX}" = "cpu" ]; then
+  export FORCE_ONLY_CPU=1
+else
+  export FORCE_CUDA=1
+fi
+
+if [ "${TRAVIS_OS_NAME}" = "linux" ] && [ "${IDX}" != "cpu" ] && [ "${IDX}" != "cu110" ]; then
+  INSTALLER="cuda-repo-${UBUNTU_VERSION}_${CUDA}_amd64.deb"
+  wget -nv "http://developer.download.nvidia.com/compute/cuda/repos/${UBUNTU_VERSION}/x86_64/${INSTALLER}"
+  sudo dpkg -i "${INSTALLER}"
+  wget -nv "https://developer.download.nvidia.com/compute/cuda/repos/${UBUNTU_VERSION}/x86_64/7fa2af80.pub"
+  sudo apt-key add 7fa2af80.pub
+  sudo apt update -qq
+  sudo apt install "cuda-core-${CUDA_SHORT/./-}" "cuda-nvcc-${CUDA_SHORT/./-}" "cuda-libraries-dev-${CUDA_SHORT/./-}"
+  sudo apt clean
+  CUDA_HOME=/usr/local/cuda-${CUDA_SHORT}
+  LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
+  PATH=${CUDA_HOME}/bin:${PATH}
+  nvcc --version
+
+  # Fix cublas on CUDA 10.1:
+  if [ -d "/usr/local/cuda-10.2/targets/x86_64-linux/include" ]; then
+    sudo cp -r /usr/local/cuda-10.2/targets/x86_64-linux/include/* "${CUDA_HOME}/include/"
+  fi
+  if [ -d "/usr/local/cuda-10.2/targets/x86_64-linux/lib" ]; then
+    sudo cp -r /usr/local/cuda-10.2/targets/x86_64-linux/lib/* "${CUDA_HOME}/lib/"
+  fi
+fi
+
+if [ "${TRAVIS_OS_NAME}" = "linux" ] && [ "${IDX}" = "cu110" ]; then
+  wget -nv https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-ubuntu1804.pin
+  sudo mv cuda-ubuntu1804.pin /etc/apt/preferences.d/cuda-repository-pin-600
+  wget -nv https://developer.download.nvidia.com/compute/cuda/11.0.3/local_installers/cuda-repo-ubuntu1804-11-0-local_11.0.3-450.51.06-1_amd64.deb
+  sudo dpkg -i cuda-repo-ubuntu1804-11-0-local_11.0.3-450.51.06-1_amd64.deb
+  sudo apt-key add /var/cuda-repo-ubuntu1804-11-0-local/7fa2af80.pub
+  sudo apt update -qq
+  sudo apt install cuda-nvcc-11-0 cuda-libraries-dev-11-0
+  sudo apt clean
+  CUDA_HOME=/usr/local/cuda-${CUDA_SHORT}
+  LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
+  PATH=${CUDA_HOME}/bin:${PATH}
+  nvcc --version
+fi
+
+if [ "${TRAVIS_OS_NAME}" = "windows" ] && [ "${IDX}" != "cpu" ]; then
+  # Install NVIDIA drivers, see:
+  # https://github.com/pytorch/vision/blob/master/packaging/windows/internal/cuda_install.bat#L99-L102
+  curl -k -L "https://drive.google.com/u/0/uc?id=1injUyo3lnarMgWyRcXqKg4UGnN0ysmuq&export=download" --output "/tmp/gpu_driver_dlls.zip"
+  7z x "/tmp/gpu_driver_dlls.zip" -o"/c/Windows/System32"
+
+  # Install CUDA:
+  wget -nv "${CUDA_URL}/${CUDA_FILE}"
+  PowerShell -Command "Start-Process -FilePath \"${CUDA_FILE}\" -ArgumentList \"-s nvcc_${CUDA_SHORT} cuobjdump_${CUDA_SHORT} nvprune_${CUDA_SHORT} cupti_${CUDA_SHORT} cublas_dev_${CUDA_SHORT} cudart_${CUDA_SHORT} cufft_dev_${CUDA_SHORT} curand_dev_${CUDA_SHORT} cusolver_dev_${CUDA_SHORT} cusparse_dev_${CUDA_SHORT} npp_dev_${CUDA_SHORT} nvrtc_dev_${CUDA_SHORT} nvml_dev_${CUDA_SHORT}\" -Wait -NoNewWindow"
+  CUDA_HOME=/c/Program\ Files/NVIDIA\ GPU\ Computing\ Toolkit/CUDA/v${CUDA_SHORT}
+  PATH=${CUDA_HOME}/bin:$PATH
+  PATH=/c/Program\ Files\ \(x86\)/Microsoft\ Visual\ Studio/2017/BuildTools/MSBuild/15.0/Bin:$PATH
+  nvcc --version
+fi
--- a/script/gcc.sh
+++ b/script/gcc.sh
+#!/bin/bash
+
+if [ "${TRAVIS_OS_NAME}" = "linux" ]; then
+  sudo add-apt-repository ppa:ubuntu-toolchain-r/test --yes
+  sudo apt update
+  sudo apt install gcc-7 g++-7 --yes
+  sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-7 60 \
+                           --slave /usr/bin/g++ g++ /usr/bin/g++-7
+  sudo update-alternatives --config gcc
+  gcc --version
+  g++ --version
+fi
+
--- a/script/torch.sh
+++ b/script/torch.sh
+#!/bin/bash
+
+# https://github.com/pytorch/pytorch/commit/d2e16dd888a9b5fd55bd475d4fcffb70f388d4f0
+if [ "${TRAVIS_OS_NAME}" = "windows" ]; then
+  echo "Fix nvcc for PyTorch"
+  sed -i.bak -e 's/CONSTEXPR_EXCEPT_WIN_CUDA/const/g' /c/tools/miniconda3/envs/test/lib/site-packages/torch/include/torch/csrc/jit/api/module.h
+  sed -i.bak -e 's/return \*(this->value)/return \*((type\*)this->value)/g' /c/tools/miniconda3/envs/test/lib/site-packages/torch/include/pybind11/cast.h
+fi
+
+if [ "${TRAVIS_OS_NAME}" = "windows" ] && [ "${TORCH_VERSION}" = "1.7.0" ]; then
+  echo "Fix nvcc for PyTorch 1.7.0"
+  sed -i.bak '/static constexpr Symbol Kind/d' /c/tools/miniconda3/envs/test/lib/site-packages/torch/include/torch/csrc/jit/ir/ir.h
+fi
--- a/setup.cfg
+++ b/setup.cfg
@@ -2,12 +2,7 @@
 description-file = README.md

 [aliases]
-test = pytest
+test=pytest

 [tool:pytest]
 addopts = --cov
-
-[egg_info]
-tag_build = 
-tag_date = 0
-
--- a/setup.py
+++ b/setup.py
@@ -8,16 +8,14 @@ import torch
 from torch.utils.cpp_extension import BuildExtension
 from torch.utils.cpp_extension import CppExtension, CUDAExtension, CUDA_HOME

-WITH_HIP = torch.cuda.is_available() and CUDA_HOME is not None
-suffices = ['cpu', 'cuda'] if WITH_HIP else ['cpu']
+WITH_CUDA = torch.cuda.is_available() and CUDA_HOME is not None
+suffices = ['cpu', 'cuda'] if WITH_CUDA else ['cpu']
 if os.getenv('FORCE_CUDA', '0') == '1':
    suffices = ['cuda', 'cpu']
-if os.getenv('FORCE_ONLY_HIP', '0') == '1':
-    suffices = ['hip']
+if os.getenv('FORCE_ONLY_CUDA', '0') == '1':
+    suffices = ['cuda']
 if os.getenv('FORCE_ONLY_CPU', '0') == '1':
    suffices = ['cpu']
-ROCM_PATH = os.getenv('ROCM_PATH')
-HIPLIB = osp.join(ROCM_PATH, 'hipsparse', 'include')

 BUILD_DOCS = os.getenv('BUILD_DOCS', '0') == '1'

@@ -33,12 +31,12 @@ def get_extensions():
        extra_compile_args = {'cxx': ['-O2']}
        extra_link_args = ['-s']

-        if suffix == 'hip':
-            define_macros += [('WITH_HIP', None)]
-            hipcc_flags = os.getenv('HIPCC_FLAGS', '')
-            hipcc_flags = [] if hipcc_flags == '' else hipcc_flags.split(' ')
-            hipcc_flags += ['-arch=sm_35', '--expt-relaxed-constexpr', '-O2']
-            extra_compile_args['hipcc'] = hipcc_flags
+        if suffix == 'cuda':
+            define_macros += [('WITH_CUDA', None)]
+            nvcc_flags = os.getenv('NVCC_FLAGS', '')
+            nvcc_flags = [] if nvcc_flags == '' else nvcc_flags.split(' ')
+            nvcc_flags += ['-arch=sm_35', '--expt-relaxed-constexpr', '-O2']
+            extra_compile_args['nvcc'] = nvcc_flags

        name = main.split(os.sep)[-1][:-4]
        sources = [main]
@@ -47,16 +45,15 @@ def get_extensions():
        if osp.exists(path):
            sources += [path]

-        path = osp.join(extensions_dir, 'hip', f'{name}_hip.hip')
-        if suffix == 'hip' and osp.exists(path):
+        path = osp.join(extensions_dir, 'cuda', f'{name}_cuda.cu')
+        if suffix == 'cuda' and osp.exists(path):
            sources += [path]

        Extension = CppExtension if suffix == 'cpu' else CUDAExtension
-        define_macros += [('TORCH_HIP_VERSION', 10000), ('__HIP__', None), ('__HCC__', None)]
        extension = Extension(
            f'torch_spline_conv._{name}_{suffix}',
            sources,
-            include_dirs=[extensions_dir, HIPLIB],
+            include_dirs=[extensions_dir],
            define_macros=define_macros,
            extra_compile_args=extra_compile_args,
            extra_link_args=extra_link_args,

--- a/test/__init__.py
+++ b/test/__init__.py
--- a/test/test_basis.py
+++ b/test/test_basis.py
+from itertools import product
+
+import pytest
+import torch
+from torch_spline_conv import spline_basis
+
+from .utils import dtypes, devices, tensor
+
+tests = [{
+    'pseudo': [[0], [0.0625], [0.25], [0.75], [0.9375], [1]],
+    'kernel_size': [5],
+    'is_open_spline': [1],
+    'basis': [[1, 0], [0.75, 0.25], [1, 0], [1, 0], [0.25, 0.75], [1, 0]],
+    'weight_index': [[0, 1], [0, 1], [1, 2], [3, 4], [3, 4], [4, 0]],
+}, {
+    'pseudo': [[0], [0.0625], [0.25], [0.75], [0.9375], [1]],
+    'kernel_size': [4],
+    'is_open_spline': [0],
+    'basis': [[1, 0], [0.75, 0.25], [1, 0], [1, 0], [0.25, 0.75], [1, 0]],
+    'weight_index': [[0, 1], [0, 1], [1, 2], [3, 0], [3, 0], [0, 1]],
+}, {
+    'pseudo': [[0.125, 0.5], [0.5, 0.5], [0.75, 0.125]],
+    'kernel_size': [5, 5],
+    'is_open_spline': [1, 1],
+    'basis': [[0.5, 0.5, 0, 0], [1, 0, 0, 0], [0.5, 0, 0.5, 0]],
+    'weight_index': [[10, 11, 15, 16], [12, 13, 17, 18], [3, 4, 8, 9]]
+}]
+
+
+@pytest.mark.parametrize('test,dtype,device', product(tests, dtypes, devices))
+def test_spline_basis_forward(test, dtype, device):
+    pseudo = tensor(test['pseudo'], dtype, device)
+    kernel_size = tensor(test['kernel_size'], torch.long, device)
+    is_open_spline = tensor(test['is_open_spline'], torch.uint8, device)
+    degree = 1
+
+    basis, weight_index = spline_basis(pseudo, kernel_size, is_open_spline,
+                                       degree)
+    assert basis.tolist() == test['basis']
+    assert weight_index.tolist() == test['weight_index']