Initial commit

b73cc994 · quyuanhao123 · b73cc994 · b73cc994 · b73cc994 · b73cc994
Commit b73cc994 authored Apr 18, 2023 by quyuanhao123
20 changed files
--- a/csrc/hip/basis_hip.h
+++ b/csrc/hip/basis_hip.h
+#pragma once
+#include <torch/extension.h>
+std::tuple<torch::Tensor, torch::Tensor>
+spline_basis_fw_cuda(torch::Tensor pseudo, torch::Tensor kernel_size,
+                     torch::Tensor is_open_spline, int64_t degree);
+torch::Tensor spline_basis_bw_cuda(torch::Tensor grad_basis,
+                                   torch::Tensor pseudo,
+                                   torch::Tensor kernel_size,
+                                   torch::Tensor is_open_spline,
+                                   int64_t degree);
--- a/csrc/hip/basis_hip.hip
+++ b/csrc/hip/basis_hip.hip
+#include "hip/hip_runtime.h"
+#include "basis_hip.h"
+#include <ATen/hip/HIPContext.h>
+#include "utils.cuh"
+#define THREADS 1024
+#define BLOCKS(N) (N + THREADS - 1) / THREADS
+template <typename scalar_t, int64_t degree> struct Basis {
+  static inline __device__ scalar_t forward(scalar_t v, int64_t k_mod) {
+    if (degree == 1) {
+      return 1. - v - k_mod + 2. * v * k_mod;
+    } else if (degree == 2) {
+      if (k_mod == 0)
+        return 0.5 * v * v - v + 0.5;
+      else if (k_mod == 1)
+        return -v * v + v + 0.5;
+      else
+        return 0.5 * v * v;
+    } else if (degree == 3) {
+      if (k_mod == 0)
+        return (1. - v) * (1. - v) * (1. - v) / 6.;
+      else if (k_mod == 1)
+        return (3. * v * v * v - 6. * v * v + 4.) / 6.;
+      else if (k_mod == 2)
+        return (-3. * v * v * v + 3. * v * v + 3. * v + 1.) / 6.;
+      else
+        return v * v * v / 6.;
+    } else {
+      return (scalar_t)-1.;
+    }
+  }
+  static inline __device__ scalar_t backward(scalar_t v, int64_t k_mod) {
+    if (degree == 1) {
+      return 2 * k_mod - 1;
+    } else if (degree == 2) {
+      if (k_mod == 0)
+        return v - 1.;
+      else if (k_mod == 1)
+        return -2. * v + 1.;
+      else
+        return v;
+    } else if (degree == 3) {
+      if (k_mod == 0)
+        return (-v * v + 2. * v - 1.) / 2.;
+      else if (k_mod == 1)
+        return (3. * v * v - 4. * v) / 2.;
+      else if (k_mod == 2)
+        return (-3. * v * v + 2. * v + 1.) / 2.;
+      else
+        return v * v / 2.;
+    } else {
+      return (scalar_t)-1.;
+    }
+  }
+};
+template <typename scalar_t, int64_t degree>
+__global__ void
+spline_basis_fw_kernel(const scalar_t *pseudo, const int64_t *kernel_size,
+                       const uint8_t *is_open_spline, scalar_t *basis,
+                       int64_t *weight_index, int64_t E, int64_t D, int64_t S,
+                       int64_t numel) {
+  const int64_t thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int64_t e = thread_idx / S;
+  const int64_t s = thread_idx % S;
+  if (thread_idx < numel) {
+    int64_t k = s, wi = 0, wi_offset = 1;
+    scalar_t b = (scalar_t)1.;
+    for (int64_t d = 0; d < D; d++) {
+      const int64_t k_mod = k % (degree + 1);
+      k /= degree + 1;
+      scalar_t v = pseudo[e * D + d];
+      v *= kernel_size[d] - degree * is_open_spline[d];
+      wi += (((int64_t)v + k_mod) % kernel_size[d]) * wi_offset;
+      wi_offset *= kernel_size[d];
+      v -= floor(v);
+      v = Basis<scalar_t, degree>::forward(v, k_mod);
+      b *= v;
+    }
+    basis[thread_idx] = b;
+    weight_index[thread_idx] = wi;
+  }
+}
+std::tuple<torch::Tensor, torch::Tensor>
+spline_basis_fw_cuda(torch::Tensor pseudo, torch::Tensor kernel_size,
+                     torch::Tensor is_open_spline, int64_t degree) {
+  CHECK_CUDA(pseudo);
+  CHECK_CUDA(kernel_size);
+  CHECK_CUDA(is_open_spline);
+  hipSetDevice(pseudo.get_device());
+  CHECK_INPUT(kernel_size.dim() == 1);
+  CHECK_INPUT(pseudo.size(1) == kernel_size.numel());
+  CHECK_INPUT(is_open_spline.dim());
+  CHECK_INPUT(pseudo.size(1) == is_open_spline.numel());
+  auto E = pseudo.size(0);
+  auto D = pseudo.size(1);
+  auto S = (int64_t)(powf(degree + 1, D) + 0.5);
+  auto basis = at::empty({E, S}, pseudo.options());
+  auto weight_index = at::empty({E, S}, kernel_size.options());
+  auto kernel_size_data = kernel_size.data_ptr<int64_t>();
+  auto is_open_spline_data = is_open_spline.data_ptr<uint8_t>();
+  auto weight_index_data = weight_index.data_ptr<int64_t>();
+  auto stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES(pseudo.scalar_type(), "basis_fw", [&] {
+    auto pseudo_data = pseudo.data_ptr<scalar_t>();
+    auto basis_data = basis.data_ptr<scalar_t>();
+    AT_DISPATCH_DEGREE_TYPES(degree, [&] {
+      spline_basis_fw_kernel<scalar_t, DEGREE>
+          <<<BLOCKS(basis.numel()), THREADS, 0, stream>>>(
+              pseudo_data, kernel_size_data, is_open_spline_data, basis_data,
+              weight_index_data, E, D, S, basis.numel());
+    });
+  });
+  return std::make_tuple(basis, weight_index);
+}
+template <typename scalar_t, int64_t degree>
+__global__ void
+spline_basis_bw_kernel(const scalar_t *grad_basis, const scalar_t *pseudo,
+                       const int64_t *kernel_size,
+                       const uint8_t *is_open_spline, scalar_t *grad_pseudo,
+                       int64_t E, int64_t D, int64_t S, int64_t numel) {
+  const int64_t thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int64_t e = thread_idx / D;
+  const int64_t d = thread_idx % D;
+  if (thread_idx < numel) {
+    scalar_t g = (scalar_t)0., tmp;
+    for (ptrdiff_t s = 0; s < S; s++) {
+      int64_t k_mod = (s / (int64_t)(powf(degree + 1, d) + 0.5)) % (degree + 1);
+      scalar_t v = pseudo[e * D + d];
+      v *= kernel_size[d] - degree * is_open_spline[d];
+      v -= floor(v);
+      v = Basis<scalar_t, degree>::backward(v, k_mod);
+      tmp = v;
+      for (int64_t d_it = 1; d_it < D; d_it++) {
+        const int64_t d_new = d_it - (d >= d_it);
+        k_mod = (s / (int64_t)(powf(degree + 1, d_new) + 0.5)) % (degree + 1);
+        v = pseudo[e * D + d_new];
+        v *= kernel_size[d_new] - degree * is_open_spline[d_new];
+        v -= floor(v);
+        v = Basis<scalar_t, degree>::forward(v, k_mod);
+        tmp *= v;
+      }
+      g += tmp * grad_basis[e * S + s];
+    }
+    g *= kernel_size[d] - degree * is_open_spline[d];
+    grad_pseudo[thread_idx] = g;
+  }
+}
+torch::Tensor spline_basis_bw_cuda(torch::Tensor grad_basis,
+                                   torch::Tensor pseudo,
+                                   torch::Tensor kernel_size,
+                                   torch::Tensor is_open_spline,
+                                   int64_t degree) {
+  CHECK_CUDA(grad_basis);
+  CHECK_CUDA(pseudo);
+  CHECK_CUDA(kernel_size);
+  CHECK_CUDA(is_open_spline);
+  hipSetDevice(grad_basis.get_device());
+  CHECK_INPUT(grad_basis.size(0) == pseudo.size(0));
+  CHECK_INPUT(kernel_size.dim() == 1);
+  CHECK_INPUT(pseudo.size(1) == kernel_size.numel());
+  CHECK_INPUT(is_open_spline.dim());
+  CHECK_INPUT(pseudo.size(1) == is_open_spline.numel());
+  auto E = pseudo.size(0);
+  auto D = pseudo.size(1);
+  auto S = grad_basis.size(1);
+  auto grad_pseudo = at::empty({E, D}, pseudo.options());
+  auto kernel_size_data = kernel_size.data_ptr<int64_t>();
+  auto is_open_spline_data = is_open_spline.data_ptr<uint8_t>();
+  auto stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES(pseudo.scalar_type(), "basis_bw", [&] {
+    auto grad_basis_data = grad_basis.data_ptr<scalar_t>();
+    auto pseudo_data = pseudo.data_ptr<scalar_t>();
+    auto grad_pseudo_data = grad_pseudo.data_ptr<scalar_t>();
+    AT_DISPATCH_DEGREE_TYPES(degree, [&] {
+      spline_basis_bw_kernel<scalar_t, DEGREE>
+          <<<BLOCKS(grad_pseudo.numel()), THREADS, 0, stream>>>(
+              grad_basis_data, pseudo_data, kernel_size_data,
+              is_open_spline_data, grad_pseudo_data, E, D, S,
+              grad_pseudo.numel());
+    });
+  });
+  return grad_pseudo;
+}
--- a/csrc/hip/basis_hip_hip.hip
+++ b/csrc/hip/basis_hip_hip.hip
+#include "hip/hip_runtime.h"
+#include "basis_hip.h"
+#include <ATen/hip/HIPContext.h>
+#include "utils.cuh"
+#define THREADS 1024
+#define BLOCKS(N) (N + THREADS - 1) / THREADS
+template <typename scalar_t, int64_t degree> struct Basis {
+  static inline __device__ scalar_t forward(scalar_t v, int64_t k_mod) {
+    if (degree == 1) {
+      return 1. - v - k_mod + 2. * v * k_mod;
+    } else if (degree == 2) {
+      if (k_mod == 0)
+        return 0.5 * v * v - v + 0.5;
+      else if (k_mod == 1)
+        return -v * v + v + 0.5;
+      else
+        return 0.5 * v * v;
+    } else if (degree == 3) {
+      if (k_mod == 0)
+        return (1. - v) * (1. - v) * (1. - v) / 6.;
+      else if (k_mod == 1)
+        return (3. * v * v * v - 6. * v * v + 4.) / 6.;
+      else if (k_mod == 2)
+        return (-3. * v * v * v + 3. * v * v + 3. * v + 1.) / 6.;
+      else
+        return v * v * v / 6.;
+    } else {
+      return (scalar_t)-1.;
+    }
+  }
+  static inline __device__ scalar_t backward(scalar_t v, int64_t k_mod) {
+    if (degree == 1) {
+      return 2 * k_mod - 1;
+    } else if (degree == 2) {
+      if (k_mod == 0)
+        return v - 1.;
+      else if (k_mod == 1)
+        return -2. * v + 1.;
+      else
+        return v;
+    } else if (degree == 3) {
+      if (k_mod == 0)
+        return (-v * v + 2. * v - 1.) / 2.;
+      else if (k_mod == 1)
+        return (3. * v * v - 4. * v) / 2.;
+      else if (k_mod == 2)
+        return (-3. * v * v + 2. * v + 1.) / 2.;
+      else
+        return v * v / 2.;
+    } else {
+      return (scalar_t)-1.;
+    }
+  }
+};
+template <typename scalar_t, int64_t degree>
+__global__ void
+spline_basis_fw_kernel(const scalar_t *pseudo, const int64_t *kernel_size,
+                       const uint8_t *is_open_spline, scalar_t *basis,
+                       int64_t *weight_index, int64_t E, int64_t D, int64_t S,
+                       int64_t numel) {
+  const int64_t thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int64_t e = thread_idx / S;
+  const int64_t s = thread_idx % S;
+  if (thread_idx < numel) {
+    int64_t k = s, wi = 0, wi_offset = 1;
+    scalar_t b = (scalar_t)1.;
+    for (int64_t d = 0; d < D; d++) {
+      const int64_t k_mod = k % (degree + 1);
+      k /= degree + 1;
+      scalar_t v = pseudo[e * D + d];
+      v *= kernel_size[d] - degree * is_open_spline[d];
+      wi += (((int64_t)v + k_mod) % kernel_size[d]) * wi_offset;
+      wi_offset *= kernel_size[d];
+      v -= floor(v);
+      v = Basis<scalar_t, degree>::forward(v, k_mod);
+      b *= v;
+    }
+    basis[thread_idx] = b;
+    weight_index[thread_idx] = wi;
+  }
+}
+std::tuple<torch::Tensor, torch::Tensor>
+spline_basis_fw_cuda(torch::Tensor pseudo, torch::Tensor kernel_size,
+                     torch::Tensor is_open_spline, int64_t degree) {
+  CHECK_CUDA(pseudo);
+  CHECK_CUDA(kernel_size);
+  CHECK_CUDA(is_open_spline);
+  hipSetDevice(pseudo.get_device());
+  CHECK_INPUT(kernel_size.dim() == 1);
+  CHECK_INPUT(pseudo.size(1) == kernel_size.numel());
+  CHECK_INPUT(is_open_spline.dim());
+  CHECK_INPUT(pseudo.size(1) == is_open_spline.numel());
+  auto E = pseudo.size(0);
+  auto D = pseudo.size(1);
+  auto S = (int64_t)(powf(degree + 1, D) + 0.5);
+  auto basis = at::empty({E, S}, pseudo.options());
+  auto weight_index = at::empty({E, S}, kernel_size.options());
+  auto kernel_size_data = kernel_size.data_ptr<int64_t>();
+  auto is_open_spline_data = is_open_spline.data_ptr<uint8_t>();
+  auto weight_index_data = weight_index.data_ptr<int64_t>();
+  auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
+  AT_DISPATCH_FLOATING_TYPES(pseudo.scalar_type(), "basis_fw", [&] {
+    auto pseudo_data = pseudo.data_ptr<scalar_t>();
+    auto basis_data = basis.data_ptr<scalar_t>();
+    AT_DISPATCH_DEGREE_TYPES(degree, [&] {
+     hipLaunchKernelGGL(( spline_basis_fw_kernel<scalar_t, DEGREE>)
+          , dim3(BLOCKS(basis.numel())), dim3(THREADS), 0, stream, 
+              pseudo_data, kernel_size_data, is_open_spline_data, basis_data,
+              weight_index_data, E, D, S, basis.numel());
+    });
+  });
+  return std::make_tuple(basis, weight_index);
+}
+template <typename scalar_t, int64_t degree>
+__global__ void
+spline_basis_bw_kernel(const scalar_t *grad_basis, const scalar_t *pseudo,
+                       const int64_t *kernel_size,
+                       const uint8_t *is_open_spline, scalar_t *grad_pseudo,
+                       int64_t E, int64_t D, int64_t S, int64_t numel) {
+  const int64_t thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int64_t e = thread_idx / D;
+  const int64_t d = thread_idx % D;
+  if (thread_idx < numel) {
+    scalar_t g = (scalar_t)0., tmp;
+    for (ptrdiff_t s = 0; s < S; s++) {
+      int64_t k_mod = (s / (int64_t)(powf(degree + 1, d) + 0.5)) % (degree + 1);
+      scalar_t v = pseudo[e * D + d];
+      v *= kernel_size[d] - degree * is_open_spline[d];
+      v -= floor(v);
+      v = Basis<scalar_t, degree>::backward(v, k_mod);
+      tmp = v;
+      for (int64_t d_it = 1; d_it < D; d_it++) {
+        const int64_t d_new = d_it - (d >= d_it);
+        k_mod = (s / (int64_t)(powf(degree + 1, d_new) + 0.5)) % (degree + 1);
+        v = pseudo[e * D + d_new];
+        v *= kernel_size[d_new] - degree * is_open_spline[d_new];
+        v -= floor(v);
+        v = Basis<scalar_t, degree>::forward(v, k_mod);
+        tmp *= v;
+      }
+      g += tmp * grad_basis[e * S + s];
+    }
+    g *= kernel_size[d] - degree * is_open_spline[d];
+    grad_pseudo[thread_idx] = g;
+  }
+}
+torch::Tensor spline_basis_bw_cuda(torch::Tensor grad_basis,
+                                   torch::Tensor pseudo,
+                                   torch::Tensor kernel_size,
+                                   torch::Tensor is_open_spline,
+                                   int64_t degree) {
+  CHECK_CUDA(grad_basis);
+  CHECK_CUDA(pseudo);
+  CHECK_CUDA(kernel_size);
+  CHECK_CUDA(is_open_spline);
+  hipSetDevice(grad_basis.get_device());
+  CHECK_INPUT(grad_basis.size(0) == pseudo.size(0));
+  CHECK_INPUT(kernel_size.dim() == 1);
+  CHECK_INPUT(pseudo.size(1) == kernel_size.numel());
+  CHECK_INPUT(is_open_spline.dim());
+  CHECK_INPUT(pseudo.size(1) == is_open_spline.numel());
+  auto E = pseudo.size(0);
+  auto D = pseudo.size(1);
+  auto S = grad_basis.size(1);
+  auto grad_pseudo = at::empty({E, D}, pseudo.options());
+  auto kernel_size_data = kernel_size.data_ptr<int64_t>();
+  auto is_open_spline_data = is_open_spline.data_ptr<uint8_t>();
+  auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
+  AT_DISPATCH_FLOATING_TYPES(pseudo.scalar_type(), "basis_bw", [&] {
+    auto grad_basis_data = grad_basis.data_ptr<scalar_t>();
+    auto pseudo_data = pseudo.data_ptr<scalar_t>();
+    auto grad_pseudo_data = grad_pseudo.data_ptr<scalar_t>();
+    AT_DISPATCH_DEGREE_TYPES(degree, [&] {
+     hipLaunchKernelGGL(( spline_basis_bw_kernel<scalar_t, DEGREE>)
+          , dim3(BLOCKS(grad_pseudo.numel())), dim3(THREADS), 0, stream, 
+              grad_basis_data, pseudo_data, kernel_size_data,
+              is_open_spline_data, grad_pseudo_data, E, D, S,
+              grad_pseudo.numel());
+    });
+  });
+  return grad_pseudo;
+}
--- a/csrc/hip/utils.cuh
+++ b/csrc/hip/utils.cuh
+#pragma once
+#include <torch/extension.h>
+#define CHECK_CUDA(x)                                                          \
+  AT_ASSERTM(x.device().is_cuda(), #x " must be CUDA tensor")
+#define CHECK_INPUT(x) AT_ASSERTM(x, "Input mismatch")
+#define AT_DISPATCH_DEGREE_TYPES(degree, ...)                                  \
+  [&] {                                                                        \
+    switch (degree) {                                                          \
+    case 1: {                                                                  \
+      const int64_t DEGREE = 1;                                                \
+      return __VA_ARGS__();                                                    \
+    }                                                                          \
+    case 2: {                                                                  \
+      const int64_t DEGREE = 2;                                                \
+      return __VA_ARGS__();                                                    \
+    }                                                                          \
+    case 3: {                                                                  \
+      const int64_t DEGREE = 3;                                                \
+      return __VA_ARGS__();                                                    \
+    }                                                                          \
+    default:                                                                   \
+      AT_ERROR("Basis degree not implemented");                                \
+    }                                                                          \
+  }()
--- a/csrc/hip/weighting_hip.h
+++ b/csrc/hip/weighting_hip.h
+#pragma once
+#include <torch/extension.h>
+torch::Tensor spline_weighting_fw_cuda(torch::Tensor x, torch::Tensor weight,
+                                       torch::Tensor basis,
+                                       torch::Tensor weight_index);
+torch::Tensor spline_weighting_bw_x_cuda(torch::Tensor grad_out,
+                                         torch::Tensor weight,
+                                         torch::Tensor basis,
+                                         torch::Tensor weight_index);
+torch::Tensor spline_weighting_bw_weight_cuda(torch::Tensor grad_out,
+                                              torch::Tensor x,
+                                              torch::Tensor basis,
+                                              torch::Tensor weight_index,
+                                              int64_t kernel_size);
+torch::Tensor spline_weighting_bw_basis_cuda(torch::Tensor grad_out,
+                                             torch::Tensor x,
+                                             torch::Tensor weight,
+                                             torch::Tensor weight_index);
--- a/csrc/hip/weighting_hip.hip
+++ b/csrc/hip/weighting_hip.hip
+#include "hip/hip_runtime.h"
+#include "weighting_hip.h"
+#include <ATen/hip/HIPContext.h>
+#include "atomics.cuh"
+#include "utils.cuh"
+#define THREADS 1024
+#define BLOCKS(N) (N + THREADS - 1) / THREADS
+template <typename scalar_t>
+__global__ void
+spline_weighting_fw_kernel(const scalar_t *x, const scalar_t *weight,
+                           const scalar_t *basis, const int64_t *weight_index,
+                           scalar_t *out, int64_t E, int64_t M_in,
+                           int64_t M_out, int64_t S, int64_t numel) {
+  const int64_t thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int64_t e = thread_idx / M_out;
+  const int64_t m_out = thread_idx % M_out;
+  if (thread_idx < numel) {
+    scalar_t v = (scalar_t)0.;
+    for (ptrdiff_t s = 0; s < S; s++) {
+      const scalar_t b = basis[e * S + s];
+      const int64_t wi = weight_index[e * S + s];
+      for (int64_t m_in = 0; m_in < M_in; m_in++) {
+        scalar_t tmp = weight[wi * M_in * M_out + m_in * M_out + m_out];
+        tmp *= b * x[e * M_in + m_in];
+        v += tmp;
+      }
+    }
+    out[thread_idx] = v;
+  }
+}
+torch::Tensor spline_weighting_fw_cuda(torch::Tensor x, torch::Tensor weight,
+                                       torch::Tensor basis,
+                                       torch::Tensor weight_index) {
+  CHECK_CUDA(x);
+  CHECK_CUDA(weight);
+  CHECK_CUDA(basis);
+  CHECK_CUDA(weight_index);
+  hipSetDevice(x.get_device());
+  CHECK_INPUT(x.size(1) == weight.size(1));
+  auto E = x.size(0);
+  auto M_in = x.size(1);
+  auto M_out = weight.size(2);
+  auto S = basis.size(1);
+  auto out = at::empty({E, M_out}, x.options());
+  auto weight_index_data = weight_index.data_ptr<int64_t>();
+  auto stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES(x.scalar_type(), "weighting_fw", [&] {
+    auto x_data = x.data_ptr<scalar_t>();
+    auto weight_data = weight.data_ptr<scalar_t>();
+    auto basis_data = basis.data_ptr<scalar_t>();
+    auto out_data = out.data_ptr<scalar_t>();
+    spline_weighting_fw_kernel<scalar_t>
+        <<<BLOCKS(out.numel()), THREADS, 0, stream>>>(
+            x_data, weight_data, basis_data, weight_index_data, out_data, E,
+            M_in, M_out, S, out.numel());
+  });
+  return out;
+}
+template <typename scalar_t>
+__global__ void
+spline_weighting_bw_x_kernel(const scalar_t *grad_out, const scalar_t *weight,
+                             const scalar_t *basis, const int64_t *weight_index,
+                             scalar_t *grad_x, int64_t E, int64_t M_in,
+                             int64_t M_out, int64_t S, int64_t numel) {
+  const int64_t thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int64_t e = thread_idx / M_in;
+  const int64_t m_in = thread_idx % M_in;
+  if (thread_idx < numel) {
+    scalar_t v = (scalar_t)0.;
+    for (int64_t s = 0; s < S; s++) {
+      const scalar_t b = basis[e * S + s];
+      const int64_t wi = weight_index[e * S + s];
+      for (int64_t m_out = 0; m_out < M_out; m_out++) {
+        scalar_t tmp = weight[wi * M_out * M_in + m_out * M_in + m_in];
+        tmp *= b * grad_out[e * M_out + m_out];
+        v += tmp;
+      }
+    }
+    grad_x[thread_idx] = v;
+  }
+}
+torch::Tensor spline_weighting_bw_x_cuda(torch::Tensor grad_out,
+                                         torch::Tensor weight,
+                                         torch::Tensor basis,
+                                         torch::Tensor weight_index) {
+  CHECK_CUDA(grad_out);
+  CHECK_CUDA(weight);
+  CHECK_CUDA(basis);
+  CHECK_CUDA(weight_index);
+  hipSetDevice(grad_out.get_device());
+  CHECK_INPUT(grad_out.size(1) == weight.size(2));
+  auto E = grad_out.size(0);
+  auto M_in = weight.size(1);
+  auto M_out = grad_out.size(1);
+  auto S = basis.size(1);
+  auto grad_x = at::zeros({E, M_in}, grad_out.options());
+  weight = weight.transpose(1, 2).contiguous(); // Contiguous memory-access.
+  auto weight_index_data = weight_index.data_ptr<int64_t>();
+  auto stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES(grad_out.scalar_type(), "weighting_bw_x", [&] {
+    auto grad_out_data = grad_out.data_ptr<scalar_t>();
+    auto weight_data = weight.data_ptr<scalar_t>();
+    auto basis_data = basis.data_ptr<scalar_t>();
+    auto grad_x_data = grad_x.data_ptr<scalar_t>();
+    spline_weighting_bw_x_kernel<scalar_t>
+        <<<BLOCKS(grad_x.numel()), THREADS, 0, stream>>>(
+            grad_out_data, weight_data, basis_data, weight_index_data,
+            grad_x_data, E, M_in, M_out, S, grad_x.numel());
+  });
+  return grad_x;
+}
+template <typename scalar_t>
+__global__ void spline_weighting_bw_weight_kernel(
+    const scalar_t *grad_out, const scalar_t *x, const scalar_t *basis,
+    const int64_t *weight_index, scalar_t *grad_weight, int64_t E, int64_t M_in,
+    int64_t M_out, int64_t S, int64_t numel) {
+  const int64_t thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int64_t e = thread_idx / M_out;
+  const int64_t m_out = thread_idx % M_out;
+  if (thread_idx < numel) {
+    auto g = grad_out[e * M_out + m_out];
+    for (int64_t s = 0; s < S; s++) {
+      const scalar_t b = basis[e * S + s];
+      const int64_t wi = weight_index[e * S + s];
+      for (int64_t m_in = 0; m_in < M_in; m_in++) {
+        auto v = g * b * x[e * M_in + m_in];
+        atomAdd(&grad_weight[wi * M_in * M_out + m_in * M_out + m_out], v);
+      }
+    }
+  }
+}
+torch::Tensor spline_weighting_bw_weight_cuda(torch::Tensor grad_out,
+                                              torch::Tensor x,
+                                              torch::Tensor basis,
+                                              torch::Tensor weight_index,
+                                              int64_t kernel_size) {
+  CHECK_CUDA(grad_out);
+  CHECK_CUDA(x);
+  CHECK_CUDA(basis);
+  CHECK_CUDA(weight_index);
+  hipSetDevice(grad_out.get_device());
+  auto E = grad_out.size(0);
+  auto M_in = x.size(1);
+  auto M_out = grad_out.size(1);
+  auto S = basis.size(1);
+  auto grad_weight = at::zeros({kernel_size, M_in, M_out}, grad_out.options());
+  auto weight_index_data = weight_index.data_ptr<int64_t>();
+  auto stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES(x.scalar_type(), "weighting_bw_weight", [&] {
+    auto grad_out_data = grad_out.data_ptr<scalar_t>();
+    auto x_data = x.data_ptr<scalar_t>();
+    auto basis_data = basis.data_ptr<scalar_t>();
+    auto grad_weight_data = grad_weight.data_ptr<scalar_t>();
+    spline_weighting_bw_weight_kernel<scalar_t>
+        <<<BLOCKS(grad_out.numel()), THREADS, 0, stream>>>(
+            grad_out_data, x_data, basis_data, weight_index_data,
+            grad_weight_data, E, M_in, M_out, S, grad_out.numel());
+  });
+  return grad_weight;
+}
+template <typename scalar_t>
+__global__ void spline_weighting_bw_basis_kernel(
+    const scalar_t *grad_out, const scalar_t *x, const scalar_t *weight,
+    const int64_t *weight_index, scalar_t *grad_basis, int64_t E, int64_t M_in,
+    int64_t M_out, int64_t S, int64_t numel) {
+  const size_t thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int64_t e = thread_idx / M_out;
+  const int64_t m_out = thread_idx % M_out;
+  if (thread_idx < numel) {
+    const scalar_t g = grad_out[e * M_out + m_out];
+    for (int64_t s = 0; s < S; s++) {
+      scalar_t v = (scalar_t)0.;
+      const int64_t wi = weight_index[e * S + s];
+      for (int64_t m_in = 0; m_in < M_in; m_in++) {
+        const scalar_t w = weight[wi * M_in * M_out + m_in * M_out + m_out];
+        v += g * w * x[e * M_in + m_in];
+      }
+      atomAdd(&grad_basis[e * S + s], v);
+    }
+  }
+}
+torch::Tensor spline_weighting_bw_basis_cuda(torch::Tensor grad_out,
+                                             torch::Tensor x,
+                                             torch::Tensor weight,
+                                             torch::Tensor weight_index) {
+  CHECK_CUDA(grad_out);
+  CHECK_CUDA(x);
+  CHECK_CUDA(weight);
+  CHECK_CUDA(weight_index);
+  hipSetDevice(grad_out.get_device());
+  CHECK_INPUT(x.size(1) == weight.size(1));
+  CHECK_INPUT(grad_out.size(1) == weight.size(2));
+  auto E = grad_out.size(0);
+  auto M_in = x.size(1);
+  auto M_out = grad_out.size(1);
+  auto S = weight_index.size(1);
+  auto grad_basis = at::zeros({E, S}, grad_out.options());
+  auto weight_index_data = weight_index.data_ptr<int64_t>();
+  auto stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES(x.scalar_type(), "weighting_bw_basis", [&] {
+    auto grad_out_data = grad_out.data_ptr<scalar_t>();
+    auto x_data = x.data_ptr<scalar_t>();
+    auto weight_data = weight.data_ptr<scalar_t>();
+    auto grad_basis_data = grad_basis.data_ptr<scalar_t>();
+    spline_weighting_bw_basis_kernel<scalar_t>
+        <<<BLOCKS(grad_out.numel()), THREADS, 0, stream>>>(
+            grad_out_data, x_data, weight_data, weight_index_data,
+            grad_basis_data, E, M_in, M_out, S, grad_out.numel());
+  });
+  return grad_basis;
+}
--- a/csrc/hip/weighting_hip_hip.hip
+++ b/csrc/hip/weighting_hip_hip.hip
+#include "hip/hip_runtime.h"
+#include "weighting_hip.h"
+#include <ATen/hip/HIPContext.h>
+#include "atomics.cuh"
+#include "utils.cuh"
+#define THREADS 1024
+#define BLOCKS(N) (N + THREADS - 1) / THREADS
+template <typename scalar_t>
+__global__ void
+spline_weighting_fw_kernel(const scalar_t *x, const scalar_t *weight,
+                           const scalar_t *basis, const int64_t *weight_index,
+                           scalar_t *out, int64_t E, int64_t M_in,
+                           int64_t M_out, int64_t S, int64_t numel) {
+  const int64_t thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int64_t e = thread_idx / M_out;
+  const int64_t m_out = thread_idx % M_out;
+  if (thread_idx < numel) {
+    scalar_t v = (scalar_t)0.;
+    for (ptrdiff_t s = 0; s < S; s++) {
+      const scalar_t b = basis[e * S + s];
+      const int64_t wi = weight_index[e * S + s];
+      for (int64_t m_in = 0; m_in < M_in; m_in++) {
+        scalar_t tmp = weight[wi * M_in * M_out + m_in * M_out + m_out];
+        tmp *= b * x[e * M_in + m_in];
+        v += tmp;
+      }
+    }
+    out[thread_idx] = v;
+  }
+}
+torch::Tensor spline_weighting_fw_cuda(torch::Tensor x, torch::Tensor weight,
+                                       torch::Tensor basis,
+                                       torch::Tensor weight_index) {
+  CHECK_CUDA(x);
+  CHECK_CUDA(weight);
+  CHECK_CUDA(basis);
+  CHECK_CUDA(weight_index);
+  hipSetDevice(x.get_device());
+  CHECK_INPUT(x.size(1) == weight.size(1));
+  auto E = x.size(0);
+  auto M_in = x.size(1);
+  auto M_out = weight.size(2);
+  auto S = basis.size(1);
+  auto out = at::empty({E, M_out}, x.options());
+  auto weight_index_data = weight_index.data_ptr<int64_t>();
+  auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
+  AT_DISPATCH_FLOATING_TYPES(x.scalar_type(), "weighting_fw", [&] {
+    auto x_data = x.data_ptr<scalar_t>();
+    auto weight_data = weight.data_ptr<scalar_t>();
+    auto basis_data = basis.data_ptr<scalar_t>();
+    auto out_data = out.data_ptr<scalar_t>();
+   hipLaunchKernelGGL(( spline_weighting_fw_kernel<scalar_t>)
+        , dim3(BLOCKS(out.numel())), dim3(THREADS), 0, stream, 
+            x_data, weight_data, basis_data, weight_index_data, out_data, E,
+            M_in, M_out, S, out.numel());
+  });
+  return out;
+}
+template <typename scalar_t>
+__global__ void
+spline_weighting_bw_x_kernel(const scalar_t *grad_out, const scalar_t *weight,
+                             const scalar_t *basis, const int64_t *weight_index,
+                             scalar_t *grad_x, int64_t E, int64_t M_in,
+                             int64_t M_out, int64_t S, int64_t numel) {
+  const int64_t thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int64_t e = thread_idx / M_in;
+  const int64_t m_in = thread_idx % M_in;
+  if (thread_idx < numel) {
+    scalar_t v = (scalar_t)0.;
+    for (int64_t s = 0; s < S; s++) {
+      const scalar_t b = basis[e * S + s];
+      const int64_t wi = weight_index[e * S + s];
+      for (int64_t m_out = 0; m_out < M_out; m_out++) {
+        scalar_t tmp = weight[wi * M_out * M_in + m_out * M_in + m_in];
+        tmp *= b * grad_out[e * M_out + m_out];
+        v += tmp;
+      }
+    }
+    grad_x[thread_idx] = v;
+  }
+}
+torch::Tensor spline_weighting_bw_x_cuda(torch::Tensor grad_out,
+                                         torch::Tensor weight,
+                                         torch::Tensor basis,
+                                         torch::Tensor weight_index) {
+  CHECK_CUDA(grad_out);
+  CHECK_CUDA(weight);
+  CHECK_CUDA(basis);
+  CHECK_CUDA(weight_index);
+  hipSetDevice(grad_out.get_device());
+  CHECK_INPUT(grad_out.size(1) == weight.size(2));
+  auto E = grad_out.size(0);
+  auto M_in = weight.size(1);
+  auto M_out = grad_out.size(1);
+  auto S = basis.size(1);
+  auto grad_x = at::zeros({E, M_in}, grad_out.options());
+  weight = weight.transpose(1, 2).contiguous(); // Contiguous memory-access.
+  auto weight_index_data = weight_index.data_ptr<int64_t>();
+  auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
+  AT_DISPATCH_FLOATING_TYPES(grad_out.scalar_type(), "weighting_bw_x", [&] {
+    auto grad_out_data = grad_out.data_ptr<scalar_t>();
+    auto weight_data = weight.data_ptr<scalar_t>();
+    auto basis_data = basis.data_ptr<scalar_t>();
+    auto grad_x_data = grad_x.data_ptr<scalar_t>();
+   hipLaunchKernelGGL(( spline_weighting_bw_x_kernel<scalar_t>)
+        , dim3(BLOCKS(grad_x.numel())), dim3(THREADS), 0, stream, 
+            grad_out_data, weight_data, basis_data, weight_index_data,
+            grad_x_data, E, M_in, M_out, S, grad_x.numel());
+  });
+  return grad_x;
+}
+template <typename scalar_t>
+__global__ void spline_weighting_bw_weight_kernel(
+    const scalar_t *grad_out, const scalar_t *x, const scalar_t *basis,
+    const int64_t *weight_index, scalar_t *grad_weight, int64_t E, int64_t M_in,
+    int64_t M_out, int64_t S, int64_t numel) {
+  const int64_t thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int64_t e = thread_idx / M_out;
+  const int64_t m_out = thread_idx % M_out;
+  if (thread_idx < numel) {
+    auto g = grad_out[e * M_out + m_out];
+    for (int64_t s = 0; s < S; s++) {
+      const scalar_t b = basis[e * S + s];
+      const int64_t wi = weight_index[e * S + s];
+      for (int64_t m_in = 0; m_in < M_in; m_in++) {
+        auto v = g * b * x[e * M_in + m_in];
+        atomAdd(&grad_weight[wi * M_in * M_out + m_in * M_out + m_out], v);
+      }
+    }
+  }
+}
+torch::Tensor spline_weighting_bw_weight_cuda(torch::Tensor grad_out,
+                                              torch::Tensor x,
+                                              torch::Tensor basis,
+                                              torch::Tensor weight_index,
+                                              int64_t kernel_size) {
+  CHECK_CUDA(grad_out);
+  CHECK_CUDA(x);
+  CHECK_CUDA(basis);
+  CHECK_CUDA(weight_index);
+  hipSetDevice(grad_out.get_device());
+  auto E = grad_out.size(0);
+  auto M_in = x.size(1);
+  auto M_out = grad_out.size(1);
+  auto S = basis.size(1);
+  auto grad_weight = at::zeros({kernel_size, M_in, M_out}, grad_out.options());
+  auto weight_index_data = weight_index.data_ptr<int64_t>();
+  auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
+  AT_DISPATCH_FLOATING_TYPES(x.scalar_type(), "weighting_bw_weight", [&] {
+    auto grad_out_data = grad_out.data_ptr<scalar_t>();
+    auto x_data = x.data_ptr<scalar_t>();
+    auto basis_data = basis.data_ptr<scalar_t>();
+    auto grad_weight_data = grad_weight.data_ptr<scalar_t>();
+   hipLaunchKernelGGL(( spline_weighting_bw_weight_kernel<scalar_t>)
+        , dim3(BLOCKS(grad_out.numel())), dim3(THREADS), 0, stream, 
+            grad_out_data, x_data, basis_data, weight_index_data,
+            grad_weight_data, E, M_in, M_out, S, grad_out.numel());
+  });
+  return grad_weight;
+}
+template <typename scalar_t>
+__global__ void spline_weighting_bw_basis_kernel(
+    const scalar_t *grad_out, const scalar_t *x, const scalar_t *weight,
+    const int64_t *weight_index, scalar_t *grad_basis, int64_t E, int64_t M_in,
+    int64_t M_out, int64_t S, int64_t numel) {
+  const size_t thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int64_t e = thread_idx / M_out;
+  const int64_t m_out = thread_idx % M_out;
+  if (thread_idx < numel) {
+    const scalar_t g = grad_out[e * M_out + m_out];
+    for (int64_t s = 0; s < S; s++) {
+      scalar_t v = (scalar_t)0.;
+      const int64_t wi = weight_index[e * S + s];
+      for (int64_t m_in = 0; m_in < M_in; m_in++) {
+        const scalar_t w = weight[wi * M_in * M_out + m_in * M_out + m_out];
+        v += g * w * x[e * M_in + m_in];
+      }
+      atomAdd(&grad_basis[e * S + s], v);
+    }
+  }
+}
+torch::Tensor spline_weighting_bw_basis_cuda(torch::Tensor grad_out,
+                                             torch::Tensor x,
+                                             torch::Tensor weight,
+                                             torch::Tensor weight_index) {
+  CHECK_CUDA(grad_out);
+  CHECK_CUDA(x);
+  CHECK_CUDA(weight);
+  CHECK_CUDA(weight_index);
+  hipSetDevice(grad_out.get_device());
+  CHECK_INPUT(x.size(1) == weight.size(1));
+  CHECK_INPUT(grad_out.size(1) == weight.size(2));
+  auto E = grad_out.size(0);
+  auto M_in = x.size(1);
+  auto M_out = grad_out.size(1);
+  auto S = weight_index.size(1);
+  auto grad_basis = at::zeros({E, S}, grad_out.options());
+  auto weight_index_data = weight_index.data_ptr<int64_t>();
+  auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
+  AT_DISPATCH_FLOATING_TYPES(x.scalar_type(), "weighting_bw_basis", [&] {
+    auto grad_out_data = grad_out.data_ptr<scalar_t>();
+    auto x_data = x.data_ptr<scalar_t>();
+    auto weight_data = weight.data_ptr<scalar_t>();
+    auto grad_basis_data = grad_basis.data_ptr<scalar_t>();
+   hipLaunchKernelGGL(( spline_weighting_bw_basis_kernel<scalar_t>)
+        , dim3(BLOCKS(grad_out.numel())), dim3(THREADS), 0, stream, 
+            grad_out_data, x_data, weight_data, weight_index_data,
+            grad_basis_data, E, M_in, M_out, S, grad_out.numel());
+  });
+  return grad_basis;
+}
--- a/csrc/spline_conv.h
+++ b/csrc/spline_conv.h
+#pragma once
+#include <torch/extension.h>
+int64_t cuda_version();
+std::tuple<torch::Tensor, torch::Tensor>
+spline_basis(torch::Tensor pseudo, torch::Tensor kernel_size,
+             torch::Tensor is_open_spline, int64_t degree);
+torch::Tensor spline_weighting(torch::Tensor x, torch::Tensor weight,
+                               torch::Tensor basis, torch::Tensor weight_index);
--- a/csrc/version.cpp
+++ b/csrc/version.cpp
+#include <Python.h>
+#include <torch/script.h>
+#ifdef WITH_HIP
+#include <hip/hip_runtime.h>
+#endif
+#ifdef _WIN32
+#ifdef WITH_HIP
+PyMODINIT_FUNC PyInit__version_cuda(void) { return NULL; }
+#else
+PyMODINIT_FUNC PyInit__version_cpu(void) { return NULL; }
+#endif
+#endif
+int64_t cuda_version() {
+#ifdef WITH_HIP
+  return TORCH_HIP_VERSION;
+#else
+  return -1;
+#endif
+}
+static auto registry = torch::RegisterOperators().op(
+    "torch_spline_conv::cuda_version", &cuda_version);
--- a/csrc/weighting.cpp
+++ b/csrc/weighting.cpp
+#include <Python.h>
+#include <torch/script.h>
+#include "cpu/weighting_cpu.h"
+#ifdef WITH_HIP
+#include "hip/weighting_hip.h"
+#endif
+#ifdef _WIN32
+#ifdef WITH_HIP
+PyMODINIT_FUNC PyInit__weighting_cuda(void) { return NULL; }
+#else
+PyMODINIT_FUNC PyInit__weighting_cpu(void) { return NULL; }
+#endif
+#endif
+torch::Tensor spline_weighting_fw(torch::Tensor x, torch::Tensor weight,
+                                  torch::Tensor basis,
+                                  torch::Tensor weight_index) {
+  if (x.device().is_cuda()) {
+#ifdef WITH_HIP
+    return spline_weighting_fw_cuda(x, weight, basis, weight_index);
+#else
+    AT_ERROR("Not compiled with CUDA support");
+#endif
+  } else {
+    return spline_weighting_fw_cpu(x, weight, basis, weight_index);
+  }
+}
+torch::Tensor spline_weighting_bw_x(torch::Tensor grad_out,
+                                    torch::Tensor weight, torch::Tensor basis,
+                                    torch::Tensor weight_index) {
+  if (grad_out.device().is_cuda()) {
+#ifdef WITH_HIP
+    return spline_weighting_bw_x_cuda(grad_out, weight, basis, weight_index);
+#else
+    AT_ERROR("Not compiled with CUDA support");
+#endif
+  } else {
+    return spline_weighting_bw_x_cpu(grad_out, weight, basis, weight_index);
+  }
+}
+torch::Tensor spline_weighting_bw_weight(torch::Tensor grad_out,
+                                         torch::Tensor x, torch::Tensor basis,
+                                         torch::Tensor weight_index,
+                                         int64_t kernel_size) {
+  if (grad_out.device().is_cuda()) {
+#ifdef WITH_HIP
+    return spline_weighting_bw_weight_cuda(grad_out, x, basis, weight_index,
+                                           kernel_size);
+#else
+    AT_ERROR("Not compiled with CUDA support");
+#endif
+  } else {
+    return spline_weighting_bw_weight_cpu(grad_out, x, basis, weight_index,
+                                          kernel_size);
+  }
+}
+torch::Tensor spline_weighting_bw_basis(torch::Tensor grad_out, torch::Tensor x,
+                                        torch::Tensor weight,
+                                        torch::Tensor weight_index) {
+  if (grad_out.device().is_cuda()) {
+#ifdef WITH_HIP
+    return spline_weighting_bw_basis_cuda(grad_out, x, weight, weight_index);
+#else
+    AT_ERROR("Not compiled with CUDA support");
+#endif
+  } else {
+    return spline_weighting_bw_basis_cpu(grad_out, x, weight, weight_index);
+  }
+}
+using torch::autograd::AutogradContext;
+using torch::autograd::Variable;
+using torch::autograd::variable_list;
+class SplineWeighting : public torch::autograd::Function<SplineWeighting> {
+public:
+  static variable_list forward(AutogradContext *ctx, Variable x,
+                               Variable weight, Variable basis,
+                               Variable weight_index) {
+    auto out = spline_weighting_fw(x, weight, basis, weight_index);
+    ctx->save_for_backward({x, weight, basis, weight_index});
+    return {out};
+  }
+  static variable_list backward(AutogradContext *ctx, variable_list grad_outs) {
+    auto grad_out = grad_outs[0];
+    auto saved = ctx->get_saved_variables();
+    auto x = saved[0], weight = saved[1], basis = saved[2],
+         weight_index = saved[3];
+    auto grad_x = Variable();
+    if (torch::autograd::any_variable_requires_grad({x})) {
+      grad_x = spline_weighting_bw_x(grad_out, weight, basis, weight_index);
+    }
+    auto grad_weight = Variable();
+    if (torch::autograd::any_variable_requires_grad({weight})) {
+      grad_weight = spline_weighting_bw_weight(grad_out, x, basis, weight_index,
+                                               weight.size(0));
+    }
+    auto grad_basis = Variable();
+    if (torch::autograd::any_variable_requires_grad({basis})) {
+      grad_basis = spline_weighting_bw_basis(grad_out, x, weight, weight_index);
+    }
+    return {grad_x, grad_weight, grad_basis, Variable()};
+  }
+};
+torch::Tensor spline_weighting(torch::Tensor x, torch::Tensor weight,
+                               torch::Tensor basis,
+                               torch::Tensor weight_index) {
+  x = x.contiguous();
+  weight = weight.contiguous();
+  return SplineWeighting::apply(x, weight, basis, weight_index)[0];
+}
+static auto registry = torch::RegisterOperators().op(
+    "torch_spline_conv::spline_weighting", &spline_weighting);
--- a/env.sh-22.10
+++ b/env.sh-22.10
+#!/bin/bash
+source ~/miniconda3/etc/profile.d/conda.sh
+conda activate torch1.10_py39_dtk22.10
+module purge
+module load compiler/devtoolset/7.3.1 mpi/hpcx/gcc-7.3.1 #compiler/dtk/22.10.1
+module list
+source ~/dtk-22.10.1/env.sh
+export C_INCLUDE_PATH=/public/software/apps/DeepLearning/PyTorch_Lib/gflags-2.1.2-build/include:$C_INCLUDE_PATH
+export CPLUS_INCLUDE_PATH=/public/software/apps/DeepLearning/PyTorch_Lib/gflags-2.1.2-build/include:$CPLUS_INCLUDE_PATH
+export C_INCLUDE_PATH=/public/software/apps/DeepLearning/PyTorch_Lib/glog-build/include:$C_INCLUDE_PATH
+export CPLUS_INCLUDE_PATH=/public/software/apps/DeepLearning/PyTorch_Lib/glog-build/include:$CPLUS_INCLUDE_PATH
+export C_INCLUDE_PATH=$ROCM_PATH/rocrand/include:$C_INCLUDE_PATH
+export CPLUS_INCLUDE_PATH=$ROCM_PATH/rocrand/include:$CPLUS_INCLUDE_PATH
+export LD_LIBRARY_PATH=$ROCM_PATH/rocrand/lib:$LD_LIBRARY_PATH
+export FORCE_ONLY_HIP=1
+export CC=hipcc
+export CXX=hipcc
--- a/setup.cfg
+++ b/setup.cfg
+[metadata]
+description-file = README.md
+[aliases]
+test = pytest
+[tool:pytest]
+addopts = --cov
+[egg_info]
+tag_build = 
+tag_date = 0
--- a/setup.py
+++ b/setup.py
+import os
+import glob
+import os.path as osp
+from itertools import product
+from setuptools import setup, find_packages
+import torch
+from torch.utils.cpp_extension import BuildExtension
+from torch.utils.cpp_extension import CppExtension, CUDAExtension, CUDA_HOME
+WITH_HIP = torch.cuda.is_available() and CUDA_HOME is not None
+suffices = ['cpu', 'cuda'] if WITH_HIP else ['cpu']
+if os.getenv('FORCE_CUDA', '0') == '1':
+    suffices = ['cuda', 'cpu']
+if os.getenv('FORCE_ONLY_HIP', '0') == '1':
+    suffices = ['hip']
+if os.getenv('FORCE_ONLY_CPU', '0') == '1':
+    suffices = ['cpu']
+ROCM_PATH = os.getenv('ROCM_PATH')
+HIPLIB = osp.join(ROCM_PATH, 'hipsparse', 'include')
+BUILD_DOCS = os.getenv('BUILD_DOCS', '0') == '1'
+def get_extensions():
+    extensions = []
+    extensions_dir = osp.join(osp.dirname(osp.abspath(__file__)), 'csrc')
+    main_files = glob.glob(osp.join(extensions_dir, '*.cpp'))
+    for main, suffix in product(main_files, suffices):
+        define_macros = []
+        extra_compile_args = {'cxx': ['-O2']}
+        extra_link_args = ['-s']
+        if suffix == 'hip':
+            define_macros += [('WITH_HIP', None)]
+            hipcc_flags = os.getenv('HIPCC_FLAGS', '')
+            hipcc_flags = [] if hipcc_flags == '' else hipcc_flags.split(' ')
+            hipcc_flags += ['-arch=sm_35', '--expt-relaxed-constexpr', '-O2']
+            extra_compile_args['hipcc'] = hipcc_flags
+        name = main.split(os.sep)[-1][:-4]
+        sources = [main]
+        path = osp.join(extensions_dir, 'cpu', f'{name}_cpu.cpp')
+        if osp.exists(path):
+            sources += [path]
+        path = osp.join(extensions_dir, 'hip', f'{name}_hip.hip')
+        if suffix == 'hip' and osp.exists(path):
+            sources += [path]
+        Extension = CppExtension if suffix == 'cpu' else CUDAExtension
+        define_macros += [('TORCH_HIP_VERSION', 10000), ('__HIP__', None), ('__HCC__', None)]
+        extension = Extension(
+            f'torch_spline_conv._{name}_{suffix}',
+            sources,
+            include_dirs=[extensions_dir, HIPLIB],
+            define_macros=define_macros,
+            extra_compile_args=extra_compile_args,
+            extra_link_args=extra_link_args,
+        )
+        extensions += [extension]
+    return extensions
+install_requires = []
+setup_requires = ['pytest-runner']
+tests_require = ['pytest', 'pytest-cov']
+setup(
+    name='torch_spline_conv',
+    version='1.2.1',
+    author='Matthias Fey',
+    author_email='matthias.fey@tu-dortmund.de',
+    url='https://github.com/rusty1s/pytorch_spline_conv',
+    description=('Implementation of the Spline-Based Convolution Operator of '
+                 'SplineCNN in PyTorch'),
+    keywords=[
+        'pytorch',
+        'geometric-deep-learning',
+        'graph-neural-networks',
+        'spline-cnn',
+    ],
+    license='MIT',
+    python_requires='>=3.6',
+    install_requires=install_requires,
+    setup_requires=setup_requires,
+    tests_require=tests_require,
+    ext_modules=get_extensions() if not BUILD_DOCS else [],
+    cmdclass={
+        'build_ext': BuildExtension.with_options(no_python_abi_suffix=True)
+    },
+    packages=find_packages(),
+)
--- a/torch_spline_conv.egg-info/PKG-INFO
+++ b/torch_spline_conv.egg-info/PKG-INFO
+Metadata-Version: 2.1
+Name: torch-spline-conv
+Version: 1.2.1
+Summary: Implementation of the Spline-Based Convolution Operator of SplineCNN in PyTorch
+Home-page: https://github.com/rusty1s/pytorch_spline_conv
+Author: Matthias Fey
+Author-email: matthias.fey@tu-dortmund.de
+License: MIT
+Keywords: pytorch,geometric-deep-learning,graph-neural-networks,spline-cnn
+Requires-Python: >=3.6
+License-File: LICENSE
--- a/torch_spline_conv.egg-info/SOURCES.txt
+++ b/torch_spline_conv.egg-info/SOURCES.txt
+LICENSE
+MANIFEST.in
+README.md
+setup.cfg
+setup.py
+/work/home/quyuanhao123/software/test_ocp/torch_spline_conv-1.2.1/csrc/basis.cpp
+/work/home/quyuanhao123/software/test_ocp/torch_spline_conv-1.2.1/csrc/version.cpp
+/work/home/quyuanhao123/software/test_ocp/torch_spline_conv-1.2.1/csrc/weighting.cpp
+/work/home/quyuanhao123/software/test_ocp/torch_spline_conv-1.2.1/csrc/cpu/basis_cpu.cpp
+/work/home/quyuanhao123/software/test_ocp/torch_spline_conv-1.2.1/csrc/cpu/weighting_cpu.cpp
+/work/home/quyuanhao123/software/test_ocp/torch_spline_conv-1.2.1/csrc/hip/basis_hip_hip.hip
+/work/home/quyuanhao123/software/test_ocp/torch_spline_conv-1.2.1/csrc/hip/weighting_hip_hip.hip
+csrc/basis.cpp
+csrc/spline_conv.h
+csrc/version.cpp
+csrc/weighting.cpp
+csrc/cpu/basis_cpu.cpp
+csrc/cpu/basis_cpu.h
+csrc/cpu/utils.h
+csrc/cpu/weighting_cpu.cpp
+csrc/cpu/weighting_cpu.h
+csrc/hip/atomics.cuh
+csrc/hip/basis_hip.h
+csrc/hip/basis_hip.hip
+csrc/hip/basis_hip_hip.hip
+csrc/hip/utils.cuh
+csrc/hip/weighting_hip.h
+csrc/hip/weighting_hip.hip
+csrc/hip/weighting_hip_hip.hip
+torch_spline_conv/__init__.py
+torch_spline_conv/basis.py
+torch_spline_conv/conv.py
+torch_spline_conv/weighting.py
+torch_spline_conv.egg-info/PKG-INFO
+torch_spline_conv.egg-info/SOURCES.txt
+torch_spline_conv.egg-info/dependency_links.txt
+torch_spline_conv.egg-info/top_level.txt
\ No newline at end of file
--- a/torch_spline_conv.egg-info/dependency_links.txt
+++ b/torch_spline_conv.egg-info/dependency_links.txt
--- a/torch_spline_conv.egg-info/top_level.txt
+++ b/torch_spline_conv.egg-info/top_level.txt
+torch_spline_conv
--- a/torch_spline_conv/__init__.py
+++ b/torch_spline_conv/__init__.py
+import importlib
+import os.path as osp
+import torch
+__version__ = '1.2.1'
+suffix = 'hip' if torch.cuda.is_available() else 'cpu'
+for library in ['_version', '_basis', '_weighting']:
+    torch.ops.load_library(importlib.machinery.PathFinder().find_spec(
+        f'{library}_{suffix}', [osp.dirname(__file__)]).origin)
+if torch.cuda.is_available():  # pragma: no cover
+    cuda_version = torch.ops.torch_spline_conv.cuda_version()
+    if cuda_version == -1:
+        major = minor = 0
+    elif cuda_version < 10000:
+        major, minor = int(str(cuda_version)[0]), int(str(cuda_version)[2])
+    else:
+        major, minor = int(str(cuda_version)[0:2]), int(str(cuda_version)[3])
+from .basis import spline_basis  # noqa
+from .weighting import spline_weighting  # noqa
+from .conv import spline_conv  # noqa
+__all__ = [
+    'spline_basis',
+    'spline_weighting',
+    'spline_conv',
+    '__version__',
+]
--- a/torch_spline_conv/basis.py
+++ b/torch_spline_conv/basis.py
+from typing import Tuple
+import torch
+@torch.jit.script
+def spline_basis(pseudo: torch.Tensor, kernel_size: torch.Tensor,
+                 is_open_spline: torch.Tensor,
+                 degree: int) -> Tuple[torch.Tensor, torch.Tensor]:
+    return torch.ops.torch_spline_conv.spline_basis(pseudo, kernel_size,
+                                                    is_open_spline, degree)
--- a/torch_spline_conv/conv.py
+++ b/torch_spline_conv/conv.py
+from typing import Optional
+import torch
+from .basis import spline_basis
+from .weighting import spline_weighting
+@torch.jit.script
+def spline_conv(x: torch.Tensor, edge_index: torch.Tensor,
+                pseudo: torch.Tensor, weight: torch.Tensor,
+                kernel_size: torch.Tensor, is_open_spline: torch.Tensor,
+                degree: int = 1, norm: bool = True,
+                root_weight: Optional[torch.Tensor] = None,
+                bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    r"""Applies the spline-based convolution operator :math:`(f \star g)(i) =
+    \frac{1}{|\mathcal{N}(i)|} \sum_{l=1}^{M_{in}} \sum_{j \in \mathcal{N}(i)}
+    f_l(j) \cdot g_l(u(i, j))` over several node features of an input graph.
+    The kernel function :math:`g_l` is defined over the weighted B-spline
+    tensor product basis for a single input feature map :math:`l`.
+    Args:
+        x (:class:`Tensor`): Input node features of shape
+            (number_of_nodes x in_channels).
+        edge_index (:class:`LongTensor`): Graph edges, given by source and
+            target indices, of shape (2 x number_of_edges) in the fixed
+            interval [0, 1].
+        pseudo (:class:`Tensor`): Edge attributes, ie. pseudo coordinates,
+            of shape (number_of_edges x number_of_edge_attributes).
+        weight (:class:`Tensor`): Trainable weight parameters of shape
+            (kernel_size x in_channels x out_channels).
+        kernel_size (:class:`LongTensor`): Number of trainable weight
+            parameters in each edge dimension.
+        is_open_spline (:class:`ByteTensor`): Whether to use open or closed
+            B-spline bases for each dimension.
+        degree (int, optional): B-spline basis degree. (default: :obj:`1`)
+        norm (bool, optional): Whether to normalize output by node degree.
+            (default: :obj:`True`)
+        root_weight (:class:`Tensor`, optional): Additional shared trainable
+            parameters for each feature of the root node of shape
+            (in_channels x out_channels). (default: :obj:`None`)
+        bias (:class:`Tensor`, optional): Optional bias of shape
+            (out_channels). (default: :obj:`None`)
+    :rtype: :class:`Tensor`
+    """
+    x = x.unsqueeze(-1) if x.dim() == 1 else x
+    pseudo = pseudo.unsqueeze(-1) if pseudo.dim() == 1 else pseudo
+    row, col = edge_index[0], edge_index[1]
+    N, E, M_out = x.size(0), row.size(0), weight.size(2)
+    # Weight each node.
+    basis, weight_index = spline_basis(pseudo, kernel_size, is_open_spline,
+                                       degree)
+    out = spline_weighting(x[col], weight, basis, weight_index)
+    # Convert E x M_out to N x M_out features.
+    row_expanded = row.unsqueeze(-1).expand_as(out)
+    out = x.new_zeros((N, M_out)).scatter_add_(0, row_expanded, out)
+    # Normalize out by node degree (if wished).
+    if norm:
+        ones = torch.ones(E, dtype=x.dtype, device=x.device)
+        deg = out.new_zeros(N).scatter_add_(0, row, ones)
+        out = out / deg.unsqueeze(-1).clamp_(min=1)
+    # Weight root node separately (if wished).
+    if root_weight is not None:
+        out = out + torch.matmul(x, root_weight)
+    # Add bias (if wished).
+    if bias is not None:
+        out = out + bias
+    return out