first version bp_to_adj, not working atm

e2e17501 · Jan Eric Lenssen · 8383b476 · 8383b476 · 8383b476 · 8383b476
Commit e2e17501 authored Mar 05, 2018 by Jan Eric Lenssen
8 changed files
--- a/compute_spline_basis.py
+++ b/compute_spline_basis.py
-import torch
-from ....utils.cuda import (cuda_num_threads, Stream, Dtype, load_kernel,
-                            kernel_loop, get_blocks)
-_spline_kernel_linear = kernel_loop + '''
-extern "C"
-__global__ void spline_kernel(
-const ${Dtype}* input, ${Dtype}* amount, long* index,
-const long* kernel_size, const long* is_open_spline, int num_threads) {
-  CUDA_KERNEL_LOOP(idx, num_threads) {
-    const int e_idx = idx / ${k_max};
-    int k_idx = idx % ${k_max};
-    int K = ${K};
-    int k_idx_mod;
-    int bot;
-    int top;
-    ${Dtype} value;
-    ${Dtype} frac;
-    ${Dtype} a = 1.0;
-    long i = 0;
-    for (int d_idx = 0; d_idx < ${dim}; d_idx++) {
-      K /= kernel_size[d_idx];
-      k_idx_mod = k_idx % 2;
-      k_idx >>= 1;
-      value = input[e_idx * ${dim} + d_idx];
-      value *= kernel_size[d_idx] - is_open_spline[d_idx];
-      frac = value - floor(value);
-      a *= (1 - k_idx_mod) * frac + k_idx_mod * (1 - frac);
-      bot = int(floor(value));
-      top = (bot + 1) % kernel_size[d_idx];
-      bot %= kernel_size[d_idx];
-      i += (k_idx_mod * bot + (1 - k_idx_mod) * top) * K;
-    }
-    amount[idx] = a;
-    index[idx] = i;
-  }
-}
-'''
-_spline_kernel_quadratic = kernel_loop + '''
-extern "C"
-__global__ void spline_kernel(
-const ${Dtype}* input, ${Dtype}* amount, long* index,
-const long* kernel_size, const long* is_open_spline, int num_threads) {
-  CUDA_KERNEL_LOOP(idx, num_threads) {
-    const int e_idx = idx / ${k_max};
-    int k_idx = idx % ${k_max};
-    int K = ${K};
-    int k_idx_mod;
-    int pos;
-    ${Dtype} value;
-    ${Dtype} frac;
-    ${Dtype} a = 1.0;
-    long i = 0;
-    for (int d_idx = 0; d_idx < ${dim}; d_idx++) {
-      K /= kernel_size[d_idx];
-      k_idx_mod = k_idx % 3;
-      k_idx /= 3;
-      value = input[e_idx * ${dim} + d_idx] *
-              (kernel_size[d_idx] - (2 * is_open_spline[d_idx]));
-      frac = value - floor(value);
-      if (k_idx_mod == 0) a *= 0.5 * (1- frac) * (1-frac);
-      else if (k_idx_mod == 1) a *= -frac * frac + frac + 0.5;
-      else a *= 0.5 * frac * frac;
-      pos = int(floor(value)) + k_idx_mod;
-      pos %= kernel_size[d_idx];
-      i += pos * K;
-    }
-    amount[idx] = a;
-    index[idx] = i;
-  }
-}
-'''
-_spline_kernel_cubic = kernel_loop + '''
-extern "C"
-__global__ void spline_kernel(
-const ${Dtype}* input, ${Dtype}* amount, long* index,
-const long* kernel_size, const long* is_open_spline, int num_threads) {
-  CUDA_KERNEL_LOOP(idx, num_threads}) {
-    const int e_idx = idx / ${k_max};
-    int k_idx = idx % ${k_max};
-    int K = ${K};
-    int k_idx_mod;
-    int pos;
-    ${Dtype} value;
-    ${Dtype} frac;
-    ${Dtype} a = 1.0;
-    long i = 0;
-    for (int d_idx = 0; d_idx < ${dim}; d_idx++) {
-      K /= kernel_size[d_idx];
-      k_idx_mod = k_idx % 4;
-      k_idx /= 4;
-      value = input[e_idx * ${dim} + d_idx] *
-              (kernel_size[d_idx] - (3 * is_open_spline[d_idx]));
-      frac = value - floor(value);
-      if (k_idx_mod == 0) a *= (1 - frac) * (1 - frac) * (1 - frac) / 6.0;
-      else if (k_idx_mod == 1)
-        a *= (3 * frac * frac * frac - 6 * frac * frac + 4) / 6.0;
-      else if (k_idx_mod == 2)
-        a *= (-3 * frac * frac * frac + 3 * frac * frac + 3 * frac + 1) / 6.0;
-      else a *= frac * frac * frac / 6.0;
-      pos = int(floor(value)) + k_idx_mod;
-      pos %= kernel_size[d_idx];
-      i += pos * K;
-    }
-    amount[idx] = a;
-    index[idx] = i;
-  }
-}
-'''
-def get_basis_kernel(k_max, K, dim, degree):
-    if degree == 3:
-        _spline_kernel = _spline_kernel_cubic
-    elif degree == 2:
-        _spline_kernel = _spline_kernel_quadratic
-    else:
-        _spline_kernel = _spline_kernel_linear
-    cuda_tensor = torch.FloatTensor([1]).cuda()
-    with torch.cuda.device_of(cuda_tensor):
-        f = load_kernel(
-            'spline_kernel',
-            _spline_kernel,
-            Dtype='float',
-            k_max=k_max,
-            dim=dim,
-            K=K)
-    return f
-def compute_spline_basis(input, kernel_size, is_open_spline, K, basis_kernel):
-    assert input.is_cuda and kernel_size.is_cuda and is_open_spline.is_cuda
-    input = input.unsqueeze(1) if len(input.size()) < 2 else input
-    num_edges, dim = input.size()
-    k_max = 2**dim
-    amount = input.new(num_edges, k_max)
-    index = input.new(num_edges, k_max).long()
-    num_threads = amount.numel()
-    with torch.cuda.device_of(input):
-        basis_kernel(
-            block=(cuda_num_threads, 1, 1),
-            grid=(get_blocks(num_threads), 1, 1),
-            args=[
-                input.data_ptr(),
-                amount.data_ptr(),
-                index.data_ptr(),
-                kernel_size.data_ptr(),
-                is_open_spline.data_ptr(), num_threads
-            ],
-            stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))
-    return amount, index
--- a/edgewise_spline_weighting.py
+++ b/edgewise_spline_weighting.py
-import torch
-if torch.cuda.is_available():
-    from .edgewise_spline_weighting_gpu import EdgewiseSplineWeightingGPU
-def edgewise_spline_weighting(input, weight, amount, index, k_fw, k_bw):
-    if input.is_cuda:
-        K, M_in, M_out = weight.size()
-        return EdgewiseSplineWeightingGPU(amount, index, K, M_in, M_out
-                                          , k_fw, k_bw)(input, weight)
-    else:
-        raise NotImplementedError
--- a/edgewise_spline_weighting_bp2adj_gpu.py
+++ b/edgewise_spline_weighting_bp2adj_gpu.py
-import torch
-from torch.autograd import Function
-from ....utils.cuda import (cuda_num_threads, Stream, Dtype, load_kernel,
-                            kernel_loop, get_blocks)
-_edgewise_spline_weighting_forward_kernel = kernel_loop + '''
-extern "C"
-__global__ void edgewise_spline_weighting_forward_kernel(
-const ${Dtype}* input, const ${Dtype}* weight, ${Dtype}* output,
-const ${Dtype}* amount, const long* index) {
-  CUDA_KERNEL_LOOP(idx, ${num_threads}) {
-    const int e_idx = idx / ${M_out};
-    const int m_out_idx = idx % ${M_out};
-    ${Dtype} result = 0.0;
-    ${Dtype} w;
-    ${Dtype} f;
-    int k;
-    ${Dtype} b;
-    long c;
-    long w_idx;
-    for (int k_idx = 0; k_idx < ${k_max}; k_idx++) {
-      k = e_idx * ${k_max} + k_idx;
-      b = amount[k];
-      c = index[k];
-      for (int m_in_idx = 0; m_in_idx < ${M_in}; m_in_idx++) {
-        w_idx = c * ${M_out} * ${M_in} +
-                m_in_idx * ${M_out} +
-                m_out_idx;
-        w = weight[w_idx];
-        f = input[e_idx * ${M_in} + m_in_idx];
-        result += b * w * f;
-      }
-    }
-    output[idx] = result;
-  }
-}
-'''
-_edgewise_spline_weighting_backward_kernel = kernel_loop + '''
-extern "C"
-__global__ void edgewise_spline_weighting_backward_kernel(
-const ${Dtype}* grad_output, ${Dtype}* grad_input, ${Dtype}* grad_weight,
-${Dtype}* grad_b, const ${Dtype}* input, const ${Dtype}* weight,
-const ${Dtype}* amount, const long* index) {
-  CUDA_KERNEL_LOOP(idx, ${num_threads}) {
-    const int e_idx = idx / ${M_out};
-    const int m_out_idx = idx % ${M_out};
-    ${Dtype} w;
-    ${Dtype} g;
-    ${Dtype} f;
-    ${Dtype} w_grad;
-    ${Dtype} b_grad;
-    int k;
-    ${Dtype} b;
-    long c;
-    long w_idx;
-    for (int k_idx = 0; k_idx < ${k_max}; k_idx++) {
-      k = e_idx * ${k_max} + k_idx;
-      b = amount[k];
-      c = index[k];
-      for (int m_in_idx = 0; m_in_idx < ${M_in}; m_in_idx++) {
-        w_idx = c * ${M_out} * ${M_in} +
-                m_in_idx * ${M_out} +
-                m_out_idx;
-        w = weight[w_idx];
-        // Calculate input gradient.
-        g = grad_output[e_idx * ${M_out} + m_out_idx];
-        atomicAdd(&(grad_input[e_idx * ${M_in} + m_in_idx]), b * w * g);
-        // This is inefficient: `reduce_sum` shouldn't be done like this.
-        // Looping over `M_out` would be better to avoid the `atomicAdd`.
-        // Calculate weight gradient.
-        f = input[e_idx * ${M_in} + m_in_idx];
-        w_grad = f * b * grad_output[e_idx * ${M_out} + m_out_idx];
-        atomicAdd(&(grad_weight[w_idx]), w_grad);
-        // Not so efficient either, but not avoidable.
-        //Calculate gradient of B
-        b_grad = f * w * g;
-        atomicAdd(&(grad_b[k]), b_grad);
-      }
-    }
-  }
-}
-'''
-_spline_kernel = kernel_loop + '''
-extern "C"
-__global__ void spline_kernel(
-const ${Dtype}* input, ${Dtype}* amount, long* index,
-const long* kernel_size, const long* is_open_spline) {
-  CUDA_KERNEL_LOOP(idx, ${num_threads}) {
-    const int e_idx = idx / ${k_max};
-    int k_idx = idx % ${k_max};
-    int K = ${K};
-    int k_idx_mod;
-    int bot;
-    int top;
-    ${Dtype} value;
-    ${Dtype} frac;
-    ${Dtype} a = 1.0;
-    long i = 0;
-    for (int d_idx = 0; d_idx < ${dim}; d_idx++) {
-      K /= kernel_size[d_idx];
-      k_idx_mod = k_idx % 2;
-      k_idx >>= 1;
-      value = input[e_idx * ${dim} + d_idx] *
-              (kernel_size[d_idx] - is_open_spline[d_idx]);
-      frac = value - floor(value);
-      a *= (1 - k_idx_mod) * frac + k_idx_mod * (1 - frac);
-      bot = int(floor(value));
-      top = (bot + 1) % kernel_size[d_idx];
-      bot %= kernel_size[d_idx];
-      i += (k_idx_mod * bot + (1 - k_idx_mod) * top) * K;
-    }
-    amount[idx] = a;
-    index[idx] = i;
-  }
-}
-'''
-_spline_kernel_backwards = kernel_loop + '''
-extern "C"
-__global__ void spline_kernel(
-const ${Dtype}* input, ${Dtype}* amount, long* index,
-const long* kernel_size, const long* is_open_spline) {
-  CUDA_KERNEL_LOOP(idx, ${num_threads}) {
-    const int e_idx = idx / ${k_max};
-    int k_idx = idx % ${k_max};
-    int K = ${K};
-    int k_idx_mod;
-    int bot;
-    int top;
-    ${Dtype} value;
-    ${Dtype} frac;
-    ${Dtype} a = 1.0;
-    long i = 0;
-    for (int d_idx = 0; d_idx < ${dim}; d_idx++) {
-      K /= kernel_size[d_idx];
-      k_idx_mod = k_idx % 2;
-      k_idx >>= 1;
-      value = input[e_idx * ${dim} + d_idx] *
-              (kernel_size[d_idx] - is_open_spline[d_idx]);
-      frac = value - floor(value);
-      a *= (1 - k_idx_mod) * frac + k_idx_mod * (1 - frac);
-      bot = int(floor(value));
-      top = (bot + 1) % kernel_size[d_idx];
-      bot %= kernel_size[d_idx];
-      i += (k_idx_mod * bot + (1 - k_idx_mod) * top) * K;
-    }
-    amount[idx] = a;
-    index[idx] = i;
-  }
-}
-'''
-class EdgewiseSplineWeightingGPU(Function):
-    def __init__(self,  kernel_size, is_open_spline, K):
-        super(EdgewiseSplineWeightingGPU, self).__init__()
-        assert kernel_size.is_cuda and is_open_spline.is_cuda
-        self.kernel_size = kernel_size
-        self.is_open_spline = is_open_spline
-        self.K = K
-    def forward(self, input, values, weight):
-        assert input.is_cuda and weight.is_cuda
-        self.save_for_backward(input, weight)
-        values = values.unsqueeze(1) if len(values.size()) < 2 else values
-        num_edges, dim = values.size()
-        k_max = 2 ** dim
-        amount = values.new(num_edges, k_max)
-        index = values.new(num_edges, k_max).long()
-        num_threads = amount.numel()
-        with torch.cuda.device_of(input):
-            f = load_kernel(
-                'spline_kernel',
-                _spline_kernel,
-                Dtype=Dtype(input),
-                num_threads=num_threads,
-                k_max=k_max,
-                dim=dim,
-                K=self.K)
-            f(block=(cuda_num_threads, 1, 1),
-              grid=(get_blocks(num_threads), 1, 1),
-              args=[
-                  values.data_ptr(),
-                  amount.data_ptr(),
-                  index.data_ptr(),
-                  self.kernel_size.data_ptr(),
-                  self.is_open_spline.data_ptr()
-              ],
-              stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))
-        self.amount = amount
-        self.index = index
-        _, M_in, M_out = weight.size()
-        k_max = self.amount.size(1)
-        output = input.new(input.size(0), M_out)
-        num_threads = output.numel()
-        with torch.cuda.device_of(input):
-            f = load_kernel(
-                'edgewise_spline_weighting_forward_kernel',
-                _edgewise_spline_weighting_forward_kernel,
-                Dtype=Dtype(input),
-                num_threads=num_threads,
-                M_in=M_in,
-                M_out=M_out,
-                k_max=k_max)
-            f(block=(cuda_num_threads, 1, 1),
-              grid=(get_blocks(num_threads), 1, 1),
-              args=[
-                  input.data_ptr(),
-                  weight.data_ptr(),
-                  output.data_ptr(),
-                  self.amount.data_ptr(),
-                  self.index.data_ptr()
-              ],
-              stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))
-        return output
-    def backward(self, grad_output):
-        input, weight = self.saved_tensors
-        K, M_in, M_out = weight.size()
-        E, k_max = self.amount.size()
-        grad_input = grad_output.new(input.size(0), M_in).fill_(0)
-        grad_weight = grad_output.new(K, M_in, M_out).fill_(0)
-        grad_b = grad_output.new(E, k_max).fill_(0)
-        num_threads = grad_output.numel()
-        with torch.cuda.device_of(grad_output):
-            f = load_kernel(
-                'edgewise_spline_weighting_backward_kernel',
-                _edgewise_spline_weighting_backward_kernel,
-                Dtype=Dtype(input),
-                num_threads=num_threads,
-                M_in=M_in,
-                M_out=M_out,
-                k_max=k_max,
-                K=K)
-            f(block=(cuda_num_threads, 1, 1),
-              grid=(get_blocks(num_threads), 1, 1),
-              args=[
-                  grad_output.data_ptr(),
-                  grad_input.data_ptr(),
-                  grad_weight.data_ptr(),
-                  grad_b.data_ptr(),
-                  input.data_ptr(),
-                  weight.data_ptr(),
-                  self.amount.data_ptr(),
-                  self.index.data_ptr()
-              ],
-              stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))
-        grad_b = grad_output.new(E, k_max).fill_(0)
-        return grad_input, grad_weight, grad_u
--- a/edgewise_spline_weighting_gpu.py
+++ b/edgewise_spline_weighting_gpu.py
-import torch
-from torch.autograd import Function
-from ....utils.cuda import (cuda_num_threads, Stream, load_kernel, kernel_loop,
-                            get_blocks)
-_edgewise_spline_weighting_forward_kernel = kernel_loop + '''
-extern "C"
-__global__ void edgewise_spline_weighting_forward_kernel(
-const ${Dtype}* input, const ${Dtype}* weight, ${Dtype}* output,
-const ${Dtype}* amount, const long* index, int num_threads) {
-  CUDA_KERNEL_LOOP(idx, num_threads) {
-    const int e_idx = idx / ${M_out};
-    const int m_out_idx = idx % ${M_out};
-    ${Dtype} result = 0.0;
-    ${Dtype} w;
-    ${Dtype} f;
-    int k;
-    ${Dtype} b;
-    long c;
-    long w_idx;
-    for (int k_idx = 0; k_idx < ${k_max}; k_idx++) {
-      k = e_idx * ${k_max} + k_idx;
-      b = amount[k];
-      c = index[k];
-      for (int m_in_idx = 0; m_in_idx < ${M_in}; m_in_idx++) {
-        w_idx = c * ${M_out} * ${M_in} +
-                m_in_idx * ${M_out} +
-                m_out_idx;
-        w = weight[w_idx];
-        f = input[e_idx * ${M_in} + m_in_idx];
-        result += b * w * f;
-      }
-    }
-    output[idx] = result;
-  }
-}
-'''
-_edgewise_spline_weighting_backward_kernel = kernel_loop + '''
-extern "C"
-__global__ void edgewise_spline_weighting_backward_kernel(
-const ${Dtype}* grad_output, ${Dtype}* grad_input, ${Dtype}* grad_weight,
-const ${Dtype}* input, const ${Dtype}* weight, const ${Dtype}* amount,
-const long* index, int num_threads) {
-  CUDA_KERNEL_LOOP(idx, num_threads) {
-    const int e_idx = idx / ${M_out};
-    const int m_out_idx = idx % ${M_out};
-    ${Dtype} w;
-    ${Dtype} g;
-    ${Dtype} f;
-    ${Dtype} w_grad;
-    int k;
-    ${Dtype} b;
-    long c;
-    long w_idx;
-    for (int k_idx = 0; k_idx < ${k_max}; k_idx++) {
-      k = e_idx * ${k_max} + k_idx;
-      b = amount[k];
-      c = index[k];
-      for (int m_in_idx = 0; m_in_idx < ${M_in}; m_in_idx++) {
-        w_idx = c * ${M_out} * ${M_in} +
-                m_in_idx * ${M_out} +
-                m_out_idx;
-        w = weight[w_idx];
-        // Calculate input gradient.
-        g = grad_output[e_idx * ${M_out} + m_out_idx];
-        atomicAdd(&(grad_input[e_idx * ${M_in} + m_in_idx]), b * w * g);
-        // This is inefficient: `reduce_sum` shouldn't be done like this.
-        // Looping over `M_out` would be better to avoid the `atomicAdd`.
-        // Calculate weight gradient.
-        f = input[e_idx * ${M_in} + m_in_idx];
-        w_grad = f * b * g;
-        atomicAdd(&(grad_weight[w_idx]), w_grad);
-        // Not so efficient either, but not avoidable.
-      }
-    }
-  }
-}
-'''
-def get_forward_kernel(M_in, M_out, k_max):
-    cuda_tensor = torch.FloatTensor([1]).cuda()
-    with torch.cuda.device_of(cuda_tensor):
-        f_fw = load_kernel(
-            'edgewise_spline_weighting_forward_kernel',
-            _edgewise_spline_weighting_forward_kernel,
-            Dtype='float',
-            M_in=M_in,
-            M_out=M_out,
-            k_max=k_max)
-    return f_fw
-def get_backward_kernel(M_in, M_out, k_max, K):
-    cuda_tensor = torch.FloatTensor([1]).cuda()
-    with torch.cuda.device_of(cuda_tensor):
-        f_bw = load_kernel(
-            'edgewise_spline_weighting_backward_kernel',
-            _edgewise_spline_weighting_backward_kernel,
-            Dtype='float',
-            M_in=M_in,
-            M_out=M_out,
-            k_max=k_max,
-            K=K)
-    return f_bw
-class EdgewiseSplineWeightingGPU(Function):
-    def __init__(self, amount, index, K, M_in, M_out, k_fw, k_bw):
-        super(EdgewiseSplineWeightingGPU, self).__init__()
-        assert amount.is_cuda and index.is_cuda
-        self.amount = amount
-        self.index = index
-        self.M_in = M_in
-        self.M_out = M_out
-        self.K = K
-        self.f_fw = k_fw
-        self.f_bw = k_bw
-    def forward(self, input, weight):
-        assert input.is_cuda and weight.is_cuda
-        self.save_for_backward(input, weight)
-        output = input.new(input.size(0), self.M_out)
-        num_threads = output.numel()
-        with torch.cuda.device_of(input):
-            self.f_fw(
-                block=(cuda_num_threads, 1, 1),
-                grid=(get_blocks(num_threads), 1, 1),
-                args=[
-                    input.data_ptr(),
-                    weight.data_ptr(),
-                    output.data_ptr(),
-                    self.amount.data_ptr(),
-                    self.index.data_ptr(), num_threads
-                ],
-                stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))
-        return output
-    def backward(self, grad_output):
-        input, weight = self.saved_tensors
-        grad_input = grad_output.new(input.size(0), self.M_in).fill_(0)
-        grad_weight = grad_output.new(self.K, self.M_in, self.M_out).fill_(0)
-        num_threads = grad_output.numel()
-        with torch.cuda.device_of(grad_output):
-            self.f_bw(
-                block=(cuda_num_threads, 1, 1),
-                grid=(get_blocks(num_threads), 1, 1),
-                args=[
-                    grad_output.data_ptr(),
-                    grad_input.data_ptr(),
-                    grad_weight.data_ptr(),
-                    input.data_ptr(),
-                    weight.data_ptr(),
-                    self.amount.data_ptr(),
-                    self.index.data_ptr(), num_threads
-                ],
-                stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))
-        return grad_input, grad_weight
--- a/spline.py
+++ b/spline.py
-import torch
-if torch.cuda.is_available():
-    from .compute_spline_basis import compute_spline_basis
-def spline(input, kernel_size, is_open_spline, K, degree, basis_kernel):
-    if input.is_cuda:
-        return compute_spline_basis(input, kernel_size, is_open_spline, K,
-                                    basis_kernel)
-    else:
-        raise NotImplementedError()
--- a/spline_conv.py
+++ b/spline_conv.py
@@ -2,23 +2,22 @@ import torch
 from torch.autograd import Variable
 import time
-from .spline import spline
+from .spline_conv_gpu import SplineConvGPU
-from .edgewise_spline_weighting import edgewise_spline_weighting
 def spline_conv(
-        adj,  # Tensor
+        adj,  # Pytorch Tensor (!bp_to_adj) or Pytorch Variable (bp_to_adj)
-        input,  # Variable
+        input,  # Pytorch Variable
-        weight,  # Variable
+        weight,  # Pytorch Variable
-        kernel_size,
+        kernel_size,  # Rest tensors or python variables
        is_open_spline,
        K,
-        forward_kernel,
+        weighting_kernel,
-        backward_kernel,
+        weighting_backward_kernel,
        basis_kernel,
+        basis_backward_kernel=None,
        degree=1,
-        bias=None):
+        bias=None,
+        bp_to_adj=False):
    if input.dim() == 1:
        input = input.unsqueeze(1)
@@ -29,12 +28,14 @@ def spline_conv(
    # Get features for every end vertex with shape [|E| x M_in].
    output = input[col]
    # Convert to [|E| x M_in] feature matrix and calculate [|E| x M_out].
+    if output.is_cuda:
-    amount, index = spline(values, kernel_size, is_open_spline, K, degree,
+        output = SplineConvGPU(kernel_size, is_open_spline, K, degree,
-                           basis_kernel)
+                               basis_kernel, basis_backward_kernel,
+                               weighting_kernel, weighting_backward_kernel,
-    output = edgewise_spline_weighting(output, weight[:-1], amount, index,
+                               bp_to_adj)(output, weight[:-1], values)
-                                       forward_kernel, backward_kernel)
+    else:
+        # CPU Implementation not available
+        raise NotImplementedError()
    # Convolution via `scatter_add`. Converts [|E| x M_out] feature matrix to
    # [n x M_out] feature matrix.

--- a/spline_conv_bp2adj.py
+++ b/spline_conv_bp2adj.py
-import torch
-from torch.autograd import Variable
-from .spline import spline
-from .edgewise_spline_weighting_bp2adj_gpu import EdgewiseSplineWeightingGPU
-def spline_conv_bp2adj(
-        adj,  # Tensor
-        input,  # Variable
-        weight,  # Variable
-        kernel_size,
-        is_open_spline,
-        K,
-        degree=1,
-        bias=None):
-    values = adj._values()
-    row, col = adj._indices()
-    # Get features for every end vertex with shape [|E| x M_in].
-    output = input[col]
-    # Convert to [|E| x M_in] feature matrix and calculate [|E| x M_out].
-    output = EdgewiseSplineWeightingGPU(kernel_size, is_open_spline, K)(output, weight[:-1], values)
-    # Convolution via `scatter_add`. Converts [|E| x M_out] feature matrix to
-    # [n x M_out] feature matrix.
-    zero = output.data.new(adj.size(1), output.size(1)).fill_(0.0)
-    zero = Variable(zero) if not torch.is_tensor(output) else zero
-    r = row.view(-1, 1).expand(row.size(0), output.size(1))
-    output = zero.scatter_add_(0, Variable(r), output)
-    # Weighten root node features by multiplying with root weight.
-    output += torch.mm(input, weight[-1])
-    # Normalize output by degree.
-    ones = values.new(values.size(0)).fill_(1)
-    zero = values.new(output.size(0)).fill_(0)
-    degree = zero.scatter_add_(0, row, ones)
-    degree = torch.clamp(degree, min=1)
-    output = output / Variable(degree.view(-1, 1))
-    if bias is not None:
-        output += bias
-    return output
--- a/spline_conv_gpu.py
+++ b/spline_conv_gpu.py
+import torch
+from torch.autograd import Function
+from ....utils.cuda import (cuda_num_threads, Stream, load_kernel, kernel_loop,
+                            get_blocks)
+_edgewise_spline_weighting_forward_kernel = kernel_loop + '''
+extern "C"
+__global__ void edgewise_spline_weighting_forward_kernel(
+const ${Dtype}* input, const ${Dtype}* weight, ${Dtype}* output,
+const ${Dtype}* amount, const long* index, int num_threads) {
+  CUDA_KERNEL_LOOP(idx, num_threads) {
+    const int e_idx = idx / ${M_out};
+    const int m_out_idx = idx % ${M_out};
+    ${Dtype} result = 0.0;
+    ${Dtype} w;
+    ${Dtype} f;
+    int k;
+    ${Dtype} b;
+    long c;
+    long w_idx;
+    for (int k_idx = 0; k_idx < ${k_max}; k_idx++) {
+      k = e_idx * ${k_max} + k_idx;
+      b = amount[k];
+      c = index[k];
+      for (int m_in_idx = 0; m_in_idx < ${M_in}; m_in_idx++) {
+        w_idx = c * ${M_out} * ${M_in} +
+                m_in_idx * ${M_out} +
+                m_out_idx;
+        w = weight[w_idx];
+        f = input[e_idx * ${M_in} + m_in_idx];
+        result += b * w * f;
+      }
+    }
+    output[idx] = result;
+  }
+}
+'''
+_edgewise_spline_weighting_backward_kernel = kernel_loop + '''
+extern "C"
+__global__ void edgewise_spline_weighting_backward_kernel(
+const ${Dtype}* grad_output, ${Dtype}* grad_input, ${Dtype}* grad_weight,
+const ${Dtype}* input, const ${Dtype}* weight, const ${Dtype}* amount,
+const long* index, int num_threads) {
+  CUDA_KERNEL_LOOP(idx, num_threads) {
+    const int e_idx = idx / ${M_out};
+    const int m_out_idx = idx % ${M_out};
+    ${Dtype} w;
+    ${Dtype} g;
+    ${Dtype} f;
+    ${Dtype} w_grad;
+    int k;
+    ${Dtype} b;
+    long c;
+    long w_idx;
+    for (int k_idx = 0; k_idx < ${k_max}; k_idx++) {
+      k = e_idx * ${k_max} + k_idx;
+      b = amount[k];
+      c = index[k];
+      for (int m_in_idx = 0; m_in_idx < ${M_in}; m_in_idx++) {
+        w_idx = c * ${M_out} * ${M_in} +
+                m_in_idx * ${M_out} +
+                m_out_idx;
+        w = weight[w_idx];
+        // Calculate input gradient.
+        g = grad_output[e_idx * ${M_out} + m_out_idx];
+        atomicAdd(&(grad_input[e_idx * ${M_in} + m_in_idx]), b * w * g);
+        // This is inefficient: `reduce_sum` shouldn't be done like this.
+        // Looping over `M_out` would be better to avoid the `atomicAdd`.
+        // Calculate weight gradient.
+        f = input[e_idx * ${M_in} + m_in_idx];
+        w_grad = f * b * g;
+        atomicAdd(&(grad_weight[w_idx]), w_grad);
+        // Not so efficient either, but not avoidable.
+      }
+    }
+  }
+}
+'''
+_edgewise_spline_weighting_backward_kernel_bp2adj = kernel_loop + '''
+extern "C"
+__global__ void edgewise_spline_weighting_backward_kernel(
+const ${Dtype}* grad_output, ${Dtype}* grad_input, ${Dtype}* grad_weight, 
+${Dtype}* grad_amount, const ${Dtype}* input, const ${Dtype}* weight, 
+const ${Dtype}* amount, const long* index, int num_threads) {
+  CUDA_KERNEL_LOOP(idx, num_threads) {
+    const int e_idx = idx / ${M_out};
+    const int m_out_idx = idx % ${M_out};
+    ${Dtype} w;
+    ${Dtype} g;
+    ${Dtype} f;
+    ${Dtype} w_grad;
+    int k;
+    ${Dtype} b;
+    long c;
+    long w_idx;
+    for (int k_idx = 0; k_idx < ${k_max}; k_idx++) {
+      k = e_idx * ${k_max} + k_idx;
+      b = amount[k];
+      c = index[k];
+      for (int m_in_idx = 0; m_in_idx < ${M_in}; m_in_idx++) {
+        w_idx = c * ${M_out} * ${M_in} +
+                m_in_idx * ${M_out} +
+                m_out_idx;
+        w = weight[w_idx];
+        // Calculate input gradient.
+        g = grad_output[e_idx * ${M_out} + m_out_idx];
+        atomicAdd(&(grad_input[e_idx * ${M_in} + m_in_idx]), b * w * g);
+        // This is inefficient: `reduce_sum` shouldn't be done like this.
+        // Looping over `M_out` would be better to avoid the `atomicAdd`.
+        // Calculate weight gradient.
+        f = input[e_idx * ${M_in} + m_in_idx];
+        w_grad = f * b * g;
+        atomicAdd(&(grad_weight[w_idx]), w_grad);
+        // Not so efficient either, but not avoidable.
+        // Calculate B-spline basis tensor product gradient
+        adj_g += g * f * w;
+      }
+      atomicAdd(&(grad_amount[e_idx,k_idx]), adj_g);
+    }
+  }
+}
+'''
+def get_weighting_forward_kernel(M_in, M_out, k_max, bt_to_adj=False):
+    cuda_tensor = torch.FloatTensor([1]).cuda()
+    if bt_to_adj:
+        kernel = _edgewise_spline_weighting_forward_kernel
+    else:
+        kernel = _edgewise_spline_weighting_forward_kernel
+    with torch.cuda.device_of(cuda_tensor):
+        f_fw = load_kernel(
+            'edgewise_spline_weighting_forward_kernel',
+            kernel,
+            Dtype='float',
+            M_in=M_in,
+            M_out=M_out,
+            k_max=k_max)
+    return f_fw
+def get_weighting_backward_kernel(M_in, M_out, k_max, K, bt_to_adj=False):
+    cuda_tensor = torch.FloatTensor([1]).cuda()
+    if bt_to_adj:
+        kernel = _edgewise_spline_weighting_backward_kernel_bp2adj
+    else:
+        kernel = _edgewise_spline_weighting_backward_kernel
+    with torch.cuda.device_of(cuda_tensor):
+        f_bw = load_kernel(
+            'edgewise_spline_weighting_backward_kernel',
+            kernel,
+            Dtype='float',
+            M_in=M_in,
+            M_out=M_out,
+            k_max=k_max,
+            K=K)
+    return f_bw
+_spline_kernel_linear = kernel_loop + '''
+extern "C"
+__global__ void spline_kernel(
+const ${Dtype}* input, ${Dtype}* amount, long* index,
+const long* kernel_size, const long* is_open_spline, int num_threads) {
+  CUDA_KERNEL_LOOP(idx, num_threads) {
+    const int e_idx = idx / ${k_max};
+    int k_idx = idx % ${k_max};
+    int K = 1.0;
+    int k_idx_mod;
+    int bot;
+    int top;
+    ${Dtype} value;
+    ${Dtype} frac;
+    ${Dtype} a = 1.0;
+    long i = 0;
+    for (int d_idx = 0; d_idx < ${dim}; d_idx++) {
+      k_idx_mod = k_idx % 2;
+      k_idx >>= 1;
+      value = input[e_idx * ${dim} + d_idx];
+      if(value==1.0)
+        value -= 0.000001;
+      value *= kernel_size[d_idx] - is_open_spline[d_idx];
+      frac = value - floor(value);
+      a *= (1 - k_idx_mod) * (1 - frac) + k_idx_mod * frac;
+      bot = int(floor(value));
+      top = (bot + 1) % kernel_size[d_idx];
+      bot %= kernel_size[d_idx];
+      i += ((1 - k_idx_mod) * bot + k_idx_mod * top) * K;
+      K *= kernel_size[d_idx];
+    }
+    amount[idx] = a;
+    index[idx] = i;
+  }
+}
+'''
+_spline_kernel_quadratic = kernel_loop + '''
+extern "C"
+__global__ void spline_kernel(
+const ${Dtype}* input, ${Dtype}* amount, long* index,
+const long* kernel_size, const long* is_open_spline, int num_threads) {
+  CUDA_KERNEL_LOOP(idx, num_threads) {
+    const int e_idx = idx / ${k_max};
+    int k_idx = idx % ${k_max};
+    int K = ${K};
+    int k_idx_mod;
+    int pos;
+    ${Dtype} value;
+    ${Dtype} frac;
+    ${Dtype} a = 1.0;
+    long i = 0;
+    for (int d_idx = 0; d_idx < ${dim}; d_idx++) {
+      K /= kernel_size[d_idx];
+      k_idx_mod = k_idx % 3;
+      k_idx /= 3;
+      value = input[e_idx * ${dim} + d_idx] *
+              (kernel_size[d_idx] - (2 * is_open_spline[d_idx]));
+      frac = value - floor(value);
+      if (k_idx_mod == 0) a *= 0.5 * (1- frac) * (1-frac);
+      else if (k_idx_mod == 1) a *= -frac * frac + frac + 0.5;
+      else a *= 0.5 * frac * frac;
+      pos = int(floor(value)) + k_idx_mod;
+      pos %= kernel_size[d_idx];
+      i += pos * K;
+    }
+    amount[idx] = a;
+    index[idx] = i;
+  }
+}
+'''
+_spline_kernel_cubic = kernel_loop + '''
+extern "C"
+__global__ void spline_kernel(
+const ${Dtype}* input, ${Dtype}* amount, long* index,
+const long* kernel_size, const long* is_open_spline, int num_threads) {
+  CUDA_KERNEL_LOOP(idx, num_threads}) {
+    const int e_idx = idx / ${k_max};
+    int k_idx = idx % ${k_max};
+    int K = ${K};
+    int k_idx_mod;
+    int pos;
+    ${Dtype} value;
+    ${Dtype} frac;
+    ${Dtype} a = 1.0;
+    long i = 0;
+    for (int d_idx = 0; d_idx < ${dim}; d_idx++) {
+      K /= kernel_size[d_idx];
+      k_idx_mod = k_idx % 4;
+      k_idx /= 4;
+      value = input[e_idx * ${dim} + d_idx] *
+              (kernel_size[d_idx] - (3 * is_open_spline[d_idx]));
+      frac = value - floor(value);
+      if (k_idx_mod == 0) a *= (1 - frac) * (1 - frac) * (1 - frac) / 6.0;
+      else if (k_idx_mod == 1)
+        a *= (3 * frac * frac * frac - 6 * frac * frac + 4) / 6.0;
+      else if (k_idx_mod == 2)
+        a *= (-3 * frac * frac * frac + 3 * frac * frac + 3 * frac + 1) / 6.0;
+      else a *= frac * frac * frac / 6.0;
+      pos = int(floor(value)) + k_idx_mod;
+      pos %= kernel_size[d_idx];
+      i += pos * K;
+    }
+    amount[idx] = a;
+    index[idx] = i;
+  }
+}
+'''
+_spline_kernel_linear_backward = kernel_loop + '''
+extern "C"
+__global__ void spline_kernel(
+const ${Dtype}* input, const ${Dtype}* grad_amount, ${Dtype}* amount, 
+${Dtype}* grad_adj, const long* kernel_size, const long* is_open_spline, 
+int num_threads) {
+  CUDA_KERNEL_LOOP(idx, num_threads) {
+    const int e_idx = idx / ${dim};
+    int d_idx = idx % ${dim};
+    int K = ${K};
+    int k_idx_mod;
+    int bot;
+    int top;
+    ${Dtype} value;
+    ${Dtype} frac;
+    ${Dtype} grad_out = 0.0;
+    long i = 0;
+    int quotient = (int)pow(2.0,(float)d_idx)
+    for (int k_idx = 0; k_idx < ${k_max}; k_idx++) {
+      k_idx_mod = (k_idx/quotient) % 2;
+      value = input[e_idx * ${dim} + d_idx];
+      value *= kernel_size[d_idx] - is_open_spline[d_idx];
+      frac = value - floor(value);
+      residual = (1 - k_idx_mod) * frac + k_idx_mod * (1 - frac);
+      int a_idx = e_idx*${k_max} + k_idx
+      grad_out += grad_amount[a_idx]*amount[a_idx]/residual;
+    }
+    grad_adj[e_idx* ${dim} + d_idx] = grad_out;
+  }
+}
+'''
+def get_basis_kernel(k_max, K, dim, degree, bt_to_adj=False):
+    if degree == 3:
+        _spline_kernel = _spline_kernel_cubic
+    elif degree == 2:
+        _spline_kernel = _spline_kernel_quadratic
+    else:
+        _spline_kernel = _spline_kernel_linear
+    cuda_tensor = torch.FloatTensor([1]).cuda()
+    with torch.cuda.device_of(cuda_tensor):
+        f = load_kernel(
+            'spline_kernel',
+            _spline_kernel,
+            Dtype='float',
+            k_max=k_max,
+            dim=dim,
+            K=K)
+    return f
+def get_basis_backward_kernel(k_max, K, dim, degree):
+    if degree == 3:
+        _spline_kernel = _spline_kernel_cubic
+    elif degree == 2:
+        _spline_kernel = _spline_kernel_quadratic
+    else:
+        _spline_kernel = _spline_kernel_linear_backward
+    cuda_tensor = torch.FloatTensor([1]).cuda()
+    with torch.cuda.device_of(cuda_tensor):
+        f = load_kernel(
+            'spline_kernel',
+            _spline_kernel,
+            Dtype='float',
+            k_max=k_max,
+            dim=dim,
+            K=K)
+    return f
+class SplineConvGPU(Function):
+    def __init__(self, kernel_size, is_open_spline, K, degree,
+                               basis_kernel, basis_backward_kernel,
+                               weighting_kernel, weighting_backward_kernel,
+                               bp_to_adj=False):
+        super(SplineConvGPU, self).__init__()
+        self.degree = degree
+        self.f_weighting_fw = weighting_kernel
+        self.f_weighting_bw = weighting_backward_kernel
+        self.kernel_size = kernel_size
+        self.is_open_spline = is_open_spline
+        self.f_basis_fw = basis_kernel
+        self.f_basis_bw = basis_backward_kernel
+        self.bp_to_adj = bp_to_adj
+    def forward(self, input, weight, adj_values):
+        assert input.is_cuda and weight.is_cuda
+        self.K, self.M_in, self.M_out = weight.size()
+        # Compute B-spline basis tensor products
+        adj_values = adj_values.unsqueeze(1) if len(adj_values.size()) < 2 \
+            else adj_values
+        num_edges, dim = adj_values.size()
+        k_max = 2 ** dim
+        amount = adj_values.new(num_edges, k_max)
+        index = adj_values.new(num_edges, k_max).long()
+        num_threads = amount.numel()
+        with torch.cuda.device_of(input):
+            self.f_basis_fw(
+                block=(cuda_num_threads, 1, 1),
+                grid=(get_blocks(num_threads), 1, 1),
+                args=[
+                    adj_values.data_ptr(),
+                    amount.data_ptr(),
+                    index.data_ptr(),
+                    self.kernel_size.data_ptr(),
+                    self.is_open_spline.data_ptr(), num_threads
+                ],
+                stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))
+        # Weight features
+        output = input.new(input.size(0), self.M_out)
+        num_threads = output.numel()
+        with torch.cuda.device_of(input):
+            self.f_weighting_fw(
+                block=(cuda_num_threads, 1, 1),
+                grid=(get_blocks(num_threads), 1, 1),
+                args=[
+                    input.data_ptr(),
+                    weight.data_ptr(),
+                    output.data_ptr(),
+                    amount.data_ptr(),
+                    index.data_ptr(), num_threads
+                ],
+                stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))
+        if self.bp_to_adj:
+            self.save_for_backward(input, weight, adj_values, amount, index)
+        else:
+            self.save_for_backward(input, weight, amount, index)
+        return output
+    def backward(self, grad_output):
+        grad_input = grad_output.new(grad_output.size(0), self.M_in).fill_(0)
+        grad_weight = grad_output.new(self.K, self.M_in, self.M_out).fill_(0)
+        num_threads = grad_output.numel()
+        if self.bp_to_adj:
+            input, weight, adj_values, amount, index = self.saved_tensors
+            grad_amount = grad_output.new(amount.size(0),
+                                          amount.size(1)).fill_(0)
+            with torch.cuda.device_of(grad_output):
+                self.f_weighting_bw(
+                    block=(cuda_num_threads, 1, 1),
+                    grid=(get_blocks(num_threads), 1, 1),
+                    args=[
+                        grad_output.data_ptr(),
+                        grad_input.data_ptr(),
+                        grad_weight.data_ptr(),
+                        grad_amount.data_ptr(),
+                        input.data_ptr(),
+                        weight.data_ptr(),
+                        amount.data_ptr(),
+                        index.data_ptr(), num_threads
+                    ],
+                    stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))
+            num_threads = grad_amount.numel()
+            grad_adj = grad_amount.new(grad_amount.size(0),
+                                       self.kernel_size.size(0)).fill_(0)
+            with torch.cuda.device_of(grad_amount):
+                self.f_basis_bw(
+                    block=(cuda_num_threads, 1, 1),
+                    grid=(get_blocks(num_threads), 1, 1),
+                    args=[
+                        adj_values.data_ptr(),
+                        grad_amount.data_ptr(),
+                        amount.data_ptr(),
+                        grad_adj.data_ptr(),
+                        self.kernel_size.data_ptr(),
+                        self.is_open_spline.data_ptr(), num_threads
+                    ],
+                    stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))
+            return grad_input, grad_weight, grad_adj
+        else:
+            input, weight, amount, index = self.saved_tensors
+            with torch.cuda.device_of(grad_output):
+                self.f_weighting_bw(
+                    block=(cuda_num_threads, 1, 1),
+                    grid=(get_blocks(num_threads), 1, 1),
+                    args=[
+                        grad_output.data_ptr(),
+                        grad_input.data_ptr(),
+                        grad_weight.data_ptr(),
+                        input.data_ptr(),
+                        weight.data_ptr(),
+                        amount.data_ptr(),
+                        index.data_ptr(), num_threads
+                    ],
+                    stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))
+            return grad_input, grad_weight