removed old version

952a4cad · rusty1s · b189dba7 · b189dba7 · b189dba7 · b189dba7
Commit 952a4cad authored Mar 23, 2018 by rusty1s
7 changed files
--- a/torch_spline_conv/old/edgewise_spline_weighting_gpu.py
+++ b/torch_spline_conv/old/edgewise_spline_weighting_gpu.py
-'''
-import unittest
-import torch
-from torch.autograd import Variable, gradcheck
-from numpy.testing import assert_equal
-from .spline import spline
-if torch.cuda.is_available():
-    from .edgewise_spline_weighting_gpu import EdgewiseSplineWeightingGPU
-class EdgewiseSplineWeightingGPUTest(unittest.TestCase):
-    @unittest.skipIf(not torch.cuda.is_available(), 'no GPU')
-    def test_forward(self):
-        input = [[0.25, 0.125], [0.25, 0.375], [0.75, 0.625], [0.75, 0.875]]
-        input = torch.cuda.FloatTensor(input)
-        kernel_size = torch.cuda.LongTensor([3, 4])
-        is_open_spline = torch.cuda.LongTensor([1, 0])
-        amount, index = spline(input, kernel_size, is_open_spline, 12, 1)
-        input = torch.cuda.FloatTensor([[1, 2], [3, 4], [5, 6], [7, 8]])
-        weight = torch.arange(0.5, 0.5 * 25, step=0.5).view(12, 2, 1).cuda()
-        input, weight = Variable(input), Variable(weight)
-        op = EdgewiseSplineWeightingGPU(amount, index)
-        out = op(input, weight)
-        expected_out = [
-            [0.25 * (1 * (0.5 + 1.5 + 4.5 + 5.5) + 2 * (1 + 2 + 5 + 6))],
-            [0.25 * (3 * (1.5 + 2.5 + 5.5 + 6.5) + 4 * (2 + 3 + 6 + 7))],
-            [0.25 * (5 * (6.5 + 7.5 + 10.5 + 11.5) + 6 * (7 + 8 + 11 + 12))],
-            [0.25 * (7 * (4.5 + 7.5 + 8.5 + 11.5) + 8 * (5 + 8 + 9 + 12))],
-        ]
-        assert_equal(out.cpu().data.numpy(), expected_out)
-    @unittest.skipIf(not torch.cuda.is_available(), 'no GPU')
-    def test_backward(self):
-        input = [[0.25, 0.125], [0.25, 0.375], [0.75, 0.625], [0.75, 0.875]]
-        input = torch.cuda.DoubleTensor(input)
-        kernel_size = torch.cuda.LongTensor([3, 4])
-        is_open_spline = torch.cuda.LongTensor([1, 0])
-        amount, index = spline(input, kernel_size, is_open_spline, 12, 1)
-        input = torch.randn(4, 2).double().cuda()
-        weight = torch.randn(12, 2, 1).double().cuda()
-        input = Variable(input, requires_grad=True)
-        weight = Variable(weight, requires_grad=True)
-        op = EdgewiseSplineWeightingGPU(amount, index)
-        test = gradcheck(op, (input, weight), eps=1e-6, atol=1e-4)
-        self.assertTrue(test)
-'''
--- a/torch_spline_conv/old/spline_conv.py
+++ b/torch_spline_conv/old/spline_conv.py
-import torch
-from torch.autograd import Variable
-from .spline_conv_gpu import SplineConvGPU
-def spline_conv(
-        adj,  # Pytorch Tensor (!bp_to_adj) or Pytorch Variable (bp_to_adj)
-        input,  # Pytorch Variable
-        weight,  # Pytorch Variable
-        kernel_size,  # Rest tensors or python variables
-        is_open_spline,
-        K,
-        weighting_kernel,
-        weighting_backward_kernel,
-        basis_kernel,
-        basis_backward_kernel=None,
-        degree=1,
-        bias=None):
-    if input.dim() == 1:
-        input = input.unsqueeze(1)
-    values = adj['values']
-    row, col = adj['indices']
-    # Get features for every end vertex with shape [|E| x M_in].
-    output = input[col]
-    bp_to_adj = False if torch.is_tensor(values) else True
-    # Convert to [|E| x M_in] feature matrix and calculate [|E| x M_out].
-    if output.is_cuda:
-        if bp_to_adj:
-            output = SplineConvGPU(kernel_size, is_open_spline, K, degree,
-                                   basis_kernel, basis_backward_kernel,
-                                   weighting_kernel, weighting_backward_kernel,
-                                   bp_to_adj)(output, weight[:-1], values)
-        else:
-            output = SplineConvGPU(kernel_size, is_open_spline, K, degree,
-                                   basis_kernel, basis_backward_kernel,
-                                   weighting_kernel, weighting_backward_kernel,
-                                   bp_to_adj, values)(output, weight[:-1])
-    else:
-        # CPU Implementation not available
-        raise NotImplementedError()
-    # Convolution via `scatter_add`. Converts [|E| x M_out] feature matrix to
-    # [n x M_out] feature matrix.
-    zero = output.data.new(adj['size'][1], output.size(1)).fill_(0.0)
-    zero = Variable(zero) if not torch.is_tensor(output) else zero
-    r = row.view(-1, 1).expand(row.size(0), output.size(1))
-    output = zero.scatter_add_(0, Variable(r), output)
-    # Weighten root node features by multiplying with root weight.
-    output += torch.mm(input, weight[-1])
-    # Normalize output by degree.
-    ones = output.data.new(values.size(0)).fill_(1)
-    zero = output.data.new(output.size(0)).fill_(0)
-    degree = zero.scatter_add_(0, row, ones)
-    degree = torch.clamp(degree, min=1)
-    output = output / Variable(degree.view(-1, 1))
-    if bias is not None:
-        output += bias
-    return output
--- a/torch_spline_conv/old/spline_conv2.py
+++ b/torch_spline_conv/old/spline_conv2.py
-from __future__ import division
-import unittest
-import torch
-from torch.autograd import Variable, gradcheck
-from .spline_conv_gpu import (get_basis_kernel, get_basis_backward_kernel,
-                              get_weighting_forward_kernel,
-                              get_weighting_backward_kernel, SplineConvGPU)
-class SplineConvTest(unittest.TestCase):
-    '''
-    @unittest.skipIf(not torch.cuda.is_available(), 'no GPU')
-    def test_forward_gpu(self):
-        edges = torch.LongTensor([[0, 0, 0, 0], [1, 2, 3, 4]])
-        values = [[0.25, 0.125], [0.25, 0.375], [0.75, 0.625], [0.75, 0.875]]
-        values = torch.FloatTensor(values).double()
-        adj = {'indices': edges.cuda(), 'values': Variable(values.cuda()),
-               'size': torch.Size([5, 5, 2])}
-        kernel_size = torch.cuda.LongTensor([3, 4])
-        is_open_spline = torch.cuda.LongTensor([1, 0])
-        input = torch.DoubleTensor([[9, 10], [1, 2], [3, 4], [5, 6], [7, 8]])
-        weight = torch.arange(0.5, 0.5 * 27, step=0.5).view(13, 2, 1).double()
-        input, weight = input.cuda(), weight.cuda()
-        input, weight = Variable(input), Variable(weight)
-        row, col = adj['indices']
-        output = input[col]
-        K = 12
-        in_features = 2
-        out_features = 1
-        degree = 1
-        dim = 2
-        k_max = (degree+1)**dim
-        fw_k = get_weighting_forward_kernel(in_features, out_features, k_max)
-        bw_k = get_weighting_backward_kernel(in_features, out_features, k_max,
-                                             K, True)
-        basis_fw_k = get_basis_kernel(k_max, K, dim, degree)
-        basis_bw_k = get_basis_backward_kernel(k_max, K, dim, degree)
-        #output = spline_conv(
-        #    adj, input, weight, kernel_size, is_open_spline, K, fw_k, bw_k,
-        #    basis_fw_k, basis_bw_k,bp_to_adj=True)
-        values = adj['values']
-        output = SplineConvGPU(kernel_size, is_open_spline, K, degree,
-                           basis_fw_k, basis_bw_k, fw_k, bw_k, bp_to_adj=True)\
-            (output, weight, values)
-        zero = output.data.new(adj['size'][1], output.size(1)).fill_(0.0)
-        zero = Variable(zero) if not torch.is_tensor(output) else zero
-        r = row.view(-1, 1).expand(row.size(0), output.size(1))
-        output = zero.scatter_add_(0, Variable(r), output)
-        # Weighten root node features by multiplying with root weight.
-        output += torch.mm(input, weight[-1])
-        # Normalize output by degree.
-        ones = values.data.new(values.size(0)).fill_(1)
-        zero = values.data.new(output.size(0)).fill_(0)
-        degree = zero.scatter_add_(0, row, ones)
-        degree = torch.clamp(degree, min=1)
-        output = output / Variable(degree.view(-1, 1))
-        expected_output = [
-            [(12.5 * 9 + 13 * 10 + 266) / 4],
-            [12.5 * 1 + 13 * 2],
-            [12.5 * 3 + 13 * 4],
-            [12.5 * 5 + 13 * 6],
-            [12.5 * 7 + 13 * 8],
-        ]
-        assert_almost_equal(output.cpu().data.numpy(), expected_output, 1)
-    @unittest.skipIf(not torch.cuda.is_available(), 'no GPU')
-    def test_backward(self):
-        kernel_size = torch.cuda.LongTensor([3, 4])
-        is_open_spline = torch.cuda.LongTensor([1, 1])
-        input = torch.randn(4, 2).double().cuda()
-        weight = torch.randn(12, 2, 1).double().cuda()
-        values = torch.FloatTensor(4, 2).uniform_(0, 1).double().cuda()
-        print(values)
-        input = Variable(input, requires_grad=True)
-        weight = Variable(weight, requires_grad=True)
-        values = Variable(values, requires_grad=True)
-        K = 12
-        in_features = 2
-        out_features = 1
-        degree = 1
-        dim = 2
-        k_max = (degree + 1) ** dim
-        fw_k = get_weighting_forward_kernel(in_features, out_features, k_max)
-        bw_k = get_weighting_backward_kernel(in_features, out_features, k_max,
-                                             K, bp_to_adj=True)
-        basis_fw_k = get_basis_kernel(k_max, K, dim, degree)
-        basis_bw_k = get_basis_backward_kernel(k_max, K, dim, degree)
-        op = SplineConvGPU(kernel_size, is_open_spline, K, degree,
-                           basis_fw_k, basis_bw_k, fw_k, bw_k, bp_to_adj=True)
-        print(op(input, weight, values))
-        #test = gradcheck(op, (input, weight, values), eps=1e-6, atol=1e-4)
-        #self.assertTrue(test)
-    '''
-    @unittest.skipIf(not torch.cuda.is_available(), 'no GPU')
-    def test_backward(self):
-        input = torch.randn(4, 2).double().cuda()
-        weight = torch.randn(9, 2, 1).double().cuda()
-        values = torch.FloatTensor(4, 2).uniform_(0, 1).double().cuda()
-        print(values)
-        input = Variable(input, requires_grad=True)
-        weight = Variable(weight, requires_grad=True)
-        values = Variable(values, requires_grad=True)
-        K = 9
-        in_features = 2
-        out_features = 1
-        degree = 1
-        dim = 2
-        k_max = (degree + 1)**dim
-        kernel_size = torch.cuda.LongTensor([3, 3])
-        is_open_spline = torch.cuda.LongTensor([1, 0])
-        fw_k = get_weighting_forward_kernel(
-            in_features, out_features, k_max, dtype='double')
-        bw_k = get_weighting_backward_kernel(
-            in_features, out_features, k_max, K, True, dtype='double')
-        basis_fw_k = get_basis_kernel(k_max, K, dim, degree, dtype='double')
-        basis_bw_k = get_basis_backward_kernel(
-            k_max, K, dim, degree, dtype='double')
-        op = SplineConvGPU(
-            kernel_size,
-            is_open_spline,
-            K,
-            degree,
-            basis_fw_k,
-            basis_bw_k,
-            fw_k,
-            bw_k,
-            bp_to_adj=True)
-        test = gradcheck(op, (input, weight, values), eps=1e-6, atol=1e-4)
-        self.assertTrue(test)
--- a/torch_spline_conv/old/spline_conv_gpu.py
+++ b/torch_spline_conv/old/spline_conv_gpu.py
-import torch
-from torch.autograd import Function
-from ....utils.cuda import (cuda_num_threads, Stream, load_kernel, kernel_loop,
-                            get_blocks)
-_edgewise_spline_weighting_forward_kernel = kernel_loop + '''
-extern "C"
-__global__ void edgewise_spline_weighting_forward_kernel(
-const ${Dtype}* input, const ${Dtype}* weight, ${Dtype}* output,
-const ${Dtype}* amount, const long* index, int num_threads) {
-  CUDA_KERNEL_LOOP(idx, num_threads) {
-    const int e_idx = idx / ${M_out};
-    const int m_out_idx = idx % ${M_out};
-    ${Dtype} result = 0.0;
-    ${Dtype} w;
-    ${Dtype} f;
-    int k;
-    ${Dtype} b;
-    long c;
-    long w_idx;
-    for (int k_idx = 0; k_idx < ${k_max}; k_idx++) {
-      k = e_idx * ${k_max} + k_idx;
-      b = amount[k];
-      c = index[k];
-      for (int m_in_idx = 0; m_in_idx < ${M_in}; m_in_idx++) {
-        w_idx = c * ${M_out} * ${M_in} +
-                m_in_idx * ${M_out} +
-                m_out_idx;
-        w = weight[w_idx];
-        f = input[e_idx * ${M_in} + m_in_idx];
-        result += b * w * f;
-      }
-    }
-    output[idx] = result;
-  }
-}
-'''
-_edgewise_spline_weighting_backward_kernel = kernel_loop + '''
-extern "C"
-__global__ void edgewise_spline_weighting_backward_kernel(
-const ${Dtype}* grad_output, ${Dtype}* grad_input, ${Dtype}* grad_weight,
-const ${Dtype}* input, const ${Dtype}* weight, const ${Dtype}* amount,
-const long* index, int num_threads) {
-  CUDA_KERNEL_LOOP(idx, num_threads) {
-    const int e_idx = idx / ${M_out};
-    const int m_out_idx = idx % ${M_out};
-    ${Dtype} w;
-    ${Dtype} g;
-    ${Dtype} f;
-    ${Dtype} w_grad;
-    int k;
-    ${Dtype} b;
-    long c;
-    long w_idx;
-    for (int k_idx = 0; k_idx < ${k_max}; k_idx++) {
-      k = e_idx * ${k_max} + k_idx;
-      b = amount[k];
-      c = index[k];
-      for (int m_in_idx = 0; m_in_idx < ${M_in}; m_in_idx++) {
-        w_idx = c * ${M_out} * ${M_in} +
-                m_in_idx * ${M_out} +
-                m_out_idx;
-        w = weight[w_idx];
-        // Calculate input gradient.
-        g = grad_output[e_idx * ${M_out} + m_out_idx];
-        atomicAdd(&(grad_input[e_idx * ${M_in} + m_in_idx]), b * w * g);
-        // This is inefficient: `reduce_sum` shouldn't be done like this.
-        // Looping over `M_out` would be better to avoid the `atomicAdd`.
-        // Calculate weight gradient.
-        f = input[e_idx * ${M_in} + m_in_idx];
-        w_grad = f * b * g;
-        atomicAdd(&(grad_weight[w_idx]), w_grad);
-        // Not so efficient either, but not avoidable.
-      }
-    }
-  }
-}
-'''
-_edgewise_spline_weighting_backward_kernel_bp2adj = kernel_loop + '''
-extern "C"
-__global__ void edgewise_spline_weighting_backward_kernel(
-const ${Dtype}* grad_output, ${Dtype}* grad_input, ${Dtype}* grad_weight,
-${Dtype}* grad_amount, const ${Dtype}* input, const ${Dtype}* weight,
-const ${Dtype}* amount, const long* index, int num_threads) {
-  CUDA_KERNEL_LOOP(idx, num_threads) {
-    const int e_idx = idx / ${M_out};
-    const int m_out_idx = idx % ${M_out};
-    ${Dtype} w;
-    ${Dtype} g;
-    ${Dtype} f;
-    ${Dtype} w_grad;
-    int k;
-    ${Dtype} b;
-    long c;
-    long w_idx;
-    for (int k_idx = 0; k_idx < ${k_max}; k_idx++) {
-      k = e_idx * ${k_max} + k_idx;
-      b = amount[k];
-      c = index[k];
-      ${Dtype} adj_g = 0.0;
-      for (int m_in_idx = 0; m_in_idx < ${M_in}; m_in_idx++) {
-        w_idx = c * ${M_out} * ${M_in} +
-                m_in_idx * ${M_out} +
-                m_out_idx;
-        w = weight[w_idx];
-        // Calculate input gradient.
-        g = grad_output[e_idx * ${M_out} + m_out_idx];
-        atomicAdd(&(grad_input[e_idx * ${M_in} + m_in_idx]), b * w * g);
-        // This is inefficient: `reduce_sum` shouldn't be done like this.
-        // Looping over `M_out` would be better to avoid the `atomicAdd`.
-        // Calculate weight gradient.
-        f = input[e_idx * ${M_in} + m_in_idx];
-        w_grad = f * b * g;
-        atomicAdd(&(grad_weight[w_idx]), w_grad);
-        // Not so efficient either, but not avoidable.
-        // Calculate B-spline basis tensor product gradient
-        adj_g += g * f * w;
-      }
-      atomicAdd(&(grad_amount[e_idx*${k_max} + k_idx]), adj_g);
-    }
-  }
-}
-'''
-def get_weighting_forward_kernel(M_in, M_out, k_max, dtype='float'):
-    cuda_tensor = torch.FloatTensor([1]).cuda()
-    kernel = _edgewise_spline_weighting_forward_kernel
-    with torch.cuda.device_of(cuda_tensor):
-        f_fw = load_kernel(
-            'edgewise_spline_weighting_forward_kernel',
-            kernel,
-            Dtype=dtype,
-            M_in=M_in,
-            M_out=M_out,
-            k_max=k_max)
-    return f_fw
-def get_weighting_backward_kernel(M_in,
-                                  M_out,
-                                  k_max,
-                                  K,
-                                  bp_to_adj=False,
-                                  dtype='float'):
-    cuda_tensor = torch.FloatTensor([1]).cuda()
-    if bp_to_adj:
-        kernel = _edgewise_spline_weighting_backward_kernel_bp2adj
-    else:
-        kernel = _edgewise_spline_weighting_backward_kernel
-    with torch.cuda.device_of(cuda_tensor):
-        f_bw = load_kernel(
-            'edgewise_spline_weighting_backward_kernel',
-            kernel,
-            Dtype=dtype,
-            M_in=M_in,
-            M_out=M_out,
-            k_max=k_max,
-            K=K)
-    return f_bw
-_spline_kernel_linear = kernel_loop + '''
-extern "C"
-__global__ void spline_kernel(
-const ${Dtype}* input, ${Dtype}* amount, long* index,
-const long* kernel_size, const long* is_open_spline, int num_threads) {
-  CUDA_KERNEL_LOOP(idx, num_threads) {
-    const int e_idx = idx / ${k_max};
-    int k_idx = idx % ${k_max};
-    int K = ${K};
-    int k_idx_mod;
-    int bot;
-    int top;
-    ${Dtype} value;
-    ${Dtype} frac;
-    ${Dtype} a = 1.0;
-    long i = 0;
-    for (int d_idx = 0; d_idx < ${dim}; d_idx++) {
-      K/=kernel_size[d_idx];
-      k_idx_mod = k_idx % 2;
-      k_idx >>= 1;
-      value = input[e_idx * ${dim} + d_idx];
-      value *= kernel_size[d_idx] - is_open_spline[d_idx];
-      frac = value - floor(value);
-      a *= (1 - k_idx_mod) * (1 - frac) + k_idx_mod * frac;
-      bot = int(floor(value));
-      top = (bot + 1) % kernel_size[d_idx];
-      bot %= kernel_size[d_idx];
-      i += ((1 - k_idx_mod) * bot + k_idx_mod * top) * K;
-    }
-    amount[idx] = a;
-    index[idx] = i;
-  }
-}
-'''
-_spline_kernel_quadratic = kernel_loop + '''
-extern "C"
-__global__ void spline_kernel(
-const ${Dtype}* input, ${Dtype}* amount, long* index,
-const long* kernel_size, const long* is_open_spline, int num_threads) {
-  CUDA_KERNEL_LOOP(idx, num_threads) {
-    const int e_idx = idx / ${k_max};
-    int k_idx = idx % ${k_max};
-    int K = ${K};
-    int k_idx_mod;
-    int pos;
-    ${Dtype} value;
-    ${Dtype} frac;
-    ${Dtype} a = 1.0;
-    long i = 0;
-    for (int d_idx = 0; d_idx < ${dim}; d_idx++) {
-      K /= kernel_size[d_idx];
-      k_idx_mod = k_idx % 3;
-      k_idx /= 3;
-      value = input[e_idx * ${dim} + d_idx] *
-              (kernel_size[d_idx] - (2 * is_open_spline[d_idx]));
-      frac = value - floor(value);
-      if (k_idx_mod == 0) a *= 0.5 * (1- frac) * (1-frac);
-      else if (k_idx_mod == 1) a *= -frac * frac + frac + 0.5;
-      else a *= 0.5 * frac * frac;
-      pos = int(floor(value)) + k_idx_mod;
-      pos %= kernel_size[d_idx];
-      i += pos * K;
-    }
-    amount[idx] = a;
-    index[idx] = i;
-  }
-}
-'''
-_spline_kernel_cubic = kernel_loop + '''
-extern "C"
-__global__ void spline_kernel(
-const ${Dtype}* input, ${Dtype}* amount, long* index,
-const long* kernel_size, const long* is_open_spline, int num_threads) {
-  CUDA_KERNEL_LOOP(idx, num_threads) {
-    const int e_idx = idx / ${k_max};
-    int k_idx = idx % ${k_max};
-    int K = ${K};
-    int k_idx_mod;
-    int pos;
-    ${Dtype} value;
-    ${Dtype} frac;
-    ${Dtype} a = 1.0;
-    long i = 0;
-    for (int d_idx = 0; d_idx < ${dim}; d_idx++) {
-      K /= kernel_size[d_idx];
-      k_idx_mod = k_idx % 4;
-      k_idx /= 4;
-      value = input[e_idx * ${dim} + d_idx] *
-              (kernel_size[d_idx] - (3 * is_open_spline[d_idx]));
-      frac = value - floor(value);
-      if (k_idx_mod == 0) a *= (1 - frac) * (1 - frac) * (1 - frac) / 6.0;
-      else if (k_idx_mod == 1)
-        a *= (3 * frac * frac * frac - 6 * frac * frac + 4) / 6.0;
-      else if (k_idx_mod == 2)
-        a *= (-3 * frac * frac * frac + 3 * frac * frac + 3 * frac + 1) / 6.0;
-      else a *= frac * frac * frac / 6.0;
-      pos = int(floor(value)) + k_idx_mod;
-      pos %= kernel_size[d_idx];
-      i += pos * K;
-    }
-    amount[idx] = a;
-    index[idx] = i;
-  }
-}
-'''
-# This is the efficient version which uses amount but may divide by 0 and
-# may be numerically unstable.
-# No solution for this yet, use the less efficient version 2.
-_spline_kernel_linear_backward = kernel_loop + '''
-extern "C"
-__global__ void spline_kernel(
-const ${Dtype}* input, const ${Dtype}* grad_amount, ${Dtype}* amount,
-${Dtype}* grad_adj, const long* kernel_size, const long* is_open_spline,
-int num_threads) {
-  CUDA_KERNEL_LOOP(idx, num_threads) {
-    const int e_idx = idx / ${dim};
-    int d_idx = idx % ${dim};
-    int k_idx_mod;
-    ${Dtype} value;
-    ${Dtype} frac;
-    ${Dtype} grad_out = 0.0;
-    int quotient = (int)pow(2.0,(double)d_idx);
-    value = input[e_idx * ${dim} + d_idx];
-    value *= kernel_size[d_idx] - is_open_spline[d_idx];
-    frac = value - floor(value);
-    for (int k_idx = 0; k_idx < ${k_max}; k_idx++) {
-      k_idx_mod = (k_idx/quotient) % 2;
-      int a_idx = e_idx*${k_max} + k_idx;
-      ${Dtype} residual = - (1 - k_idx_mod) * (1 - frac) + k_idx_mod * frac;
-      grad_out += grad_amount[a_idx]*amount[a_idx]/residual;
-    }
-    grad_adj[idx] = grad_out*(kernel_size[d_idx] - is_open_spline[d_idx]);
-  }
-}
-/*
- ${Dtype} a = -(1 - k_idx_mod) + k_idx_mod;
-      for (int d_it = 0; d_it < ${dim}; d_it++) {
-        if(d_it!=d_idx)
-        {
-          value = input[e_idx * ${dim} + d_it];
-          value *= kernel_size[d_it] - is_open_spline[d_it];
-          frac = value - floor(value);
-          a *= (1 - k_idx_mod) * (1 - frac) + k_idx_mod * frac;
-        }
-      }
-      grad_out += a*grad_amount[a_idx];
-*/
-'''
-# This is the inefficient version with gradient computation without amount.
-_spline_kernel_linear_backward2 = kernel_loop + '''
-extern "C"
-__global__ void spline_kernel(
-const ${Dtype}* input, const ${Dtype}* grad_amount, ${Dtype}* amount,
-${Dtype}* grad_adj, const long* kernel_size, const long* is_open_spline,
-int num_threads) {
-  CUDA_KERNEL_LOOP(idx, num_threads) {
-    const int e_idx = idx / ${dim};
-    int d_idx = idx % ${dim};
-    int k_idx_mod;
-    ${Dtype} value;
-    ${Dtype} frac;
-    ${Dtype} grad_out = 0.0;
-    int quotient = (int)pow(2.0,(double)d_idx);
-    for (int k_idx = 0; k_idx < ${k_max}; k_idx++) {
-      k_idx_mod = (k_idx/quotient) % 2;
-      int a_idx = e_idx*${k_max} + k_idx;
-      ${Dtype} a = -(1 - k_idx_mod) + k_idx_mod;
-      for (int d_it = 0; d_it < ${dim}; d_it++) {
-        if(d_it!=d_idx)
-        {
-          int quotient = (int)pow(2.0,(double)d_it);
-          k_idx_mod = (k_idx/quotient) % 2;
-          value = input[e_idx * ${dim} + d_it];
-          value *= kernel_size[d_it] - is_open_spline[d_it];
-          frac = value - floor(value);
-          a *= (1 - k_idx_mod) * (1 - frac) + k_idx_mod * frac;
-        }
-      }
-      grad_out += a*grad_amount[a_idx];
-    }
-    grad_adj[idx] = grad_out*(kernel_size[d_idx] - is_open_spline[d_idx]);
-  }
-}
-'''
-def get_basis_kernel(k_max, K, dim, degree, dtype='float'):
-    if degree == 3:
-        _spline_kernel = _spline_kernel_cubic
-    elif degree == 2:
-        _spline_kernel = _spline_kernel_quadratic
-    else:
-        _spline_kernel = _spline_kernel_linear
-    cuda_tensor = torch.FloatTensor([1]).cuda()
-    with torch.cuda.device_of(cuda_tensor):
-        f = load_kernel(
-            'spline_kernel',
-            _spline_kernel,
-            Dtype=dtype,
-            k_max=k_max,
-            dim=dim,
-            K=K)
-    return f
-def get_basis_backward_kernel(k_max, K, dim, degree, dtype='float'):
-    if degree == 3:
-        _spline_kernel = _spline_kernel_linear_backward2
-    elif degree == 2:
-        _spline_kernel = _spline_kernel_linear_backward2
-    else:
-        _spline_kernel = _spline_kernel_linear_backward2
-    cuda_tensor = torch.FloatTensor([1]).cuda()
-    with torch.cuda.device_of(cuda_tensor):
-        f = load_kernel(
-            'spline_kernel',
-            _spline_kernel,
-            Dtype=dtype,
-            k_max=k_max,
-            dim=dim,
-            K=K)
-    return f
-class SplineConvGPU(Function):
-    def __init__(self,
-                 kernel_size,
-                 is_open_spline,
-                 K,
-                 degree,
-                 basis_kernel,
-                 basis_backward_kernel,
-                 weighting_kernel,
-                 weighting_backward_kernel,
-                 bp_to_adj=False,
-                 adj_values=None):
-        super(SplineConvGPU, self).__init__()
-        self.degree = degree
-        self.f_weighting_fw = weighting_kernel
-        self.f_weighting_bw = weighting_backward_kernel
-        self.kernel_size = kernel_size
-        self.is_open_spline = is_open_spline
-        self.f_basis_fw = basis_kernel
-        self.f_basis_bw = basis_backward_kernel
-        self.bp_to_adj = bp_to_adj
-        self.adj_values = adj_values
-    def forward(self, input, weight, adj_values=None):
-        assert input.is_cuda and weight.is_cuda
-        self.K, self.M_in, self.M_out = weight.size()
-        # If bp_to_u is false
-        if adj_values is None:
-            adj_values = self.adj_values
-        # Compute B-spline basis tensor products
-        adj_values = adj_values.unsqueeze(1) if len(adj_values.size()) < 2 \
-            else adj_values
-        if self.bp_to_adj:
-            self.save_for_backward(input, weight, adj_values)
-            # adj_values = torch.clamp(adj_values,min=0.0,max=1.0)
-        else:
-            self.save_for_backward(input, weight)
-        num_edges, dim = adj_values.size()
-        k_max = (self.degree + 1)**dim
-        amount = adj_values.new(num_edges, k_max)
-        index = adj_values.new(num_edges, k_max).long()
-        num_threads = amount.numel()
-        with torch.cuda.device_of(input):
-            self.f_basis_fw(
-                block=(cuda_num_threads, 1, 1),
-                grid=(get_blocks(num_threads), 1, 1),
-                args=[
-                    adj_values.data_ptr(),
-                    amount.data_ptr(),
-                    index.data_ptr(),
-                    self.kernel_size.data_ptr(),
-                    self.is_open_spline.data_ptr(), num_threads
-                ],
-                stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))
-        # Weight features
-        output = input.new(input.size(0), self.M_out)
-        num_threads = output.numel()
-        with torch.cuda.device_of(input):
-            self.f_weighting_fw(
-                block=(cuda_num_threads, 1, 1),
-                grid=(get_blocks(num_threads), 1, 1),
-                args=[
-                    input.data_ptr(),
-                    weight.data_ptr(),
-                    output.data_ptr(),
-                    amount.data_ptr(),
-                    index.data_ptr(), num_threads
-                ],
-                stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))
-        self.amount = amount
-        self.index = index
-        return output
-    def backward(self, grad_output):
-        grad_input = grad_output.new(grad_output.size(0), self.M_in).fill_(0)
-        grad_weight = grad_output.new(self.K, self.M_in, self.M_out).fill_(0)
-        num_threads = grad_output.numel()
-        if self.bp_to_adj:
-            if self.degree == 2 or self.degree == 3:
-                print('Backward to u for degree>1 not implemented!')
-                raise NotImplementedError
-            input, weight, adj_values = self.saved_tensors
-            # adj_values = torch.clamp(adj_values,min=0.0,max=1.0)
-            amount = self.amount
-            index = self.index
-            grad_amount = grad_output.new(amount.size(0),
-                                          amount.size(1)).fill_(0)
-            with torch.cuda.device_of(grad_output):
-                self.f_weighting_bw(
-                    block=(cuda_num_threads, 1, 1),
-                    grid=(get_blocks(num_threads), 1, 1),
-                    args=[
-                        grad_output.data_ptr(),
-                        grad_input.data_ptr(),
-                        grad_weight.data_ptr(),
-                        grad_amount.data_ptr(),
-                        input.data_ptr(),
-                        weight.data_ptr(),
-                        amount.data_ptr(),
-                        index.data_ptr(), num_threads
-                    ],
-                    stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))
-            grad_adj = grad_amount.new(
-                grad_amount.size(0), self.kernel_size.size(0)).fill_(0)
-            num_threads = grad_adj.numel()
-            with torch.cuda.device_of(grad_amount):
-                self.f_basis_bw(
-                    block=(cuda_num_threads, 1, 1),
-                    grid=(get_blocks(num_threads), 1, 1),
-                    args=[
-                        adj_values.data_ptr(),
-                        grad_amount.data_ptr(),
-                        amount.data_ptr(),
-                        grad_adj.data_ptr(),
-                        self.kernel_size.data_ptr(),
-                        self.is_open_spline.data_ptr(), num_threads
-                    ],
-                    stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))
-            return grad_input, grad_weight, grad_adj
-        else:
-            input, weight = self.saved_tensors
-            amount = self.amount
-            index = self.index
-            grad_amount = grad_output.new(amount.size(0),
-                                          amount.size(1)).fill_(0)
-            with torch.cuda.device_of(grad_output):
-                self.f_weighting_bw(
-                    block=(cuda_num_threads, 1, 1),
-                    grid=(get_blocks(num_threads), 1, 1),
-                    args=[
-                        grad_output.data_ptr(),
-                        grad_input.data_ptr(),
-                        grad_weight.data_ptr(),
-                        grad_amount.data_ptr(),
-                        input.data_ptr(),
-                        weight.data_ptr(),
-                        amount.data_ptr(),
-                        index.data_ptr(), num_threads
-                    ],
-                    stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))
-            return grad_input, grad_weight, None
--- a/torch_spline_conv/old/spline_cubic_gpu.py
+++ b/torch_spline_conv/old/spline_cubic_gpu.py
-'''
-import unittest
-import torch
-from numpy.testing import assert_equal, assert_almost_equal
-if torch.cuda.is_available():
-    from .compute_spline_basis import compute_spline_basis
-    from .compute_spline_basis import get_basis_kernel
-class SplineQuadraticGPUTest(unittest.TestCase):
-    @unittest.skipIf(not torch.cuda.is_available(), 'no GPU')
-    def test_open_spline(self):
-        input = torch.cuda.FloatTensor([0, 0.05, 0.25, 0.5, 0.75, 0.95, 1])
-        kernel_size = torch.cuda.LongTensor([7])
-        is_open_spline = torch.cuda.LongTensor([1])
-        k_max = 4
-        K = 7
-        dim = 1
-        basis_kernel = get_basis_kernel(k_max, K, dim, 3)
-        a1, i1 = compute_spline_basis(input, kernel_size, is_open_spline, 7,
-                                      basis_kernel)
-        a2 = [
-            [0.1667, 0.6667, 0.1667, 0],
-            [0.0853, 0.6307, 0.2827, 0.0013],
-            [0.1667, 0.6667, 0.1667, 0],
-            [0.1667, 0.6667, 0.1667, 0],
-            [0.1667, 0.6667, 0.1667, 0],
-            [0.0013, 0.2827, 0.6307, 0.0853],
-            [0.1667, 0.6667, 0.1667, 0],
-        ]
-        i2 = [[0, 1, 2, 3], [0, 1, 2, 3], [1, 2, 3, 4], [2, 3, 4, 5],
-              [3, 4, 5, 6], [3, 4, 5, 6], [4, 5, 6, 0]]
-        assert_almost_equal(a1.cpu().numpy(), a2, 4)
-        assert_equal(i1.cpu().numpy(), i2)
-    @unittest.skipIf(not torch.cuda.is_available(), 'no GPU')
-    def test_closed_spline(self):
-        input = torch.cuda.FloatTensor([0, 0.05, 0.25, 0.5, 0.75, 0.95, 1])
-        kernel_size = torch.cuda.LongTensor([4])
-        is_open_spline = torch.cuda.LongTensor([0])
-        k_max = 4
-        K = 4
-        dim = 1
-        basis_kernel = get_basis_kernel(k_max, K, dim, 3)
-        a1, i1 = compute_spline_basis(input, kernel_size, is_open_spline, 4,
-                                      basis_kernel)
-        a2 = [
-            [0.1667, 0.6667, 0.1667, 0],
-            [0.0853, 0.6307, 0.2827, 0.0013],
-            [0.1667, 0.6667, 0.1667, 0],
-            [0.1667, 0.6667, 0.1667, 0],
-            [0.1667, 0.6667, 0.1667, 0],
-            [0.0013, 0.2827, 0.6307, 0.0853],
-            [0.1667, 0.6667, 0.1667, 0],
-        ]
-        i2 = [[0, 1, 2, 3], [0, 1, 2, 3], [1, 2, 3, 0], [2, 3, 0, 1],
-              [3, 0, 1, 2], [3, 0, 1, 2], [0, 1, 2, 3]]
-        assert_almost_equal(a1.cpu().numpy(), a2, 4)
-        assert_equal(i1.cpu().numpy(), i2)
-'''
--- a/torch_spline_conv/old/spline_linear_gpu.py
+++ b/torch_spline_conv/old/spline_linear_gpu.py
-'''
-import unittest
-import torch
-from numpy.testing import assert_equal, assert_almost_equal
-if torch.cuda.is_available():
-    from .compute_spline_basis import compute_spline_basis
-    from .compute_spline_basis import get_basis_kernel
-class SplineLinearGPUTest(unittest.TestCase):
-    @unittest.skipIf(not torch.cuda.is_available(), 'no GPU')
-    def test_open_spline(self):
-        input = torch.cuda.FloatTensor([0, 0.05, 0.25, 0.5, 0.75, 0.95, 1])
-        kernel_size = torch.cuda.LongTensor([5])
-        is_open_spline = torch.cuda.LongTensor([1])
-        k_max = 2
-        K = 5
-        dim = 1
-        basis_kernel = get_basis_kernel(k_max, K, dim, 1)
-        a1, i1 = compute_spline_basis(input, kernel_size, is_open_spline, 5,
-                                      basis_kernel)
-        a2 = [[0, 1], [0.2, 0.8], [0, 1], [0, 1], [0, 1], [0.8, 0.2], [0, 1]]
-        i2 = [[1, 0], [1, 0], [2, 1], [3, 2], [4, 3], [4, 3], [0, 4]]
-        assert_almost_equal(a1.cpu().numpy(), a2, 2)
-        assert_equal(i1.cpu().numpy(), i2)
-    @unittest.skipIf(not torch.cuda.is_available(), 'no GPU')
-    def test_closed_spline(self):
-        input = torch.cuda.FloatTensor([0, 0.05, 0.25, 0.5, 0.75, 0.95, 1])
-        kernel_size = torch.cuda.LongTensor([4])
-        is_open_spline = torch.cuda.LongTensor([0])
-        k_max = 2
-        K = 4
-        dim = 1
-        basis_kernel = get_basis_kernel(k_max, K, dim, 1)
-        a1, i1 = compute_spline_basis(input, kernel_size, is_open_spline, 4,
-                                      basis_kernel)
-        a2 = [[0, 1], [0.2, 0.8], [0, 1], [0, 1], [0, 1], [0.8, 0.2], [0, 1]]
-        i2 = [[1, 0], [1, 0], [2, 1], [3, 2], [0, 3], [0, 3], [1, 0]]
-        assert_almost_equal(a1.cpu().numpy(), a2, 2)
-        assert_equal(i1.cpu().numpy(), i2)
-'''
--- a/torch_spline_conv/old/spline_quadratic_gpu.py
+++ b/torch_spline_conv/old/spline_quadratic_gpu.py
-'''
-import unittest
-import torch
-from numpy.testing import assert_equal, assert_almost_equal
-if torch.cuda.is_available():
-    from .compute_spline_basis import compute_spline_basis
-    from .compute_spline_basis import get_basis_kernel
-class SplineQuadraticGPUTest(unittest.TestCase):
-    @unittest.skipIf(not torch.cuda.is_available(), 'no GPU')
-    def test_open_spline(self):
-        input = torch.cuda.FloatTensor([0, 0.05, 0.25, 0.5, 0.75, 0.95, 1])
-        kernel_size = torch.cuda.LongTensor([6])
-        is_open_spline = torch.cuda.LongTensor([1])
-        k_max = 3
-        K = 6
-        dim = 1
-        basis_kernel = get_basis_kernel(k_max, K, dim, 2)
-        a1, i1 = compute_spline_basis(input, kernel_size, is_open_spline, 6,
-                                      basis_kernel)
-        a2 = [[0.5, 0.5, 0], [0.32, 0.66, 0.02], [0.5, 0.5, 0], [0.5, 0.5, 0],
-              [0.5, 0.5, 0], [0.02, 0.66, 0.32], [0.5, 0.5, 0]]
-        i2 = [[0, 1, 2], [0, 1, 2], [1, 2, 3], [2, 3, 4], [3, 4, 5], [3, 4, 5],
-              [4, 5, 0]]
-        assert_almost_equal(a1.cpu().numpy(), a2, 2)
-        assert_equal(i1.cpu().numpy(), i2)
-    @unittest.skipIf(not torch.cuda.is_available(), 'no GPU')
-    def test_closed_spline(self):
-        input = torch.cuda.FloatTensor([0, 0.05, 0.25, 0.5, 0.75, 0.95, 1])
-        kernel_size = torch.cuda.LongTensor([4])
-        is_open_spline = torch.cuda.LongTensor([0])
-        k_max = 3
-        K = 4
-        dim = 1
-        basis_kernel = get_basis_kernel(k_max, K, dim, 2)
-        a1, i1 = compute_spline_basis(input, kernel_size, is_open_spline, 4,
-                                      basis_kernel)
-        a2 = [[0.5, 0.5, 0], [0.32, 0.66, 0.02], [0.5, 0.5, 0], [0.5, 0.5, 0],
-              [0.5, 0.5, 0], [0.02, 0.66, 0.32], [0.5, 0.5, 0]]
-        i2 = [[0, 1, 2], [0, 1, 2], [1, 2, 3], [2, 3, 0], [3, 0, 1], [3, 0, 1],
-              [0, 1, 2]]
-        assert_almost_equal(a1.cpu().numpy(), a2, 2)
-        assert_equal(i1.cpu().numpy(), i2)
-'''