splineconv forward now approx. 10 times faster

764cd87d · Jan Eric Lenssen · 4b92ba74 · 764cd87d · 764cd87d · 764cd87d
Commit 764cd87d authored Feb 13, 2018 by Jan Eric Lenssen
10 changed files
--- a/spline_cubic_gpu.py
+++ b/spline_cubic_gpu.py
@@ -3,13 +3,105 @@ import torch
 from ....utils.cuda import (cuda_num_threads, Stream, Dtype, load_kernel,
                            kernel_loop, get_blocks)

-_spline_kernel = kernel_loop + '''
+_spline_kernel_linear = kernel_loop + '''
 extern "C"
 __global__ void spline_kernel(
 const ${Dtype}* input, ${Dtype}* amount, long* index,
-const long* kernel_size, const long* is_open_spline) {
+const long* kernel_size, const long* is_open_spline, int num_threads) {

-  CUDA_KERNEL_LOOP(idx, ${num_threads}) {
+  CUDA_KERNEL_LOOP(idx, num_threads) {
+
+    const int e_idx = idx / ${k_max};
+    int k_idx = idx % ${k_max};
+
+    int K = ${K};
+    int k_idx_mod;
+    int bot;
+    int top;
+    ${Dtype} value;
+    ${Dtype} frac;
+    ${Dtype} a = 1.0;
+    long i = 0;
+
+    for (int d_idx = 0; d_idx < ${dim}; d_idx++) {
+
+      K /= kernel_size[d_idx];
+
+      k_idx_mod = k_idx % 2;
+      k_idx >>= 1;
+
+      value = input[e_idx * ${dim} + d_idx] *
+              (kernel_size[d_idx] - is_open_spline[d_idx]);
+
+      frac = value - floor(value);
+
+      a *= (1 - k_idx_mod) * frac + k_idx_mod * (1 - frac);
+
+      bot = int(floor(value));
+      top = (bot + 1) % kernel_size[d_idx];
+      bot %= kernel_size[d_idx];
+      i += (k_idx_mod * bot + (1 - k_idx_mod) * top) * K;
+    }
+
+    amount[idx] = a;
+    index[idx] = i;
+  }
+}
+'''
+
+_spline_kernel_quadratic = kernel_loop + '''
+extern "C"
+__global__ void spline_kernel(
+const ${Dtype}* input, ${Dtype}* amount, long* index,
+const long* kernel_size, const long* is_open_spline, int num_threads) {
+
+  CUDA_KERNEL_LOOP(idx, num_threads) {
+
+    const int e_idx = idx / ${k_max};
+    int k_idx = idx % ${k_max};
+
+    int K = ${K};
+    int k_idx_mod;
+    int pos;
+    ${Dtype} value;
+    ${Dtype} frac;
+    ${Dtype} a = 1.0;
+    long i = 0;
+
+    for (int d_idx = 0; d_idx < ${dim}; d_idx++) {
+
+      K /= kernel_size[d_idx];
+
+      k_idx_mod = k_idx % 3;
+      k_idx /= 3;
+
+      value = input[e_idx * ${dim} + d_idx] *
+              (kernel_size[d_idx] - (2 * is_open_spline[d_idx]));
+
+      frac = value - floor(value);
+
+      if (k_idx_mod == 0) a *= 0.5 * (1- frac) * (1-frac);
+      else if (k_idx_mod == 1) a *= -frac * frac + frac + 0.5;
+      else a *= 0.5 * frac * frac;
+
+      pos = int(floor(value)) + k_idx_mod;
+      pos %= kernel_size[d_idx];
+
+      i += pos * K;
+    }
+    amount[idx] = a;
+    index[idx] = i;
+  }
+}
+'''
+
+_spline_kernel_cubic = kernel_loop + '''
+extern "C"
+__global__ void spline_kernel(
+const ${Dtype}* input, ${Dtype}* amount, long* index,
+const long* kernel_size, const long* is_open_spline, int num_threads) {
+
+  CUDA_KERNEL_LOOP(idx, num_threads}) {

    const int e_idx = idx / ${k_max};
    int k_idx = idx % ${k_max};
@@ -52,35 +144,47 @@ const long* kernel_size, const long* is_open_spline) {
 }
 '''

+def get_basis_kernel(k_max,K,dim,degree):
+
+    if degree==3:
+        _spline_kernel = _spline_kernel_cubic
+    elif degree==2:
+        _spline_kernel = _spline_kernel_quadratic
+    else:
+        _spline_kernel = _spline_kernel_linear

-def spline_cubic_gpu(input, kernel_size, is_open_spline, K):
+    cuda_tensor = torch.FloatTensor([1]).cuda()
+    with torch.cuda.device_of(cuda_tensor):
+        f = load_kernel(
+            'spline_kernel',
+            _spline_kernel,
+            Dtype='float',
+            k_max=k_max,
+            dim=dim,
+            K=K)
+    return f
+
+def compute_spline_basis(input, kernel_size, is_open_spline, K, basis_kernel):
    assert input.is_cuda and kernel_size.is_cuda and is_open_spline.is_cuda

    input = input.unsqueeze(1) if len(input.size()) < 2 else input
    num_edges, dim = input.size()
-    k_max = 4**dim
+    k_max = 2**dim

    amount = input.new(num_edges, k_max)
    index = input.new(num_edges, k_max).long()
    num_threads = amount.numel()

    with torch.cuda.device_of(input):
-        f = load_kernel(
-            'spline_kernel',
-            _spline_kernel,
-            Dtype=Dtype(input),
-            num_threads=num_threads,
-            k_max=k_max,
-            dim=dim,
-            K=K)
-        f(block=(cuda_num_threads, 1, 1),
+        basis_kernel(block=(cuda_num_threads, 1, 1),
          grid=(get_blocks(num_threads), 1, 1),
          args=[
              input.data_ptr(),
              amount.data_ptr(),
              index.data_ptr(),
              kernel_size.data_ptr(),
-              is_open_spline.data_ptr()
+              is_open_spline.data_ptr(),
+              num_threads
          ],
          stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))


--- a/edgewise_spline_weighting.py
+++ b/edgewise_spline_weighting.py
@@ -4,11 +4,10 @@ if torch.cuda.is_available():
    from .edgewise_spline_weighting_gpu import EdgewiseSplineWeightingGPU


-def edgewise_spline_weighting(input, weight, amount, index):
+def edgewise_spline_weighting(input, weight, amount, index, k_fw, k_bw):
    if input.is_cuda:
        K, M_in, M_out = weight.size()
-        k_max = amount.size(1)
-        return EdgewiseSplineWeightingGPU(amount, index, K, M_in, M_out,
-                                          k_max)(input, weight)
+        return EdgewiseSplineWeightingGPU(amount, index, K, M_in, M_out
+                                          ,k_fw,k_bw)(input, weight)
    else:
        raise NotImplementedError
--- a/edgewise_spline_weighting_gpu.py
+++ b/edgewise_spline_weighting_gpu.py
@@ -95,9 +95,34 @@ const long* index, int num_threads) {
 }
 '''

+def get_forward_kernel(M_in,M_out,k_max):
+    cuda_tensor = torch.FloatTensor([1]).cuda()
+    with torch.cuda.device_of(cuda_tensor):
+        f_fw = load_kernel(
+            'edgewise_spline_weighting_forward_kernel',
+            _edgewise_spline_weighting_forward_kernel,
+            Dtype='float',
+            M_in=M_in,
+            M_out=M_out,
+            k_max=k_max)
+    return f_fw
+
+def get_backward_kernel(M_in,M_out,k_max, K):
+    cuda_tensor = torch.FloatTensor([1]).cuda()
+    with torch.cuda.device_of(cuda_tensor):
+        f_bw = load_kernel(
+            'edgewise_spline_weighting_backward_kernel',
+            _edgewise_spline_weighting_backward_kernel,
+            Dtype='float',
+            M_in=M_in,
+            M_out=M_out,
+            k_max=k_max,
+            K=K)
+    return f_bw
+

 class EdgewiseSplineWeightingGPU(Function):
-    def __init__(self, amount, index, K, M_in, M_out, k_max):
+    def __init__(self, amount, index, K, M_in, M_out, k_fw, k_bw):
        super(EdgewiseSplineWeightingGPU, self).__init__()
        assert amount.is_cuda and index.is_cuda
        self.amount = amount
@@ -105,23 +130,9 @@ class EdgewiseSplineWeightingGPU(Function):
        self.M_in = M_in
        self.M_out = M_out
        self.K = K
-        with torch.cuda.device_of(amount):
-            self.f_fw = load_kernel(
-                'edgewise_spline_weighting_forward_kernel',
-                _edgewise_spline_weighting_forward_kernel,
-                Dtype=Dtype(amount),
-                M_in=M_in,
-                M_out=M_out,
-                k_max=k_max)
-
-            self.f_bw = load_kernel(
-                'edgewise_spline_weighting_backward_kernel',
-                _edgewise_spline_weighting_backward_kernel,
-                Dtype=Dtype(amount),
-                M_in=M_in,
-                M_out=M_out,
-                k_max=k_max,
-                K=K)
+        self.f_fw = k_fw
+        self.f_bw = k_bw
+

    def forward(self, input, weight):
        assert input.is_cuda and weight.is_cuda

--- a/spline.py
+++ b/spline.py
 import torch

 if torch.cuda.is_available():
-    from .spline_linear_gpu import spline_linear_gpu
+    from .compute_spline_basis import compute_spline_basis
    from .spline_quadratic_gpu import spline_quadratic_gpu
    from .spline_cubic_gpu import spline_cubic_gpu


-def spline(input, kernel_size, is_open_spline, K, degree):
+def spline(input, kernel_size, is_open_spline, K, degree, basis_kernel):
    if input.is_cuda:
-        if degree == 1:
-            return spline_linear_gpu(input, kernel_size, is_open_spline, K)
-        if degree == 2:
-            return spline_quadratic_gpu(input, kernel_size, is_open_spline, K)
-        if degree == 3:
-            return spline_cubic_gpu(input, kernel_size, is_open_spline, K)
-        else:
-            raise NotImplementedError()
+        return compute_spline_basis(input, kernel_size, is_open_spline, K, basis_kernel)
    else:
        raise NotImplementedError()
--- a/spline_conv.py
+++ b/spline_conv.py
@@ -13,8 +13,11 @@ def spline_conv(
        kernel_size,
        is_open_spline,
        K,
+        forward_kernel,
+        backward_kernel,
+        basis_kernel,
        degree=1,
-        bias=None):
+        bias=None,):

    if input.dim() == 1:
        input = input.unsqueeze(1)
@@ -25,8 +28,8 @@ def spline_conv(
    # Get features for every end vertex with shape [|E| x M_in].
    output = input[col]
    # Convert to [|E| x M_in] feature matrix and calculate [|E| x M_out].
-    amount, index = spline(values, kernel_size, is_open_spline, K, degree)
-    output = edgewise_spline_weighting(output, weight[:-1], amount, index)
+    amount, index = spline(values, kernel_size, is_open_spline, K, degree, basis_kernel)
+    output = edgewise_spline_weighting(output, weight[:-1], amount, index, forward_kernel, backward_kernel)

    # Convolution via `scatter_add`. Converts [|E| x M_out] feature matrix to
    # [n x M_out] feature matrix.

--- a/spline_cubic_gpu_test.py
+++ b/spline_cubic_gpu_test.py
@@ -4,7 +4,8 @@ import torch
 from numpy.testing import assert_equal, assert_almost_equal

 if torch.cuda.is_available():
-    from .spline_cubic_gpu import spline_cubic_gpu
+    from .compute_spline_basis import compute_spline_basis
+    from .compute_spline_basis import get_basis_kernel


 class SplineQuadraticGPUTest(unittest.TestCase):
@@ -13,8 +14,11 @@ class SplineQuadraticGPUTest(unittest.TestCase):
        input = torch.cuda.FloatTensor([0, 0.05, 0.25, 0.5, 0.75, 0.95, 1])
        kernel_size = torch.cuda.LongTensor([7])
        is_open_spline = torch.cuda.LongTensor([1])
-
-        a1, i1 = spline_cubic_gpu(input, kernel_size, is_open_spline, 7)
+        k_max = 4
+        K = 7
+        dim = 1
+        basis_kernel = get_basis_kernel(k_max, K, dim, 3)
+        a1, i1 = compute_spline_basis(input, kernel_size, is_open_spline, 7, basis_kernel)

        a2 = [
            [0.1667, 0.6667, 0.1667, 0],
@@ -36,8 +40,11 @@ class SplineQuadraticGPUTest(unittest.TestCase):
        input = torch.cuda.FloatTensor([0, 0.05, 0.25, 0.5, 0.75, 0.95, 1])
        kernel_size = torch.cuda.LongTensor([4])
        is_open_spline = torch.cuda.LongTensor([0])
-
-        a1, i1 = spline_cubic_gpu(input, kernel_size, is_open_spline, 4)
+        k_max = 4
+        K = 4
+        dim = 1
+        basis_kernel = get_basis_kernel(k_max, K, dim, 3)
+        a1, i1 = compute_spline_basis(input, kernel_size, is_open_spline, 4, basis_kernel)

        a2 = [
            [0.1667, 0.6667, 0.1667, 0],

--- a/spline_linear_gpu.py
+++ b/spline_linear_gpu.py
-import torch
-
-from ....utils.cuda import (cuda_num_threads, Stream, Dtype, load_kernel,
-                            kernel_loop, get_blocks)
-
-_spline_kernel = kernel_loop + '''
-extern "C"
-__global__ void spline_kernel(
-const ${Dtype}* input, ${Dtype}* amount, long* index,
-const long* kernel_size, const long* is_open_spline) {
-
-  CUDA_KERNEL_LOOP(idx, ${num_threads}) {
-
-    const int e_idx = idx / ${k_max};
-    int k_idx = idx % ${k_max};
-
-    int K = ${K};
-    int k_idx_mod;
-    int bot;
-    int top;
-    ${Dtype} value;
-    ${Dtype} frac;
-    ${Dtype} a = 1.0;
-    long i = 0;
-
-    for (int d_idx = 0; d_idx < ${dim}; d_idx++) {
-
-      K /= kernel_size[d_idx];
-
-      k_idx_mod = k_idx % 2;
-      k_idx >>= 1;
-
-      value = input[e_idx * ${dim} + d_idx] *
-              (kernel_size[d_idx] - is_open_spline[d_idx]);
-
-      frac = value - floor(value);
-
-      a *= (1 - k_idx_mod) * frac + k_idx_mod * (1 - frac);
-
-      bot = int(floor(value));
-      top = (bot + 1) % kernel_size[d_idx];
-      bot %= kernel_size[d_idx];
-      i += (k_idx_mod * bot + (1 - k_idx_mod) * top) * K;
-    }
-
-    amount[idx] = a;
-    index[idx] = i;
-  }
-}
-'''
-
-
-def spline_linear_gpu(input, kernel_size, is_open_spline, K):
-    assert input.is_cuda and kernel_size.is_cuda and is_open_spline.is_cuda
-
-    input = input.unsqueeze(1) if len(input.size()) < 2 else input
-    num_edges, dim = input.size()
-    k_max = 2**dim
-
-    amount = input.new(num_edges, k_max)
-    index = input.new(num_edges, k_max).long()
-    num_threads = amount.numel()
-
-    with torch.cuda.device_of(input):
-        f = load_kernel(
-            'spline_kernel',
-            _spline_kernel,
-            Dtype=Dtype(input),
-            num_threads=num_threads,
-            k_max=k_max,
-            dim=dim,
-            K=K)
-        f(block=(cuda_num_threads, 1, 1),
-          grid=(get_blocks(num_threads), 1, 1),
-          args=[
-              input.data_ptr(),
-              amount.data_ptr(),
-              index.data_ptr(),
-              kernel_size.data_ptr(),
-              is_open_spline.data_ptr()
-          ],
-          stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))
-
-    return amount, index
--- a/spline_linear_gpu_test.py
+++ b/spline_linear_gpu_test.py
@@ -4,7 +4,8 @@ import torch
 from numpy.testing import assert_equal, assert_almost_equal

 if torch.cuda.is_available():
-    from .spline_linear_gpu import spline_linear_gpu
+    from .compute_spline_basis import compute_spline_basis
+    from .compute_spline_basis import get_basis_kernel


 class SplineLinearGPUTest(unittest.TestCase):
@@ -13,8 +14,11 @@ class SplineLinearGPUTest(unittest.TestCase):
        input = torch.cuda.FloatTensor([0, 0.05, 0.25, 0.5, 0.75, 0.95, 1])
        kernel_size = torch.cuda.LongTensor([5])
        is_open_spline = torch.cuda.LongTensor([1])
-
-        a1, i1 = spline_linear_gpu(input, kernel_size, is_open_spline, 5)
+        k_max = 2
+        K = 5
+        dim = 1
+        basis_kernel = get_basis_kernel(k_max, K, dim, 1)
+        a1, i1 = compute_spline_basis(input, kernel_size, is_open_spline, 5, basis_kernel)

        a2 = [[0, 1], [0.2, 0.8], [0, 1], [0, 1], [0, 1], [0.8, 0.2], [0, 1]]
        i2 = [[1, 0], [1, 0], [2, 1], [3, 2], [4, 3], [4, 3], [0, 4]]
@@ -27,8 +31,11 @@ class SplineLinearGPUTest(unittest.TestCase):
        input = torch.cuda.FloatTensor([0, 0.05, 0.25, 0.5, 0.75, 0.95, 1])
        kernel_size = torch.cuda.LongTensor([4])
        is_open_spline = torch.cuda.LongTensor([0])
-
-        a1, i1 = spline_linear_gpu(input, kernel_size, is_open_spline, 4)
+        k_max = 2
+        K = 4
+        dim = 1
+        basis_kernel = get_basis_kernel(k_max, K, dim, 1)
+        a1, i1 = compute_spline_basis(input, kernel_size, is_open_spline, 4, basis_kernel)

        a2 = [[0, 1], [0.2, 0.8], [0, 1], [0, 1], [0, 1], [0.8, 0.2], [0, 1]]
        i2 = [[1, 0], [1, 0], [2, 1], [3, 2], [0, 3], [0, 3], [1, 0]]

--- a/spline_quadratic_gpu.py
+++ b/spline_quadratic_gpu.py
-import torch
-
-from ....utils.cuda import (cuda_num_threads, Stream, Dtype, load_kernel,
-                            kernel_loop, get_blocks)
-
-_spline_kernel = kernel_loop + '''
-extern "C"
-__global__ void spline_kernel(
-const ${Dtype}* input, ${Dtype}* amount, long* index,
-const long* kernel_size, const long* is_open_spline) {
-
-  CUDA_KERNEL_LOOP(idx, ${num_threads}) {
-
-    const int e_idx = idx / ${k_max};
-    int k_idx = idx % ${k_max};
-
-    int K = ${K};
-    int k_idx_mod;
-    int pos;
-    ${Dtype} value;
-    ${Dtype} frac;
-    ${Dtype} a = 1.0;
-    long i = 0;
-
-    for (int d_idx = 0; d_idx < ${dim}; d_idx++) {
-
-      K /= kernel_size[d_idx];
-
-      k_idx_mod = k_idx % 3;
-      k_idx /= 3;
-
-      value = input[e_idx * ${dim} + d_idx] *
-              (kernel_size[d_idx] - (2 * is_open_spline[d_idx]));
-
-      frac = value - floor(value);
-
-      if (k_idx_mod == 0) a *= 0.5 * (1- frac) * (1-frac);
-      else if (k_idx_mod == 1) a *= -frac * frac + frac + 0.5;
-      else a *= 0.5 * frac * frac;
-
-      pos = int(floor(value)) + k_idx_mod;
-      pos %= kernel_size[d_idx];
-
-      i += pos * K;
-    }
-    amount[idx] = a;
-    index[idx] = i;
-  }
-}
-'''
-
-
-def spline_quadratic_gpu(input, kernel_size, is_open_spline, K):
-    assert input.is_cuda and kernel_size.is_cuda and is_open_spline.is_cuda
-
-    input = input.unsqueeze(1) if len(input.size()) < 2 else input
-    num_edges, dim = input.size()
-    k_max = 3**dim
-
-    amount = input.new(num_edges, k_max)
-    index = input.new(num_edges, k_max).long()
-    num_threads = amount.numel()
-
-    with torch.cuda.device_of(input):
-        f = load_kernel(
-            'spline_kernel',
-            _spline_kernel,
-            Dtype=Dtype(input),
-            num_threads=num_threads,
-            k_max=k_max,
-            dim=dim,
-            K=K)
-        f(block=(cuda_num_threads, 1, 1),
-          grid=(get_blocks(num_threads), 1, 1),
-          args=[
-              input.data_ptr(),
-              amount.data_ptr(),
-              index.data_ptr(),
-              kernel_size.data_ptr(),
-              is_open_spline.data_ptr()
-          ],
-          stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))
-
-    return amount, index
--- a/spline_quadratic_gpu_test.py
+++ b/spline_quadratic_gpu_test.py
@@ -4,7 +4,8 @@ import torch
 from numpy.testing import assert_equal, assert_almost_equal

 if torch.cuda.is_available():
-    from .spline_quadratic_gpu import spline_quadratic_gpu
+    from .compute_spline_basis import compute_spline_basis
+    from .compute_spline_basis import get_basis_kernel


 class SplineQuadraticGPUTest(unittest.TestCase):
@@ -13,8 +14,12 @@ class SplineQuadraticGPUTest(unittest.TestCase):
        input = torch.cuda.FloatTensor([0, 0.05, 0.25, 0.5, 0.75, 0.95, 1])
        kernel_size = torch.cuda.LongTensor([6])
        is_open_spline = torch.cuda.LongTensor([1])
+        k_max = 3
+        K = 6
+        dim=1
+        basis_kernel = get_basis_kernel(k_max,K,dim,2)

-        a1, i1 = spline_quadratic_gpu(input, kernel_size, is_open_spline, 6)
+        a1, i1 = compute_spline_basis(input, kernel_size, is_open_spline, 6, basis_kernel)

        a2 = [[0.5, 0.5, 0], [0.32, 0.66, 0.02], [0.5, 0.5, 0], [0.5, 0.5, 0],
              [0.5, 0.5, 0], [0.02, 0.66, 0.32], [0.5, 0.5, 0]]
@@ -29,8 +34,11 @@ class SplineQuadraticGPUTest(unittest.TestCase):
        input = torch.cuda.FloatTensor([0, 0.05, 0.25, 0.5, 0.75, 0.95, 1])
        kernel_size = torch.cuda.LongTensor([4])
        is_open_spline = torch.cuda.LongTensor([0])
-
-        a1, i1 = spline_quadratic_gpu(input, kernel_size, is_open_spline, 4)
+        k_max = 3
+        K = 4
+        dim = 1
+        basis_kernel = get_basis_kernel(k_max, K, dim, 2)
+        a1, i1 = compute_spline_basis(input, kernel_size, is_open_spline, 4, basis_kernel)

        a2 = [[0.5, 0.5, 0], [0.32, 0.66, 0.02], [0.5, 0.5, 0], [0.5, 0.5, 0],
              [0.5, 0.5, 0], [0.02, 0.66, 0.32], [0.5, 0.5, 0]]