Commit 764cd87d authored by Jan Eric Lenssen's avatar Jan Eric Lenssen
Browse files

splineconv forward now approx. 10 times faster

parent 4b92ba74
......@@ -3,13 +3,105 @@ import torch
from ....utils.cuda import (cuda_num_threads, Stream, Dtype, load_kernel,
kernel_loop, get_blocks)
_spline_kernel = kernel_loop + '''
_spline_kernel_linear = kernel_loop + '''
extern "C"
__global__ void spline_kernel(
const ${Dtype}* input, ${Dtype}* amount, long* index,
const long* kernel_size, const long* is_open_spline) {
const long* kernel_size, const long* is_open_spline, int num_threads) {
CUDA_KERNEL_LOOP(idx, ${num_threads}) {
CUDA_KERNEL_LOOP(idx, num_threads) {
const int e_idx = idx / ${k_max};
int k_idx = idx % ${k_max};
int K = ${K};
int k_idx_mod;
int bot;
int top;
${Dtype} value;
${Dtype} frac;
${Dtype} a = 1.0;
long i = 0;
for (int d_idx = 0; d_idx < ${dim}; d_idx++) {
K /= kernel_size[d_idx];
k_idx_mod = k_idx % 2;
k_idx >>= 1;
value = input[e_idx * ${dim} + d_idx] *
(kernel_size[d_idx] - is_open_spline[d_idx]);
frac = value - floor(value);
a *= (1 - k_idx_mod) * frac + k_idx_mod * (1 - frac);
bot = int(floor(value));
top = (bot + 1) % kernel_size[d_idx];
bot %= kernel_size[d_idx];
i += (k_idx_mod * bot + (1 - k_idx_mod) * top) * K;
}
amount[idx] = a;
index[idx] = i;
}
}
'''
_spline_kernel_quadratic = kernel_loop + '''
extern "C"
__global__ void spline_kernel(
const ${Dtype}* input, ${Dtype}* amount, long* index,
const long* kernel_size, const long* is_open_spline, int num_threads) {
CUDA_KERNEL_LOOP(idx, num_threads) {
const int e_idx = idx / ${k_max};
int k_idx = idx % ${k_max};
int K = ${K};
int k_idx_mod;
int pos;
${Dtype} value;
${Dtype} frac;
${Dtype} a = 1.0;
long i = 0;
for (int d_idx = 0; d_idx < ${dim}; d_idx++) {
K /= kernel_size[d_idx];
k_idx_mod = k_idx % 3;
k_idx /= 3;
value = input[e_idx * ${dim} + d_idx] *
(kernel_size[d_idx] - (2 * is_open_spline[d_idx]));
frac = value - floor(value);
if (k_idx_mod == 0) a *= 0.5 * (1- frac) * (1-frac);
else if (k_idx_mod == 1) a *= -frac * frac + frac + 0.5;
else a *= 0.5 * frac * frac;
pos = int(floor(value)) + k_idx_mod;
pos %= kernel_size[d_idx];
i += pos * K;
}
amount[idx] = a;
index[idx] = i;
}
}
'''
_spline_kernel_cubic = kernel_loop + '''
extern "C"
__global__ void spline_kernel(
const ${Dtype}* input, ${Dtype}* amount, long* index,
const long* kernel_size, const long* is_open_spline, int num_threads) {
CUDA_KERNEL_LOOP(idx, num_threads}) {
const int e_idx = idx / ${k_max};
int k_idx = idx % ${k_max};
......@@ -52,35 +144,47 @@ const long* kernel_size, const long* is_open_spline) {
}
'''
def get_basis_kernel(k_max,K,dim,degree):
if degree==3:
_spline_kernel = _spline_kernel_cubic
elif degree==2:
_spline_kernel = _spline_kernel_quadratic
else:
_spline_kernel = _spline_kernel_linear
def spline_cubic_gpu(input, kernel_size, is_open_spline, K):
cuda_tensor = torch.FloatTensor([1]).cuda()
with torch.cuda.device_of(cuda_tensor):
f = load_kernel(
'spline_kernel',
_spline_kernel,
Dtype='float',
k_max=k_max,
dim=dim,
K=K)
return f
def compute_spline_basis(input, kernel_size, is_open_spline, K, basis_kernel):
assert input.is_cuda and kernel_size.is_cuda and is_open_spline.is_cuda
input = input.unsqueeze(1) if len(input.size()) < 2 else input
num_edges, dim = input.size()
k_max = 4**dim
k_max = 2**dim
amount = input.new(num_edges, k_max)
index = input.new(num_edges, k_max).long()
num_threads = amount.numel()
with torch.cuda.device_of(input):
f = load_kernel(
'spline_kernel',
_spline_kernel,
Dtype=Dtype(input),
num_threads=num_threads,
k_max=k_max,
dim=dim,
K=K)
f(block=(cuda_num_threads, 1, 1),
basis_kernel(block=(cuda_num_threads, 1, 1),
grid=(get_blocks(num_threads), 1, 1),
args=[
input.data_ptr(),
amount.data_ptr(),
index.data_ptr(),
kernel_size.data_ptr(),
is_open_spline.data_ptr()
is_open_spline.data_ptr(),
num_threads
],
stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))
......
......@@ -4,11 +4,10 @@ if torch.cuda.is_available():
from .edgewise_spline_weighting_gpu import EdgewiseSplineWeightingGPU
def edgewise_spline_weighting(input, weight, amount, index):
def edgewise_spline_weighting(input, weight, amount, index, k_fw, k_bw):
if input.is_cuda:
K, M_in, M_out = weight.size()
k_max = amount.size(1)
return EdgewiseSplineWeightingGPU(amount, index, K, M_in, M_out,
k_max)(input, weight)
return EdgewiseSplineWeightingGPU(amount, index, K, M_in, M_out
,k_fw,k_bw)(input, weight)
else:
raise NotImplementedError
......@@ -95,9 +95,34 @@ const long* index, int num_threads) {
}
'''
def get_forward_kernel(M_in,M_out,k_max):
cuda_tensor = torch.FloatTensor([1]).cuda()
with torch.cuda.device_of(cuda_tensor):
f_fw = load_kernel(
'edgewise_spline_weighting_forward_kernel',
_edgewise_spline_weighting_forward_kernel,
Dtype='float',
M_in=M_in,
M_out=M_out,
k_max=k_max)
return f_fw
def get_backward_kernel(M_in,M_out,k_max, K):
cuda_tensor = torch.FloatTensor([1]).cuda()
with torch.cuda.device_of(cuda_tensor):
f_bw = load_kernel(
'edgewise_spline_weighting_backward_kernel',
_edgewise_spline_weighting_backward_kernel,
Dtype='float',
M_in=M_in,
M_out=M_out,
k_max=k_max,
K=K)
return f_bw
class EdgewiseSplineWeightingGPU(Function):
def __init__(self, amount, index, K, M_in, M_out, k_max):
def __init__(self, amount, index, K, M_in, M_out, k_fw, k_bw):
super(EdgewiseSplineWeightingGPU, self).__init__()
assert amount.is_cuda and index.is_cuda
self.amount = amount
......@@ -105,23 +130,9 @@ class EdgewiseSplineWeightingGPU(Function):
self.M_in = M_in
self.M_out = M_out
self.K = K
with torch.cuda.device_of(amount):
self.f_fw = load_kernel(
'edgewise_spline_weighting_forward_kernel',
_edgewise_spline_weighting_forward_kernel,
Dtype=Dtype(amount),
M_in=M_in,
M_out=M_out,
k_max=k_max)
self.f_bw = load_kernel(
'edgewise_spline_weighting_backward_kernel',
_edgewise_spline_weighting_backward_kernel,
Dtype=Dtype(amount),
M_in=M_in,
M_out=M_out,
k_max=k_max,
K=K)
self.f_fw = k_fw
self.f_bw = k_bw
def forward(self, input, weight):
assert input.is_cuda and weight.is_cuda
......
import torch
if torch.cuda.is_available():
from .spline_linear_gpu import spline_linear_gpu
from .compute_spline_basis import compute_spline_basis
from .spline_quadratic_gpu import spline_quadratic_gpu
from .spline_cubic_gpu import spline_cubic_gpu
def spline(input, kernel_size, is_open_spline, K, degree):
def spline(input, kernel_size, is_open_spline, K, degree, basis_kernel):
if input.is_cuda:
if degree == 1:
return spline_linear_gpu(input, kernel_size, is_open_spline, K)
if degree == 2:
return spline_quadratic_gpu(input, kernel_size, is_open_spline, K)
if degree == 3:
return spline_cubic_gpu(input, kernel_size, is_open_spline, K)
else:
raise NotImplementedError()
return compute_spline_basis(input, kernel_size, is_open_spline, K, basis_kernel)
else:
raise NotImplementedError()
......@@ -13,8 +13,11 @@ def spline_conv(
kernel_size,
is_open_spline,
K,
forward_kernel,
backward_kernel,
basis_kernel,
degree=1,
bias=None):
bias=None,):
if input.dim() == 1:
input = input.unsqueeze(1)
......@@ -25,8 +28,8 @@ def spline_conv(
# Get features for every end vertex with shape [|E| x M_in].
output = input[col]
# Convert to [|E| x M_in] feature matrix and calculate [|E| x M_out].
amount, index = spline(values, kernel_size, is_open_spline, K, degree)
output = edgewise_spline_weighting(output, weight[:-1], amount, index)
amount, index = spline(values, kernel_size, is_open_spline, K, degree, basis_kernel)
output = edgewise_spline_weighting(output, weight[:-1], amount, index, forward_kernel, backward_kernel)
# Convolution via `scatter_add`. Converts [|E| x M_out] feature matrix to
# [n x M_out] feature matrix.
......
......@@ -4,7 +4,8 @@ import torch
from numpy.testing import assert_equal, assert_almost_equal
if torch.cuda.is_available():
from .spline_cubic_gpu import spline_cubic_gpu
from .compute_spline_basis import compute_spline_basis
from .compute_spline_basis import get_basis_kernel
class SplineQuadraticGPUTest(unittest.TestCase):
......@@ -13,8 +14,11 @@ class SplineQuadraticGPUTest(unittest.TestCase):
input = torch.cuda.FloatTensor([0, 0.05, 0.25, 0.5, 0.75, 0.95, 1])
kernel_size = torch.cuda.LongTensor([7])
is_open_spline = torch.cuda.LongTensor([1])
a1, i1 = spline_cubic_gpu(input, kernel_size, is_open_spline, 7)
k_max = 4
K = 7
dim = 1
basis_kernel = get_basis_kernel(k_max, K, dim, 3)
a1, i1 = compute_spline_basis(input, kernel_size, is_open_spline, 7, basis_kernel)
a2 = [
[0.1667, 0.6667, 0.1667, 0],
......@@ -36,8 +40,11 @@ class SplineQuadraticGPUTest(unittest.TestCase):
input = torch.cuda.FloatTensor([0, 0.05, 0.25, 0.5, 0.75, 0.95, 1])
kernel_size = torch.cuda.LongTensor([4])
is_open_spline = torch.cuda.LongTensor([0])
a1, i1 = spline_cubic_gpu(input, kernel_size, is_open_spline, 4)
k_max = 4
K = 4
dim = 1
basis_kernel = get_basis_kernel(k_max, K, dim, 3)
a1, i1 = compute_spline_basis(input, kernel_size, is_open_spline, 4, basis_kernel)
a2 = [
[0.1667, 0.6667, 0.1667, 0],
......
import torch
from ....utils.cuda import (cuda_num_threads, Stream, Dtype, load_kernel,
kernel_loop, get_blocks)
_spline_kernel = kernel_loop + '''
extern "C"
__global__ void spline_kernel(
const ${Dtype}* input, ${Dtype}* amount, long* index,
const long* kernel_size, const long* is_open_spline) {
CUDA_KERNEL_LOOP(idx, ${num_threads}) {
const int e_idx = idx / ${k_max};
int k_idx = idx % ${k_max};
int K = ${K};
int k_idx_mod;
int bot;
int top;
${Dtype} value;
${Dtype} frac;
${Dtype} a = 1.0;
long i = 0;
for (int d_idx = 0; d_idx < ${dim}; d_idx++) {
K /= kernel_size[d_idx];
k_idx_mod = k_idx % 2;
k_idx >>= 1;
value = input[e_idx * ${dim} + d_idx] *
(kernel_size[d_idx] - is_open_spline[d_idx]);
frac = value - floor(value);
a *= (1 - k_idx_mod) * frac + k_idx_mod * (1 - frac);
bot = int(floor(value));
top = (bot + 1) % kernel_size[d_idx];
bot %= kernel_size[d_idx];
i += (k_idx_mod * bot + (1 - k_idx_mod) * top) * K;
}
amount[idx] = a;
index[idx] = i;
}
}
'''
def spline_linear_gpu(input, kernel_size, is_open_spline, K):
assert input.is_cuda and kernel_size.is_cuda and is_open_spline.is_cuda
input = input.unsqueeze(1) if len(input.size()) < 2 else input
num_edges, dim = input.size()
k_max = 2**dim
amount = input.new(num_edges, k_max)
index = input.new(num_edges, k_max).long()
num_threads = amount.numel()
with torch.cuda.device_of(input):
f = load_kernel(
'spline_kernel',
_spline_kernel,
Dtype=Dtype(input),
num_threads=num_threads,
k_max=k_max,
dim=dim,
K=K)
f(block=(cuda_num_threads, 1, 1),
grid=(get_blocks(num_threads), 1, 1),
args=[
input.data_ptr(),
amount.data_ptr(),
index.data_ptr(),
kernel_size.data_ptr(),
is_open_spline.data_ptr()
],
stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))
return amount, index
......@@ -4,7 +4,8 @@ import torch
from numpy.testing import assert_equal, assert_almost_equal
if torch.cuda.is_available():
from .spline_linear_gpu import spline_linear_gpu
from .compute_spline_basis import compute_spline_basis
from .compute_spline_basis import get_basis_kernel
class SplineLinearGPUTest(unittest.TestCase):
......@@ -13,8 +14,11 @@ class SplineLinearGPUTest(unittest.TestCase):
input = torch.cuda.FloatTensor([0, 0.05, 0.25, 0.5, 0.75, 0.95, 1])
kernel_size = torch.cuda.LongTensor([5])
is_open_spline = torch.cuda.LongTensor([1])
a1, i1 = spline_linear_gpu(input, kernel_size, is_open_spline, 5)
k_max = 2
K = 5
dim = 1
basis_kernel = get_basis_kernel(k_max, K, dim, 1)
a1, i1 = compute_spline_basis(input, kernel_size, is_open_spline, 5, basis_kernel)
a2 = [[0, 1], [0.2, 0.8], [0, 1], [0, 1], [0, 1], [0.8, 0.2], [0, 1]]
i2 = [[1, 0], [1, 0], [2, 1], [3, 2], [4, 3], [4, 3], [0, 4]]
......@@ -27,8 +31,11 @@ class SplineLinearGPUTest(unittest.TestCase):
input = torch.cuda.FloatTensor([0, 0.05, 0.25, 0.5, 0.75, 0.95, 1])
kernel_size = torch.cuda.LongTensor([4])
is_open_spline = torch.cuda.LongTensor([0])
a1, i1 = spline_linear_gpu(input, kernel_size, is_open_spline, 4)
k_max = 2
K = 4
dim = 1
basis_kernel = get_basis_kernel(k_max, K, dim, 1)
a1, i1 = compute_spline_basis(input, kernel_size, is_open_spline, 4, basis_kernel)
a2 = [[0, 1], [0.2, 0.8], [0, 1], [0, 1], [0, 1], [0.8, 0.2], [0, 1]]
i2 = [[1, 0], [1, 0], [2, 1], [3, 2], [0, 3], [0, 3], [1, 0]]
......
import torch
from ....utils.cuda import (cuda_num_threads, Stream, Dtype, load_kernel,
kernel_loop, get_blocks)
_spline_kernel = kernel_loop + '''
extern "C"
__global__ void spline_kernel(
const ${Dtype}* input, ${Dtype}* amount, long* index,
const long* kernel_size, const long* is_open_spline) {
CUDA_KERNEL_LOOP(idx, ${num_threads}) {
const int e_idx = idx / ${k_max};
int k_idx = idx % ${k_max};
int K = ${K};
int k_idx_mod;
int pos;
${Dtype} value;
${Dtype} frac;
${Dtype} a = 1.0;
long i = 0;
for (int d_idx = 0; d_idx < ${dim}; d_idx++) {
K /= kernel_size[d_idx];
k_idx_mod = k_idx % 3;
k_idx /= 3;
value = input[e_idx * ${dim} + d_idx] *
(kernel_size[d_idx] - (2 * is_open_spline[d_idx]));
frac = value - floor(value);
if (k_idx_mod == 0) a *= 0.5 * (1- frac) * (1-frac);
else if (k_idx_mod == 1) a *= -frac * frac + frac + 0.5;
else a *= 0.5 * frac * frac;
pos = int(floor(value)) + k_idx_mod;
pos %= kernel_size[d_idx];
i += pos * K;
}
amount[idx] = a;
index[idx] = i;
}
}
'''
def spline_quadratic_gpu(input, kernel_size, is_open_spline, K):
assert input.is_cuda and kernel_size.is_cuda and is_open_spline.is_cuda
input = input.unsqueeze(1) if len(input.size()) < 2 else input
num_edges, dim = input.size()
k_max = 3**dim
amount = input.new(num_edges, k_max)
index = input.new(num_edges, k_max).long()
num_threads = amount.numel()
with torch.cuda.device_of(input):
f = load_kernel(
'spline_kernel',
_spline_kernel,
Dtype=Dtype(input),
num_threads=num_threads,
k_max=k_max,
dim=dim,
K=K)
f(block=(cuda_num_threads, 1, 1),
grid=(get_blocks(num_threads), 1, 1),
args=[
input.data_ptr(),
amount.data_ptr(),
index.data_ptr(),
kernel_size.data_ptr(),
is_open_spline.data_ptr()
],
stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))
return amount, index
......@@ -4,7 +4,8 @@ import torch
from numpy.testing import assert_equal, assert_almost_equal
if torch.cuda.is_available():
from .spline_quadratic_gpu import spline_quadratic_gpu
from .compute_spline_basis import compute_spline_basis
from .compute_spline_basis import get_basis_kernel
class SplineQuadraticGPUTest(unittest.TestCase):
......@@ -13,8 +14,12 @@ class SplineQuadraticGPUTest(unittest.TestCase):
input = torch.cuda.FloatTensor([0, 0.05, 0.25, 0.5, 0.75, 0.95, 1])
kernel_size = torch.cuda.LongTensor([6])
is_open_spline = torch.cuda.LongTensor([1])
k_max = 3
K = 6
dim=1
basis_kernel = get_basis_kernel(k_max,K,dim,2)
a1, i1 = spline_quadratic_gpu(input, kernel_size, is_open_spline, 6)
a1, i1 = compute_spline_basis(input, kernel_size, is_open_spline, 6, basis_kernel)
a2 = [[0.5, 0.5, 0], [0.32, 0.66, 0.02], [0.5, 0.5, 0], [0.5, 0.5, 0],
[0.5, 0.5, 0], [0.02, 0.66, 0.32], [0.5, 0.5, 0]]
......@@ -29,8 +34,11 @@ class SplineQuadraticGPUTest(unittest.TestCase):
input = torch.cuda.FloatTensor([0, 0.05, 0.25, 0.5, 0.75, 0.95, 1])
kernel_size = torch.cuda.LongTensor([4])
is_open_spline = torch.cuda.LongTensor([0])
a1, i1 = spline_quadratic_gpu(input, kernel_size, is_open_spline, 4)
k_max = 3
K = 4
dim = 1
basis_kernel = get_basis_kernel(k_max, K, dim, 2)
a1, i1 = compute_spline_basis(input, kernel_size, is_open_spline, 4, basis_kernel)
a2 = [[0.5, 0.5, 0], [0.32, 0.66, 0.02], [0.5, 0.5, 0], [0.5, 0.5, 0],
[0.5, 0.5, 0], [0.02, 0.66, 0.32], [0.5, 0.5, 0]]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment