Commit e2e17501 authored by Jan Eric Lenssen's avatar Jan Eric Lenssen
Browse files

first version bp_to_adj, not working atm

parent 8383b476
import torch
from ....utils.cuda import (cuda_num_threads, Stream, Dtype, load_kernel,
kernel_loop, get_blocks)
_spline_kernel_linear = kernel_loop + '''
extern "C"
__global__ void spline_kernel(
const ${Dtype}* input, ${Dtype}* amount, long* index,
const long* kernel_size, const long* is_open_spline, int num_threads) {
CUDA_KERNEL_LOOP(idx, num_threads) {
const int e_idx = idx / ${k_max};
int k_idx = idx % ${k_max};
int K = ${K};
int k_idx_mod;
int bot;
int top;
${Dtype} value;
${Dtype} frac;
${Dtype} a = 1.0;
long i = 0;
for (int d_idx = 0; d_idx < ${dim}; d_idx++) {
K /= kernel_size[d_idx];
k_idx_mod = k_idx % 2;
k_idx >>= 1;
value = input[e_idx * ${dim} + d_idx];
value *= kernel_size[d_idx] - is_open_spline[d_idx];
frac = value - floor(value);
a *= (1 - k_idx_mod) * frac + k_idx_mod * (1 - frac);
bot = int(floor(value));
top = (bot + 1) % kernel_size[d_idx];
bot %= kernel_size[d_idx];
i += (k_idx_mod * bot + (1 - k_idx_mod) * top) * K;
}
amount[idx] = a;
index[idx] = i;
}
}
'''
_spline_kernel_quadratic = kernel_loop + '''
extern "C"
__global__ void spline_kernel(
const ${Dtype}* input, ${Dtype}* amount, long* index,
const long* kernel_size, const long* is_open_spline, int num_threads) {
CUDA_KERNEL_LOOP(idx, num_threads) {
const int e_idx = idx / ${k_max};
int k_idx = idx % ${k_max};
int K = ${K};
int k_idx_mod;
int pos;
${Dtype} value;
${Dtype} frac;
${Dtype} a = 1.0;
long i = 0;
for (int d_idx = 0; d_idx < ${dim}; d_idx++) {
K /= kernel_size[d_idx];
k_idx_mod = k_idx % 3;
k_idx /= 3;
value = input[e_idx * ${dim} + d_idx] *
(kernel_size[d_idx] - (2 * is_open_spline[d_idx]));
frac = value - floor(value);
if (k_idx_mod == 0) a *= 0.5 * (1- frac) * (1-frac);
else if (k_idx_mod == 1) a *= -frac * frac + frac + 0.5;
else a *= 0.5 * frac * frac;
pos = int(floor(value)) + k_idx_mod;
pos %= kernel_size[d_idx];
i += pos * K;
}
amount[idx] = a;
index[idx] = i;
}
}
'''
_spline_kernel_cubic = kernel_loop + '''
extern "C"
__global__ void spline_kernel(
const ${Dtype}* input, ${Dtype}* amount, long* index,
const long* kernel_size, const long* is_open_spline, int num_threads) {
CUDA_KERNEL_LOOP(idx, num_threads}) {
const int e_idx = idx / ${k_max};
int k_idx = idx % ${k_max};
int K = ${K};
int k_idx_mod;
int pos;
${Dtype} value;
${Dtype} frac;
${Dtype} a = 1.0;
long i = 0;
for (int d_idx = 0; d_idx < ${dim}; d_idx++) {
K /= kernel_size[d_idx];
k_idx_mod = k_idx % 4;
k_idx /= 4;
value = input[e_idx * ${dim} + d_idx] *
(kernel_size[d_idx] - (3 * is_open_spline[d_idx]));
frac = value - floor(value);
if (k_idx_mod == 0) a *= (1 - frac) * (1 - frac) * (1 - frac) / 6.0;
else if (k_idx_mod == 1)
a *= (3 * frac * frac * frac - 6 * frac * frac + 4) / 6.0;
else if (k_idx_mod == 2)
a *= (-3 * frac * frac * frac + 3 * frac * frac + 3 * frac + 1) / 6.0;
else a *= frac * frac * frac / 6.0;
pos = int(floor(value)) + k_idx_mod;
pos %= kernel_size[d_idx];
i += pos * K;
}
amount[idx] = a;
index[idx] = i;
}
}
'''
def get_basis_kernel(k_max, K, dim, degree):
if degree == 3:
_spline_kernel = _spline_kernel_cubic
elif degree == 2:
_spline_kernel = _spline_kernel_quadratic
else:
_spline_kernel = _spline_kernel_linear
cuda_tensor = torch.FloatTensor([1]).cuda()
with torch.cuda.device_of(cuda_tensor):
f = load_kernel(
'spline_kernel',
_spline_kernel,
Dtype='float',
k_max=k_max,
dim=dim,
K=K)
return f
def compute_spline_basis(input, kernel_size, is_open_spline, K, basis_kernel):
assert input.is_cuda and kernel_size.is_cuda and is_open_spline.is_cuda
input = input.unsqueeze(1) if len(input.size()) < 2 else input
num_edges, dim = input.size()
k_max = 2**dim
amount = input.new(num_edges, k_max)
index = input.new(num_edges, k_max).long()
num_threads = amount.numel()
with torch.cuda.device_of(input):
basis_kernel(
block=(cuda_num_threads, 1, 1),
grid=(get_blocks(num_threads), 1, 1),
args=[
input.data_ptr(),
amount.data_ptr(),
index.data_ptr(),
kernel_size.data_ptr(),
is_open_spline.data_ptr(), num_threads
],
stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))
return amount, index
import torch
if torch.cuda.is_available():
from .edgewise_spline_weighting_gpu import EdgewiseSplineWeightingGPU
def edgewise_spline_weighting(input, weight, amount, index, k_fw, k_bw):
if input.is_cuda:
K, M_in, M_out = weight.size()
return EdgewiseSplineWeightingGPU(amount, index, K, M_in, M_out
, k_fw, k_bw)(input, weight)
else:
raise NotImplementedError
import torch
from torch.autograd import Function
from ....utils.cuda import (cuda_num_threads, Stream, Dtype, load_kernel,
kernel_loop, get_blocks)
_edgewise_spline_weighting_forward_kernel = kernel_loop + '''
extern "C"
__global__ void edgewise_spline_weighting_forward_kernel(
const ${Dtype}* input, const ${Dtype}* weight, ${Dtype}* output,
const ${Dtype}* amount, const long* index) {
CUDA_KERNEL_LOOP(idx, ${num_threads}) {
const int e_idx = idx / ${M_out};
const int m_out_idx = idx % ${M_out};
${Dtype} result = 0.0;
${Dtype} w;
${Dtype} f;
int k;
${Dtype} b;
long c;
long w_idx;
for (int k_idx = 0; k_idx < ${k_max}; k_idx++) {
k = e_idx * ${k_max} + k_idx;
b = amount[k];
c = index[k];
for (int m_in_idx = 0; m_in_idx < ${M_in}; m_in_idx++) {
w_idx = c * ${M_out} * ${M_in} +
m_in_idx * ${M_out} +
m_out_idx;
w = weight[w_idx];
f = input[e_idx * ${M_in} + m_in_idx];
result += b * w * f;
}
}
output[idx] = result;
}
}
'''
_edgewise_spline_weighting_backward_kernel = kernel_loop + '''
extern "C"
__global__ void edgewise_spline_weighting_backward_kernel(
const ${Dtype}* grad_output, ${Dtype}* grad_input, ${Dtype}* grad_weight,
${Dtype}* grad_b, const ${Dtype}* input, const ${Dtype}* weight,
const ${Dtype}* amount, const long* index) {
CUDA_KERNEL_LOOP(idx, ${num_threads}) {
const int e_idx = idx / ${M_out};
const int m_out_idx = idx % ${M_out};
${Dtype} w;
${Dtype} g;
${Dtype} f;
${Dtype} w_grad;
${Dtype} b_grad;
int k;
${Dtype} b;
long c;
long w_idx;
for (int k_idx = 0; k_idx < ${k_max}; k_idx++) {
k = e_idx * ${k_max} + k_idx;
b = amount[k];
c = index[k];
for (int m_in_idx = 0; m_in_idx < ${M_in}; m_in_idx++) {
w_idx = c * ${M_out} * ${M_in} +
m_in_idx * ${M_out} +
m_out_idx;
w = weight[w_idx];
// Calculate input gradient.
g = grad_output[e_idx * ${M_out} + m_out_idx];
atomicAdd(&(grad_input[e_idx * ${M_in} + m_in_idx]), b * w * g);
// This is inefficient: `reduce_sum` shouldn't be done like this.
// Looping over `M_out` would be better to avoid the `atomicAdd`.
// Calculate weight gradient.
f = input[e_idx * ${M_in} + m_in_idx];
w_grad = f * b * grad_output[e_idx * ${M_out} + m_out_idx];
atomicAdd(&(grad_weight[w_idx]), w_grad);
// Not so efficient either, but not avoidable.
//Calculate gradient of B
b_grad = f * w * g;
atomicAdd(&(grad_b[k]), b_grad);
}
}
}
}
'''
_spline_kernel = kernel_loop + '''
extern "C"
__global__ void spline_kernel(
const ${Dtype}* input, ${Dtype}* amount, long* index,
const long* kernel_size, const long* is_open_spline) {
CUDA_KERNEL_LOOP(idx, ${num_threads}) {
const int e_idx = idx / ${k_max};
int k_idx = idx % ${k_max};
int K = ${K};
int k_idx_mod;
int bot;
int top;
${Dtype} value;
${Dtype} frac;
${Dtype} a = 1.0;
long i = 0;
for (int d_idx = 0; d_idx < ${dim}; d_idx++) {
K /= kernel_size[d_idx];
k_idx_mod = k_idx % 2;
k_idx >>= 1;
value = input[e_idx * ${dim} + d_idx] *
(kernel_size[d_idx] - is_open_spline[d_idx]);
frac = value - floor(value);
a *= (1 - k_idx_mod) * frac + k_idx_mod * (1 - frac);
bot = int(floor(value));
top = (bot + 1) % kernel_size[d_idx];
bot %= kernel_size[d_idx];
i += (k_idx_mod * bot + (1 - k_idx_mod) * top) * K;
}
amount[idx] = a;
index[idx] = i;
}
}
'''
_spline_kernel_backwards = kernel_loop + '''
extern "C"
__global__ void spline_kernel(
const ${Dtype}* input, ${Dtype}* amount, long* index,
const long* kernel_size, const long* is_open_spline) {
CUDA_KERNEL_LOOP(idx, ${num_threads}) {
const int e_idx = idx / ${k_max};
int k_idx = idx % ${k_max};
int K = ${K};
int k_idx_mod;
int bot;
int top;
${Dtype} value;
${Dtype} frac;
${Dtype} a = 1.0;
long i = 0;
for (int d_idx = 0; d_idx < ${dim}; d_idx++) {
K /= kernel_size[d_idx];
k_idx_mod = k_idx % 2;
k_idx >>= 1;
value = input[e_idx * ${dim} + d_idx] *
(kernel_size[d_idx] - is_open_spline[d_idx]);
frac = value - floor(value);
a *= (1 - k_idx_mod) * frac + k_idx_mod * (1 - frac);
bot = int(floor(value));
top = (bot + 1) % kernel_size[d_idx];
bot %= kernel_size[d_idx];
i += (k_idx_mod * bot + (1 - k_idx_mod) * top) * K;
}
amount[idx] = a;
index[idx] = i;
}
}
'''
class EdgewiseSplineWeightingGPU(Function):
def __init__(self, kernel_size, is_open_spline, K):
super(EdgewiseSplineWeightingGPU, self).__init__()
assert kernel_size.is_cuda and is_open_spline.is_cuda
self.kernel_size = kernel_size
self.is_open_spline = is_open_spline
self.K = K
def forward(self, input, values, weight):
assert input.is_cuda and weight.is_cuda
self.save_for_backward(input, weight)
values = values.unsqueeze(1) if len(values.size()) < 2 else values
num_edges, dim = values.size()
k_max = 2 ** dim
amount = values.new(num_edges, k_max)
index = values.new(num_edges, k_max).long()
num_threads = amount.numel()
with torch.cuda.device_of(input):
f = load_kernel(
'spline_kernel',
_spline_kernel,
Dtype=Dtype(input),
num_threads=num_threads,
k_max=k_max,
dim=dim,
K=self.K)
f(block=(cuda_num_threads, 1, 1),
grid=(get_blocks(num_threads), 1, 1),
args=[
values.data_ptr(),
amount.data_ptr(),
index.data_ptr(),
self.kernel_size.data_ptr(),
self.is_open_spline.data_ptr()
],
stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))
self.amount = amount
self.index = index
_, M_in, M_out = weight.size()
k_max = self.amount.size(1)
output = input.new(input.size(0), M_out)
num_threads = output.numel()
with torch.cuda.device_of(input):
f = load_kernel(
'edgewise_spline_weighting_forward_kernel',
_edgewise_spline_weighting_forward_kernel,
Dtype=Dtype(input),
num_threads=num_threads,
M_in=M_in,
M_out=M_out,
k_max=k_max)
f(block=(cuda_num_threads, 1, 1),
grid=(get_blocks(num_threads), 1, 1),
args=[
input.data_ptr(),
weight.data_ptr(),
output.data_ptr(),
self.amount.data_ptr(),
self.index.data_ptr()
],
stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))
return output
def backward(self, grad_output):
input, weight = self.saved_tensors
K, M_in, M_out = weight.size()
E, k_max = self.amount.size()
grad_input = grad_output.new(input.size(0), M_in).fill_(0)
grad_weight = grad_output.new(K, M_in, M_out).fill_(0)
grad_b = grad_output.new(E, k_max).fill_(0)
num_threads = grad_output.numel()
with torch.cuda.device_of(grad_output):
f = load_kernel(
'edgewise_spline_weighting_backward_kernel',
_edgewise_spline_weighting_backward_kernel,
Dtype=Dtype(input),
num_threads=num_threads,
M_in=M_in,
M_out=M_out,
k_max=k_max,
K=K)
f(block=(cuda_num_threads, 1, 1),
grid=(get_blocks(num_threads), 1, 1),
args=[
grad_output.data_ptr(),
grad_input.data_ptr(),
grad_weight.data_ptr(),
grad_b.data_ptr(),
input.data_ptr(),
weight.data_ptr(),
self.amount.data_ptr(),
self.index.data_ptr()
],
stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))
grad_b = grad_output.new(E, k_max).fill_(0)
return grad_input, grad_weight, grad_u
import torch
from torch.autograd import Function
from ....utils.cuda import (cuda_num_threads, Stream, load_kernel, kernel_loop,
get_blocks)
_edgewise_spline_weighting_forward_kernel = kernel_loop + '''
extern "C"
__global__ void edgewise_spline_weighting_forward_kernel(
const ${Dtype}* input, const ${Dtype}* weight, ${Dtype}* output,
const ${Dtype}* amount, const long* index, int num_threads) {
CUDA_KERNEL_LOOP(idx, num_threads) {
const int e_idx = idx / ${M_out};
const int m_out_idx = idx % ${M_out};
${Dtype} result = 0.0;
${Dtype} w;
${Dtype} f;
int k;
${Dtype} b;
long c;
long w_idx;
for (int k_idx = 0; k_idx < ${k_max}; k_idx++) {
k = e_idx * ${k_max} + k_idx;
b = amount[k];
c = index[k];
for (int m_in_idx = 0; m_in_idx < ${M_in}; m_in_idx++) {
w_idx = c * ${M_out} * ${M_in} +
m_in_idx * ${M_out} +
m_out_idx;
w = weight[w_idx];
f = input[e_idx * ${M_in} + m_in_idx];
result += b * w * f;
}
}
output[idx] = result;
}
}
'''
_edgewise_spline_weighting_backward_kernel = kernel_loop + '''
extern "C"
__global__ void edgewise_spline_weighting_backward_kernel(
const ${Dtype}* grad_output, ${Dtype}* grad_input, ${Dtype}* grad_weight,
const ${Dtype}* input, const ${Dtype}* weight, const ${Dtype}* amount,
const long* index, int num_threads) {
CUDA_KERNEL_LOOP(idx, num_threads) {
const int e_idx = idx / ${M_out};
const int m_out_idx = idx % ${M_out};
${Dtype} w;
${Dtype} g;
${Dtype} f;
${Dtype} w_grad;
int k;
${Dtype} b;
long c;
long w_idx;
for (int k_idx = 0; k_idx < ${k_max}; k_idx++) {
k = e_idx * ${k_max} + k_idx;
b = amount[k];
c = index[k];
for (int m_in_idx = 0; m_in_idx < ${M_in}; m_in_idx++) {
w_idx = c * ${M_out} * ${M_in} +
m_in_idx * ${M_out} +
m_out_idx;
w = weight[w_idx];
// Calculate input gradient.
g = grad_output[e_idx * ${M_out} + m_out_idx];
atomicAdd(&(grad_input[e_idx * ${M_in} + m_in_idx]), b * w * g);
// This is inefficient: `reduce_sum` shouldn't be done like this.
// Looping over `M_out` would be better to avoid the `atomicAdd`.
// Calculate weight gradient.
f = input[e_idx * ${M_in} + m_in_idx];
w_grad = f * b * g;
atomicAdd(&(grad_weight[w_idx]), w_grad);
// Not so efficient either, but not avoidable.
}
}
}
}
'''
def get_forward_kernel(M_in, M_out, k_max):
cuda_tensor = torch.FloatTensor([1]).cuda()
with torch.cuda.device_of(cuda_tensor):
f_fw = load_kernel(
'edgewise_spline_weighting_forward_kernel',
_edgewise_spline_weighting_forward_kernel,
Dtype='float',
M_in=M_in,
M_out=M_out,
k_max=k_max)
return f_fw
def get_backward_kernel(M_in, M_out, k_max, K):
cuda_tensor = torch.FloatTensor([1]).cuda()
with torch.cuda.device_of(cuda_tensor):
f_bw = load_kernel(
'edgewise_spline_weighting_backward_kernel',
_edgewise_spline_weighting_backward_kernel,
Dtype='float',
M_in=M_in,
M_out=M_out,
k_max=k_max,
K=K)
return f_bw
class EdgewiseSplineWeightingGPU(Function):
def __init__(self, amount, index, K, M_in, M_out, k_fw, k_bw):
super(EdgewiseSplineWeightingGPU, self).__init__()
assert amount.is_cuda and index.is_cuda
self.amount = amount
self.index = index
self.M_in = M_in
self.M_out = M_out
self.K = K
self.f_fw = k_fw
self.f_bw = k_bw
def forward(self, input, weight):
assert input.is_cuda and weight.is_cuda
self.save_for_backward(input, weight)
output = input.new(input.size(0), self.M_out)
num_threads = output.numel()
with torch.cuda.device_of(input):
self.f_fw(
block=(cuda_num_threads, 1, 1),
grid=(get_blocks(num_threads), 1, 1),
args=[
input.data_ptr(),
weight.data_ptr(),
output.data_ptr(),
self.amount.data_ptr(),
self.index.data_ptr(), num_threads
],
stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))
return output
def backward(self, grad_output):
input, weight = self.saved_tensors
grad_input = grad_output.new(input.size(0), self.M_in).fill_(0)
grad_weight = grad_output.new(self.K, self.M_in, self.M_out).fill_(0)
num_threads = grad_output.numel()
with torch.cuda.device_of(grad_output):
self.f_bw(
block=(cuda_num_threads, 1, 1),
grid=(get_blocks(num_threads), 1, 1),
args=[
grad_output.data_ptr(),
grad_input.data_ptr(),
grad_weight.data_ptr(),
input.data_ptr(),
weight.data_ptr(),
self.amount.data_ptr(),
self.index.data_ptr(), num_threads
],
stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))
return grad_input, grad_weight
import torch
if torch.cuda.is_available():
from .compute_spline_basis import compute_spline_basis
def spline(input, kernel_size, is_open_spline, K, degree, basis_kernel):
if input.is_cuda:
return compute_spline_basis(input, kernel_size, is_open_spline, K,
basis_kernel)
else:
raise NotImplementedError()
...@@ -2,23 +2,22 @@ import torch ...@@ -2,23 +2,22 @@ import torch
from torch.autograd import Variable from torch.autograd import Variable
import time import time
from .spline import spline from .spline_conv_gpu import SplineConvGPU
from .edgewise_spline_weighting import edgewise_spline_weighting
def spline_conv( def spline_conv(
adj, # Tensor adj, # Pytorch Tensor (!bp_to_adj) or Pytorch Variable (bp_to_adj)
input, # Variable input, # Pytorch Variable
weight, # Variable weight, # Pytorch Variable
kernel_size, kernel_size, # Rest tensors or python variables
is_open_spline, is_open_spline,
K, K,
forward_kernel, weighting_kernel,
backward_kernel, weighting_backward_kernel,
basis_kernel, basis_kernel,
basis_backward_kernel=None,
degree=1, degree=1,
bias=None): bias=None,
bp_to_adj=False):
if input.dim() == 1: if input.dim() == 1:
input = input.unsqueeze(1) input = input.unsqueeze(1)
...@@ -29,12 +28,14 @@ def spline_conv( ...@@ -29,12 +28,14 @@ def spline_conv(
# Get features for every end vertex with shape [|E| x M_in]. # Get features for every end vertex with shape [|E| x M_in].
output = input[col] output = input[col]
# Convert to [|E| x M_in] feature matrix and calculate [|E| x M_out]. # Convert to [|E| x M_in] feature matrix and calculate [|E| x M_out].
if output.is_cuda:
amount, index = spline(values, kernel_size, is_open_spline, K, degree, output = SplineConvGPU(kernel_size, is_open_spline, K, degree,
basis_kernel) basis_kernel, basis_backward_kernel,
weighting_kernel, weighting_backward_kernel,
output = edgewise_spline_weighting(output, weight[:-1], amount, index, bp_to_adj)(output, weight[:-1], values)
forward_kernel, backward_kernel) else:
# CPU Implementation not available
raise NotImplementedError()
# Convolution via `scatter_add`. Converts [|E| x M_out] feature matrix to # Convolution via `scatter_add`. Converts [|E| x M_out] feature matrix to
# [n x M_out] feature matrix. # [n x M_out] feature matrix.
......
import torch
from torch.autograd import Variable
from .spline import spline
from .edgewise_spline_weighting_bp2adj_gpu import EdgewiseSplineWeightingGPU
def spline_conv_bp2adj(
adj, # Tensor
input, # Variable
weight, # Variable
kernel_size,
is_open_spline,
K,
degree=1,
bias=None):
values = adj._values()
row, col = adj._indices()
# Get features for every end vertex with shape [|E| x M_in].
output = input[col]
# Convert to [|E| x M_in] feature matrix and calculate [|E| x M_out].
output = EdgewiseSplineWeightingGPU(kernel_size, is_open_spline, K)(output, weight[:-1], values)
# Convolution via `scatter_add`. Converts [|E| x M_out] feature matrix to
# [n x M_out] feature matrix.
zero = output.data.new(adj.size(1), output.size(1)).fill_(0.0)
zero = Variable(zero) if not torch.is_tensor(output) else zero
r = row.view(-1, 1).expand(row.size(0), output.size(1))
output = zero.scatter_add_(0, Variable(r), output)
# Weighten root node features by multiplying with root weight.
output += torch.mm(input, weight[-1])
# Normalize output by degree.
ones = values.new(values.size(0)).fill_(1)
zero = values.new(output.size(0)).fill_(0)
degree = zero.scatter_add_(0, row, ones)
degree = torch.clamp(degree, min=1)
output = output / Variable(degree.view(-1, 1))
if bias is not None:
output += bias
return output
import torch
from torch.autograd import Function
from ....utils.cuda import (cuda_num_threads, Stream, load_kernel, kernel_loop,
get_blocks)
_edgewise_spline_weighting_forward_kernel = kernel_loop + '''
extern "C"
__global__ void edgewise_spline_weighting_forward_kernel(
const ${Dtype}* input, const ${Dtype}* weight, ${Dtype}* output,
const ${Dtype}* amount, const long* index, int num_threads) {
CUDA_KERNEL_LOOP(idx, num_threads) {
const int e_idx = idx / ${M_out};
const int m_out_idx = idx % ${M_out};
${Dtype} result = 0.0;
${Dtype} w;
${Dtype} f;
int k;
${Dtype} b;
long c;
long w_idx;
for (int k_idx = 0; k_idx < ${k_max}; k_idx++) {
k = e_idx * ${k_max} + k_idx;
b = amount[k];
c = index[k];
for (int m_in_idx = 0; m_in_idx < ${M_in}; m_in_idx++) {
w_idx = c * ${M_out} * ${M_in} +
m_in_idx * ${M_out} +
m_out_idx;
w = weight[w_idx];
f = input[e_idx * ${M_in} + m_in_idx];
result += b * w * f;
}
}
output[idx] = result;
}
}
'''
_edgewise_spline_weighting_backward_kernel = kernel_loop + '''
extern "C"
__global__ void edgewise_spline_weighting_backward_kernel(
const ${Dtype}* grad_output, ${Dtype}* grad_input, ${Dtype}* grad_weight,
const ${Dtype}* input, const ${Dtype}* weight, const ${Dtype}* amount,
const long* index, int num_threads) {
CUDA_KERNEL_LOOP(idx, num_threads) {
const int e_idx = idx / ${M_out};
const int m_out_idx = idx % ${M_out};
${Dtype} w;
${Dtype} g;
${Dtype} f;
${Dtype} w_grad;
int k;
${Dtype} b;
long c;
long w_idx;
for (int k_idx = 0; k_idx < ${k_max}; k_idx++) {
k = e_idx * ${k_max} + k_idx;
b = amount[k];
c = index[k];
for (int m_in_idx = 0; m_in_idx < ${M_in}; m_in_idx++) {
w_idx = c * ${M_out} * ${M_in} +
m_in_idx * ${M_out} +
m_out_idx;
w = weight[w_idx];
// Calculate input gradient.
g = grad_output[e_idx * ${M_out} + m_out_idx];
atomicAdd(&(grad_input[e_idx * ${M_in} + m_in_idx]), b * w * g);
// This is inefficient: `reduce_sum` shouldn't be done like this.
// Looping over `M_out` would be better to avoid the `atomicAdd`.
// Calculate weight gradient.
f = input[e_idx * ${M_in} + m_in_idx];
w_grad = f * b * g;
atomicAdd(&(grad_weight[w_idx]), w_grad);
// Not so efficient either, but not avoidable.
}
}
}
}
'''
_edgewise_spline_weighting_backward_kernel_bp2adj = kernel_loop + '''
extern "C"
__global__ void edgewise_spline_weighting_backward_kernel(
const ${Dtype}* grad_output, ${Dtype}* grad_input, ${Dtype}* grad_weight,
${Dtype}* grad_amount, const ${Dtype}* input, const ${Dtype}* weight,
const ${Dtype}* amount, const long* index, int num_threads) {
CUDA_KERNEL_LOOP(idx, num_threads) {
const int e_idx = idx / ${M_out};
const int m_out_idx = idx % ${M_out};
${Dtype} w;
${Dtype} g;
${Dtype} f;
${Dtype} w_grad;
int k;
${Dtype} b;
long c;
long w_idx;
for (int k_idx = 0; k_idx < ${k_max}; k_idx++) {
k = e_idx * ${k_max} + k_idx;
b = amount[k];
c = index[k];
for (int m_in_idx = 0; m_in_idx < ${M_in}; m_in_idx++) {
w_idx = c * ${M_out} * ${M_in} +
m_in_idx * ${M_out} +
m_out_idx;
w = weight[w_idx];
// Calculate input gradient.
g = grad_output[e_idx * ${M_out} + m_out_idx];
atomicAdd(&(grad_input[e_idx * ${M_in} + m_in_idx]), b * w * g);
// This is inefficient: `reduce_sum` shouldn't be done like this.
// Looping over `M_out` would be better to avoid the `atomicAdd`.
// Calculate weight gradient.
f = input[e_idx * ${M_in} + m_in_idx];
w_grad = f * b * g;
atomicAdd(&(grad_weight[w_idx]), w_grad);
// Not so efficient either, but not avoidable.
// Calculate B-spline basis tensor product gradient
adj_g += g * f * w;
}
atomicAdd(&(grad_amount[e_idx,k_idx]), adj_g);
}
}
}
'''
def get_weighting_forward_kernel(M_in, M_out, k_max, bt_to_adj=False):
cuda_tensor = torch.FloatTensor([1]).cuda()
if bt_to_adj:
kernel = _edgewise_spline_weighting_forward_kernel
else:
kernel = _edgewise_spline_weighting_forward_kernel
with torch.cuda.device_of(cuda_tensor):
f_fw = load_kernel(
'edgewise_spline_weighting_forward_kernel',
kernel,
Dtype='float',
M_in=M_in,
M_out=M_out,
k_max=k_max)
return f_fw
def get_weighting_backward_kernel(M_in, M_out, k_max, K, bt_to_adj=False):
cuda_tensor = torch.FloatTensor([1]).cuda()
if bt_to_adj:
kernel = _edgewise_spline_weighting_backward_kernel_bp2adj
else:
kernel = _edgewise_spline_weighting_backward_kernel
with torch.cuda.device_of(cuda_tensor):
f_bw = load_kernel(
'edgewise_spline_weighting_backward_kernel',
kernel,
Dtype='float',
M_in=M_in,
M_out=M_out,
k_max=k_max,
K=K)
return f_bw
_spline_kernel_linear = kernel_loop + '''
extern "C"
__global__ void spline_kernel(
const ${Dtype}* input, ${Dtype}* amount, long* index,
const long* kernel_size, const long* is_open_spline, int num_threads) {
CUDA_KERNEL_LOOP(idx, num_threads) {
const int e_idx = idx / ${k_max};
int k_idx = idx % ${k_max};
int K = 1.0;
int k_idx_mod;
int bot;
int top;
${Dtype} value;
${Dtype} frac;
${Dtype} a = 1.0;
long i = 0;
for (int d_idx = 0; d_idx < ${dim}; d_idx++) {
k_idx_mod = k_idx % 2;
k_idx >>= 1;
value = input[e_idx * ${dim} + d_idx];
if(value==1.0)
value -= 0.000001;
value *= kernel_size[d_idx] - is_open_spline[d_idx];
frac = value - floor(value);
a *= (1 - k_idx_mod) * (1 - frac) + k_idx_mod * frac;
bot = int(floor(value));
top = (bot + 1) % kernel_size[d_idx];
bot %= kernel_size[d_idx];
i += ((1 - k_idx_mod) * bot + k_idx_mod * top) * K;
K *= kernel_size[d_idx];
}
amount[idx] = a;
index[idx] = i;
}
}
'''
_spline_kernel_quadratic = kernel_loop + '''
extern "C"
__global__ void spline_kernel(
const ${Dtype}* input, ${Dtype}* amount, long* index,
const long* kernel_size, const long* is_open_spline, int num_threads) {
CUDA_KERNEL_LOOP(idx, num_threads) {
const int e_idx = idx / ${k_max};
int k_idx = idx % ${k_max};
int K = ${K};
int k_idx_mod;
int pos;
${Dtype} value;
${Dtype} frac;
${Dtype} a = 1.0;
long i = 0;
for (int d_idx = 0; d_idx < ${dim}; d_idx++) {
K /= kernel_size[d_idx];
k_idx_mod = k_idx % 3;
k_idx /= 3;
value = input[e_idx * ${dim} + d_idx] *
(kernel_size[d_idx] - (2 * is_open_spline[d_idx]));
frac = value - floor(value);
if (k_idx_mod == 0) a *= 0.5 * (1- frac) * (1-frac);
else if (k_idx_mod == 1) a *= -frac * frac + frac + 0.5;
else a *= 0.5 * frac * frac;
pos = int(floor(value)) + k_idx_mod;
pos %= kernel_size[d_idx];
i += pos * K;
}
amount[idx] = a;
index[idx] = i;
}
}
'''
_spline_kernel_cubic = kernel_loop + '''
extern "C"
__global__ void spline_kernel(
const ${Dtype}* input, ${Dtype}* amount, long* index,
const long* kernel_size, const long* is_open_spline, int num_threads) {
CUDA_KERNEL_LOOP(idx, num_threads}) {
const int e_idx = idx / ${k_max};
int k_idx = idx % ${k_max};
int K = ${K};
int k_idx_mod;
int pos;
${Dtype} value;
${Dtype} frac;
${Dtype} a = 1.0;
long i = 0;
for (int d_idx = 0; d_idx < ${dim}; d_idx++) {
K /= kernel_size[d_idx];
k_idx_mod = k_idx % 4;
k_idx /= 4;
value = input[e_idx * ${dim} + d_idx] *
(kernel_size[d_idx] - (3 * is_open_spline[d_idx]));
frac = value - floor(value);
if (k_idx_mod == 0) a *= (1 - frac) * (1 - frac) * (1 - frac) / 6.0;
else if (k_idx_mod == 1)
a *= (3 * frac * frac * frac - 6 * frac * frac + 4) / 6.0;
else if (k_idx_mod == 2)
a *= (-3 * frac * frac * frac + 3 * frac * frac + 3 * frac + 1) / 6.0;
else a *= frac * frac * frac / 6.0;
pos = int(floor(value)) + k_idx_mod;
pos %= kernel_size[d_idx];
i += pos * K;
}
amount[idx] = a;
index[idx] = i;
}
}
'''
_spline_kernel_linear_backward = kernel_loop + '''
extern "C"
__global__ void spline_kernel(
const ${Dtype}* input, const ${Dtype}* grad_amount, ${Dtype}* amount,
${Dtype}* grad_adj, const long* kernel_size, const long* is_open_spline,
int num_threads) {
CUDA_KERNEL_LOOP(idx, num_threads) {
const int e_idx = idx / ${dim};
int d_idx = idx % ${dim};
int K = ${K};
int k_idx_mod;
int bot;
int top;
${Dtype} value;
${Dtype} frac;
${Dtype} grad_out = 0.0;
long i = 0;
int quotient = (int)pow(2.0,(float)d_idx)
for (int k_idx = 0; k_idx < ${k_max}; k_idx++) {
k_idx_mod = (k_idx/quotient) % 2;
value = input[e_idx * ${dim} + d_idx];
value *= kernel_size[d_idx] - is_open_spline[d_idx];
frac = value - floor(value);
residual = (1 - k_idx_mod) * frac + k_idx_mod * (1 - frac);
int a_idx = e_idx*${k_max} + k_idx
grad_out += grad_amount[a_idx]*amount[a_idx]/residual;
}
grad_adj[e_idx* ${dim} + d_idx] = grad_out;
}
}
'''
def get_basis_kernel(k_max, K, dim, degree, bt_to_adj=False):
if degree == 3:
_spline_kernel = _spline_kernel_cubic
elif degree == 2:
_spline_kernel = _spline_kernel_quadratic
else:
_spline_kernel = _spline_kernel_linear
cuda_tensor = torch.FloatTensor([1]).cuda()
with torch.cuda.device_of(cuda_tensor):
f = load_kernel(
'spline_kernel',
_spline_kernel,
Dtype='float',
k_max=k_max,
dim=dim,
K=K)
return f
def get_basis_backward_kernel(k_max, K, dim, degree):
if degree == 3:
_spline_kernel = _spline_kernel_cubic
elif degree == 2:
_spline_kernel = _spline_kernel_quadratic
else:
_spline_kernel = _spline_kernel_linear_backward
cuda_tensor = torch.FloatTensor([1]).cuda()
with torch.cuda.device_of(cuda_tensor):
f = load_kernel(
'spline_kernel',
_spline_kernel,
Dtype='float',
k_max=k_max,
dim=dim,
K=K)
return f
class SplineConvGPU(Function):
def __init__(self, kernel_size, is_open_spline, K, degree,
basis_kernel, basis_backward_kernel,
weighting_kernel, weighting_backward_kernel,
bp_to_adj=False):
super(SplineConvGPU, self).__init__()
self.degree = degree
self.f_weighting_fw = weighting_kernel
self.f_weighting_bw = weighting_backward_kernel
self.kernel_size = kernel_size
self.is_open_spline = is_open_spline
self.f_basis_fw = basis_kernel
self.f_basis_bw = basis_backward_kernel
self.bp_to_adj = bp_to_adj
def forward(self, input, weight, adj_values):
assert input.is_cuda and weight.is_cuda
self.K, self.M_in, self.M_out = weight.size()
# Compute B-spline basis tensor products
adj_values = adj_values.unsqueeze(1) if len(adj_values.size()) < 2 \
else adj_values
num_edges, dim = adj_values.size()
k_max = 2 ** dim
amount = adj_values.new(num_edges, k_max)
index = adj_values.new(num_edges, k_max).long()
num_threads = amount.numel()
with torch.cuda.device_of(input):
self.f_basis_fw(
block=(cuda_num_threads, 1, 1),
grid=(get_blocks(num_threads), 1, 1),
args=[
adj_values.data_ptr(),
amount.data_ptr(),
index.data_ptr(),
self.kernel_size.data_ptr(),
self.is_open_spline.data_ptr(), num_threads
],
stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))
# Weight features
output = input.new(input.size(0), self.M_out)
num_threads = output.numel()
with torch.cuda.device_of(input):
self.f_weighting_fw(
block=(cuda_num_threads, 1, 1),
grid=(get_blocks(num_threads), 1, 1),
args=[
input.data_ptr(),
weight.data_ptr(),
output.data_ptr(),
amount.data_ptr(),
index.data_ptr(), num_threads
],
stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))
if self.bp_to_adj:
self.save_for_backward(input, weight, adj_values, amount, index)
else:
self.save_for_backward(input, weight, amount, index)
return output
def backward(self, grad_output):
grad_input = grad_output.new(grad_output.size(0), self.M_in).fill_(0)
grad_weight = grad_output.new(self.K, self.M_in, self.M_out).fill_(0)
num_threads = grad_output.numel()
if self.bp_to_adj:
input, weight, adj_values, amount, index = self.saved_tensors
grad_amount = grad_output.new(amount.size(0),
amount.size(1)).fill_(0)
with torch.cuda.device_of(grad_output):
self.f_weighting_bw(
block=(cuda_num_threads, 1, 1),
grid=(get_blocks(num_threads), 1, 1),
args=[
grad_output.data_ptr(),
grad_input.data_ptr(),
grad_weight.data_ptr(),
grad_amount.data_ptr(),
input.data_ptr(),
weight.data_ptr(),
amount.data_ptr(),
index.data_ptr(), num_threads
],
stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))
num_threads = grad_amount.numel()
grad_adj = grad_amount.new(grad_amount.size(0),
self.kernel_size.size(0)).fill_(0)
with torch.cuda.device_of(grad_amount):
self.f_basis_bw(
block=(cuda_num_threads, 1, 1),
grid=(get_blocks(num_threads), 1, 1),
args=[
adj_values.data_ptr(),
grad_amount.data_ptr(),
amount.data_ptr(),
grad_adj.data_ptr(),
self.kernel_size.data_ptr(),
self.is_open_spline.data_ptr(), num_threads
],
stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))
return grad_input, grad_weight, grad_adj
else:
input, weight, amount, index = self.saved_tensors
with torch.cuda.device_of(grad_output):
self.f_weighting_bw(
block=(cuda_num_threads, 1, 1),
grid=(get_blocks(num_threads), 1, 1),
args=[
grad_output.data_ptr(),
grad_input.data_ptr(),
grad_weight.data_ptr(),
input.data_ptr(),
weight.data_ptr(),
amount.data_ptr(),
index.data_ptr(), num_threads
],
stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))
return grad_input, grad_weight
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment