import torch from torch.autograd import Function, Variable from ....utils.cuda import (cuda_num_threads, Stream, load_kernel, kernel_loop, get_blocks) _edgewise_spline_weighting_forward_kernel = kernel_loop + ''' extern "C" __global__ void edgewise_spline_weighting_forward_kernel( const ${Dtype}* input, const ${Dtype}* weight, ${Dtype}* output, const ${Dtype}* amount, const long* index, int num_threads) { CUDA_KERNEL_LOOP(idx, num_threads) { const int e_idx = idx / ${M_out}; const int m_out_idx = idx % ${M_out}; ${Dtype} result = 0.0; ${Dtype} w; ${Dtype} f; int k; ${Dtype} b; long c; long w_idx; for (int k_idx = 0; k_idx < ${k_max}; k_idx++) { k = e_idx * ${k_max} + k_idx; b = amount[k]; c = index[k]; for (int m_in_idx = 0; m_in_idx < ${M_in}; m_in_idx++) { w_idx = c * ${M_out} * ${M_in} + m_in_idx * ${M_out} + m_out_idx; w = weight[w_idx]; f = input[e_idx * ${M_in} + m_in_idx]; result += b * w * f; } } output[idx] = result; } } ''' _edgewise_spline_weighting_backward_kernel = kernel_loop + ''' extern "C" __global__ void edgewise_spline_weighting_backward_kernel( const ${Dtype}* grad_output, ${Dtype}* grad_input, ${Dtype}* grad_weight, const ${Dtype}* input, const ${Dtype}* weight, const ${Dtype}* amount, const long* index, int num_threads) { CUDA_KERNEL_LOOP(idx, num_threads) { const int e_idx = idx / ${M_out}; const int m_out_idx = idx % ${M_out}; ${Dtype} w; ${Dtype} g; ${Dtype} f; ${Dtype} w_grad; int k; ${Dtype} b; long c; long w_idx; for (int k_idx = 0; k_idx < ${k_max}; k_idx++) { k = e_idx * ${k_max} + k_idx; b = amount[k]; c = index[k]; for (int m_in_idx = 0; m_in_idx < ${M_in}; m_in_idx++) { w_idx = c * ${M_out} * ${M_in} + m_in_idx * ${M_out} + m_out_idx; w = weight[w_idx]; // Calculate input gradient. g = grad_output[e_idx * ${M_out} + m_out_idx]; atomicAdd(&(grad_input[e_idx * ${M_in} + m_in_idx]), b * w * g); // This is inefficient: `reduce_sum` shouldn't be done like this. // Looping over `M_out` would be better to avoid the `atomicAdd`. // Calculate weight gradient. f = input[e_idx * ${M_in} + m_in_idx]; w_grad = f * b * g; atomicAdd(&(grad_weight[w_idx]), w_grad); // Not so efficient either, but not avoidable. } } } } ''' _edgewise_spline_weighting_backward_kernel_bp2adj = kernel_loop + ''' extern "C" __global__ void edgewise_spline_weighting_backward_kernel( const ${Dtype}* grad_output, ${Dtype}* grad_input, ${Dtype}* grad_weight, ${Dtype}* grad_amount, const ${Dtype}* input, const ${Dtype}* weight, const ${Dtype}* amount, const long* index, int num_threads) { CUDA_KERNEL_LOOP(idx, num_threads) { const int e_idx = idx / ${M_out}; const int m_out_idx = idx % ${M_out}; ${Dtype} w; ${Dtype} g; ${Dtype} f; ${Dtype} w_grad; int k; ${Dtype} b; long c; long w_idx; for (int k_idx = 0; k_idx < ${k_max}; k_idx++) { k = e_idx * ${k_max} + k_idx; b = amount[k]; c = index[k]; ${Dtype} adj_g = 0.0; for (int m_in_idx = 0; m_in_idx < ${M_in}; m_in_idx++) { w_idx = c * ${M_out} * ${M_in} + m_in_idx * ${M_out} + m_out_idx; w = weight[w_idx]; // Calculate input gradient. g = grad_output[e_idx * ${M_out} + m_out_idx]; atomicAdd(&(grad_input[e_idx * ${M_in} + m_in_idx]), b * w * g); // This is inefficient: `reduce_sum` shouldn't be done like this. // Looping over `M_out` would be better to avoid the `atomicAdd`. // Calculate weight gradient. f = input[e_idx * ${M_in} + m_in_idx]; w_grad = f * b * g; atomicAdd(&(grad_weight[w_idx]), w_grad); // Not so efficient either, but not avoidable. // Calculate B-spline basis tensor product gradient adj_g += g * f * w; } atomicAdd(&(grad_amount[e_idx*${k_max} + k_idx]), adj_g); } } } ''' def get_weighting_forward_kernel(M_in, M_out, k_max, dtype='float'): cuda_tensor = torch.FloatTensor([1]).cuda() kernel = _edgewise_spline_weighting_forward_kernel with torch.cuda.device_of(cuda_tensor): f_fw = load_kernel( 'edgewise_spline_weighting_forward_kernel', kernel, Dtype=dtype, M_in=M_in, M_out=M_out, k_max=k_max) return f_fw def get_weighting_backward_kernel(M_in, M_out, k_max, K, bp_to_adj=False, dtype='float'): cuda_tensor = torch.FloatTensor([1]).cuda() if bp_to_adj: kernel = _edgewise_spline_weighting_backward_kernel_bp2adj else: kernel = _edgewise_spline_weighting_backward_kernel with torch.cuda.device_of(cuda_tensor): f_bw = load_kernel( 'edgewise_spline_weighting_backward_kernel', kernel, Dtype=dtype, M_in=M_in, M_out=M_out, k_max=k_max, K=K) return f_bw _spline_kernel_linear = kernel_loop + ''' extern "C" __global__ void spline_kernel( const ${Dtype}* input, ${Dtype}* amount, long* index, const long* kernel_size, const long* is_open_spline, int num_threads) { CUDA_KERNEL_LOOP(idx, num_threads) { const int e_idx = idx / ${k_max}; int k_idx = idx % ${k_max}; int K = ${K}; int k_idx_mod; int bot; int top; ${Dtype} value; ${Dtype} frac; ${Dtype} a = 1.0; long i = 0; for (int d_idx = 0; d_idx < ${dim}; d_idx++) { K/=kernel_size[d_idx]; k_idx_mod = k_idx % 2; k_idx >>= 1; value = input[e_idx * ${dim} + d_idx]; value *= kernel_size[d_idx] - is_open_spline[d_idx]; frac = value - floor(value); a *= (1 - k_idx_mod) * (1 - frac) + k_idx_mod * frac; bot = int(floor(value)); top = (bot + 1) % kernel_size[d_idx]; bot %= kernel_size[d_idx]; i += ((1 - k_idx_mod) * bot + k_idx_mod * top) * K; } amount[idx] = a; index[idx] = i; } } ''' _spline_kernel_quadratic = kernel_loop + ''' extern "C" __global__ void spline_kernel( const ${Dtype}* input, ${Dtype}* amount, long* index, const long* kernel_size, const long* is_open_spline, int num_threads) { CUDA_KERNEL_LOOP(idx, num_threads) { const int e_idx = idx / ${k_max}; int k_idx = idx % ${k_max}; int K = ${K}; int k_idx_mod; int pos; ${Dtype} value; ${Dtype} frac; ${Dtype} a = 1.0; long i = 0; for (int d_idx = 0; d_idx < ${dim}; d_idx++) { K /= kernel_size[d_idx]; k_idx_mod = k_idx % 3; k_idx /= 3; value = input[e_idx * ${dim} + d_idx] * (kernel_size[d_idx] - (2 * is_open_spline[d_idx])); frac = value - floor(value); if (k_idx_mod == 0) a *= 0.5 * (1- frac) * (1-frac); else if (k_idx_mod == 1) a *= -frac * frac + frac + 0.5; else a *= 0.5 * frac * frac; pos = int(floor(value)) + k_idx_mod; pos %= kernel_size[d_idx]; i += pos * K; } amount[idx] = a; index[idx] = i; } } ''' _spline_kernel_cubic = kernel_loop + ''' extern "C" __global__ void spline_kernel( const ${Dtype}* input, ${Dtype}* amount, long* index, const long* kernel_size, const long* is_open_spline, int num_threads) { CUDA_KERNEL_LOOP(idx, num_threads) { const int e_idx = idx / ${k_max}; int k_idx = idx % ${k_max}; int K = ${K}; int k_idx_mod; int pos; ${Dtype} value; ${Dtype} frac; ${Dtype} a = 1.0; long i = 0; for (int d_idx = 0; d_idx < ${dim}; d_idx++) { K /= kernel_size[d_idx]; k_idx_mod = k_idx % 4; k_idx /= 4; value = input[e_idx * ${dim} + d_idx] * (kernel_size[d_idx] - (3 * is_open_spline[d_idx])); frac = value - floor(value); if (k_idx_mod == 0) a *= (1 - frac) * (1 - frac) * (1 - frac) / 6.0; else if (k_idx_mod == 1) a *= (3 * frac * frac * frac - 6 * frac * frac + 4) / 6.0; else if (k_idx_mod == 2) a *= (-3 * frac * frac * frac + 3 * frac * frac + 3 * frac + 1) / 6.0; else a *= frac * frac * frac / 6.0; pos = int(floor(value)) + k_idx_mod; pos %= kernel_size[d_idx]; i += pos * K; } amount[idx] = a; index[idx] = i; } } ''' # This is the efficient version which uses amount but may divide by 0 and # may be numerically unstable. # No solution for this yet, use the less efficient version 2. _spline_kernel_linear_backward = kernel_loop + ''' extern "C" __global__ void spline_kernel( const ${Dtype}* input, const ${Dtype}* grad_amount, ${Dtype}* amount, ${Dtype}* grad_adj, const long* kernel_size, const long* is_open_spline, int num_threads) { CUDA_KERNEL_LOOP(idx, num_threads) { const int e_idx = idx / ${dim}; int d_idx = idx % ${dim}; int k_idx_mod; ${Dtype} value; ${Dtype} frac; ${Dtype} grad_out = 0.0; int quotient = (int)pow(2.0,(double)d_idx); value = input[e_idx * ${dim} + d_idx]; value *= kernel_size[d_idx] - is_open_spline[d_idx]; frac = value - floor(value); for (int k_idx = 0; k_idx < ${k_max}; k_idx++) { k_idx_mod = (k_idx/quotient) % 2; int a_idx = e_idx*${k_max} + k_idx; ${Dtype} residual = - (1 - k_idx_mod) * (1 - frac) + k_idx_mod * frac; grad_out += grad_amount[a_idx]*amount[a_idx]/residual; } grad_adj[idx] = grad_out*(kernel_size[d_idx] - is_open_spline[d_idx]); } } /* ${Dtype} a = -(1 - k_idx_mod) + k_idx_mod; for (int d_it = 0; d_it < ${dim}; d_it++) { if(d_it!=d_idx) { value = input[e_idx * ${dim} + d_it]; value *= kernel_size[d_it] - is_open_spline[d_it]; frac = value - floor(value); a *= (1 - k_idx_mod) * (1 - frac) + k_idx_mod * frac; } } grad_out += a*grad_amount[a_idx]; */ ''' # This is the inefficient version with gradient computation without using amount _spline_kernel_linear_backward2 = kernel_loop + ''' extern "C" __global__ void spline_kernel( const ${Dtype}* input, const ${Dtype}* grad_amount, ${Dtype}* amount, ${Dtype}* grad_adj, const long* kernel_size, const long* is_open_spline, int num_threads) { CUDA_KERNEL_LOOP(idx, num_threads) { const int e_idx = idx / ${dim}; int d_idx = idx % ${dim}; int k_idx_mod; ${Dtype} value; ${Dtype} frac; ${Dtype} grad_out = 0.0; int quotient = (int)pow(2.0,(double)d_idx); for (int k_idx = 0; k_idx < ${k_max}; k_idx++) { k_idx_mod = (k_idx/quotient) % 2; int a_idx = e_idx*${k_max} + k_idx; ${Dtype} a = -(1 - k_idx_mod) + k_idx_mod; for (int d_it = 0; d_it < ${dim}; d_it++) { if(d_it!=d_idx) { int quotient = (int)pow(2.0,(double)d_it); k_idx_mod = (k_idx/quotient) % 2; value = input[e_idx * ${dim} + d_it]; value *= kernel_size[d_it] - is_open_spline[d_it]; frac = value - floor(value); a *= (1 - k_idx_mod) * (1 - frac) + k_idx_mod * frac; } } grad_out += a*grad_amount[a_idx]; } grad_adj[idx] = grad_out*(kernel_size[d_idx] - is_open_spline[d_idx]); } } ''' def get_basis_kernel(k_max, K, dim, degree, dtype='float'): if degree == 3: _spline_kernel = _spline_kernel_cubic elif degree == 2: _spline_kernel = _spline_kernel_quadratic else: _spline_kernel = _spline_kernel_linear cuda_tensor = torch.FloatTensor([1]).cuda() with torch.cuda.device_of(cuda_tensor): f = load_kernel( 'spline_kernel', _spline_kernel, Dtype=dtype, k_max=k_max, dim=dim, K=K) return f def get_basis_backward_kernel(k_max, K, dim, degree, dtype='float'): if degree == 3: _spline_kernel = _spline_kernel_linear_backward2 elif degree == 2: _spline_kernel = _spline_kernel_linear_backward2 else: _spline_kernel = _spline_kernel_linear_backward2 cuda_tensor = torch.FloatTensor([1]).cuda() with torch.cuda.device_of(cuda_tensor): f = load_kernel( 'spline_kernel', _spline_kernel, Dtype=dtype, k_max=k_max, dim=dim, K=K) return f class SplineConvGPU(Function): def __init__(self, kernel_size, is_open_spline, K, degree, basis_kernel, basis_backward_kernel, weighting_kernel, weighting_backward_kernel, bp_to_adj=False, adj_values=None): super(SplineConvGPU, self).__init__() self.degree = degree self.f_weighting_fw = weighting_kernel self.f_weighting_bw = weighting_backward_kernel self.kernel_size = kernel_size self.is_open_spline = is_open_spline self.f_basis_fw = basis_kernel self.f_basis_bw = basis_backward_kernel self.bp_to_adj = bp_to_adj self.adj_values = adj_values def forward(self, input, weight, adj_values=None): assert input.is_cuda and weight.is_cuda self.K, self.M_in, self.M_out = weight.size() # If bp_to_u is false if adj_values is None: adj_values = self.adj_values # Compute B-spline basis tensor products adj_values = adj_values.unsqueeze(1) if len(adj_values.size()) < 2 \ else adj_values if self.bp_to_adj: self.save_for_backward(input, weight, adj_values) #adj_values = torch.clamp(adj_values,min=0.0,max=1.0) else: self.save_for_backward(input, weight) num_edges, dim = adj_values.size() k_max = (self.degree+1) ** dim amount = adj_values.new(num_edges, k_max) index = adj_values.new(num_edges, k_max).long() num_threads = amount.numel() with torch.cuda.device_of(input): self.f_basis_fw( block=(cuda_num_threads, 1, 1), grid=(get_blocks(num_threads), 1, 1), args=[ adj_values.data_ptr(), amount.data_ptr(), index.data_ptr(), self.kernel_size.data_ptr(), self.is_open_spline.data_ptr(), num_threads ], stream=Stream(ptr=torch.cuda.current_stream().cuda_stream)) # Weight features output = input.new(input.size(0), self.M_out) num_threads = output.numel() with torch.cuda.device_of(input): self.f_weighting_fw( block=(cuda_num_threads, 1, 1), grid=(get_blocks(num_threads), 1, 1), args=[ input.data_ptr(), weight.data_ptr(), output.data_ptr(), amount.data_ptr(), index.data_ptr(), num_threads ], stream=Stream(ptr=torch.cuda.current_stream().cuda_stream)) self.amount = amount self.index = index return output def backward(self, grad_output): grad_input = grad_output.new(grad_output.size(0), self.M_in).fill_(0) grad_weight = grad_output.new(self.K, self.M_in, self.M_out).fill_(0) num_threads = grad_output.numel() if self.bp_to_adj: if self.degree == 2 or self.degree == 3: print('Backward to u for degree>1 not implemented!') raise NotImplementedError input, weight, adj_values = self.saved_tensors #adj_values = torch.clamp(adj_values,min=0.0,max=1.0) amount = self.amount index = self.index grad_amount = grad_output.new(amount.size(0), amount.size(1)).fill_(0) with torch.cuda.device_of(grad_output): self.f_weighting_bw( block=(cuda_num_threads, 1, 1), grid=(get_blocks(num_threads), 1, 1), args=[ grad_output.data_ptr(), grad_input.data_ptr(), grad_weight.data_ptr(), grad_amount.data_ptr(), input.data_ptr(), weight.data_ptr(), amount.data_ptr(), index.data_ptr(), num_threads ], stream=Stream(ptr=torch.cuda.current_stream().cuda_stream)) grad_adj = grad_amount.new(grad_amount.size(0), self.kernel_size.size(0)).fill_(0) num_threads = grad_adj.numel() with torch.cuda.device_of(grad_amount): self.f_basis_bw( block=(cuda_num_threads, 1, 1), grid=(get_blocks(num_threads), 1, 1), args=[ adj_values.data_ptr(), grad_amount.data_ptr(), amount.data_ptr(), grad_adj.data_ptr(), self.kernel_size.data_ptr(), self.is_open_spline.data_ptr(), num_threads ], stream=Stream(ptr=torch.cuda.current_stream().cuda_stream)) #print('grad_input:',grad_input.min(), grad_input.max()) #print('grad_weight:',grad_weight[:,:,-1].min(), grad_weight[:,:,-1].max()) #print('grad_amount:',grad_amount.min(), grad_amount.max()) #print('grad_adj:',grad_adj.min(), grad_adj.max()) return grad_input, grad_weight, grad_adj else: input, weight = self.saved_tensors amount = self.amount index = self.index grad_amount = grad_output.new(amount.size(0), amount.size(1)).fill_(0) with torch.cuda.device_of(grad_output): self.f_weighting_bw( block=(cuda_num_threads, 1, 1), grid=(get_blocks(num_threads), 1, 1), args=[ grad_output.data_ptr(), grad_input.data_ptr(), grad_weight.data_ptr(), grad_amount.data_ptr(), input.data_ptr(), weight.data_ptr(), amount.data_ptr(), index.data_ptr(), num_threads ], stream=Stream(ptr=torch.cuda.current_stream().cuda_stream)) return grad_input, grad_weight, None