Unverified Commit c0f5492e authored by zhuyuanhao's avatar zhuyuanhao Committed by GitHub
Browse files

add ext ops, support parrots (#310)



* add ext ops, support parrots

* fix lint

* fix lint

* update op from mmdetection

* support non-pytorch env

* fix import bug

* test not import mmcv.op

* rename mmcv.op to mmcv.ops

* fix compile warning

* 1. fix syncbn warning in pytorch 1.5
2. support only cpu compile
3. add point_sample from mmdet

* fix text bug

* update docstrings

* fix line endings

* minor updates

* remove non_local from ops

* bug fix for nonlocal2d

* rename ops_ext to _ext and _ext to _flow_warp_ext

* update the doc

* try clang-format github action

* fix github action

* add ops to api.rst

* fix cpp format

* fix clang format issues

* remove .clang-format
Co-authored-by: default avatarKai Chen <chenkaidev@gmail.com>
parent a7bf7701
#ifndef CA_CUDA_KERNEL_CUH
#define CA_CUDA_KERNEL_CUH
template <typename T>
__global__ void ca_forward_kernel(const T *t, const T *f, T *weight, int num,
int chn, int height, int width) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int sp = height * width;
int len = height + width - 1;
int z = blockIdx.z;
if (x < width && y < height && z < height + width - 1) {
for (int batch = 0; batch < num; ++batch) {
for (int plane = 0; plane < chn; ++plane) {
T _t = t[(batch * chn + plane) * sp + y * width + x];
if (z < width) {
int i = z;
T _f = f[(batch * chn + plane) * sp + y * width + i];
weight[(batch * len + i) * sp + y * width + x] += _t * _f;
} else {
int i = z - width;
int j = i < y ? i : i + 1;
T _f = f[(batch * chn + plane) * sp + j * width + x];
weight[(batch * len + width + i) * sp + y * width + x] += _t * _f;
}
}
}
}
}
template <typename T>
__global__ void ca_backward_kernel_t(const T *dw, const T *t, const T *f, T *dt,
int num, int chn, int height, int width) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int sp = height * width;
int len = height + width - 1;
int plane = blockIdx.z;
if (x < width && y < height && plane < chn) {
for (int batch = 0; batch < num; ++batch) {
for (int i = 0; i < width; ++i) {
T _dw = dw[(batch * len + i) * sp + y * width + x];
T _f = f[(batch * chn + plane) * sp + y * width + i];
dt[(batch * chn + plane) * sp + y * width + x] += _dw * _f;
}
for (int i = 0; i < height; ++i) {
if (i == y) continue;
int j = i < y ? i : i - 1;
T _dw = dw[(batch * len + width + j) * sp + y * width + x];
T _f = f[(batch * chn + plane) * sp + i * width + x];
dt[(batch * chn + plane) * sp + y * width + x] += _dw * _f;
}
}
}
}
template <typename T>
__global__ void ca_backward_kernel_f(const T *dw, const T *t, const T *f, T *df,
int num, int chn, int height, int width) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int sp = height * width;
int len = height + width - 1;
int plane = blockIdx.z;
if (x < width && y < height && plane < chn) {
for (int batch = 0; batch < num; ++batch) {
for (int i = 0; i < width; ++i) {
T _dw = dw[(batch * len + x) * sp + y * width + i];
T _t = t[(batch * chn + plane) * sp + y * width + i];
df[(batch * chn + plane) * sp + y * width + x] += _dw * _t;
}
for (int i = 0; i < height; ++i) {
if (i == y) continue;
int j = i > y ? y : y - 1;
T _dw = dw[(batch * len + width + j) * sp + i * width + x];
T _t = t[(batch * chn + plane) * sp + i * width + x];
df[(batch * chn + plane) * sp + y * width + x] += _dw * _t;
}
}
}
}
template <typename T>
__global__ void ca_map_forward_kernel(const T *weight, const T *g, T *out,
int num, int chn, int height, int width) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int sp = height * width;
int len = height + width - 1;
int plane = blockIdx.z;
if (x < width && y < height && plane < chn) {
for (int batch = 0; batch < num; ++batch) {
for (int i = 0; i < width; ++i) {
T _g = g[(batch * chn + plane) * sp + y * width + i];
T _w = weight[(batch * len + i) * sp + y * width + x];
out[(batch * chn + plane) * sp + y * width + x] += _g * _w;
}
for (int i = 0; i < height; ++i) {
if (i == y) continue;
int j = i < y ? i : i - 1;
T _g = g[(batch * chn + plane) * sp + i * width + x];
T _w = weight[(batch * len + width + j) * sp + y * width + x];
out[(batch * chn + plane) * sp + y * width + x] += _g * _w;
}
}
}
}
template <typename T>
__global__ void ca_map_backward_kernel_w(const T *dout, const T *weight,
const T *g, T *dw, int num, int chn,
int height, int width) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int sp = height * width;
int len = height + width - 1;
int z = blockIdx.z;
if (x < width && y < height && z < height + width - 1) {
for (int batch = 0; batch < num; ++batch) {
for (int plane = 0; plane < chn; ++plane) {
T _dout = dout[(batch * chn + plane) * sp + y * width + x];
if (z < width) {
int i = z;
T _g = g[(batch * chn + plane) * sp + y * width + i];
dw[(batch * len + i) * sp + y * width + x] += _dout * _g;
} else {
int i = z - width;
int j = i < y ? i : i + 1;
T _g = g[(batch * chn + plane) * sp + j * width + x];
dw[(batch * len + width + i) * sp + y * width + x] += _dout * _g;
}
}
}
}
}
template <typename T>
__global__ void ca_map_backward_kernel_g(const T *dout, const T *weight,
const T *g, T *dg, int num, int chn,
int height, int width) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int sp = height * width;
int len = height + width - 1;
int plane = blockIdx.z;
if (x < width && y < height && plane < chn) {
for (int batch = 0; batch < num; ++batch) {
for (int i = 0; i < width; ++i) {
T _dout = dout[(batch * chn + plane) * sp + y * width + i];
T _w = weight[(batch * len + x) * sp + y * width + i];
dg[(batch * chn + plane) * sp + y * width + x] += _dout * _w;
}
for (int i = 0; i < height; ++i) {
if (i == y) continue;
int j = i > y ? y : y - 1;
T _dout = dout[(batch * chn + plane) * sp + i * width + x];
T _w = weight[(batch * len + width + j) * sp + i * width + x];
dg[(batch * chn + plane) * sp + y * width + x] += _dout * _w;
}
}
}
}
#endif
#ifndef COMMON_CUDA_HELPER
#define COMMON_CUDA_HELPER
#include <cuda.h>
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
#define THREADS_PER_BLOCK 512
inline int GET_BLOCKS(const int N) {
int optimal_block_num = (N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
int max_block_num = 4096;
return min(optimal_block_num, max_block_num);
}
template <typename T>
__device__ T bilinear_interpolate(const T* input, const int height,
const int width, T y, T x,
const int index /* index for debug only*/) {
// deal with cases that inverse elements are out of feature map boundary
if (y < -1.0 || y > height || x < -1.0 || x > width) return 0;
if (y <= 0) y = 0;
if (x <= 0) x = 0;
int y_low = (int)y;
int x_low = (int)x;
int y_high;
int x_high;
if (y_low >= height - 1) {
y_high = y_low = height - 1;
y = (T)y_low;
} else {
y_high = y_low + 1;
}
if (x_low >= width - 1) {
x_high = x_low = width - 1;
x = (T)x_low;
} else {
x_high = x_low + 1;
}
T ly = y - y_low;
T lx = x - x_low;
T hy = 1. - ly, hx = 1. - lx;
// do bilinear interpolation
T v1 = input[y_low * width + x_low];
T v2 = input[y_low * width + x_high];
T v3 = input[y_high * width + x_low];
T v4 = input[y_high * width + x_high];
T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
return val;
}
template <typename T>
__device__ void bilinear_interpolate_gradient(
const int height, const int width, T y, T x, T& w1, T& w2, T& w3, T& w4,
int& x_low, int& x_high, int& y_low, int& y_high,
const int index /* index for debug only*/) {
// deal with cases that inverse elements are out of feature map boundary
if (y < -1.0 || y > height || x < -1.0 || x > width) {
// empty
w1 = w2 = w3 = w4 = 0.;
x_low = x_high = y_low = y_high = -1;
return;
}
if (y <= 0) y = 0;
if (x <= 0) x = 0;
y_low = (int)y;
x_low = (int)x;
if (y_low >= height - 1) {
y_high = y_low = height - 1;
y = (T)y_low;
} else {
y_high = y_low + 1;
}
if (x_low >= width - 1) {
x_high = x_low = width - 1;
x = (T)x_low;
} else {
x_high = x_low + 1;
}
T ly = y - y_low;
T lx = x - x_low;
T hy = 1. - ly, hx = 1. - lx;
// reference in forward
// T v1 = input[y_low * width + x_low];
// T v2 = input[y_low * width + x_high];
// T v3 = input[y_high * width + x_low];
// T v4 = input[y_high * width + x_high];
// T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
return;
}
#endif // COMMON_CUDA_HELPER
/*!
******************* BEGIN Caffe Copyright Notice and Disclaimer
*****************
*
* COPYRIGHT
*
* All contributions by the University of California:
* Copyright (c) 2014-2017 The Regents of the University of California (Regents)
* All rights reserved.
*
* All other contributions:
* Copyright (c) 2014-2017, the respective contributors
* All rights reserved.
*
* Caffe uses a shared copyright model: each contributor holds copyright over
* their contributions to Caffe. The project versioning records all such
* contribution and copyright details. If a contributor wants to further mark
* their specific copyright on a particular contribution, they should indicate
* their copyright solely in the commit message of the change when it is
* committed.
*
* LICENSE
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
*this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
*AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
*IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
*FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
*DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
*SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
*CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
*OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
*OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* CONTRIBUTION AGREEMENT
*
* By contributing to the BVLC/caffe repository through pull-request, comment,
* or otherwise, the contributor releases their content to the
* license and copyright terms herein.
*
***************** END Caffe Copyright Notice and Disclaimer
*********************
*
* Copyright (c) 2018 Microsoft
* Licensed under The MIT License [see LICENSE for details]
* \file modulated_deformable_im2col.cuh
* \brief Function definitions of converting an image to
* column matrix based on kernel, padding, dilation, and offset.
* These functions are mainly used in deformable convolution operators.
* \ref: https://arxiv.org/abs/1703.06211
* \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu, Dazhi Cheng
*/
// modified from
// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu
#ifndef DEFORM_CONV_KERNEL_CUH
#define DEFORM_CONV_KERNEL_CUH
template <typename T>
__device__ T deformable_im2col_bilinear(const T *input, const int data_width,
const int height, const int width, T h,
T w) {
if (h <= -1 || height <= h || w <= -1 || width <= w) {
return 0;
}
int h_low = floor(h);
int w_low = floor(w);
int h_high = h_low + 1;
int w_high = w_low + 1;
T lh = h - h_low;
T lw = w - w_low;
T hh = 1 - lh, hw = 1 - lw;
T v1 = 0;
if (h_low >= 0 && w_low >= 0) v1 = input[h_low * data_width + w_low];
T v2 = 0;
if (h_low >= 0 && w_high <= width - 1)
v2 = input[h_low * data_width + w_high];
T v3 = 0;
if (h_high <= height - 1 && w_low >= 0)
v3 = input[h_high * data_width + w_low];
T v4 = 0;
if (h_high <= height - 1 && w_high <= width - 1)
v4 = input[h_high * data_width + w_high];
T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
return val;
}
template <typename T>
__device__ T get_gradient_weight(T argmax_h, T argmax_w, const int h,
const int w, const int height,
const int width) {
if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
argmax_w >= width) {
// empty
return 0;
}
int argmax_h_low = floor(argmax_h);
int argmax_w_low = floor(argmax_w);
int argmax_h_high = argmax_h_low + 1;
int argmax_w_high = argmax_w_low + 1;
T weight = 0;
if (h == argmax_h_low && w == argmax_w_low)
weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
if (h == argmax_h_low && w == argmax_w_high)
weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
if (h == argmax_h_high && w == argmax_w_low)
weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
if (h == argmax_h_high && w == argmax_w_high)
weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
return weight;
}
template <typename T>
__device__ T get_coordinate_weight(T argmax_h, T argmax_w, const int height,
const int width, const T *im_data,
const int data_width, const int bp_dir) {
if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
argmax_w >= width) {
// empty
return 0;
}
int argmax_h_low = floor(argmax_h);
int argmax_w_low = floor(argmax_w);
int argmax_h_high = argmax_h_low + 1;
int argmax_w_high = argmax_w_low + 1;
T weight = 0;
if (bp_dir == 0) {
if (argmax_h_low >= 0 && argmax_w_low >= 0)
weight += -1 * (argmax_w_low + 1 - argmax_w) *
im_data[argmax_h_low * data_width + argmax_w_low];
if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
weight += -1 * (argmax_w - argmax_w_low) *
im_data[argmax_h_low * data_width + argmax_w_high];
if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
weight += (argmax_w_low + 1 - argmax_w) *
im_data[argmax_h_high * data_width + argmax_w_low];
if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
weight += (argmax_w - argmax_w_low) *
im_data[argmax_h_high * data_width + argmax_w_high];
} else if (bp_dir == 1) {
if (argmax_h_low >= 0 && argmax_w_low >= 0)
weight += -1 * (argmax_h_low + 1 - argmax_h) *
im_data[argmax_h_low * data_width + argmax_w_low];
if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
weight += (argmax_h_low + 1 - argmax_h) *
im_data[argmax_h_low * data_width + argmax_w_high];
if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
weight += -1 * (argmax_h - argmax_h_low) *
im_data[argmax_h_high * data_width + argmax_w_low];
if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
weight += (argmax_h - argmax_h_low) *
im_data[argmax_h_high * data_width + argmax_w_high];
}
return weight;
}
template <typename T>
__global__ void deformable_im2col_gpu_kernel(
const int n, const T *data_im, const T *data_offset, const int height,
const int width, const int kernel_h, const int kernel_w, const int pad_h,
const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int channel_per_deformable_group, const int batch_size,
const int num_channels, const int deformable_group, const int height_col,
const int width_col, T *data_col) {
CUDA_1D_KERNEL_LOOP(index, n) {
// index index of output matrix
const int w_col = index % width_col;
const int h_col = (index / width_col) % height_col;
const int b_col = (index / width_col / height_col) % batch_size;
const int c_im = (index / width_col / height_col) / batch_size;
const int c_col = c_im * kernel_h * kernel_w;
// compute deformable group index
const int deformable_group_index = c_im / channel_per_deformable_group;
const int h_in = h_col * stride_h - pad_h;
const int w_in = w_col * stride_w - pad_w;
T *data_col_ptr =
data_col +
((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
const T *data_im_ptr =
data_im + (b_col * num_channels + c_im) * height * width;
const T *data_offset_ptr =
data_offset + (b_col * deformable_group + deformable_group_index) * 2 *
kernel_h * kernel_w * height_col * width_col;
for (int i = 0; i < kernel_h; ++i) {
for (int j = 0; j < kernel_w; ++j) {
const int data_offset_h_ptr =
((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
const int data_offset_w_ptr =
((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
w_col;
const T offset_h = data_offset_ptr[data_offset_h_ptr];
const T offset_w = data_offset_ptr[data_offset_w_ptr];
T val = static_cast<T>(0);
const T h_im = h_in + i * dilation_h + offset_h;
const T w_im = w_in + j * dilation_w + offset_w;
if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
val = deformable_im2col_bilinear(data_im_ptr, width, height, width,
h_im, w_im);
*data_col_ptr = val;
data_col_ptr += batch_size * height_col * width_col;
}
}
}
}
template <typename T>
__global__ void deformable_col2im_gpu_kernel(
const int n, const T *data_col, const T *data_offset, const int channels,
const int height, const int width, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int channel_per_deformable_group, const int batch_size,
const int deformable_group, const int height_col, const int width_col,
T *grad_im) {
CUDA_1D_KERNEL_LOOP(index, n) {
const int j = (index / width_col / height_col / batch_size) % kernel_w;
const int i =
(index / width_col / height_col / batch_size / kernel_w) % kernel_h;
const int c =
index / width_col / height_col / batch_size / kernel_w / kernel_h;
// compute the start and end of the output
const int deformable_group_index = c / channel_per_deformable_group;
int w_out = index % width_col;
int h_out = (index / width_col) % height_col;
int b = (index / width_col / height_col) % batch_size;
int w_in = w_out * stride_w - pad_w;
int h_in = h_out * stride_h - pad_h;
const T *data_offset_ptr =
data_offset + (b * deformable_group + deformable_group_index) * 2 *
kernel_h * kernel_w * height_col * width_col;
const int data_offset_h_ptr =
((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
const int data_offset_w_ptr =
((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
const T offset_h = data_offset_ptr[data_offset_h_ptr];
const T offset_w = data_offset_ptr[data_offset_w_ptr];
const T cur_inv_h_data = h_in + i * dilation_h + offset_h;
const T cur_inv_w_data = w_in + j * dilation_w + offset_w;
const T cur_top_grad = data_col[index];
const int cur_h = (int)cur_inv_h_data;
const int cur_w = (int)cur_inv_w_data;
for (int dy = -2; dy <= 2; dy++) {
for (int dx = -2; dx <= 2; dx++) {
if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
abs(cur_inv_w_data - (cur_w + dx)) < 1) {
int cur_bottom_grad_pos =
((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
T weight = get_gradient_weight(cur_inv_h_data, cur_inv_w_data,
cur_h + dy, cur_w + dx, height, width);
atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
}
}
}
}
}
template <typename T>
__global__ void deformable_col2im_coord_gpu_kernel(
const int n, const T *data_col, const T *data_im, const T *data_offset,
const int channels, const int height, const int width, const int kernel_h,
const int kernel_w, const int pad_h, const int pad_w, const int stride_h,
const int stride_w, const int dilation_h, const int dilation_w,
const int channel_per_deformable_group, const int batch_size,
const int offset_channels, const int deformable_group, const int height_col,
const int width_col, T *grad_offset) {
CUDA_1D_KERNEL_LOOP(index, n) {
T val = 0;
int w = index % width_col;
int h = (index / width_col) % height_col;
int c = (index / width_col / height_col) % offset_channels;
int b = (index / width_col / height_col) / offset_channels;
// compute the start and end of the output
const int deformable_group_index = c / (2 * kernel_h * kernel_w);
const int col_step = kernel_h * kernel_w;
int cnt = 0;
const T *data_col_ptr = data_col + deformable_group_index *
channel_per_deformable_group *
batch_size * width_col * height_col;
const T *data_im_ptr =
data_im + (b * deformable_group + deformable_group_index) *
channel_per_deformable_group / kernel_h / kernel_w *
height * width;
const T *data_offset_ptr =
data_offset + (b * deformable_group + deformable_group_index) * 2 *
kernel_h * kernel_w * height_col * width_col;
const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;
col_c += col_step) {
const int col_pos =
(((col_c * batch_size + b) * height_col) + h) * width_col + w;
const int bp_dir = offset_c % 2;
int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
int i =
(col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
int w_out = col_pos % width_col;
int h_out = (col_pos / width_col) % height_col;
int w_in = w_out * stride_w - pad_w;
int h_in = h_out * stride_h - pad_h;
const int data_offset_h_ptr =
(((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
const int data_offset_w_ptr =
(((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
w_out);
const T offset_h = data_offset_ptr[data_offset_h_ptr];
const T offset_w = data_offset_ptr[data_offset_w_ptr];
T inv_h = h_in + i * dilation_h + offset_h;
T inv_w = w_in + j * dilation_w + offset_w;
if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
inv_h = inv_w = -2;
const T weight = get_coordinate_weight(inv_h, inv_w, height, width,
data_im_ptr + cnt * height * width,
width, bp_dir);
val += weight * data_col_ptr[col_pos];
cnt += 1;
}
grad_offset[index] = val;
}
}
#endif
#ifndef DEFORM_POOL_KERNEL_CUH
#define DEFORM_POOL_KERNEL_CUH
template <typename T>
__global__ void deform_roi_pool_forward_cuda_kernel(
const int nthreads, const T* input, const T* rois, const T* offset,
T* output, const int pooled_height, const int pooled_width,
const T spatial_scale, const int sampling_ratio, const T gamma,
const int channels, const int height, const int width) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int c = (index / pooled_width / pooled_height) % channels;
int n = index / pooled_width / pooled_height / channels;
const T* offset_rois = rois + n * 5;
int roi_batch_ind = offset_rois[0];
// Do not using rounding; this implementation detail is critical
T roi_start_w = offset_rois[1] * spatial_scale - 0.5;
T roi_start_h = offset_rois[2] * spatial_scale - 0.5;
T roi_end_w = offset_rois[3] * spatial_scale - 0.5;
T roi_end_h = offset_rois[4] * spatial_scale - 0.5;
T roi_width = roi_end_w - roi_start_w;
T roi_height = roi_end_h - roi_start_h;
T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
const T* offset_input =
input + (roi_batch_ind * channels + c) * height * width;
// We use roi_bin_grid to sample the grid and mimic integral
int roi_bin_grid_h =
(sampling_ratio > 0)
? sampling_ratio
: static_cast<int>(ceil(roi_height / pooled_height));
int roi_bin_grid_w = (sampling_ratio > 0)
? sampling_ratio
: static_cast<int>(ceil(roi_width / pooled_width));
// Compute roi offset
if (offset != NULL) {
const T* offset_cur_w = offset + n * pooled_width * pooled_height * 2 +
ph * pooled_width + pw;
T offset_roi_w = gamma * roi_width * offset_cur_w[0];
T offset_roi_h =
gamma * roi_height * offset_cur_w[pooled_width * pooled_height];
roi_start_w += offset_roi_w;
roi_start_h += offset_roi_h;
}
// We do average pooling inside a bin
const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1);
T output_val = 0.;
for (int iy = 0; iy < roi_bin_grid_h; iy++) {
const T y = roi_start_h + ph * bin_size_h +
static_cast<T>(iy + .5f) * bin_size_h /
static_cast<T>(roi_bin_grid_h);
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
const T x = roi_start_w + pw * bin_size_w +
static_cast<T>(ix + .5f) * bin_size_w /
static_cast<T>(roi_bin_grid_w);
T val = bilinear_interpolate(offset_input, height, width, y, x, index);
output_val += val;
}
}
output[index] = output_val / count;
}
}
template <typename T>
__global__ void deform_roi_pool_backward_cuda_kernel(
const int nthreads, const T* grad_output, const T* input, const T* rois,
const T* offset, T* grad_input, T* grad_offset, const int pooled_height,
const int pooled_width, const T spatial_scale, const int sampling_ratio,
const T gamma, const int channels, const int height, const int width) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int c = (index / pooled_width / pooled_height) % channels;
int n = index / pooled_width / pooled_height / channels;
const T* offset_rois = rois + n * 5;
int roi_batch_ind = offset_rois[0];
const T* offset_input =
input + ((roi_batch_ind * channels + c) * height * width);
T* offset_grad_input =
grad_input + ((roi_batch_ind * channels + c) * height * width);
// Do not using rounding; this implementation detail is critical
T roi_start_w = offset_rois[1] * spatial_scale - 0.5;
T roi_start_h = offset_rois[2] * spatial_scale - 0.5;
T roi_end_w = offset_rois[3] * spatial_scale - 0.5;
T roi_end_h = offset_rois[4] * spatial_scale - 0.5;
T roi_width = roi_end_w - roi_start_w;
T roi_height = roi_end_h - roi_start_h;
T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
// We use roi_bin_grid to sample the grid and mimic integral
int roi_bin_grid_h =
(sampling_ratio > 0)
? sampling_ratio
: static_cast<int>(ceil(roi_height / pooled_height));
int roi_bin_grid_w = (sampling_ratio > 0)
? sampling_ratio
: static_cast<int>(ceil(roi_width / pooled_width));
// Compute roi offset
if (offset != NULL) {
const T* offset_cur_w = offset + n * pooled_width * pooled_height * 2 +
ph * pooled_width + pw;
T offset_roi_w = gamma * roi_width * offset_cur_w[0];
T offset_roi_h =
gamma * roi_height * offset_cur_w[pooled_width * pooled_height];
roi_start_w += offset_roi_w;
roi_start_h += offset_roi_h;
}
// We do average (integral) pooling inside a bin
const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
const T grad_output_this_bin = grad_output[index] / count;
for (int iy = 0; iy < roi_bin_grid_h; iy++) {
const T y = roi_start_h + ph * bin_size_h +
static_cast<T>(iy + .5f) * bin_size_h /
static_cast<T>(roi_bin_grid_h);
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
const T x = roi_start_w + pw * bin_size_w +
static_cast<T>(ix + .5f) * bin_size_w /
static_cast<T>(roi_bin_grid_w);
T w1, w2, w3, w4;
int x_low, x_high, y_low, y_high;
bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
x_low, x_high, y_low, y_high, index);
if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
atomicAdd(offset_grad_input + y_low * width + x_low,
grad_output_this_bin * w1);
atomicAdd(offset_grad_input + y_low * width + x_high,
grad_output_this_bin * w2);
atomicAdd(offset_grad_input + y_high * width + x_low,
grad_output_this_bin * w3);
atomicAdd(offset_grad_input + y_high * width + x_high,
grad_output_this_bin * w4);
if (offset != NULL) {
T input_00 = offset_input[y_low * width + x_low];
T input_10 = offset_input[y_low * width + x_high];
T input_01 = offset_input[y_high * width + x_low];
T input_11 = offset_input[y_high * width + x_high];
T ogx = gamma * roi_width * grad_output_this_bin *
(input_11 * (y - y_low) + input_10 * (y_high - y) +
input_01 * (y_low - y) + input_00 * (y - y_high));
T ogy = gamma * roi_height * grad_output_this_bin *
(input_11 * (x - x_low) + input_01 * (x_high - x) +
input_10 * (x_low - x) + input_00 * (x - x_high));
atomicAdd(grad_offset + n * pooled_width * pooled_height * 2 +
ph * pooled_width + pw,
ogx);
atomicAdd(grad_offset + n * pooled_width * pooled_height * 2 +
pooled_width * pooled_height + ph * pooled_width + pw,
ogy);
}
}
}
}
}
}
#endif
template <typename scalar_t>
__global__ void MaskedIm2colForward(const int n, const scalar_t *data_im,
const int height, const int width,
const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w,
const int64_t *mask_h_idx,
const int64_t *mask_w_idx,
const int mask_cnt, scalar_t *data_col) {
// mask_cnt * channels
CUDA_1D_KERNEL_LOOP(index, n) {
const int m_index = index % mask_cnt;
const int h_col = mask_h_idx[m_index];
const int w_col = mask_w_idx[m_index];
const int c_im = index / mask_cnt;
const int c_col = c_im * kernel_h * kernel_w;
const int h_offset = h_col - pad_h;
const int w_offset = w_col - pad_w;
scalar_t *data_col_ptr = data_col + c_col * mask_cnt + m_index;
for (int i = 0; i < kernel_h; ++i) {
int h_im = h_offset + i;
for (int j = 0; j < kernel_w; ++j) {
int w_im = w_offset + j;
if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) {
*data_col_ptr =
(scalar_t)data_im[(c_im * height + h_im) * width + w_im];
} else {
*data_col_ptr = 0.0;
}
data_col_ptr += mask_cnt;
}
}
}
}
template <typename scalar_t>
__global__ void MaskedCol2imForward(const int n, const scalar_t *data_col,
const int height, const int width,
const int channels,
const int64_t *mask_h_idx,
const int64_t *mask_w_idx,
const int mask_cnt, scalar_t *data_im) {
CUDA_1D_KERNEL_LOOP(index, n) {
const int m_index = index % mask_cnt;
const int h_im = mask_h_idx[m_index];
const int w_im = mask_w_idx[m_index];
const int c_im = index / mask_cnt;
// compute the start and end of the output
data_im[(c_im * height + h_im) * width + w_im] = data_col[index];
}
}
/*!
******************* BEGIN Caffe Copyright Notice and Disclaimer
*****************
*
* COPYRIGHT
*
* All contributions by the University of California:
* Copyright (c) 2014-2017 The Regents of the University of California (Regents)
* All rights reserved.
*
* All other contributions:
* Copyright (c) 2014-2017, the respective contributors
* All rights reserved.
*
* Caffe uses a shared copyright model: each contributor holds copyright over
* their contributions to Caffe. The project versioning records all such
* contribution and copyright details. If a contributor wants to further mark
* their specific copyright on a particular contribution, they should indicate
* their copyright solely in the commit message of the change when it is
* committed.
*
* LICENSE
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
*this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
*AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
*IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
*FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
*DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
*SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
*CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
*OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
*OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* CONTRIBUTION AGREEMENT
*
* By contributing to the BVLC/caffe repository through pull-request, comment,
* or otherwise, the contributor releases their content to the
* license and copyright terms herein.
*
***************** END Caffe Copyright Notice and Disclaimer
*********************
*
* Copyright (c) 2018 Microsoft
* Licensed under The MIT License [see LICENSE for details]
* \file modulated_deformable_im2col.cuh
* \brief Function definitions of converting an image to
* column matrix based on kernel, padding, dilation, and offset.
* These functions are mainly used in deformable convolution operators.
* \ref: https://arxiv.org/abs/1703.06211
* \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu, Dazhi Cheng
*/
// modified from
// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu
#ifndef MODULATED_DEFORM_CONV_KERNEL_CUH
#define MODULATED_DEFORM_CONV_KERNEL_CUH
template <typename T>
__device__ T dmcn_im2col_bilinear(const T *input, const int data_width,
const int height, const int width, T h, T w) {
int h_low = floor(h);
int w_low = floor(w);
int h_high = h_low + 1;
int w_high = w_low + 1;
T lh = h - h_low;
T lw = w - w_low;
T hh = 1 - lh, hw = 1 - lw;
T v1 = 0;
if (h_low >= 0 && w_low >= 0) v1 = input[h_low * data_width + w_low];
T v2 = 0;
if (h_low >= 0 && w_high <= width - 1)
v2 = input[h_low * data_width + w_high];
T v3 = 0;
if (h_high <= height - 1 && w_low >= 0)
v3 = input[h_high * data_width + w_low];
T v4 = 0;
if (h_high <= height - 1 && w_high <= width - 1)
v4 = input[h_high * data_width + w_high];
T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
return val;
}
template <typename T>
__device__ T dmcn_get_gradient_weight(T argmax_h, T argmax_w, const int h,
const int w, const int height,
const int width) {
if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
argmax_w >= width) {
// empty
return 0;
}
int argmax_h_low = floor(argmax_h);
int argmax_w_low = floor(argmax_w);
int argmax_h_high = argmax_h_low + 1;
int argmax_w_high = argmax_w_low + 1;
T weight = 0;
if (h == argmax_h_low && w == argmax_w_low)
weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
if (h == argmax_h_low && w == argmax_w_high)
weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
if (h == argmax_h_high && w == argmax_w_low)
weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
if (h == argmax_h_high && w == argmax_w_high)
weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
return weight;
}
template <typename T>
__device__ T dmcn_get_coordinate_weight(T argmax_h, T argmax_w,
const int height, const int width,
const T *im_data, const int data_width,
const int bp_dir) {
if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
argmax_w >= width) {
// empty
return 0;
}
int argmax_h_low = floor(argmax_h);
int argmax_w_low = floor(argmax_w);
int argmax_h_high = argmax_h_low + 1;
int argmax_w_high = argmax_w_low + 1;
T weight = 0;
if (bp_dir == 0) {
if (argmax_h_low >= 0 && argmax_w_low >= 0)
weight += -1 * (argmax_w_low + 1 - argmax_w) *
im_data[argmax_h_low * data_width + argmax_w_low];
if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
weight += -1 * (argmax_w - argmax_w_low) *
im_data[argmax_h_low * data_width + argmax_w_high];
if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
weight += (argmax_w_low + 1 - argmax_w) *
im_data[argmax_h_high * data_width + argmax_w_low];
if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
weight += (argmax_w - argmax_w_low) *
im_data[argmax_h_high * data_width + argmax_w_high];
} else if (bp_dir == 1) {
if (argmax_h_low >= 0 && argmax_w_low >= 0)
weight += -1 * (argmax_h_low + 1 - argmax_h) *
im_data[argmax_h_low * data_width + argmax_w_low];
if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
weight += (argmax_h_low + 1 - argmax_h) *
im_data[argmax_h_low * data_width + argmax_w_high];
if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
weight += -1 * (argmax_h - argmax_h_low) *
im_data[argmax_h_high * data_width + argmax_w_low];
if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
weight += (argmax_h - argmax_h_low) *
im_data[argmax_h_high * data_width + argmax_w_high];
}
return weight;
}
template <typename T>
__global__ void modulated_deformable_im2col_gpu_kernel(
const int n, const T *data_im, const T *data_offset, const T *data_mask,
const int height, const int width, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int channel_per_deformable_group, const int batch_size,
const int num_channels, const int deformable_group, const int height_col,
const int width_col, T *data_col) {
CUDA_1D_KERNEL_LOOP(index, n) {
// index index of output matrix
const int w_col = index % width_col;
const int h_col = (index / width_col) % height_col;
const int b_col = (index / width_col / height_col) % batch_size;
const int c_im = (index / width_col / height_col) / batch_size;
const int c_col = c_im * kernel_h * kernel_w;
// compute deformable group index
const int deformable_group_index = c_im / channel_per_deformable_group;
const int h_in = h_col * stride_h - pad_h;
const int w_in = w_col * stride_w - pad_w;
T *data_col_ptr =
data_col +
((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
const T *data_im_ptr =
data_im + (b_col * num_channels + c_im) * height * width;
const T *data_offset_ptr =
data_offset + (b_col * deformable_group + deformable_group_index) * 2 *
kernel_h * kernel_w * height_col * width_col;
const T *data_mask_ptr =
data_mask + (b_col * deformable_group + deformable_group_index) *
kernel_h * kernel_w * height_col * width_col;
for (int i = 0; i < kernel_h; ++i) {
for (int j = 0; j < kernel_w; ++j) {
const int data_offset_h_ptr =
((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
const int data_offset_w_ptr =
((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
w_col;
const int data_mask_hw_ptr =
((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
const T offset_h = data_offset_ptr[data_offset_h_ptr];
const T offset_w = data_offset_ptr[data_offset_w_ptr];
const T mask = data_mask_ptr[data_mask_hw_ptr];
T val = static_cast<T>(0);
const T h_im = h_in + i * dilation_h + offset_h;
const T w_im = w_in + j * dilation_w + offset_w;
if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
val = dmcn_im2col_bilinear(data_im_ptr, width, height, width, h_im,
w_im);
*data_col_ptr = val * mask;
data_col_ptr += batch_size * height_col * width_col;
}
}
}
}
template <typename T>
__global__ void modulated_deformable_col2im_gpu_kernel(
const int n, const T *data_col, const T *data_offset, const T *data_mask,
const int channels, const int height, const int width, const int kernel_h,
const int kernel_w, const int pad_h, const int pad_w, const int stride_h,
const int stride_w, const int dilation_h, const int dilation_w,
const int channel_per_deformable_group, const int batch_size,
const int deformable_group, const int height_col, const int width_col,
T *grad_im) {
CUDA_1D_KERNEL_LOOP(index, n) {
const int j = (index / width_col / height_col / batch_size) % kernel_w;
const int i =
(index / width_col / height_col / batch_size / kernel_w) % kernel_h;
const int c =
index / width_col / height_col / batch_size / kernel_w / kernel_h;
// compute the start and end of the output
const int deformable_group_index = c / channel_per_deformable_group;
int w_out = index % width_col;
int h_out = (index / width_col) % height_col;
int b = (index / width_col / height_col) % batch_size;
int w_in = w_out * stride_w - pad_w;
int h_in = h_out * stride_h - pad_h;
const T *data_offset_ptr =
data_offset + (b * deformable_group + deformable_group_index) * 2 *
kernel_h * kernel_w * height_col * width_col;
const T *data_mask_ptr =
data_mask + (b * deformable_group + deformable_group_index) * kernel_h *
kernel_w * height_col * width_col;
const int data_offset_h_ptr =
((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
const int data_offset_w_ptr =
((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
const int data_mask_hw_ptr =
((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
const T offset_h = data_offset_ptr[data_offset_h_ptr];
const T offset_w = data_offset_ptr[data_offset_w_ptr];
const T mask = data_mask_ptr[data_mask_hw_ptr];
const T cur_inv_h_data = h_in + i * dilation_h + offset_h;
const T cur_inv_w_data = w_in + j * dilation_w + offset_w;
const T cur_top_grad = data_col[index] * mask;
const int cur_h = (int)cur_inv_h_data;
const int cur_w = (int)cur_inv_w_data;
for (int dy = -2; dy <= 2; dy++) {
for (int dx = -2; dx <= 2; dx++) {
if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
abs(cur_inv_w_data - (cur_w + dx)) < 1) {
int cur_bottom_grad_pos =
((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
T weight =
dmcn_get_gradient_weight(cur_inv_h_data, cur_inv_w_data,
cur_h + dy, cur_w + dx, height, width);
atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
}
}
}
}
}
template <typename T>
__global__ void modulated_deformable_col2im_coord_gpu_kernel(
const int n, const T *data_col, const T *data_im, const T *data_offset,
const T *data_mask, const int channels, const int height, const int width,
const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w, const int dilation_h,
const int dilation_w, const int channel_per_deformable_group,
const int batch_size, const int offset_channels, const int deformable_group,
const int height_col, const int width_col, T *grad_offset, T *grad_mask) {
CUDA_1D_KERNEL_LOOP(index, n) {
T val = 0, mval = 0;
int w = index % width_col;
int h = (index / width_col) % height_col;
int c = (index / width_col / height_col) % offset_channels;
int b = (index / width_col / height_col) / offset_channels;
// compute the start and end of the output
const int deformable_group_index = c / (2 * kernel_h * kernel_w);
const int col_step = kernel_h * kernel_w;
int cnt = 0;
const T *data_col_ptr = data_col + deformable_group_index *
channel_per_deformable_group *
batch_size * width_col * height_col;
const T *data_im_ptr =
data_im + (b * deformable_group + deformable_group_index) *
channel_per_deformable_group / kernel_h / kernel_w *
height * width;
const T *data_offset_ptr =
data_offset + (b * deformable_group + deformable_group_index) * 2 *
kernel_h * kernel_w * height_col * width_col;
const T *data_mask_ptr =
data_mask + (b * deformable_group + deformable_group_index) * kernel_h *
kernel_w * height_col * width_col;
const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;
col_c += col_step) {
const int col_pos =
(((col_c * batch_size + b) * height_col) + h) * width_col + w;
const int bp_dir = offset_c % 2;
int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
int i =
(col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
int w_out = col_pos % width_col;
int h_out = (col_pos / width_col) % height_col;
int w_in = w_out * stride_w - pad_w;
int h_in = h_out * stride_h - pad_h;
const int data_offset_h_ptr =
(((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
const int data_offset_w_ptr =
(((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
w_out);
const int data_mask_hw_ptr =
(((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
const T offset_h = data_offset_ptr[data_offset_h_ptr];
const T offset_w = data_offset_ptr[data_offset_w_ptr];
const T mask = data_mask_ptr[data_mask_hw_ptr];
T inv_h = h_in + i * dilation_h + offset_h;
T inv_w = w_in + j * dilation_w + offset_w;
if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
inv_h = inv_w = -2;
else
mval += data_col_ptr[col_pos] *
dmcn_im2col_bilinear(data_im_ptr + cnt * height * width, width,
height, width, inv_h, inv_w);
const T weight = dmcn_get_coordinate_weight(
inv_h, inv_w, height, width, data_im_ptr + cnt * height * width,
width, bp_dir);
val += weight * data_col_ptr[col_pos] * mask;
cnt += 1;
}
// KERNEL_ASSIGN(grad_offset[index], offset_req, val);
grad_offset[index] = val;
if (offset_c % 2 == 0)
// KERNEL_ASSIGN(grad_mask[(((b * deformable_group +
// deformable_group_index) * kernel_h * kernel_w + offset_c / 2) *
// height_col + h) * width_col + w], mask_req, mval);
grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h *
kernel_w +
offset_c / 2) *
height_col +
h) *
width_col +
w] = mval;
}
}
#endif
#ifndef NMS_KERNEL_CUH
#define NMS_KERNEL_CUH
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
int const threadsPerBlock = sizeof(unsigned long long int) * 8;
__device__ inline bool devIoU(float const *const a, float const *const b,
const int offset, const float threshold) {
float left = fmaxf(a[0], b[0]), right = fminf(a[2], b[2]);
float top = fmaxf(a[1], b[1]), bottom = fminf(a[3], b[3]);
float width = fmaxf(right - left + offset, 0.f),
height = fmaxf(bottom - top + offset, 0.f);
float interS = width * height;
float Sa = (a[2] - a[0] + offset) * (a[3] - a[1] + offset);
float Sb = (b[2] - b[0] + offset) * (b[3] - b[1] + offset);
return interS > threshold * (Sa + Sb - interS);
}
__global__ void nms_cuda(const int n_boxes, const float iou_threshold,
const int offset, const float *dev_boxes,
unsigned long long *dev_mask) {
const int row_start = blockIdx.y;
const int col_start = blockIdx.x;
const int tid = threadIdx.x;
if (row_start > col_start) return;
const int row_size =
fminf(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
const int col_size =
fminf(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
__shared__ float block_boxes[threadsPerBlock * 4];
if (tid < col_size) {
block_boxes[tid * 4 + 0] =
dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 0];
block_boxes[tid * 4 + 1] =
dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 1];
block_boxes[tid * 4 + 2] =
dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 2];
block_boxes[tid * 4 + 3] =
dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 3];
}
__syncthreads();
if (tid < row_size) {
const int cur_box_idx = threadsPerBlock * row_start + tid;
const float *cur_box = dev_boxes + cur_box_idx * 4;
int i = 0;
unsigned long long int t = 0;
int start = 0;
if (row_start == col_start) {
start = tid + 1;
}
for (i = start; i < col_size; i++) {
if (devIoU(cur_box, block_boxes + i * 4, offset, iou_threshold)) {
t |= 1ULL << i;
}
}
dev_mask[cur_box_idx * gridDim.y + col_start] = t;
}
}
#endif
#include "parrots_cpp_helper.hpp"
void BBoxOverlapsCUDAKernelLauncher(const DArrayLite bboxes1,
const DArrayLite bboxes2, DArrayLite ious,
const int mode, const bool aligned,
const int offset, cudaStream_t stream);
void bbox_overlaps_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int mode, offset;
bool aligned;
SSAttrs(attr)
.get<int>("mode", mode)
.get<bool>("aligned", aligned)
.get<int>("offset", offset)
.done();
const auto& bboxes1 = ins[0];
const auto& bboxes2 = ins[1];
auto& ious = outs[0];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
BBoxOverlapsCUDAKernelLauncher(bboxes1, bboxes2, ious, mode, aligned, offset,
stream);
}
PARROTS_EXTENSION_REGISTER(bbox_overlaps)
.attr("mode")
.attr("aligned")
.attr("offset")
.input(2)
.output(1)
.apply(bbox_overlaps_cuda)
.done();
#include "bbox_overlaps_cuda_kernel.cuh"
#include "parrots_cuda_helper.hpp"
void BBoxOverlapsCUDAKernelLauncher(const DArrayLite bboxes1,
const DArrayLite bboxes2, DArrayLite ious,
const int mode, const bool aligned,
const int offset, cudaStream_t stream) {
int output_size = ious.size();
int num_bbox1 = bboxes1.dim(0);
int num_bbox2 = bboxes2.dim(0);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
bboxes1.elemType().prim(), ([&] {
bbox_overlaps_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
bboxes1.ptr<scalar_t>(), bboxes2.ptr<scalar_t>(),
ious.ptr<scalar_t>(), num_bbox1, num_bbox2, mode, aligned,
offset);
}));
PARROTS_CUDA_CHECK(cudaGetLastError());
}
#include "parrots_cpp_helper.hpp"
void CARAFEForwardCUDAKernelLauncher(
const DArrayLite features, const DArrayLite masks, DArrayLite rfeatures,
DArrayLite routput, DArrayLite rmasks, DArrayLite output,
const int kernel_size, const int group_size, const int scale_factor,
cudaStream_t stream);
void CARAFEBackwardCUDAKernelLauncher(
const DArrayLite top_grad, const DArrayLite rfeatures,
const DArrayLite masks, DArrayLite rtop_grad, DArrayLite rbottom_grad_hs,
DArrayLite rbottom_grad, DArrayLite rmask_grad, DArrayLite bottom_grad,
DArrayLite mask_grad, const int kernel_size, const int group_size,
const int scale_factor, cudaStream_t stream);
void carafe_forward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int kernel_size, group_size, scale_factor;
SSAttrs(attr)
.get<int>("kernel_size", kernel_size)
.get<int>("group_size", group_size)
.get<int>("scale_factor", scale_factor)
.done();
const auto& features = ins[0];
const auto& masks = ins[1];
auto& rfeatures = outs[0];
auto& routput = outs[1];
auto& rmasks = outs[2];
auto& output = outs[3];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
CARAFEForwardCUDAKernelLauncher(features, masks, rfeatures, routput, rmasks,
output, kernel_size, group_size, scale_factor,
stream);
}
void carafe_backward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int kernel_size, group_size, scale_factor;
SSAttrs(attr)
.get<int>("kernel_size", kernel_size)
.get<int>("group_size", group_size)
.get<int>("scale_factor", scale_factor)
.done();
const auto& top_grad = ins[0];
const auto& rfeatures = ins[1];
const auto& masks = ins[2];
auto& rtop_grad = outs[0];
auto rbottom_grad_hs = outs[1];
auto& rbottom_grad = outs[2];
auto& rmask_grad = outs[3];
auto& bottom_grad = outs[4];
auto& mask_grad = outs[5];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
CARAFEBackwardCUDAKernelLauncher(top_grad, rfeatures, masks, rtop_grad,
rbottom_grad_hs, rbottom_grad, rmask_grad,
bottom_grad, mask_grad, kernel_size,
group_size, scale_factor, stream);
}
PARROTS_EXTENSION_REGISTER(carafe_forward)
.attr("kernel_size")
.attr("group_size")
.attr("scale_factor")
.input(2)
.output(4)
.apply(carafe_forward_cuda)
.done();
PARROTS_EXTENSION_REGISTER(carafe_backward)
.attr("kernel_size")
.attr("group_size")
.attr("scale_factor")
.input(3)
.output(6)
.apply(carafe_backward_cuda)
.done();
#include "carafe_cuda_kernel.cuh"
#include "parrots_cuda_helper.hpp"
void CARAFEForwardCUDAKernelLauncher(
const DArrayLite features, const DArrayLite masks, DArrayLite rfeatures,
DArrayLite routput, DArrayLite rmasks, DArrayLite output,
const int kernel_size, const int group_size, const int scale_factor,
cudaStream_t stream) {
const int batch_size = output.dim(0);
const int channels = output.dim(1);
const int output_height = output.dim(2);
const int output_width = output.dim(3);
const int input_height = features.dim(2);
const int input_width = features.dim(3);
const int mask_channels = masks.dim(1);
// one warp per pixel
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
features.elemType().prim(), ([&] {
const int dh = divideUP(channels, kTileDim);
const int dw = divideUP(input_height * input_width, kTileDim);
BatchTranspose2DCUDAKernel<scalar_t>
<<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
batch_size, channels, input_height * input_width, dh, dw,
features.ptr<scalar_t>(), rfeatures.ptr<scalar_t>());
}));
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
features.elemType().prim(), ([&] {
const int dh = divideUP(mask_channels, kTileDim);
const int dw = divideUP(output_height * output_width, kTileDim);
BatchTranspose2DCUDAKernel<scalar_t>
<<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
batch_size, mask_channels, output_height * output_width, dh, dw,
masks.ptr<scalar_t>(), rmasks.ptr<scalar_t>());
}));
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
features.elemType().prim(), ([&] {
const int num_kernels =
batch_size * output_height * output_width * THREADS_PER_PIXEL;
CARAFEForward<scalar_t><<<divideUP(num_kernels, THREADS_PER_BLOCK),
THREADS_PER_BLOCK, 0, stream>>>(
num_kernels, rfeatures.ptr<scalar_t>(), rmasks.ptr<scalar_t>(),
kernel_size, group_size, scale_factor, channels, input_height,
input_width, output_height, output_width, mask_channels,
routput.ptr<scalar_t>());
}));
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
features.elemType().prim(), ([&] {
const int dh = divideUP(output_height * output_width, kTileDim);
const int dw = divideUP(channels, kTileDim);
BatchTranspose2DCUDAKernel<scalar_t>
<<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
batch_size, output_height * output_width, channels, dh, dw,
routput.ptr<scalar_t>(), output.ptr<scalar_t>());
}));
PARROTS_CUDA_CHECK(cudaGetLastError());
}
void CARAFEBackwardCUDAKernelLauncher(
const DArrayLite top_grad, const DArrayLite rfeatures,
const DArrayLite masks, DArrayLite rtop_grad, DArrayLite rbottom_grad_hs,
DArrayLite rbottom_grad, DArrayLite rmask_grad, DArrayLite bottom_grad,
DArrayLite mask_grad, const int kernel_size, const int group_size,
const int scale_factor, cudaStream_t stream) {
const int batch_size = top_grad.dim(0);
const int channels = top_grad.dim(1);
const int output_height = top_grad.dim(2);
const int output_width = top_grad.dim(3);
const int input_height = bottom_grad.dim(2);
const int input_width = bottom_grad.dim(3);
const int mask_channels = masks.dim(1);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
top_grad.elemType().prim(), ([&] {
const int dh = divideUP(channels, kTileDim);
const int dw = divideUP(output_height * output_width, kTileDim);
BatchTranspose2DCUDAKernel<scalar_t>
<<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
batch_size, channels, output_height * output_width, dh, dw,
top_grad.ptr<scalar_t>(), rtop_grad.ptr<scalar_t>());
}));
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
top_grad.elemType().prim(), ([&] {
const int num_kernels =
batch_size * output_height * output_width * THREADS_PER_PIXEL;
CARAFEBackward_Feature<scalar_t>
<<<divideUP(num_kernels, THREADS_PER_BLOCK), THREADS_PER_BLOCK, 0,
stream>>>(num_kernels, rtop_grad.ptr<scalar_t>(),
masks.ptr<scalar_t>(), kernel_size, group_size,
scale_factor, channels, input_height, input_width,
output_height, output_width, mask_channels,
rbottom_grad_hs.ptr<scalar_t>());
}));
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
top_grad.elemType().prim(), ([&] {
const int num_kernels =
batch_size * input_height * input_width * THREADS_PER_PIXEL;
FeatureSum<scalar_t><<<divideUP(num_kernels, THREADS_PER_BLOCK),
THREADS_PER_BLOCK, 0, stream>>>(
num_kernels, rbottom_grad_hs.ptr<scalar_t>(), scale_factor,
channels, input_height, input_width, rbottom_grad.ptr<scalar_t>());
}));
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
top_grad.elemType().prim(), ([&] {
const int dh = divideUP(input_height * input_width, kTileDim);
const int dw = divideUP(channels, kTileDim);
BatchTranspose2DCUDAKernel<scalar_t>
<<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
batch_size, input_height * input_width, channels, dh, dw,
rbottom_grad.ptr<scalar_t>(), bottom_grad.ptr<scalar_t>());
}));
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
top_grad.elemType().prim(), ([&] {
const int num_kernels = batch_size * output_height * output_width *
mask_channels * WARP_SIZE;
CARAFEBackward_Mask<scalar_t>
<<<divideUP(num_kernels, THREADS_PER_BLOCK), THREADS_PER_BLOCK, 0,
stream>>>(num_kernels, rtop_grad.ptr<scalar_t>(),
rfeatures.ptr<scalar_t>(), kernel_size, group_size,
scale_factor, channels, input_height, input_width,
output_height, output_width, mask_channels,
rmask_grad.ptr<scalar_t>());
}));
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
top_grad.elemType().prim(), ([&] {
const int dh = divideUP(output_height * output_width, kTileDim);
const int dw = divideUP(mask_channels, kTileDim);
BatchTranspose2DCUDAKernel<scalar_t>
<<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
batch_size, output_height * output_width, mask_channels, dh, dw,
rmask_grad.ptr<scalar_t>(), mask_grad.ptr<scalar_t>());
}));
PARROTS_CUDA_CHECK(cudaGetLastError());
}
#include "parrots_cpp_helper.hpp"
void CARAFENAIVEForwardCUDAKernelLauncher(
const DArrayLite features, const DArrayLite masks, DArrayLite output,
const int kernel_size, const int group_size, const int scale_factor,
cudaStream_t stream);
void CARAFENAIVEBackwardCUDAKernelLauncher(
const DArrayLite top_grad, const DArrayLite features,
const DArrayLite masks, DArrayLite bottom_grad, DArrayLite mask_grad,
const int kernel_size, const int group_size, const int scale_factor,
cudaStream_t stream);
void carafe_naive_forward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int kernel_size, group_size, scale_factor;
SSAttrs(attr)
.get<int>("kernel_size", kernel_size)
.get<int>("group_size", group_size)
.get<int>("scale_factor", scale_factor)
.done();
const auto& features = ins[0];
const auto& masks = ins[1];
auto& output = outs[0];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
CARAFENAIVEForwardCUDAKernelLauncher(features, masks, output, kernel_size,
group_size, scale_factor, stream);
}
void carafe_naive_backward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int kernel_size, group_size, scale_factor;
SSAttrs(attr)
.get<int>("kernel_size", kernel_size)
.get<int>("group_size", group_size)
.get<int>("scale_factor", scale_factor)
.done();
const auto& top_grad = ins[0];
const auto& features = ins[1];
const auto& masks = ins[2];
auto& bottom_grad = outs[0];
auto& mask_grad = outs[1];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
CARAFENAIVEBackwardCUDAKernelLauncher(top_grad, features, masks, bottom_grad,
mask_grad, kernel_size, group_size,
scale_factor, stream);
}
PARROTS_EXTENSION_REGISTER(carafe_naive_forward)
.attr("kernel_size")
.attr("group_size")
.attr("scale_factor")
.input(2)
.output(1)
.apply(carafe_naive_forward_cuda)
.done();
PARROTS_EXTENSION_REGISTER(carafe_naive_backward)
.attr("kernel_size")
.attr("group_size")
.attr("scale_factor")
.input(3)
.output(2)
.apply(carafe_naive_backward_cuda)
.done();
#include "carafe_naive_cuda_kernel.cuh"
#include "parrots_cuda_helper.hpp"
void CARAFENAIVEForwardCUDAKernelLauncher(
const DArrayLite features, const DArrayLite masks, DArrayLite output,
const int kernel_size, const int group_size, const int scale_factor,
cudaStream_t stream) {
int output_size = output.size();
int channels = output.dim(1);
int height = output.dim(2);
int width = output.dim(3);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
features.elemType().prim(), ([&] {
carafe_naive_forward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, features.ptr<scalar_t>(), masks.ptr<scalar_t>(),
output.ptr<scalar_t>(), kernel_size, group_size, scale_factor,
channels, height, width);
}));
PARROTS_CUDA_CHECK(cudaGetLastError());
}
void CARAFENAIVEBackwardCUDAKernelLauncher(
const DArrayLite top_grad, const DArrayLite features,
const DArrayLite masks, DArrayLite bottom_grad, DArrayLite mask_grad,
const int kernel_size, const int group_size, const int scale_factor,
cudaStream_t stream) {
int output_size = top_grad.size();
int channels = top_grad.dim(1);
int height = top_grad.dim(2);
int width = top_grad.dim(3);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
features.elemType().prim(), ([&] {
carafe_naive_backward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, top_grad.ptr<scalar_t>(), features.ptr<scalar_t>(),
masks.ptr<scalar_t>(), bottom_grad.ptr<scalar_t>(),
mask_grad.ptr<scalar_t>(), kernel_size, group_size,
scale_factor, channels, height, width);
}));
PARROTS_CUDA_CHECK(cudaGetLastError());
}
#include "parrots_cpp_helper.hpp"
void CAForwardCUDAKernelLauncher(const DArrayLite t, const DArrayLite f,
DArrayLite weight, CudaContext &ctx,
cudaStream_t stream);
void CABackwardCUDAKernelLauncher(const DArrayLite dw, const DArrayLite t,
const DArrayLite f, DArrayLite dt,
DArrayLite df, CudaContext &ctx,
cudaStream_t stream);
void CAMapForwardCUDAKernelLauncher(const DArrayLite weight, const DArrayLite g,
DArrayLite out, CudaContext &ctx,
cudaStream_t stream);
void CAMapBackwardCUDAKernelLauncher(const DArrayLite dout,
const DArrayLite weight,
const DArrayLite g, DArrayLite dw,
DArrayLite dg, CudaContext &ctx,
cudaStream_t stream);
void ca_forward_cuda(CudaContext &ctx, const SSElement &attr,
const OperatorBase::in_list_t &ins,
OperatorBase::out_list_t &outs) {
const auto &t = ins[0];
const auto &f = ins[1];
auto &weight = outs[0];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
CAForwardCUDAKernelLauncher(t, f, weight, ctx, stream);
}
void ca_backward_cuda(CudaContext &ctx, const SSElement &attr,
const OperatorBase::in_list_t &ins,
OperatorBase::out_list_t &outs) {
const auto &dw = ins[0];
const auto &t = ins[1];
const auto &f = ins[2];
auto &dt = outs[0];
auto &df = outs[1];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
CABackwardCUDAKernelLauncher(dw, t, f, dt, df, ctx, stream);
}
void ca_map_forward_cuda(CudaContext &ctx, const SSElement &attr,
const OperatorBase::in_list_t &ins,
OperatorBase::out_list_t &outs) {
const auto &weight = ins[0];
const auto &g = ins[1];
auto &out = outs[0];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
CAMapForwardCUDAKernelLauncher(weight, g, out, ctx, stream);
}
void ca_map_backward_cuda(CudaContext &ctx, const SSElement &attr,
const OperatorBase::in_list_t &ins,
OperatorBase::out_list_t &outs) {
const auto &dout = ins[0];
const auto &weight = ins[1];
const auto &g = ins[2];
auto &dw = outs[0];
auto &dg = outs[1];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
CAMapBackwardCUDAKernelLauncher(dout, weight, g, dw, dg, ctx, stream);
}
PARROTS_EXTENSION_REGISTER(ca_forward)
.input(2)
.output(1)
.apply(ca_forward_cuda)
.done();
PARROTS_EXTENSION_REGISTER(ca_backward)
.input(3)
.output(2)
.apply(ca_backward_cuda)
.done();
PARROTS_EXTENSION_REGISTER(ca_map_forward)
.input(2)
.output(1)
.apply(ca_map_forward_cuda)
.done();
PARROTS_EXTENSION_REGISTER(ca_map_backward)
.input(3)
.output(2)
.apply(ca_map_backward_cuda)
.done();
#include "cc_attention_cuda_kernel.cuh"
#include "parrots_cuda_helper.hpp"
void CAForwardCUDAKernelLauncher(const DArrayLite t, const DArrayLite f,
DArrayLite weight, CudaContext &ctx,
cudaStream_t stream) {
auto n = t.dim(0);
auto c = t.dim(1);
auto h = t.dim(2);
auto w = t.dim(3);
// Run kernel
dim3 threads(32, 32);
int d1 = (w + threads.x - 1) / threads.x;
int d2 = (h + threads.y - 1) / threads.y;
int d3 = h + w;
dim3 blocks(d1, d2, d3);
PARROTS_DISPATCH_FLOATING_TYPES(t.elemType().prim(), [&] {
ca_forward_kernel<scalar_t>
<<<blocks, threads, 0, stream>>>(t.ptr<scalar_t>(), f.ptr<scalar_t>(),
weight.ptr<scalar_t>(), n, c, h, w);
});
PARROTS_CUDA_CHECK(cudaGetLastError());
}
void CABackwardCUDAKernelLauncher(const DArrayLite dw, const DArrayLite t,
const DArrayLite f, DArrayLite dt,
DArrayLite df, CudaContext &ctx,
cudaStream_t stream) {
auto n = t.dim(0);
auto c = t.dim(1);
auto h = t.dim(2);
auto w = t.dim(3);
// Run kernel
dim3 threads(32, 32);
int d1 = (w + threads.x - 1) / threads.x;
int d2 = (h + threads.y - 1) / threads.y;
int d3 = c;
dim3 blocks(d1, d2, d3);
PARROTS_DISPATCH_FLOATING_TYPES(t.elemType().prim(), [&] {
ca_backward_kernel_t<scalar_t><<<blocks, threads, 0, stream>>>(
dw.ptr<scalar_t>(), t.ptr<scalar_t>(), f.ptr<scalar_t>(),
dt.ptr<scalar_t>(), n, c, h, w);
});
PARROTS_DISPATCH_FLOATING_TYPES(f.elemType().prim(), [&] {
ca_backward_kernel_f<scalar_t><<<blocks, threads, 0, stream>>>(
dw.ptr<scalar_t>(), t.ptr<scalar_t>(), f.ptr<scalar_t>(),
df.ptr<scalar_t>(), n, c, h, w);
});
PARROTS_CUDA_CHECK(cudaGetLastError());
}
void CAMapForwardCUDAKernelLauncher(const DArrayLite weight, const DArrayLite g,
DArrayLite out, CudaContext &ctx,
cudaStream_t stream) {
auto n = g.dim(0);
auto c = g.dim(1);
auto h = g.dim(2);
auto w = g.dim(3);
// Run kernel
dim3 threads(32, 32);
int d1 = (w + threads.x - 1) / threads.x;
int d2 = (h + threads.y - 1) / threads.y;
int d3 = c;
dim3 blocks(d1, d2, d3);
PARROTS_DISPATCH_FLOATING_TYPES(g.elemType().prim(), [&] {
ca_map_forward_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
weight.ptr<scalar_t>(), g.ptr<scalar_t>(), out.ptr<scalar_t>(), n, c, h,
w);
});
PARROTS_CUDA_CHECK(cudaGetLastError());
}
void CAMapBackwardCUDAKernelLauncher(const DArrayLite dout,
const DArrayLite weight,
const DArrayLite g, DArrayLite dw,
DArrayLite dg, CudaContext &ctx,
cudaStream_t stream) {
auto n = dout.dim(0);
auto c = dout.dim(1);
auto h = dout.dim(2);
auto w = dout.dim(3);
// Run kernel
dim3 threads(32, 32);
int d1 = (w + threads.x - 1) / threads.x;
int d2 = (h + threads.y - 1) / threads.y;
int d3 = h + w;
dim3 blocks(d1, d2, d3);
PARROTS_DISPATCH_FLOATING_TYPES(weight.elemType().prim(), [&] {
ca_map_backward_kernel_w<scalar_t><<<blocks, threads, 0, stream>>>(
dout.ptr<scalar_t>(), weight.ptr<scalar_t>(), g.ptr<scalar_t>(),
dw.ptr<scalar_t>(), n, c, h, w);
});
PARROTS_DISPATCH_FLOATING_TYPES(g.elemType().prim(), [&] {
ca_map_backward_kernel_g<scalar_t><<<blocks, threads, 0, stream>>>(
dout.ptr<scalar_t>(), weight.ptr<scalar_t>(), g.ptr<scalar_t>(),
dg.ptr<scalar_t>(), n, c, h, w);
});
PARROTS_CUDA_CHECK(cudaGetLastError());
}
// Modified from
// https://github.com/princeton-vl/CornerNet-Lite/tree/master/core/models/py_utils/_cpools/src
#include "parrots_cpp_helper.hpp"
void bottom_pool_forward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {}
void bottom_pool_backward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {}
void top_pool_forward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {}
void top_pool_backward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {}
void left_pool_forward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {}
void left_pool_backward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {}
void right_pool_forward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {}
void right_pool_backward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {}
PARROTS_EXTENSION_REGISTER(bottom_pool_forward)
.input(1)
.output(1)
.apply(bottom_pool_forward_cuda)
.done();
PARROTS_EXTENSION_REGISTER(bottom_pool_backward)
.input(2)
.output(1)
.apply(bottom_pool_backward_cuda)
.done();
PARROTS_EXTENSION_REGISTER(top_pool_forward)
.input(1)
.output(1)
.apply(top_pool_forward_cuda)
.done();
PARROTS_EXTENSION_REGISTER(top_pool_backward)
.input(2)
.output(1)
.apply(top_pool_backward_cuda)
.done();
PARROTS_EXTENSION_REGISTER(left_pool_forward)
.input(1)
.output(1)
.apply(left_pool_forward_cuda)
.done();
PARROTS_EXTENSION_REGISTER(left_pool_backward)
.input(2)
.output(1)
.apply(left_pool_backward_cuda)
.done();
PARROTS_EXTENSION_REGISTER(right_pool_forward)
.input(1)
.output(1)
.apply(right_pool_forward_cuda)
.done();
PARROTS_EXTENSION_REGISTER(right_pool_backward)
.input(2)
.output(1)
.apply(right_pool_backward_cuda)
.done();
// Copyright (c) 2018, SenseTime.
#include "parrots_cpp_helper.hpp"
void DeformConvForwardCUDAKernelLauncher(
const DArrayLite input, const DArrayLite weight, const DArrayLite offset,
DArrayLite output, DArrayLite columns, DArrayLite ones, int kW, int kH,
int dW, int dH, int padW, int padH, int dilationW, int dilationH, int group,
int deformable_group, int im2col_step, CudaContext& ctx,
cudaStream_t stream);
void DeformConvBackwardInputCUDAKernelLauncher(
const DArrayLite input, const DArrayLite offset,
const DArrayLite gradOutput, DArrayLite gradInput, DArrayLite gradOffset,
DArrayLite weight, DArrayLite columns, int kW, int kH, int dW, int dH,
int padW, int padH, int dilationW, int dilationH, int group,
int deformable_group, int im2col_step, CudaContext& ctx,
cudaStream_t stream);
void DeformConvBackwardParametersCUDAKernelLauncher(
const DArrayLite input, const DArrayLite offset,
const DArrayLite gradOutput, DArrayLite gradWeight, DArrayLite columns,
DArrayLite ones, int kW, int kH, int dW, int dH, int padW, int padH,
int dilationW, int dilationH, int group, int deformable_group, float scale,
int im2col_step, CudaContext& ctx, cudaStream_t stream);
void deform_conv_forward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,
im2col_step;
SSAttrs(attr)
.get<int>("kW", kW)
.get<int>("kH", kH)
.get<int>("dW", dW)
.get<int>("dH", dH)
.get<int>("padW", padW)
.get<int>("padH", padH)
.get<int>("dilationW", dilationW)
.get<int>("dilationH", dilationH)
.get<int>("group", group)
.get<int>("deformable_group", deformable_group)
.get<int>("im2col_step", im2col_step)
.done();
const auto input = ins[0];
const auto weight = ins[1];
const auto offset = ins[2];
auto output = outs[0];
auto columns = outs[1];
auto ones = outs[2];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
DeformConvForwardCUDAKernelLauncher(
input, weight, offset, output, columns, ones, kW, kH, dW, dH, padW, padH,
dilationW, dilationH, group, deformable_group, im2col_step, ctx, stream);
}
void deform_conv_backward_input_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,
im2col_step;
SSAttrs(attr)
.get<int>("kW", kW)
.get<int>("kH", kH)
.get<int>("dW", dW)
.get<int>("dH", dH)
.get<int>("padW", padW)
.get<int>("padH", padH)
.get<int>("dilationW", dilationW)
.get<int>("dilationH", dilationH)
.get<int>("group", group)
.get<int>("deformable_group", deformable_group)
.get<int>("im2col_step", im2col_step)
.done();
auto input = ins[0];
auto offset = ins[1];
auto gradOutput = ins[2];
auto gradInput = outs[0];
auto gradOffset = outs[1];
auto weight = outs[2];
auto columns = outs[3];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
DeformConvBackwardInputCUDAKernelLauncher(
input, offset, gradOutput, gradInput, gradOffset, weight, columns, kW, kH,
dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,
im2col_step, ctx, stream);
}
void deform_conv_backward_parameters_cuda(CudaContext& ctx,
const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,
im2col_step;
float scale;
SSAttrs(attr)
.get<int>("kW", kW)
.get<int>("kH", kH)
.get<int>("dW", dW)
.get<int>("dH", dH)
.get<int>("padW", padW)
.get<int>("padH", padH)
.get<int>("dilationW", dilationW)
.get<int>("dilationH", dilationH)
.get<int>("group", group)
.get<int>("deformable_group", deformable_group)
.get<float>("scale", scale)
.get<int>("im2col_step", im2col_step)
.done();
auto input = ins[0];
auto offset = ins[1];
auto gradOutput = ins[2];
auto gradWeight = outs[0];
auto columns = outs[1];
auto ones = outs[2];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
DeformConvBackwardParametersCUDAKernelLauncher(
input, offset, gradOutput, gradWeight, columns, ones, kW, kH, dW, dH,
padW, padH, dilationW, dilationH, group, deformable_group, scale,
im2col_step, ctx, stream);
}
PARROTS_EXTENSION_REGISTER(deform_conv_forward)
.attr("kW")
.attr("kH")
.attr("dW")
.attr("dH")
.attr("padW")
.attr("padH")
.attr("dilationW")
.attr("dilationH")
.attr("group")
.attr("deformable_group")
.attr("im2col_step")
.input(3)
.output(3)
.apply(deform_conv_forward_cuda)
.done();
PARROTS_EXTENSION_REGISTER(deform_conv_backward_input)
.attr("kW")
.attr("kH")
.attr("dW")
.attr("dH")
.attr("padW")
.attr("padH")
.attr("dilationW")
.attr("dilationH")
.attr("group")
.attr("deformable_group")
.attr("im2col_step")
.input(3)
.output(4)
.apply(deform_conv_backward_input_cuda)
.done();
PARROTS_EXTENSION_REGISTER(deform_conv_backward_parameters)
.attr("kW")
.attr("kH")
.attr("dW")
.attr("dH")
.attr("padW")
.attr("padH")
.attr("dilationW")
.attr("dilationH")
.attr("group")
.attr("deformable_group")
.attr("scale")
.attr("im2col_step")
.input(3)
.output(3)
.apply(deform_conv_backward_parameters_cuda)
.done();
This diff is collapsed.
#include "parrots_cpp_helper.hpp"
void DeformRoIPoolForwardCUDAKernelLauncher(
const DArrayLite input, const DArrayLite rois, const DArrayLite offset,
DArrayLite output, int pooled_height, int pooled_width, float spatial_scale,
int sampling_ratio, float gamma, cudaStream_t stream);
void DeformRoIPoolBackwardCUDAKernelLauncher(
const DArrayLite grad_output, const DArrayLite input, const DArrayLite rois,
const DArrayLite offset, DArrayLite grad_input, DArrayLite grad_offset,
int pooled_height, int pooled_width, float spatial_scale,
int sampling_ratio, float gamma, cudaStream_t stream);
void deform_roi_pool_forward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int pooled_height;
int pooled_width;
float spatial_scale;
int sampling_ratio;
float gamma;
SSAttrs(attr)
.get<int>("pooled_height", pooled_height)
.get<int>("pooled_width", pooled_width)
.get<float>("spatial_scale", spatial_scale)
.get<int>("sampling_ratio", sampling_ratio)
.get<float>("gamma", gamma)
.done();
const auto& input = ins[0];
const auto& rois = ins[1];
const auto& offset = ins[2];
auto& output = outs[0];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
DeformRoIPoolForwardCUDAKernelLauncher(
input, rois, offset, output, pooled_height, pooled_width, spatial_scale,
sampling_ratio, gamma, stream);
}
void deform_roi_pool_backward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int pooled_height;
int pooled_width;
float spatial_scale;
int sampling_ratio;
float gamma;
SSAttrs(attr)
.get<int>("pooled_height", pooled_height)
.get<int>("pooled_width", pooled_width)
.get<float>("spatial_scale", spatial_scale)
.get<int>("sampling_ratio", sampling_ratio)
.get<float>("gamma", gamma)
.done();
const auto& grad_output = ins[0];
const auto& input = ins[1];
const auto& rois = ins[2];
const auto& offset = ins[3];
auto& grad_input = outs[0];
auto& grad_offset = outs[1];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
DeformRoIPoolBackwardCUDAKernelLauncher(
grad_output, input, rois, offset, grad_input, grad_offset, pooled_height,
pooled_width, spatial_scale, sampling_ratio, gamma, stream);
}
PARROTS_EXTENSION_REGISTER(deform_roi_pool_forward)
.attr("pooled_height")
.attr("pooled_width")
.attr("spatial_scale")
.attr("sampling_ratio")
.attr("gamma")
.input(3)
.output(1)
.apply(deform_roi_pool_forward_cuda)
.done();
PARROTS_EXTENSION_REGISTER(deform_roi_pool_backward)
.attr("pooled_height")
.attr("pooled_width")
.attr("spatial_scale")
.attr("sampling_ratio")
.attr("gamma")
.input(4)
.output(2)
.apply(deform_roi_pool_backward_cuda)
.done();
#include "deform_roi_pool_cuda_kernel.cuh"
#include "parrots_cuda_helper.hpp"
void DeformRoIPoolForwardCUDAKernelLauncher(
const DArrayLite input, const DArrayLite rois, const DArrayLite offset,
DArrayLite output, int pooled_height, int pooled_width, float spatial_scale,
int sampling_ratio, float gamma, cudaStream_t stream) {
int output_size = output.size();
int channels = input.dim(1);
int height = input.dim(2);
int width = input.dim(3);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
input.elemType().prim(), ([&] {
deform_roi_pool_forward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, input.ptr<scalar_t>(), rois.ptr<scalar_t>(),
offset.ptr<scalar_t>(), output.ptr<scalar_t>(), pooled_height,
pooled_width, spatial_scale, sampling_ratio, gamma, channels,
height, width);
}));
PARROTS_CUDA_CHECK(cudaGetLastError());
}
void DeformRoIPoolBackwardCUDAKernelLauncher(
const DArrayLite grad_output, const DArrayLite input, const DArrayLite rois,
const DArrayLite offset, DArrayLite grad_input, DArrayLite grad_offset,
int pooled_height, int pooled_width, float spatial_scale,
int sampling_ratio, float gamma, cudaStream_t stream) {
int output_size = grad_output.size();
int channels = grad_input.dim(1);
int height = grad_input.dim(2);
int width = grad_input.dim(3);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
grad_output.elemType().prim(), ([&] {
deform_roi_pool_backward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, grad_output.ptr<scalar_t>(), input.ptr<scalar_t>(),
rois.ptr<scalar_t>(), offset.ptr<scalar_t>(),
grad_input.ptr<scalar_t>(), grad_offset.ptr<scalar_t>(),
pooled_height, pooled_width, spatial_scale, sampling_ratio,
gamma, channels, height, width);
}));
PARROTS_CUDA_CHECK(cudaGetLastError());
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment