Unverified Commit ee041cec authored by Wang Xinjiang's avatar Wang Xinjiang Committed by GitHub
Browse files

[Feature]: Add Rotated ROI align op for pytorch (cpu&cuda), parrots (cpu&cuda)...


[Feature]: Add Rotated ROI align op for pytorch (cpu&cuda), parrots (cpu&cuda) and onnxruntime (cpu) (#933)

* add roi_align_rotated

* code format

* Add align key to roi align rotated

* Add clockwise for rotated roi align

* fix bugs in onnx export

* Add docstring for RoIAlignRotated

* remove cuda unittest

* Reformat c++ code

* add onnx roi align rotated file

* fix unittest

* Add  cpu and float64 of cuda support for parrots

* code format

* Add unified header to roi align rotated
Co-authored-by: default avatarluopeichao <luopeichao@sensetime.com>
parent de4f14e9
......@@ -24,6 +24,7 @@ from .point_sample import (SimpleRoIAlign, point_sample,
rel_roi_point_to_rel_img_point)
from .psa_mask import PSAMask
from .roi_align import RoIAlign, roi_align
from .roi_align_rotated import RoIAlignRotated, roi_align_rotated
from .roi_pool import RoIPool, roi_pool
from .saconv import SAConv2d
from .sync_bn import SyncBatchNorm
......@@ -44,5 +45,6 @@ __all__ = [
'ConvTranspose2d', 'Linear', 'MaxPool2d', 'CrissCrossAttention', 'PSAMask',
'point_sample', 'rel_roi_point_to_rel_img_point', 'SimpleRoIAlign',
'SAConv2d', 'TINShift', 'tin_shift', 'box_iou_rotated', 'nms_rotated',
'upfirdn2d', 'FusedBiasLeakyReLU', 'fused_bias_leakyrelu'
'upfirdn2d', 'FusedBiasLeakyReLU', 'fused_bias_leakyrelu',
'RoIAlignRotated', 'roi_align_rotated'
]
......@@ -16,8 +16,10 @@ def box_iou_rotated(bboxes1, bboxes2, mode='iou', aligned=False):
Arguments:
boxes1 (Tensor): rotated bboxes 1. \
It has shape (N, 5), indicating (x, y, w, h, theta) for each row.
Note that theta is in radian.
boxes2 (Tensor): rotated bboxes 2. \
It has shape (M, 5), indicating (x, y, w, h, theta) for each row.
Note that theta is in radian.
mode (str): "iou" (intersection over union) or iof (intersection over
foreground).
......
......@@ -4,12 +4,14 @@
#include "nms.h"
#include "ort_mmcv_utils.h"
#include "roi_align.h"
#include "roi_align_rotated.h"
#include "soft_nms.h"
const char *c_MMCVOpDomain = "mmcv";
SoftNmsOp c_SoftNmsOp;
NmsOp c_NmsOp;
MMCVRoiAlignCustomOp c_MMCVRoiAlignCustomOp;
MMCVRoIAlignRotatedCustomOp c_MMCVRoIAlignRotatedCustomOp;
GridSampleOp c_GridSampleOp;
OrtStatus *ORT_API_CALL RegisterCustomOps(OrtSessionOptions *options,
......@@ -34,6 +36,11 @@ OrtStatus *ORT_API_CALL RegisterCustomOps(OrtSessionOptions *options,
return status;
}
if (auto status =
ortApi->CustomOpDomain_Add(domain, &c_MMCVRoIAlignRotatedCustomOp)) {
return status;
}
if (auto status = ortApi->CustomOpDomain_Add(domain, &c_GridSampleOp)) {
return status;
}
......
// Modified from
// https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/ROIAlignRotated
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
#include "roi_align_rotated.h"
#include "../ort_mmcv_utils.h"
struct PreCalc {
int pos1;
int pos2;
int pos3;
int pos4;
float w1;
float w2;
float w3;
float w4;
};
void pre_calc_for_bilinear_interpolate(
const int height, const int width, const int pooled_height,
const int pooled_width, const int iy_upper, const int ix_upper,
float roi_start_h, float roi_start_w, float bin_size_h, float bin_size_w,
int roi_bin_grid_h, int roi_bin_grid_w, float roi_center_h,
float roi_center_w, float cos_theta, float sin_theta,
std::vector<PreCalc> &pre_calc) {
int pre_calc_index = 0;
for (int ph = 0; ph < pooled_height; ph++) {
for (int pw = 0; pw < pooled_width; pw++) {
for (int iy = 0; iy < iy_upper; iy++) {
const float yy =
roi_start_h + ph * bin_size_h +
static_cast<float>(iy + .5f) * bin_size_h /
static_cast<float>(roi_bin_grid_h); // e.g., 0.5, 1.5
for (int ix = 0; ix < ix_upper; ix++) {
const float xx = roi_start_w + pw * bin_size_w +
static_cast<float>(ix + .5f) * bin_size_w /
static_cast<float>(roi_bin_grid_w);
// Rotate by theta around the center and translate
// In image space, (y, x) is the order for Right Handed System,
// and this is essentially multiplying the point by a rotation matrix
// to rotate it counterclockwise through angle theta.
float y = yy * cos_theta - xx * sin_theta + roi_center_h;
float x = yy * sin_theta + xx * cos_theta + roi_center_w;
// deal with: inverse elements are out of feature map boundary
if (y < -1.0 || y > height || x < -1.0 || x > width) {
// empty
PreCalc pc;
pc.pos1 = 0;
pc.pos2 = 0;
pc.pos3 = 0;
pc.pos4 = 0;
pc.w1 = 0;
pc.w2 = 0;
pc.w3 = 0;
pc.w4 = 0;
pre_calc[pre_calc_index] = pc;
pre_calc_index += 1;
continue;
}
if (y < 0) {
y = 0;
}
if (x < 0) {
x = 0;
}
int y_low = (int)y;
int x_low = (int)x;
int y_high;
int x_high;
if (y_low >= height - 1) {
y_high = y_low = height - 1;
y = (float)y_low;
} else {
y_high = y_low + 1;
}
if (x_low >= width - 1) {
x_high = x_low = width - 1;
x = (float)x_low;
} else {
x_high = x_low + 1;
}
float ly = y - y_low;
float lx = x - x_low;
float hy = 1. - ly, hx = 1. - lx;
float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
// save weights and indices
PreCalc pc;
pc.pos1 = y_low * width + x_low;
pc.pos2 = y_low * width + x_high;
pc.pos3 = y_high * width + x_low;
pc.pos4 = y_high * width + x_high;
pc.w1 = w1;
pc.w2 = w2;
pc.w3 = w3;
pc.w4 = w4;
pre_calc[pre_calc_index] = pc;
pre_calc_index += 1;
}
}
}
}
}
void ROIAlignRotatedForwardCPU(const int nthreads, const float *input,
const float *rois, float *output,
const float &spatial_scale, const int aligned,
const int clockwise, const int channels,
const int height, const int width,
const int pooled_height, const int pooled_width,
const int sampling_ratio) {
int n_rois = nthreads / channels / pooled_width / pooled_height;
// (n, c, ph, pw) is an element in the pooled output
// can be parallelized using omp
// #pragma omp parallel for num_threads(32)
for (int n = 0; n < n_rois; n++) {
int index_n = n * channels * pooled_width * pooled_height;
const float *current_roi = rois + n * 6;
int roi_batch_ind = current_roi[0];
// Do not use rounding; this implementation detail is critical
float offset = aligned ? (float)0.5 : (float)0.0;
float roi_center_w = current_roi[1] * spatial_scale - offset;
float roi_center_h = current_roi[2] * spatial_scale - offset;
float roi_width = current_roi[3] * spatial_scale;
float roi_height = current_roi[4] * spatial_scale;
// float theta = current_roi[5] * M_PI / 180.0;
float theta = current_roi[5]; // Radian angle by default
if (clockwise) {
theta = -theta;
}
float cos_theta = cos(theta);
float sin_theta = sin(theta);
if (!aligned) { // for backward-compatibility only
roi_width = std::max(roi_width, (float)1.);
roi_height = std::max(roi_height, (float)1.);
}
float bin_size_h =
static_cast<float>(roi_height) / static_cast<float>(pooled_height);
float bin_size_w =
static_cast<float>(roi_width) / static_cast<float>(pooled_width);
// We use roi_bin_grid to sample the grid and mimic integral
int roi_bin_grid_h = (sampling_ratio > 0)
? sampling_ratio
: ceil(roi_height / pooled_height); // e.g., = 2
int roi_bin_grid_w =
(sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
// We do average (integral) pooling inside a bin
const float count =
std::max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4
// we want to precalculate indices and weights shared by all channels,
// this is the key point of optimization
std::vector<PreCalc> pre_calc(roi_bin_grid_h * roi_bin_grid_w *
pooled_width * pooled_height);
// roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
// Appropriate translation needs to be applied after.
float roi_start_h = -roi_height / 2.0;
float roi_start_w = -roi_width / 2.0;
pre_calc_for_bilinear_interpolate(
height, width, pooled_height, pooled_width, roi_bin_grid_h,
roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h, bin_size_w,
roi_bin_grid_h, roi_bin_grid_w, roi_center_h, roi_center_w, cos_theta,
sin_theta, pre_calc);
for (int c = 0; c < channels; c++) {
int index_n_c = index_n + c * pooled_width * pooled_height;
const float *offset_input =
input + (roi_batch_ind * channels + c) * height * width;
int pre_calc_index = 0;
for (int ph = 0; ph < pooled_height; ph++) {
for (int pw = 0; pw < pooled_width; pw++) {
int index = index_n_c + ph * pooled_width + pw;
float output_val = 0.;
for (int iy = 0; iy < roi_bin_grid_h; iy++) {
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
PreCalc pc = pre_calc[pre_calc_index];
output_val += pc.w1 * offset_input[pc.pos1] +
pc.w2 * offset_input[pc.pos2] +
pc.w3 * offset_input[pc.pos3] +
pc.w4 * offset_input[pc.pos4];
pre_calc_index += 1;
}
}
output_val /= count;
output[index] = output_val;
} // for pw
} // for ph
} // for c
} // for n
}
void MMCVRoIAlignRotatedKernel::Compute(OrtKernelContext *context) {
// Setup inputs
const OrtValue *input_X = ort_.KernelContext_GetInput(context, 0);
const float *X_data =
reinterpret_cast<const float *>(ort_.GetTensorData<float>(input_X));
const OrtValue *input_rois = ort_.KernelContext_GetInput(context, 1);
const float *rois = reinterpret_cast<const float *>(
ort_.GetTensorData<const float *>(input_rois));
// Setup output
OrtTensorDimensions out_dimensions(ort_, input_X);
OrtTensorDimensions roi_dimensions(ort_, input_rois);
int batch_size = out_dimensions.data()[0];
int input_channels = out_dimensions.data()[1];
int input_height = out_dimensions.data()[2];
int input_width = out_dimensions.data()[3];
out_dimensions.data()[0] = roi_dimensions.data()[0];
out_dimensions.data()[2] = aligned_height_;
out_dimensions.data()[3] = aligned_width_;
OrtValue *output = ort_.KernelContext_GetOutput(
context, 0, out_dimensions.data(), out_dimensions.size());
float *out = ort_.GetTensorMutableData<float>(output);
OrtTensorTypeAndShapeInfo *output_info = ort_.GetTensorTypeAndShape(output);
ort_.ReleaseTensorTypeAndShapeInfo(output_info);
// TODO: forward here
int output_size = out_dimensions.data()[0];
for (auto i = 1; i < out_dimensions.size(); ++i) {
output_size *= out_dimensions.data()[i];
}
ROIAlignRotatedForwardCPU(output_size, X_data, rois, out, spatial_scale_,
aligned_, clockwise_, input_channels, input_height,
input_width, aligned_height_, aligned_width_,
sampling_ratio_);
}
#ifndef ONNXRUNTIME_ROI_ALIGN_ROTATED_H
#define ONNXRUNTIME_ROI_ALIGN_ROTATED_H
#include <assert.h>
#include <onnxruntime_cxx_api.h>
#include <cmath>
#include <mutex>
#include <string>
#include <vector>
struct MMCVRoIAlignRotatedKernel {
public:
MMCVRoIAlignRotatedKernel(Ort::CustomOpApi ort, const OrtKernelInfo* info)
: ort_(ort) {
aligned_height_ =
ort_.KernelInfoGetAttribute<int64_t>(info, "output_height");
aligned_width_ = ort_.KernelInfoGetAttribute<int64_t>(info, "output_width");
sampling_ratio_ =
ort_.KernelInfoGetAttribute<int64_t>(info, "sampling_ratio");
spatial_scale_ = ort_.KernelInfoGetAttribute<float>(info, "spatial_scale");
aligned_ = ort_.KernelInfoGetAttribute<int64_t>(info, "aligned");
clockwise_ = ort_.KernelInfoGetAttribute<int64_t>(info, "clockwise");
}
void Compute(OrtKernelContext* context);
private:
Ort::CustomOpApi ort_;
int aligned_height_;
int aligned_width_;
float spatial_scale_;
int sampling_ratio_;
int aligned_;
int clockwise_;
};
struct MMCVRoIAlignRotatedCustomOp
: Ort::CustomOpBase<MMCVRoIAlignRotatedCustomOp,
MMCVRoIAlignRotatedKernel> {
void* CreateKernel(Ort::CustomOpApi api, const OrtKernelInfo* info) {
return new MMCVRoIAlignRotatedKernel(api, info);
}
const char* GetName() const { return "MMCVRoIAlignRotated"; }
size_t GetInputTypeCount() const { return 2; }
ONNXTensorElementDataType GetInputType(size_t) const {
return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
}
size_t GetOutputTypeCount() const { return 1; }
ONNXTensorElementDataType GetOutputType(size_t) const {
return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
}
// force cpu
const char* GetExecutionProviderType() const {
return "CPUExecutionProvider";
}
};
#endif // ONNXRUNTIME_ROI_ALIGN_ROTATED_H
#include "pytorch_cpp_helper.hpp"
#ifdef MMCV_WITH_CUDA
void ROIAlignRotatedForwardCUDAKernelLauncher(
const at::Tensor features, const at::Tensor rois, const float spatial_scale,
const int sample_num, const bool aligned, const bool clockwise,
const int channels, const int height, const int width, const int num_rois,
const int pooled_height, const int pooled_width, at::Tensor output);
void ROIAlignRotatedBackwardCUDAKernelLauncher(
const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,
const int sample_num, const bool aligned, const bool clockwise,
const int channels, const int height, const int width, const int num_rois,
const int pooled_height, const int pooled_width, at::Tensor bottom_grad);
void roi_align_rotated_forward_cuda(Tensor features, Tensor rois, Tensor output,
int pooled_height, int pooled_width,
float spatial_scale, int sample_num,
bool aligned, bool clockwise) {
// Number of ROIs
int num_rois = rois.size(0);
int size_rois = rois.size(1);
if (size_rois != 6) {
AT_ERROR("wrong roi size");
}
int num_channels = features.size(1);
int data_height = features.size(2);
int data_width = features.size(3);
ROIAlignRotatedForwardCUDAKernelLauncher(
features, rois, spatial_scale, sample_num, aligned, clockwise,
num_channels, data_height, data_width, num_rois, pooled_height,
pooled_width, output);
}
void roi_align_rotated_backward_cuda(Tensor top_grad, Tensor rois,
Tensor bottom_grad, int pooled_height,
int pooled_width, float spatial_scale,
int sample_num, bool aligned,
bool clockwise) {
// Number of ROIs
int num_rois = rois.size(0);
int size_rois = rois.size(1);
if (size_rois != 6) {
AT_ERROR("wrong roi size");
}
int num_channels = bottom_grad.size(1);
int data_height = bottom_grad.size(2);
int data_width = bottom_grad.size(3);
ROIAlignRotatedBackwardCUDAKernelLauncher(
top_grad, rois, spatial_scale, sample_num, aligned, clockwise,
num_channels, data_height, data_width, num_rois, pooled_height,
pooled_width, bottom_grad);
}
#endif
void ROIAlignRotatedForwardCPULauncher(Tensor input, Tensor rois, Tensor output,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
bool aligned, bool clockwise);
void ROIAlignRotatedBackwardCPULauncher(Tensor grad_output, Tensor rois,
Tensor grad_input, int aligned_height,
int aligned_width, float spatial_scale,
int sampling_ratio, bool aligned,
bool clockwise);
void roi_align_rotated_forward_cpu(Tensor features, Tensor rois, Tensor output,
int pooled_height, int pooled_width,
float spatial_scale, int sample_num,
bool aligned, bool clockwise) {
ROIAlignRotatedForwardCPULauncher(features, rois, output, pooled_height,
pooled_width, spatial_scale, sample_num,
aligned, clockwise);
}
void roi_align_rotated_backward_cpu(Tensor features, Tensor rois, Tensor output,
int pooled_height, int pooled_width,
float spatial_scale, int sample_num,
bool aligned, bool clockwise) {
ROIAlignRotatedBackwardCPULauncher(features, rois, output, pooled_height,
pooled_width, spatial_scale, sample_num,
aligned, clockwise);
}
void roi_align_rotated_forward(Tensor input, Tensor rois, Tensor output,
int pooled_height, int pooled_width,
float spatial_scale, int sample_num,
bool aligned, bool clockwise) {
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(rois);
CHECK_CUDA_INPUT(output);
roi_align_rotated_forward_cuda(input, rois, output, pooled_height,
pooled_width, spatial_scale, sample_num,
aligned, clockwise);
#else
AT_ERROR("RoIAlignRotated is not compiled with GPU support");
#endif
} else {
CHECK_CPU_INPUT(input);
CHECK_CPU_INPUT(rois);
CHECK_CPU_INPUT(output);
roi_align_rotated_forward_cpu(input, rois, output, pooled_height,
pooled_width, spatial_scale, sample_num,
aligned, clockwise);
}
}
void roi_align_rotated_backward(Tensor grad_output, Tensor rois,
Tensor grad_input, int pooled_height,
int pooled_width, float spatial_scale,
int sample_num, bool aligned, bool clockwise) {
if (grad_output.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(grad_output);
CHECK_CUDA_INPUT(rois);
CHECK_CUDA_INPUT(grad_input);
roi_align_rotated_backward_cuda(grad_output, rois, grad_input,
pooled_height, pooled_width, spatial_scale,
sample_num, aligned, clockwise);
#else
AT_ERROR("RoIAlignRotated is not compiled with GPU support");
#endif
} else {
CHECK_CPU_INPUT(grad_output);
CHECK_CPU_INPUT(rois);
CHECK_CPU_INPUT(grad_input);
roi_align_rotated_backward_cpu(grad_output, rois, grad_input, pooled_height,
pooled_width, spatial_scale, sample_num,
aligned, clockwise);
}
}
// Modified from
// https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/ROIAlignRotated
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
#include <ATen/ATen.h>
#include <ATen/TensorUtils.h>
#include "../pytorch_cpp_helper.hpp"
// implementation taken from Caffe2
template <typename T>
struct PreCalc {
int pos1;
int pos2;
int pos3;
int pos4;
T w1;
T w2;
T w3;
T w4;
};
template <typename T>
void pre_calc_for_bilinear_interpolate(
const int height, const int width, const int pooled_height,
const int pooled_width, const int iy_upper, const int ix_upper,
T roi_start_h, T roi_start_w, T bin_size_h, T bin_size_w,
int roi_bin_grid_h, int roi_bin_grid_w, T roi_center_h, T roi_center_w,
T cos_theta, T sin_theta, std::vector<PreCalc<T>>& pre_calc) {
int pre_calc_index = 0;
for (int ph = 0; ph < pooled_height; ph++) {
for (int pw = 0; pw < pooled_width; pw++) {
for (int iy = 0; iy < iy_upper; iy++) {
const T yy = roi_start_h + ph * bin_size_h +
static_cast<T>(iy + .5f) * bin_size_h /
static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
for (int ix = 0; ix < ix_upper; ix++) {
const T xx = roi_start_w + pw * bin_size_w +
static_cast<T>(ix + .5f) * bin_size_w /
static_cast<T>(roi_bin_grid_w);
// Rotate by theta around the center and translate
// In image space, (y, x) is the order for Right Handed System,
// and this is essentially multiplying the point by a rotation matrix
// to rotate it counterclockwise through angle theta.
T y = yy * cos_theta - xx * sin_theta + roi_center_h;
T x = yy * sin_theta + xx * cos_theta + roi_center_w;
// deal with: inverse elements are out of feature map boundary
if (y < -1.0 || y > height || x < -1.0 || x > width) {
// empty
PreCalc<T> pc;
pc.pos1 = 0;
pc.pos2 = 0;
pc.pos3 = 0;
pc.pos4 = 0;
pc.w1 = 0;
pc.w2 = 0;
pc.w3 = 0;
pc.w4 = 0;
pre_calc[pre_calc_index] = pc;
pre_calc_index += 1;
continue;
}
if (y < 0) {
y = 0;
}
if (x < 0) {
x = 0;
}
int y_low = (int)y;
int x_low = (int)x;
int y_high;
int x_high;
if (y_low >= height - 1) {
y_high = y_low = height - 1;
y = (T)y_low;
} else {
y_high = y_low + 1;
}
if (x_low >= width - 1) {
x_high = x_low = width - 1;
x = (T)x_low;
} else {
x_high = x_low + 1;
}
T ly = y - y_low;
T lx = x - x_low;
T hy = 1. - ly, hx = 1. - lx;
T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
// save weights and indices
PreCalc<T> pc;
pc.pos1 = y_low * width + x_low;
pc.pos2 = y_low * width + x_high;
pc.pos3 = y_high * width + x_low;
pc.pos4 = y_high * width + x_high;
pc.w1 = w1;
pc.w2 = w2;
pc.w3 = w3;
pc.w4 = w4;
pre_calc[pre_calc_index] = pc;
pre_calc_index += 1;
}
}
}
}
}
template <typename T>
void ROIAlignRotatedForward(const int nthreads, const T* input,
const T& spatial_scale, const bool aligned,
const bool clockwise, const int channels,
const int height, const int width,
const int pooled_height, const int pooled_width,
const int sampling_ratio, const T* rois,
T* output) {
int n_rois = nthreads / channels / pooled_width / pooled_height;
// (n, c, ph, pw) is an element in the pooled output
// can be parallelized using omp
// #pragma omp parallel for num_threads(32)
for (int n = 0; n < n_rois; n++) {
int index_n = n * channels * pooled_width * pooled_height;
const T* current_roi = rois + n * 6;
int roi_batch_ind = current_roi[0];
// Do not use rounding; this implementation detail is critical
T offset = aligned ? (T)0.5 : (T)0.0;
T roi_center_w = current_roi[1] * spatial_scale - offset;
T roi_center_h = current_roi[2] * spatial_scale - offset;
T roi_width = current_roi[3] * spatial_scale;
T roi_height = current_roi[4] * spatial_scale;
T theta = current_roi[5];
if (clockwise) {
theta = -theta; // If clockwise, the angle needs to be reversed.
}
T cos_theta = cos(theta);
T sin_theta = sin(theta);
if (aligned) {
AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
"ROIs in ROIAlignRotated do not have non-negative size!");
} else { // for backward-compatibility only
roi_width = std::max(roi_width, (T)1.);
roi_height = std::max(roi_height, (T)1.);
}
T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
// We use roi_bin_grid to sample the grid and mimic integral
int roi_bin_grid_h = (sampling_ratio > 0)
? sampling_ratio
: ceil(roi_height / pooled_height); // e.g., = 2
int roi_bin_grid_w =
(sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
// We do average (integral) pooling inside a bin
const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4
// we want to precalculate indices and weights shared by all channels,
// this is the key point of optimization
std::vector<PreCalc<T>> pre_calc(roi_bin_grid_h * roi_bin_grid_w *
pooled_width * pooled_height);
// roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
// Appropriate translation needs to be applied after.
T roi_start_h = -roi_height / 2.0;
T roi_start_w = -roi_width / 2.0;
pre_calc_for_bilinear_interpolate(
height, width, pooled_height, pooled_width, roi_bin_grid_h,
roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h, bin_size_w,
roi_bin_grid_h, roi_bin_grid_w, roi_center_h, roi_center_w, cos_theta,
sin_theta, pre_calc);
for (int c = 0; c < channels; c++) {
int index_n_c = index_n + c * pooled_width * pooled_height;
const T* offset_input =
input + (roi_batch_ind * channels + c) * height * width;
int pre_calc_index = 0;
for (int ph = 0; ph < pooled_height; ph++) {
for (int pw = 0; pw < pooled_width; pw++) {
int index = index_n_c + ph * pooled_width + pw;
T output_val = 0.;
for (int iy = 0; iy < roi_bin_grid_h; iy++) {
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
PreCalc<T> pc = pre_calc[pre_calc_index];
output_val += pc.w1 * offset_input[pc.pos1] +
pc.w2 * offset_input[pc.pos2] +
pc.w3 * offset_input[pc.pos3] +
pc.w4 * offset_input[pc.pos4];
pre_calc_index += 1;
}
}
output_val /= count;
output[index] = output_val;
} // for pw
} // for ph
} // for c
} // for n
}
template <typename T>
void bilinear_interpolate_gradient(const int height, const int width, T y, T x,
T& w1, T& w2, T& w3, T& w4, int& x_low,
int& x_high, int& y_low, int& y_high) {
// deal with cases that inverse elements are out of feature map boundary
if (y < -1.0 || y > height || x < -1.0 || x > width) {
// empty
w1 = w2 = w3 = w4 = 0.;
x_low = x_high = y_low = y_high = -1;
return;
}
if (y < 0) {
y = 0;
}
if (x < 0) {
x = 0;
}
y_low = (int)y;
x_low = (int)x;
if (y_low >= height - 1) {
y_high = y_low = height - 1;
y = (T)y_low;
} else {
y_high = y_low + 1;
}
if (x_low >= width - 1) {
x_high = x_low = width - 1;
x = (T)x_low;
} else {
x_high = x_low + 1;
}
T ly = y - y_low;
T lx = x - x_low;
T hy = 1. - ly, hx = 1. - lx;
// reference in forward
// T v1 = input[y_low * width + x_low];
// T v2 = input[y_low * width + x_high];
// T v3 = input[y_high * width + x_low];
// T v4 = input[y_high * width + x_high];
// T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
return;
}
template <class T>
inline void add(T* address, const T& val) {
*address += val;
}
template <typename T>
void ROIAlignRotatedBackward(
const int nthreads,
// may not be contiguous. should index using n_stride, etc
const T* grad_output, const T& spatial_scale, const bool aligned,
const bool clockwise, const int channels, const int height, const int width,
const int pooled_height, const int pooled_width, const int sampling_ratio,
T* grad_input, const T* rois, const int n_stride, const int c_stride,
const int h_stride, const int w_stride) {
for (int index = 0; index < nthreads; index++) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int c = (index / pooled_width / pooled_height) % channels;
int n = index / pooled_width / pooled_height / channels;
const T* current_roi = rois + n * 6;
int roi_batch_ind = current_roi[0];
// Do not use rounding; this implementation detail is critical
T offset = aligned ? (T)0.5 : (T)0.0;
T roi_center_w = current_roi[1] * spatial_scale - offset;
T roi_center_h = current_roi[2] * spatial_scale - offset;
T roi_width = current_roi[3] * spatial_scale;
T roi_height = current_roi[4] * spatial_scale;
T theta = current_roi[5];
if (clockwise) {
theta = -theta; // If clockwise, the angle needs to be reversed.
}
T cos_theta = cos(theta);
T sin_theta = sin(theta);
if (aligned) {
AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
"ROIs in ROIAlignRotated do not have non-negative size!");
} else { // for backward-compatibility only
roi_width = std::max(roi_width, (T)1.);
roi_height = std::max(roi_height, (T)1.);
}
T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
T* offset_grad_input =
grad_input + ((roi_batch_ind * channels + c) * height * width);
int output_offset = n * n_stride + c * c_stride;
const T* offset_grad_output = grad_output + output_offset;
const T grad_output_this_bin =
offset_grad_output[ph * h_stride + pw * w_stride];
// We use roi_bin_grid to sample the grid and mimic integral
int roi_bin_grid_h = (sampling_ratio > 0)
? sampling_ratio
: ceil(roi_height / pooled_height); // e.g., = 2
int roi_bin_grid_w =
(sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
// roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
// Appropriate translation needs to be applied after.
T roi_start_h = -roi_height / 2.0;
T roi_start_w = -roi_width / 2.0;
// We do average (integral) pooling inside a bin
const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
for (int iy = 0; iy < roi_bin_grid_h; iy++) {
const T yy = roi_start_h + ph * bin_size_h +
static_cast<T>(iy + .5f) * bin_size_h /
static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
const T xx = roi_start_w + pw * bin_size_w +
static_cast<T>(ix + .5f) * bin_size_w /
static_cast<T>(roi_bin_grid_w);
// Rotate by theta around the center and translate
T y = yy * cos_theta - xx * sin_theta + roi_center_h;
T x = yy * sin_theta + xx * cos_theta + roi_center_w;
T w1, w2, w3, w4;
int x_low, x_high, y_low, y_high;
bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
x_low, x_high, y_low, y_high);
T g1 = grad_output_this_bin * w1 / count;
T g2 = grad_output_this_bin * w2 / count;
T g3 = grad_output_this_bin * w3 / count;
T g4 = grad_output_this_bin * w4 / count;
if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
// atomic add is not needed for now since it is single threaded
add(offset_grad_input + y_low * width + x_low, static_cast<T>(g1));
add(offset_grad_input + y_low * width + x_high, static_cast<T>(g2));
add(offset_grad_input + y_high * width + x_low, static_cast<T>(g3));
add(offset_grad_input + y_high * width + x_high, static_cast<T>(g4));
} // if
} // ix
} // iy
} // for
} // ROIAlignRotatedBackward
void ROIAlignRotatedForwardCPULauncher(Tensor input, Tensor rois, Tensor output,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
bool aligned, bool clockwise) {
int output_size = output.numel();
int channels = input.size(1);
int height = input.size(2);
int width = input.size(3);
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "ROIAlignRotated_forward", [&] {
ROIAlignRotatedForward<scalar_t>(
output_size, input.data_ptr<scalar_t>(),
static_cast<scalar_t>(spatial_scale), aligned, clockwise, channels,
height, width, aligned_height, aligned_width, sampling_ratio,
rois.data_ptr<scalar_t>(), output.data_ptr<scalar_t>());
});
}
void ROIAlignRotatedBackwardCPULauncher(Tensor grad_output, Tensor rois,
Tensor grad_input, int aligned_height,
int aligned_width, float spatial_scale,
int sampling_ratio, bool aligned,
bool clockwise) {
int output_size = grad_output.numel();
int channels = grad_input.size(1);
int height = grad_input.size(2);
int width = grad_input.size(3);
// get stride values to ensure indexing into gradients is correct.
int n_stride = grad_output.stride(0);
int c_stride = grad_output.stride(1);
int h_stride = grad_output.stride(2);
int w_stride = grad_output.stride(3);
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
grad_output.scalar_type(), "ROIAlignRotated_backward", [&] {
ROIAlignRotatedBackward<scalar_t>(
grad_output.numel(), grad_output.data_ptr<scalar_t>(),
static_cast<scalar_t>(spatial_scale), aligned, clockwise, channels,
height, width, aligned_height, aligned_width, sampling_ratio,
grad_input.data_ptr<scalar_t>(), rois.data_ptr<scalar_t>(),
n_stride, c_stride, h_stride, w_stride);
});
}
#include "pytorch_cuda_helper.hpp"
#include "roi_align_rotated_cuda_kernel.cuh"
void ROIAlignRotatedForwardCUDAKernelLauncher(
const at::Tensor features, const at::Tensor rois, const float spatial_scale,
const int sample_num, const bool aligned, const bool clockwise,
const int channels, const int height, const int width, const int num_rois,
const int pooled_height, const int pooled_width, at::Tensor output) {
const int output_size = num_rois * pooled_height * pooled_width * channels;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
features.type(), "ROIAlignRotatedLaucherForward", ([&] {
const scalar_t *bottom_data = features.data<scalar_t>();
const scalar_t *rois_data = rois.data<scalar_t>();
scalar_t *top_data = output.data<scalar_t>();
roi_align_rotated_forward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(
output_size, bottom_data, rois_data, scalar_t(spatial_scale),
sample_num, aligned, clockwise, channels, height, width,
pooled_height, pooled_width, top_data);
}));
AT_CUDA_CHECK(cudaGetLastError());
}
void ROIAlignRotatedBackwardCUDAKernelLauncher(
const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,
const int sample_num, const bool aligned, const bool clockwise,
const int channels, const int height, const int width, const int num_rois,
const int pooled_height, const int pooled_width, at::Tensor bottom_grad) {
const int output_size = num_rois * pooled_height * pooled_width * channels;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
top_grad.type(), "ROIAlignLaucherBackward", ([&] {
const scalar_t *top_diff = top_grad.data<scalar_t>();
const scalar_t *rois_data = rois.data<scalar_t>();
scalar_t *bottom_diff = bottom_grad.data<scalar_t>();
roi_align_rotated_backward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(
output_size, top_diff, rois_data, spatial_scale, sample_num,
aligned, clockwise, channels, height, width, pooled_height,
pooled_width, bottom_diff);
}));
AT_CUDA_CHECK(cudaGetLastError());
}
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>
#include "roi_align_rotated_pytorch.h"
using namespace parrots;
#ifdef MMCV_WITH_CUDA
void roi_align_rotated_forward_cuda_parrots(CudaContext& ctx,
const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int pooled_height;
int pooled_width;
float spatial_scale;
int sample_num;
bool aligned;
bool clockwise;
SSAttrs(attr)
.get<int>("pooled_height", pooled_height)
.get<int>("pooled_width", pooled_width)
.get<float>("spatial_scale", spatial_scale)
.get<int>("sample_num", sample_num)
.get<bool>("aligned", aligned)
.get<bool>("clockwise", clockwise)
.done();
const auto& input = buildATensor(ctx, ins[0]);
const auto& rois = buildATensor(ctx, ins[1]);
auto output = buildATensor(ctx, outs[0]);
roi_align_rotated_forward_cuda(input, rois, output, pooled_height,
pooled_width, spatial_scale, sample_num,
aligned, clockwise);
}
void roi_align_rotated_backward_cuda_parrots(CudaContext& ctx,
const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int pooled_height;
int pooled_width;
float spatial_scale;
int sample_num;
bool aligned;
bool clockwise;
SSAttrs(attr)
.get<int>("pooled_height", pooled_height)
.get<int>("pooled_width", pooled_width)
.get<float>("spatial_scale", spatial_scale)
.get<int>("sample_num", sample_num)
.get<bool>("aligned", aligned)
.get<bool>("clockwise", clockwise)
.done();
const auto& grad_output = buildATensor(ctx, ins[0]);
const auto& rois = buildATensor(ctx, ins[1]);
auto grad_input = buildATensor(ctx, outs[0]);
roi_align_rotated_backward_cuda(grad_output, rois, grad_input, pooled_height,
pooled_width, spatial_scale, sample_num,
aligned, clockwise);
}
#endif
void roi_align_rotated_forward_cpu_parrots(HostContext& ctx,
const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int pooled_height;
int pooled_width;
float spatial_scale;
int sample_num;
bool aligned;
bool clockwise;
SSAttrs(attr)
.get<int>("pooled_height", pooled_height)
.get<int>("pooled_width", pooled_width)
.get<float>("spatial_scale", spatial_scale)
.get<int>("sample_num", sample_num)
.get<bool>("aligned", aligned)
.get<bool>("clockwise", clockwise)
.done();
const auto& input = buildATensor(ctx, ins[0]);
const auto& rois = buildATensor(ctx, ins[1]);
auto output = buildATensor(ctx, outs[0]);
roi_align_rotated_forward_cpu(input, rois, output, pooled_height,
pooled_width, spatial_scale, sample_num,
aligned, clockwise);
}
void roi_align_rotated_backward_cpu_parrots(HostContext& ctx,
const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int pooled_height;
int pooled_width;
float spatial_scale;
int sample_num;
bool aligned;
bool clockwise;
SSAttrs(attr)
.get<int>("pooled_height", pooled_height)
.get<int>("pooled_width", pooled_width)
.get<float>("spatial_scale", spatial_scale)
.get<int>("sample_num", sample_num)
.get<bool>("aligned", aligned)
.get<bool>("clockwise", clockwise)
.done();
const auto& grad_output = buildATensor(ctx, ins[0]);
const auto& rois = buildATensor(ctx, ins[1]);
auto grad_input = buildATensor(ctx, outs[0]);
roi_align_rotated_backward_cpu(grad_output, rois, grad_input, pooled_height,
pooled_width, spatial_scale, sample_num,
aligned, clockwise);
}
PARROTS_EXTENSION_REGISTER(roi_align_rotated_forward)
.attr("pooled_height")
.attr("pooled_width")
.attr("spatial_scale")
.attr("sample_num")
.attr("aligned")
.attr("clockwise")
.input(2)
.output(1)
.apply(roi_align_rotated_forward_cpu_parrots)
#ifdef MMCV_WITH_CUDA
.apply(roi_align_rotated_forward_cuda_parrots)
#endif
.done();
PARROTS_EXTENSION_REGISTER(roi_align_rotated_backward)
.attr("pooled_height")
.attr("pooled_width")
.attr("spatial_scale")
.attr("sample_num")
.attr("aligned")
.attr("clockwise")
.input(2)
.output(1)
.apply(roi_align_rotated_backward_cpu_parrots)
#ifdef MMCV_WITH_CUDA
.apply(roi_align_rotated_backward_cuda_parrots)
#endif
.done();
#ifndef ROI_ALIGN_ROTATED_PYTORCH_H
#define ROI_ALIGN_ROTATED_PYTORCH_H
#include <torch/extension.h>
using namespace at;
#ifdef MMCV_WITH_CUDA
void roi_align_rotated_forward_cuda(Tensor features, Tensor rois, Tensor output,
int pooled_height, int pooled_width,
float spatial_scale, int sample_num,
bool aligned, bool clockwise);
void roi_align_rotated_backward_cuda(Tensor grad_output, Tensor rois,
Tensor bottom_grad, int pooled_height,
int pooled_width, float spatial_scale,
int sample_num, bool aligned,
bool clockwise);
#endif
void roi_align_rotated_forward_cpu(Tensor features, Tensor rois, Tensor output,
int pooled_height, int pooled_width,
float spatial_scale, int sample_num,
bool aligned, bool clockwise);
void roi_align_rotated_backward_cpu(Tensor grad_output, Tensor rois,
Tensor bottom_grad, int pooled_height,
int pooled_width, float spatial_scale,
int sample_num, bool aligned,
bool clockwise);
#endif // ROI_ALIGN_ROTATED_PYTORCH_H
......@@ -190,6 +190,16 @@ Tensor fused_bias_leakyrelu(const Tensor& input, const Tensor& bias,
const Tensor& refer, int act, int grad, float alpha,
float scale);
void roi_align_rotated_forward(Tensor input, Tensor rois, Tensor output,
int pooled_height, int pooled_width,
float spatial_scale, int sample_num,
bool aligned, bool clockwise);
void roi_align_rotated_backward(Tensor grad_output, Tensor rois,
Tensor grad_input, int pooled_height,
int pooled_width, float spatial_scale,
int sample_num, bool aligned, bool clockwise);
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("upfirdn2d", &upfirdn2d, "upfirdn2d (CUDA)");
m.def("fused_bias_leakyrelu", &fused_bias_leakyrelu,
......@@ -381,4 +391,14 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("nms_rotated", &nms_rotated, "NMS for rotated boxes", py::arg("dets"),
py::arg("scores"), py::arg("order"), py::arg("dets_sorted"),
py::arg("iou_threshold"), py::arg("multi_label"));
m.def("roi_align_rotated_forward", &roi_align_rotated_forward,
"roi_align_rotated forward", py::arg("input"), py::arg("rois"),
py::arg("output"), py::arg("pooled_height"), py::arg("pooled_width"),
py::arg("spatial_scale"), py::arg("sample_num"), py::arg("aligned"),
py::arg("clockwise"));
m.def("roi_align_rotated_backward", &roi_align_rotated_backward,
"roi_align_rotated backward", py::arg("grad_output"), py::arg("rois"),
py::arg("grad_input"), py::arg("pooled_height"),
py::arg("pooled_width"), py::arg("spatial_scale"),
py::arg("sample_num"), py::arg("aligned"), py::arg("clockwise"));
}
#include "pytorch_cpp_helper.hpp"
#ifdef MMCV_WITH_CUDA
void ROIAlignRotatedForwardCUDAKernelLauncher(
const at::Tensor features, const at::Tensor rois, const float spatial_scale,
const int sample_ratio, const bool aligned, const bool clockwise,
const int channels, const int height, const int width, const int num_rois,
const int aligned_height, const int aligned_width, at::Tensor output);
void ROIAlignRotatedBackwardCUDAKernelLauncher(
const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,
const int sample_ratio, const bool aligned, const bool clockwise,
const int channels, const int height, const int width, const int num_rois,
const int aligned_height, const int aligned_width, at::Tensor bottom_grad);
void roi_align_rotated_forward_cuda(Tensor features, Tensor rois, Tensor output,
int aligned_height, int aligned_width,
float spatial_scale, int sample_ratio,
bool aligned, bool clockwise) {
// Number of ROIs
int num_rois = rois.size(0);
int size_rois = rois.size(1);
if (size_rois != 6) {
AT_ERROR("wrong roi size");
}
int num_channels = features.size(1);
int data_height = features.size(2);
int data_width = features.size(3);
ROIAlignRotatedForwardCUDAKernelLauncher(
features, rois, spatial_scale, sample_ratio, aligned, clockwise,
num_channels, data_height, data_width, num_rois, aligned_height,
aligned_width, output);
}
void roi_align_rotated_backward_cuda(Tensor top_grad, Tensor rois,
Tensor bottom_grad, int aligned_height,
int aligned_width, float spatial_scale,
int sample_ratio, bool aligned,
bool clockwise) {
// Number of ROIs
int num_rois = rois.size(0);
int size_rois = rois.size(1);
if (size_rois != 6) {
AT_ERROR("wrong roi size");
}
int num_channels = bottom_grad.size(1);
int data_height = bottom_grad.size(2);
int data_width = bottom_grad.size(3);
ROIAlignRotatedBackwardCUDAKernelLauncher(
top_grad, rois, spatial_scale, sample_ratio, aligned, clockwise,
num_channels, data_height, data_width, num_rois, aligned_height,
aligned_width, bottom_grad);
}
#endif
void ROIAlignRotatedForwardCPULauncher(Tensor input, Tensor rois, Tensor output,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
bool aligned, bool clockwise);
void ROIAlignRotatedBackwardCPULauncher(Tensor top_grad, Tensor rois,
Tensor bottom_grad, int aligned_height,
int aligned_width, float spatial_scale,
int sampling_ratio, bool aligned,
bool clockwise);
void roi_align_rotated_forward_cpu(Tensor input, Tensor rois, Tensor output,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
bool aligned, bool clockwise) {
ROIAlignRotatedForwardCPULauncher(input, rois, output, aligned_height,
aligned_width, spatial_scale,
sampling_ratio, aligned, clockwise);
}
void roi_align_rotated_backward_cpu(Tensor top_grad, Tensor rois,
Tensor bottom_grad, int aligned_height,
int aligned_width, float spatial_scale,
int sampling_ratio, bool aligned,
bool clockwise) {
// Number of ROIs
int num_rois = rois.size(0);
int size_rois = rois.size(1);
if (size_rois != 6) {
AT_ERROR("wrong roi size");
}
ROIAlignRotatedBackwardCPULauncher(
top_grad, rois, bottom_grad, aligned_height, aligned_width, spatial_scale,
sampling_ratio, aligned, clockwise);
}
void roi_align_rotated_forward(Tensor input, Tensor rois, Tensor output,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
bool aligned, bool clockwise) {
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(rois);
CHECK_CUDA_INPUT(output);
roi_align_rotated_forward_cuda(input, rois, output, aligned_height,
aligned_width, spatial_scale, sampling_ratio,
aligned, clockwise);
#else
AT_ERROR("RoIAlignRotated is not compiled with GPU support");
#endif
} else {
CHECK_CPU_INPUT(input);
CHECK_CPU_INPUT(rois);
CHECK_CPU_INPUT(output);
roi_align_rotated_forward_cpu(input, rois, output, aligned_height,
aligned_width, spatial_scale, sampling_ratio,
aligned, clockwise);
}
}
void roi_align_rotated_backward(Tensor top_grad, Tensor rois,
Tensor bottom_grad, int aligned_height,
int aligned_width, float spatial_scale,
int sampling_ratio, bool aligned,
bool clockwise) {
if (top_grad.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(top_grad);
CHECK_CUDA_INPUT(rois);
CHECK_CUDA_INPUT(bottom_grad);
roi_align_rotated_backward_cuda(top_grad, rois, bottom_grad, aligned_height,
aligned_width, spatial_scale,
sampling_ratio, aligned, clockwise);
#else
AT_ERROR("RoIAlignRotated is not compiled with GPU support");
#endif
} else {
CHECK_CPU_INPUT(top_grad);
CHECK_CPU_INPUT(rois);
CHECK_CPU_INPUT(bottom_grad);
roi_align_rotated_backward_cpu(top_grad, rois, bottom_grad, aligned_height,
aligned_width, spatial_scale, sampling_ratio,
aligned, clockwise);
}
}
// Modified from
// https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/ROIAlignRotated
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
#include <ATen/ATen.h>
#include <ATen/TensorUtils.h>
#include "../pytorch_cpp_helper.hpp"
// implementation taken from Caffe2
template <typename T>
struct PreCalc {
int pos1;
int pos2;
int pos3;
int pos4;
T w1;
T w2;
T w3;
T w4;
};
template <typename T>
void pre_calc_for_bilinear_interpolate(
const int height, const int width, const int pooled_height,
const int pooled_width, const int iy_upper, const int ix_upper,
T roi_start_h, T roi_start_w, T bin_size_h, T bin_size_w,
int roi_bin_grid_h, int roi_bin_grid_w, T roi_center_h, T roi_center_w,
T cos_theta, T sin_theta, std::vector<PreCalc<T>>& pre_calc) {
int pre_calc_index = 0;
for (int ph = 0; ph < pooled_height; ph++) {
for (int pw = 0; pw < pooled_width; pw++) {
for (int iy = 0; iy < iy_upper; iy++) {
const T yy = roi_start_h + ph * bin_size_h +
static_cast<T>(iy + .5f) * bin_size_h /
static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
for (int ix = 0; ix < ix_upper; ix++) {
const T xx = roi_start_w + pw * bin_size_w +
static_cast<T>(ix + .5f) * bin_size_w /
static_cast<T>(roi_bin_grid_w);
// Rotate by theta around the center and translate
// In image space, (y, x) is the order for Right Handed System,
// and this is essentially multiplying the point by a rotation matrix
// to rotate it counterclockwise through angle theta.
T y = yy * cos_theta - xx * sin_theta + roi_center_h;
T x = yy * sin_theta + xx * cos_theta + roi_center_w;
// deal with: inverse elements are out of feature map boundary
if (y < -1.0 || y > height || x < -1.0 || x > width) {
// empty
PreCalc<T> pc;
pc.pos1 = 0;
pc.pos2 = 0;
pc.pos3 = 0;
pc.pos4 = 0;
pc.w1 = 0;
pc.w2 = 0;
pc.w3 = 0;
pc.w4 = 0;
pre_calc[pre_calc_index] = pc;
pre_calc_index += 1;
continue;
}
if (y < 0) {
y = 0;
}
if (x < 0) {
x = 0;
}
int y_low = (int)y;
int x_low = (int)x;
int y_high;
int x_high;
if (y_low >= height - 1) {
y_high = y_low = height - 1;
y = (T)y_low;
} else {
y_high = y_low + 1;
}
if (x_low >= width - 1) {
x_high = x_low = width - 1;
x = (T)x_low;
} else {
x_high = x_low + 1;
}
T ly = y - y_low;
T lx = x - x_low;
T hy = 1. - ly, hx = 1. - lx;
T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
// save weights and indices
PreCalc<T> pc;
pc.pos1 = y_low * width + x_low;
pc.pos2 = y_low * width + x_high;
pc.pos3 = y_high * width + x_low;
pc.pos4 = y_high * width + x_high;
pc.w1 = w1;
pc.w2 = w2;
pc.w3 = w3;
pc.w4 = w4;
pre_calc[pre_calc_index] = pc;
pre_calc_index += 1;
}
}
}
}
}
template <typename T>
void ROIAlignRotatedForward(const int nthreads, const T* input,
const T& spatial_scale, const bool aligned,
const bool clockwise, const int channels,
const int height, const int width,
const int pooled_height, const int pooled_width,
const int sampling_ratio, const T* rois,
T* output) {
int n_rois = nthreads / channels / pooled_width / pooled_height;
// (n, c, ph, pw) is an element in the pooled output
// can be parallelized using omp
// #pragma omp parallel for num_threads(32)
for (int n = 0; n < n_rois; n++) {
int index_n = n * channels * pooled_width * pooled_height;
const T* current_roi = rois + n * 6;
int roi_batch_ind = current_roi[0];
// Do not use rounding; this implementation detail is critical
T offset = aligned ? (T)0.5 : (T)0.0;
T roi_center_w = current_roi[1] * spatial_scale - offset;
T roi_center_h = current_roi[2] * spatial_scale - offset;
T roi_width = current_roi[3] * spatial_scale;
T roi_height = current_roi[4] * spatial_scale;
T theta = current_roi[5];
if (clockwise) {
theta = -theta; // If clockwise, the angle needs to be reversed.
}
T cos_theta = cos(theta);
T sin_theta = sin(theta);
if (aligned) {
AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
"ROIs in ROIAlignRotated do not have non-negative size!");
} else { // for backward-compatibility only
roi_width = std::max(roi_width, (T)1.);
roi_height = std::max(roi_height, (T)1.);
}
T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
// We use roi_bin_grid to sample the grid and mimic integral
int roi_bin_grid_h = (sampling_ratio > 0)
? sampling_ratio
: ceil(roi_height / pooled_height); // e.g., = 2
int roi_bin_grid_w =
(sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
// We do average (integral) pooling inside a bin
const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4
// we want to precalculate indices and weights shared by all channels,
// this is the key point of optimization
std::vector<PreCalc<T>> pre_calc(roi_bin_grid_h * roi_bin_grid_w *
pooled_width * pooled_height);
// roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
// Appropriate translation needs to be applied after.
T roi_start_h = -roi_height / 2.0;
T roi_start_w = -roi_width / 2.0;
pre_calc_for_bilinear_interpolate(
height, width, pooled_height, pooled_width, roi_bin_grid_h,
roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h, bin_size_w,
roi_bin_grid_h, roi_bin_grid_w, roi_center_h, roi_center_w, cos_theta,
sin_theta, pre_calc);
for (int c = 0; c < channels; c++) {
int index_n_c = index_n + c * pooled_width * pooled_height;
const T* offset_input =
input + (roi_batch_ind * channels + c) * height * width;
int pre_calc_index = 0;
for (int ph = 0; ph < pooled_height; ph++) {
for (int pw = 0; pw < pooled_width; pw++) {
int index = index_n_c + ph * pooled_width + pw;
T output_val = 0.;
for (int iy = 0; iy < roi_bin_grid_h; iy++) {
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
PreCalc<T> pc = pre_calc[pre_calc_index];
output_val += pc.w1 * offset_input[pc.pos1] +
pc.w2 * offset_input[pc.pos2] +
pc.w3 * offset_input[pc.pos3] +
pc.w4 * offset_input[pc.pos4];
pre_calc_index += 1;
}
}
output_val /= count;
output[index] = output_val;
} // for pw
} // for ph
} // for c
} // for n
}
template <typename T>
void bilinear_interpolate_gradient(const int height, const int width, T y, T x,
T& w1, T& w2, T& w3, T& w4, int& x_low,
int& x_high, int& y_low, int& y_high) {
// deal with cases that inverse elements are out of feature map boundary
if (y < -1.0 || y > height || x < -1.0 || x > width) {
// empty
w1 = w2 = w3 = w4 = 0.;
x_low = x_high = y_low = y_high = -1;
return;
}
if (y < 0) {
y = 0;
}
if (x < 0) {
x = 0;
}
y_low = (int)y;
x_low = (int)x;
if (y_low >= height - 1) {
y_high = y_low = height - 1;
y = (T)y_low;
} else {
y_high = y_low + 1;
}
if (x_low >= width - 1) {
x_high = x_low = width - 1;
x = (T)x_low;
} else {
x_high = x_low + 1;
}
T ly = y - y_low;
T lx = x - x_low;
T hy = 1. - ly, hx = 1. - lx;
// reference in forward
// T v1 = input[y_low * width + x_low];
// T v2 = input[y_low * width + x_high];
// T v3 = input[y_high * width + x_low];
// T v4 = input[y_high * width + x_high];
// T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
return;
}
template <class T>
inline void add(T* address, const T& val) {
*address += val;
}
template <typename T>
void ROIAlignRotatedBackward(
const int nthreads,
// may not be contiguous. should index using n_stride, etc
const T* grad_output, const T& spatial_scale, const bool aligned,
const bool clockwise, const int channels, const int height, const int width,
const int pooled_height, const int pooled_width, const int sampling_ratio,
T* grad_input, const T* rois, const int n_stride, const int c_stride,
const int h_stride, const int w_stride) {
for (int index = 0; index < nthreads; index++) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int c = (index / pooled_width / pooled_height) % channels;
int n = index / pooled_width / pooled_height / channels;
const T* current_roi = rois + n * 6;
int roi_batch_ind = current_roi[0];
// Do not use rounding; this implementation detail is critical
T offset = aligned ? (T)0.5 : (T)0.0;
T roi_center_w = current_roi[1] * spatial_scale - offset;
T roi_center_h = current_roi[2] * spatial_scale - offset;
T roi_width = current_roi[3] * spatial_scale;
T roi_height = current_roi[4] * spatial_scale;
T theta = current_roi[5];
if (clockwise) {
theta = -theta; // If clockwise, the angle needs to be reversed.
}
T cos_theta = cos(theta);
T sin_theta = sin(theta);
if (aligned) {
AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
"ROIs in ROIAlignRotated do not have non-negative size!");
} else { // for backward-compatibility only
roi_width = std::max(roi_width, (T)1.);
roi_height = std::max(roi_height, (T)1.);
}
T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
T* offset_grad_input =
grad_input + ((roi_batch_ind * channels + c) * height * width);
int output_offset = n * n_stride + c * c_stride;
const T* offset_grad_output = grad_output + output_offset;
const T grad_output_this_bin =
offset_grad_output[ph * h_stride + pw * w_stride];
// We use roi_bin_grid to sample the grid and mimic integral
int roi_bin_grid_h = (sampling_ratio > 0)
? sampling_ratio
: ceil(roi_height / pooled_height); // e.g., = 2
int roi_bin_grid_w =
(sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
// roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
// Appropriate translation needs to be applied after.
T roi_start_h = -roi_height / 2.0;
T roi_start_w = -roi_width / 2.0;
// We do average (integral) pooling inside a bin
const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
for (int iy = 0; iy < roi_bin_grid_h; iy++) {
const T yy = roi_start_h + ph * bin_size_h +
static_cast<T>(iy + .5f) * bin_size_h /
static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
const T xx = roi_start_w + pw * bin_size_w +
static_cast<T>(ix + .5f) * bin_size_w /
static_cast<T>(roi_bin_grid_w);
// Rotate by theta around the center and translate
T y = yy * cos_theta - xx * sin_theta + roi_center_h;
T x = yy * sin_theta + xx * cos_theta + roi_center_w;
T w1, w2, w3, w4;
int x_low, x_high, y_low, y_high;
bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
x_low, x_high, y_low, y_high);
T g1 = grad_output_this_bin * w1 / count;
T g2 = grad_output_this_bin * w2 / count;
T g3 = grad_output_this_bin * w3 / count;
T g4 = grad_output_this_bin * w4 / count;
if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
// atomic add is not needed for now since it is single threaded
add(offset_grad_input + y_low * width + x_low, static_cast<T>(g1));
add(offset_grad_input + y_low * width + x_high, static_cast<T>(g2));
add(offset_grad_input + y_high * width + x_low, static_cast<T>(g3));
add(offset_grad_input + y_high * width + x_high, static_cast<T>(g4));
} // if
} // ix
} // iy
} // for
} // ROIAlignRotatedBackward
void ROIAlignRotatedForwardCPULauncher(Tensor input, Tensor rois, Tensor output,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
bool aligned, bool clockwise) {
int output_size = output.numel();
int channels = input.size(1);
int height = input.size(2);
int width = input.size(3);
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "ROIAlignRotated_forward", [&] {
ROIAlignRotatedForward<scalar_t>(
output_size, input.data_ptr<scalar_t>(),
static_cast<scalar_t>(spatial_scale), aligned, clockwise, channels,
height, width, aligned_height, aligned_width, sampling_ratio,
rois.data_ptr<scalar_t>(), output.data_ptr<scalar_t>());
});
}
void ROIAlignRotatedBackwardCPULauncher(Tensor grad_output, Tensor rois,
Tensor grad_input, int aligned_height,
int aligned_width, float spatial_scale,
int sampling_ratio, bool aligned,
bool clockwise) {
int output_size = grad_output.numel();
int channels = grad_input.size(1);
int height = grad_input.size(2);
int width = grad_input.size(3);
// get stride values to ensure indexing into gradients is correct.
int n_stride = grad_output.stride(0);
int c_stride = grad_output.stride(1);
int h_stride = grad_output.stride(2);
int w_stride = grad_output.stride(3);
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
grad_output.scalar_type(), "ROIAlignRotated_backward", [&] {
ROIAlignRotatedBackward<scalar_t>(
grad_output.numel(), grad_output.data_ptr<scalar_t>(),
static_cast<scalar_t>(spatial_scale), aligned, clockwise, channels,
height, width, aligned_height, aligned_width, sampling_ratio,
grad_input.data_ptr<scalar_t>(), rois.data_ptr<scalar_t>(),
n_stride, c_stride, h_stride, w_stride);
});
}
#include "pytorch_cuda_helper.hpp"
#include "roi_align_rotated_cuda_kernel.cuh"
void ROIAlignRotatedForwardCUDAKernelLauncher(
const at::Tensor features, const at::Tensor rois, const float spatial_scale,
const int sample_num, const bool aligned, const bool clockwise,
const int channels, const int height, const int width, const int num_rois,
const int pooled_height, const int pooled_width, at::Tensor output) {
const int output_size = num_rois * pooled_height * pooled_width * channels;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
features.type(), "ROIAlignRotatedLaucherForward", ([&] {
const scalar_t *bottom_data = features.data<scalar_t>();
const scalar_t *rois_data = rois.data<scalar_t>();
scalar_t *top_data = output.data<scalar_t>();
roi_align_rotated_forward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(
output_size, bottom_data, rois_data, scalar_t(spatial_scale),
sample_num, aligned, clockwise, channels, height, width,
pooled_height, pooled_width, top_data);
}));
AT_CUDA_CHECK(cudaGetLastError());
}
void ROIAlignRotatedBackwardCUDAKernelLauncher(
const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,
const int sample_num, const bool aligned, const bool clockwise,
const int channels, const int height, const int width, const int num_rois,
const int pooled_height, const int pooled_width, at::Tensor bottom_grad) {
const int output_size = num_rois * pooled_height * pooled_width * channels;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
top_grad.type(), "ROIAlignLaucherBackward", ([&] {
const scalar_t *top_diff = top_grad.data<scalar_t>();
const scalar_t *rois_data = rois.data<scalar_t>();
scalar_t *bottom_diff = bottom_grad.data<scalar_t>();
roi_align_rotated_backward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(
output_size, top_diff, rois_data, spatial_scale, sample_num,
aligned, clockwise, channels, height, width, pooled_height,
pooled_width, bottom_diff);
}));
AT_CUDA_CHECK(cudaGetLastError());
}
// Modified from
// https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/ROIAlignRotated
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
#ifndef ROI_ALIGN_ROTATED_CUDA_KERNEL_CUH
#define ROI_ALIGN_ROTATED_CUDA_KERNEL_CUH
#include <float.h>
#ifdef MMCV_WITH_TRT
#include "common_cuda_helper.hpp"
#else // MMCV_WITH_TRT
#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else // MMCV_USE_PARROTS
#include "pytorch_cuda_helper.hpp"
#endif // MMCV_USE_PARROTS
#endif // MMCV_WITH_TRT
/*** Forward ***/
template <typename scalar_t>
__global__ void roi_align_rotated_forward_cuda_kernel(
const int nthreads, const scalar_t *bottom_data,
const scalar_t *bottom_rois, const scalar_t spatial_scale,
const int sample_num, const bool aligned, const bool clockwise,
const int channels, const int height, const int width,
const int pooled_height, const int pooled_width, scalar_t *top_data) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int c = (index / pooled_width / pooled_height) % channels;
int n = index / pooled_width / pooled_height / channels;
const scalar_t *offset_bottom_rois = bottom_rois + n * 6;
int roi_batch_ind = offset_bottom_rois[0];
// Do not using rounding; this implementation detail is critical
scalar_t offset = aligned ? (scalar_t)0.5 : (scalar_t)0.0;
scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale - offset;
scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale - offset;
scalar_t roi_width = offset_bottom_rois[3] * spatial_scale;
scalar_t roi_height = offset_bottom_rois[4] * spatial_scale;
// scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0;
scalar_t theta = offset_bottom_rois[5];
if (clockwise) {
theta = -theta; // If clockwise, the angle needs to be reversed.
}
if (!aligned) { // for backward-compatibility only
// Force malformed ROIs to be 1x1
roi_width = max(roi_width, (scalar_t)1.);
roi_height = max(roi_height, (scalar_t)1.);
}
scalar_t bin_size_h = static_cast<scalar_t>(roi_height) /
static_cast<scalar_t>(pooled_height);
scalar_t bin_size_w =
static_cast<scalar_t>(roi_width) / static_cast<scalar_t>(pooled_width);
const scalar_t *offset_bottom_data =
bottom_data + (roi_batch_ind * channels + c) * height * width;
// We use roi_bin_grid to sample the grid and mimic integral
int roi_bin_grid_h = (sample_num > 0)
? sample_num
: ceil(roi_height / pooled_height); // e.g., = 2
int roi_bin_grid_w =
(sample_num > 0) ? sample_num : ceil(roi_width / pooled_width);
// roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
// Appropriate translation needs to be applied after.
scalar_t roi_start_h = -roi_height / 2.0;
scalar_t roi_start_w = -roi_width / 2.0;
scalar_t cosscalar_theta = cos(theta);
scalar_t sinscalar_theta = sin(theta);
// We do average (integral) pooling inside a bin
const scalar_t count = max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4
scalar_t output_val = 0.;
for (int iy = 0; iy < roi_bin_grid_h; iy++) { // e.g., iy = 0, 1
const scalar_t yy =
roi_start_h + ph * bin_size_h +
static_cast<scalar_t>(iy + .5f) * bin_size_h /
static_cast<scalar_t>(roi_bin_grid_h); // e.g., 0.5, 1.5
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
const scalar_t xx = roi_start_w + pw * bin_size_w +
static_cast<scalar_t>(ix + .5f) * bin_size_w /
static_cast<scalar_t>(roi_bin_grid_w);
// Rotate by theta (counterclockwise) around the center and translate
scalar_t y = yy * cosscalar_theta - xx * sinscalar_theta + roi_center_h;
scalar_t x = yy * sinscalar_theta + xx * cosscalar_theta + roi_center_w;
scalar_t val = bilinear_interpolate<scalar_t>(
offset_bottom_data, height, width, y, x, index);
output_val += val;
}
}
output_val /= count;
top_data[index] = output_val;
}
}
/*** Backward ***/
template <typename scalar_t>
__global__ void roi_align_rotated_backward_cuda_kernel(
const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_rois,
const scalar_t spatial_scale, const int sample_num, const bool aligned,
const bool clockwise, const int channels, const int height, const int width,
const int pooled_height, const int pooled_width, scalar_t *bottom_diff) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int c = (index / pooled_width / pooled_height) % channels;
int n = index / pooled_width / pooled_height / channels;
const scalar_t *offset_bottom_rois = bottom_rois + n * 6;
int roi_batch_ind = offset_bottom_rois[0];
// Do not round
scalar_t offset = aligned ? (scalar_t)0.5 : (scalar_t)0.0;
scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale - offset;
scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale - offset;
scalar_t roi_width = offset_bottom_rois[3] * spatial_scale;
scalar_t roi_height = offset_bottom_rois[4] * spatial_scale;
// scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0;
scalar_t theta = offset_bottom_rois[5];
if (clockwise) {
theta = -theta; // If clockwise, the angle needs to be reversed.
}
if (!aligned) { // for backward-compatibility only
// Force malformed ROIs to be 1x1
roi_width = max(roi_width, (scalar_t)1.);
roi_height = max(roi_height, (scalar_t)1.);
}
scalar_t bin_size_h = static_cast<scalar_t>(roi_height) /
static_cast<scalar_t>(pooled_height);
scalar_t bin_size_w =
static_cast<scalar_t>(roi_width) / static_cast<scalar_t>(pooled_width);
scalar_t *offset_bottom_diff =
bottom_diff + (roi_batch_ind * channels + c) * height * width;
int top_offset = (n * channels + c) * pooled_height * pooled_width;
const scalar_t *offset_top_diff = top_diff + top_offset;
const scalar_t top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
// We use roi_bin_grid to sample the grid and mimic integral
int roi_bin_grid_h = (sample_num > 0)
? sample_num
: ceil(roi_height / pooled_height); // e.g., = 2
int roi_bin_grid_w =
(sample_num > 0) ? sample_num : ceil(roi_width / pooled_width);
// roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
// Appropriate translation needs to be applied after.
scalar_t roi_start_h = -roi_height / 2.0;
scalar_t roi_start_w = -roi_width / 2.0;
scalar_t cosTheta = cos(theta);
scalar_t sinTheta = sin(theta);
// We do average (integral) pooling inside a bin
const scalar_t count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
for (int iy = 0; iy < roi_bin_grid_h; iy++) { // e.g., iy = 0, 1
const scalar_t yy =
roi_start_h + ph * bin_size_h +
static_cast<scalar_t>(iy + .5f) * bin_size_h /
static_cast<scalar_t>(roi_bin_grid_h); // e.g., 0.5, 1.5
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
const scalar_t xx = roi_start_w + pw * bin_size_w +
static_cast<scalar_t>(ix + .5f) * bin_size_w /
static_cast<scalar_t>(roi_bin_grid_w);
// Rotate by theta around the center and translate
scalar_t y = yy * cosTheta - xx * sinTheta + roi_center_h;
scalar_t x = yy * sinTheta + xx * cosTheta + roi_center_w;
scalar_t w1, w2, w3, w4;
int x_low, x_high, y_low, y_high;
bilinear_interpolate_gradient<scalar_t>(height, width, y, x, w1, w2, w3,
w4, x_low, x_high, y_low,
y_high, index);
scalar_t g1 = top_diff_this_bin * w1 / count;
scalar_t g2 = top_diff_this_bin * w2 / count;
scalar_t g3 = top_diff_this_bin * w3 / count;
scalar_t g4 = top_diff_this_bin * w4 / count;
if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
atomicAdd(offset_bottom_diff + y_low * width + x_low, g1);
atomicAdd(offset_bottom_diff + y_low * width + x_high, g2);
atomicAdd(offset_bottom_diff + y_high * width + x_low, g3);
atomicAdd(offset_bottom_diff + y_high * width + x_high, g4);
} // if
} // ix
} // iy
} // CUDA_1D_KERNEL_LOOP
} // RoIAlignBackward
#endif // ROI_ALIGN_ROTATED_CUDA_KERNEL_CUH
import torch.nn as nn
from torch.autograd import Function
from ..utils import ext_loader
ext_module = ext_loader.load_ext(
'_ext', ['roi_align_rotated_forward', 'roi_align_rotated_backward'])
class RoIAlignRotatedFunction(Function):
@staticmethod
def symbolic(g, features, rois, out_size, spatial_scale, sample_num,
aligned, clockwise):
if isinstance(out_size, int):
out_h = out_size
out_w = out_size
elif isinstance(out_size, tuple):
assert len(out_size) == 2
assert isinstance(out_size[0], int)
assert isinstance(out_size[1], int)
out_h, out_w = out_size
else:
raise TypeError(
'"out_size" must be an integer or tuple of integers')
return g.op(
'mmcv::MMCVRoIAlignRotated',
features,
rois,
output_height_i=out_h,
output_width_i=out_h,
spatial_scale_f=spatial_scale,
sampling_ratio_i=sample_num,
aligned_i=aligned,
clockwise_i=clockwise)
@staticmethod
def forward(ctx,
features,
rois,
out_size,
spatial_scale,
sample_num=0,
aligned=True,
clockwise=False):
if isinstance(out_size, int):
out_h = out_size
out_w = out_size
elif isinstance(out_size, tuple):
assert len(out_size) == 2
assert isinstance(out_size[0], int)
assert isinstance(out_size[1], int)
out_h, out_w = out_size
else:
raise TypeError(
'"out_size" must be an integer or tuple of integers')
ctx.spatial_scale = spatial_scale
ctx.sample_num = sample_num
ctx.aligned = aligned
ctx.clockwise = clockwise
ctx.save_for_backward(rois)
ctx.feature_size = features.size()
batch_size, num_channels, data_height, data_width = features.size()
num_rois = rois.size(0)
output = features.new_zeros(num_rois, num_channels, out_h, out_w)
ext_module.roi_align_rotated_forward(
features,
rois,
output,
pooled_height=out_h,
pooled_width=out_w,
spatial_scale=spatial_scale,
sample_num=sample_num,
aligned=aligned,
clockwise=clockwise)
return output
@staticmethod
def backward(ctx, grad_output):
feature_size = ctx.feature_size
spatial_scale = ctx.spatial_scale
aligned = ctx.aligned
clockwise = ctx.clockwise
sample_num = ctx.sample_num
rois = ctx.saved_tensors[0]
assert feature_size is not None
batch_size, num_channels, data_height, data_width = feature_size
out_w = grad_output.size(3)
out_h = grad_output.size(2)
grad_input = grad_rois = None
if ctx.needs_input_grad[0]:
grad_input = rois.new_zeros(batch_size, num_channels, data_height,
data_width)
ext_module.roi_align_rotated_backward(
grad_output.contiguous(),
rois,
grad_input,
pooled_height=out_h,
pooled_width=out_w,
spatial_scale=spatial_scale,
sample_num=sample_num,
aligned=aligned,
clockwise=clockwise)
return grad_input, grad_rois, None, None, None, None, None
roi_align_rotated = RoIAlignRotatedFunction.apply
class RoIAlignRotated(nn.Module):
"""RoI align pooling layer for rotated proposals.
It accepts a feature map of shape (N, C, H, W) and rois with shape
(n, 6) with each roi decoded as (batch_index, center_x, center_y,
w, h, angle). The angle is in radian.
Args:
out_size (tuple): h, w
spatial_scale (float): scale the input boxes by this number
sample_num (int): number of inputs samples to take for each
output sample. 0 to take samples densely for current models.
aligned (bool): if False, use the legacy implementation in
MMDetection. If True, align the results more perfectly.
Default: True.
clockwise (bool): If True, the angle in each proposal follows a
clockwise fashion in image space, otherwise, the angle is
counterclockwise. Default: False.
Note:
The implementation of RoIAlign when aligned=True is modified from
https://github.com/facebookresearch/detectron2/
The meaning of aligned=True:
Given a continuous coordinate c, its two neighboring pixel
indices (in our pixel model) are computed by floor(c - 0.5) and
ceil(c - 0.5). For example, c=1.3 has pixel neighbors with discrete
indices [0] and [1] (which are sampled from the underlying signal
at continuous coordinates 0.5 and 1.5). But the original roi_align
(aligned=False) does not subtract the 0.5 when computing
neighboring pixel indices and therefore it uses pixels with a
slightly incorrect alignment (relative to our pixel model) when
performing bilinear interpolation.
With `aligned=True`,
we first appropriately scale the ROI and then shift it by -0.5
prior to calling roi_align. This produces the correct neighbors;
The difference does not make a difference to the model's
performance if ROIAlign is used together with conv layers.
"""
def __init__(self,
out_size,
spatial_scale,
sample_num=0,
aligned=True,
clockwise=False):
super(RoIAlignRotated, self).__init__()
self.out_size = out_size
self.spatial_scale = float(spatial_scale)
self.sample_num = int(sample_num)
self.aligned = aligned
self.clockwise = clockwise
def forward(self, features, rois):
return RoIAlignRotatedFunction.apply(features, rois, self.out_size,
self.spatial_scale,
self.sample_num, self.aligned,
self.clockwise)
......@@ -280,6 +280,85 @@ def test_roialign():
assert np.allclose(pytorch_output, onnx_output, atol=1e-3)
def test_roialign_rotated():
if torch.__version__ == 'parrots':
pytest.skip('onnx is not supported in parrots directly')
try:
from mmcv.ops import roi_align_rotated
from mmcv.ops import get_onnxruntime_op_path
except (ImportError, ModuleNotFoundError):
pytest.skip('roi_align_aligned op is not successfully compiled')
ort_custom_op_path = get_onnxruntime_op_path()
if not os.path.exists(ort_custom_op_path):
pytest.skip('custom ops for onnxruntime are not compiled.')
# roi align config
pool_h = 2
pool_w = 2
spatial_scale = 1.0
sampling_ratio = 2
inputs = [([[[[1., 2.], [3., 4.]]]], [[0., 0.5, 0.5, 1., 1., 0]]),
([[[[1., 2.], [3., 4.]]]], [[0., 0.5, 0.5, 1., 1., np.pi / 2]]),
([[[[1., 2.], [3., 4.]],
[[4., 3.], [2., 1.]]]], [[0., 0.5, 0.5, 1., 1., 0]]),
([[[[1., 2., 5., 6.], [3., 4., 7., 8.], [9., 10., 13., 14.],
[11., 12., 15., 16.]]]], [[0., 1.5, 1.5, 3., 3., 0]]),
([[[[1., 2., 5., 6.], [3., 4., 7., 8.], [9., 10., 13., 14.],
[11., 12., 15., 16.]]]], [[0., 1.5, 1.5, 3., 3.,
np.pi / 2]])]
def warpped_function(torch_input, torch_rois):
return roi_align_rotated(torch_input, torch_rois, (pool_w, pool_h),
spatial_scale, sampling_ratio, True, False)
for case in inputs:
np_input = np.array(case[0], dtype=np.float32)
np_rois = np.array(case[1], dtype=np.float32)
input = torch.from_numpy(np_input)
rois = torch.from_numpy(np_rois)
# compute pytorch_output
with torch.no_grad():
pytorch_output = roi_align_rotated(input, rois, (pool_w, pool_h),
spatial_scale, sampling_ratio,
True, False)
# export and load onnx model
wrapped_model = WrapFunction(warpped_function)
with torch.no_grad():
torch.onnx.export(
wrapped_model, (input, rois),
onnx_file,
export_params=True,
keep_initializers_as_inputs=True,
input_names=['features', 'rois'],
opset_version=11)
onnx_model = onnx.load(onnx_file)
session_options = rt.SessionOptions()
if os.path.exists(ort_custom_op_path):
session_options.register_custom_ops_library(ort_custom_op_path)
# compute onnx_output
input_all = [node.name for node in onnx_model.graph.input]
input_initializer = [
node.name for node in onnx_model.graph.initializer
]
net_feed_input = list(set(input_all) - set(input_initializer))
assert (len(net_feed_input) == 2)
sess = rt.InferenceSession(onnx_file, session_options)
onnx_output = sess.run(None, {
'features': input.detach().numpy(),
'rois': rois.detach().numpy()
})
onnx_output = onnx_output[0]
# allclose
os.remove(onnx_file)
assert np.allclose(pytorch_output, onnx_output, atol=1e-3)
@pytest.mark.skipif(not torch.cuda.is_available(), reason='test requires GPU')
def test_roipool():
if torch.__version__ == 'parrots':
......
import numpy as np
import pytest
import torch
_USING_PARROTS = True
try:
from parrots.autograd import gradcheck
except ImportError:
from torch.autograd import gradcheck
_USING_PARROTS = False
# yapf:disable
inputs = [([[[[1., 2.], [3., 4.]]]],
[[0., 0.5, 0.5, 1., 1., 0]]),
([[[[1., 2.], [3., 4.]]]],
[[0., 0.5, 0.5, 1., 1., np.pi / 2]]),
([[[[1., 2.], [3., 4.]],
[[4., 3.], [2., 1.]]]],
[[0., 0.5, 0.5, 1., 1., 0]]),
([[[[1., 2., 5., 6.], [3., 4., 7., 8.],
[9., 10., 13., 14.], [11., 12., 15., 16.]]]],
[[0., 1.5, 1.5, 3., 3., 0]]),
([[[[1., 2., 5., 6.], [3., 4., 7., 8.],
[9., 10., 13., 14.], [11., 12., 15., 16.]]]],
[[0., 1.5, 1.5, 3., 3., np.pi / 2]])]
outputs = [([[[[1.0, 1.25], [1.5, 1.75]]]],
[[[[3.0625, 0.4375], [0.4375, 0.0625]]]]),
([[[[1.5, 1], [1.75, 1.25]]]],
[[[[3.0625, 0.4375], [0.4375, 0.0625]]]]),
([[[[1.0, 1.25], [1.5, 1.75]],
[[4.0, 3.75], [3.5, 3.25]]]],
[[[[3.0625, 0.4375], [0.4375, 0.0625]],
[[3.0625, 0.4375], [0.4375, 0.0625]]]]),
([[[[1.9375, 4.75], [7.5625, 10.375]]]],
[[[[0.47265625, 0.42968750, 0.42968750, 0.04296875],
[0.42968750, 0.39062500, 0.39062500, 0.03906250],
[0.42968750, 0.39062500, 0.39062500, 0.03906250],
[0.04296875, 0.03906250, 0.03906250, 0.00390625]]]]),
([[[[7.5625, 1.9375], [10.375, 4.75]]]],
[[[[0.47265625, 0.42968750, 0.42968750, 0.04296875],
[0.42968750, 0.39062500, 0.39062500, 0.03906250],
[0.42968750, 0.39062500, 0.39062500, 0.03906250],
[0.04296875, 0.03906250, 0.03906250, 0.00390625]]]])]
# yapf:enable
pool_h = 2
pool_w = 2
spatial_scale = 1.0
sampling_ratio = 2
def _test_roialign_rotated_gradcheck(device, dtype):
if not torch.cuda.is_available() and device == 'cuda':
pytest.skip('unittest does not support GPU yet.')
try:
from mmcv.ops import RoIAlignRotated
except ModuleNotFoundError:
pytest.skip('RoIAlignRotated op is not successfully compiled')
if dtype is torch.half:
pytest.skip('grad check does not support fp16')
for case in inputs:
np_input = np.array(case[0])
np_rois = np.array(case[1])
x = torch.tensor(
np_input, dtype=dtype, device=device, requires_grad=True)
rois = torch.tensor(np_rois, dtype=dtype, device=device)
froipool = RoIAlignRotated((pool_h, pool_w), spatial_scale,
sampling_ratio)
if torch.__version__ == 'parrots':
gradcheck(
froipool, (x, rois), no_grads=[rois], delta=1e-5, pt_atol=1e-5)
else:
gradcheck(froipool, (x, rois), eps=1e-5, atol=1e-5)
def _test_roialign_rotated_allclose(device, dtype):
if not torch.cuda.is_available() and device == 'cuda':
pytest.skip('unittest does not support GPU yet.')
try:
from mmcv.ops import roi_align_rotated
except ModuleNotFoundError:
pytest.skip('test requires compilation')
pool_h = 2
pool_w = 2
spatial_scale = 1.0
sampling_ratio = 2
for case, output in zip(inputs, outputs):
np_input = np.array(case[0])
np_rois = np.array(case[1])
np_output = np.array(output[0])
np_grad = np.array(output[1])
x = torch.tensor(
np_input, dtype=dtype, device=device, requires_grad=True)
rois = torch.tensor(np_rois, dtype=dtype, device=device)
output = roi_align_rotated(x, rois, (pool_h, pool_w), spatial_scale,
sampling_ratio, True)
output.backward(torch.ones_like(output))
assert np.allclose(
output.data.type(torch.float).cpu().numpy(), np_output, atol=1e-3)
assert np.allclose(
x.grad.data.type(torch.float).cpu().numpy(), np_grad, atol=1e-3)
@pytest.mark.parametrize('device', ['cuda', 'cpu'])
@pytest.mark.parametrize('dtype', [torch.float, torch.double, torch.half])
def test_roialign_rotated(device, dtype):
# check double only
if (dtype is torch.double):
_test_roialign_rotated_gradcheck(device=device, dtype=dtype)
_test_roialign_rotated_allclose(device=device, dtype=dtype)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment