"vscode:/vscode.git/clone" did not exist on "661047f119c82324fcda3dbe11eaf5b7e233a359"
Commit 6f3c5f1c authored by limm's avatar limm
Browse files

support v1.4.0

parent 6f674c7e
// Copyright (c) OpenMMLab. All rights reserved
#include "onnxruntime_register.h"
#include "corner_pool.h"
#include "deform_conv.h"
#include "grid_sample.h"
#include "modulated_deform_conv.h"
#include "nms.h"
#include "ort_mmcv_utils.h"
#include "reduce_ops.h"
#include "roi_align.h"
#include "roi_align_rotated.h"
#include "soft_nms.h"
const char *c_MMCVOpDomain = "mmcv";
SoftNmsOp c_SoftNmsOp;
NmsOp c_NmsOp;
MMCVRoiAlignCustomOp c_MMCVRoiAlignCustomOp;
MMCVRoIAlignRotatedCustomOp c_MMCVRoIAlignRotatedCustomOp;
GridSampleOp c_GridSampleOp;
MMCVCumMaxCustomOp c_MMCVCumMaxCustomOp;
MMCVCumMinCustomOp c_MMCVCumMinCustomOp;
MMCVCornerPoolCustomOp c_MMCVCornerPoolCustomOp;
MMCVModulatedDeformConvOp c_MMCVModulatedDeformConvOp;
MMCVDeformConvOp c_MMCVDeformConvOp;
OrtStatus *ORT_API_CALL RegisterCustomOps(OrtSessionOptions *options,
const OrtApiBase *api) {
OrtCustomOpDomain *domain = nullptr;
const OrtApi *ortApi = api->GetApi(ORT_API_VERSION);
if (auto status = ortApi->CreateCustomOpDomain(c_MMCVOpDomain, &domain)) {
return status;
}
if (auto status = ortApi->CustomOpDomain_Add(domain, &c_SoftNmsOp)) {
return status;
}
if (auto status = ortApi->CustomOpDomain_Add(domain, &c_NmsOp)) {
return status;
}
if (auto status =
ortApi->CustomOpDomain_Add(domain, &c_MMCVRoiAlignCustomOp)) {
return status;
}
if (auto status =
ortApi->CustomOpDomain_Add(domain, &c_MMCVRoIAlignRotatedCustomOp)) {
return status;
}
if (auto status = ortApi->CustomOpDomain_Add(domain, &c_GridSampleOp)) {
return status;
}
if (auto status =
ortApi->CustomOpDomain_Add(domain, &c_MMCVCornerPoolCustomOp)) {
return status;
}
if (auto status = ortApi->CustomOpDomain_Add(domain, &c_MMCVCumMaxCustomOp)) {
return status;
}
if (auto status = ortApi->CustomOpDomain_Add(domain, &c_MMCVCumMinCustomOp)) {
return status;
}
if (auto status =
ortApi->CustomOpDomain_Add(domain, &c_MMCVModulatedDeformConvOp)) {
return status;
}
if (auto status = ortApi->CustomOpDomain_Add(domain, &c_MMCVDeformConvOp)) {
return status;
}
return ortApi->AddCustomOpDomain(options, domain);
}
// Copyright (c) OpenMMLab. All rights reserved
#include "reduce_ops.h"
#include <assert.h>
#include <vector>
#include "../ort_mmcv_utils.h"
// modified from
// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/ReduceOps.cpp
static inline int64_t maybe_wrap_dim(int64_t dim, int64_t ndims) {
int64_t min = -ndims;
int64_t max = ndims - 1;
assert(dim >= min && dim <= max);
if (dim < 0) dim += ndims;
return dim;
}
static inline int64_t get_dim_stride(const int64_t dim, const int64_t ndims,
const int64_t *reversed_dim_cumprod) {
return dim == ndims - 1 ? 1 : reversed_dim_cumprod[dim + 1];
}
static inline int64_t get_dim_size(const int64_t dim, const int64_t ndims,
const int64_t *reversed_dim_cumprod) {
return dim == ndims - 1
? reversed_dim_cumprod[dim]
: reversed_dim_cumprod[dim] / reversed_dim_cumprod[dim + 1];
}
template <typename T1, typename T2, typename Operation>
void cummax_cummin_helper(const T1 *input, T1 *output, T2 *indices,
const int64_t input_dim_size, const int64_t stride) {
Operation op;
T1 out = input[0];
int64_t idx = 0;
for (int64_t i = 0; i < input_dim_size; i++) {
T1 curr_elem = input[i * stride];
if (op(curr_elem, out)) {
out = curr_elem;
idx = i;
}
output[i * stride] = out;
indices[i * stride] = idx;
}
}
// modified `tensor_dim_apply3` from
// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/TensorDimApply.h.
// the difference is that: (1) use `reversed_dim_cumprod` for fast computing of
// tensor `size` and `stride`. (2) the same `stride` is used for input, output,
// and indices, since it's unnecessary to use separate values. currently
// `tensor_dim_apply3` is only used for `cummax` and `cummin`, according to the
// official pytorch projects: https://github.com/pytorch/pytorch.
template <typename T1, typename T2, typename Function>
void tensor_dim_apply3(const T1 *input, T1 *output, T2 *indices,
const int64_t dim, const int64_t ndims,
const int64_t *reversed_dim_cumprod, Function func) {
int dim_apply_finished = 0;
int64_t input_dim_size = get_dim_size(dim, ndims, reversed_dim_cumprod);
// the same stride is used for input, output and indices
int64_t stride = get_dim_stride(dim, ndims, reversed_dim_cumprod);
std::vector<int64_t> counter(ndims, 0);
while (!dim_apply_finished) {
// call `func` once to update output and indices
func(input, output, indices, input_dim_size, stride);
if (ndims == 1) break;
for (int64_t dim_i = 0; dim_i < ndims; dim_i++) {
if (dim_i == dim) {
if (dim_i == (ndims - 1)) {
dim_apply_finished = 1;
break;
}
continue;
}
counter[dim_i]++;
// the same stride is used for input, output, and indices
int64_t stride_dim_i = get_dim_stride(dim_i, ndims, reversed_dim_cumprod);
input += stride_dim_i;
output += stride_dim_i;
indices += stride_dim_i;
if (counter[dim_i] == get_dim_size(dim_i, ndims, reversed_dim_cumprod)) {
if (dim_i == ndims - 1) {
dim_apply_finished = 1;
break;
} else {
input -= counter[dim_i] * stride_dim_i;
output -= counter[dim_i] * stride_dim_i;
indices -= counter[dim_i] * stride_dim_i;
counter[dim_i] = 0;
}
} else {
break;
} // if
} // for
} // while
}
template <typename T1, typename T2, typename Operation>
void CumMax_CumMin_CPU(const T1 *input, T1 *output, T2 *indices,
int64_t *reversed_dim_cumprod, const int64_t dim,
const OrtTensorDimensions &out_dimensions) {
// calculate numel
const int64_t ndims = out_dimensions.size();
int64_t numel = 1;
for (int64_t dim_i = 0; dim_i < ndims; dim_i++) {
numel *= out_dimensions.data()[dim_i];
}
// cummax is only applied to input which is non-zero dim and non-empty
if (numel) {
// compute the cumulative production on dimension size,
// which is then used for computing the stride or size of a specific `dim`.
reversed_dim_cumprod[ndims - 1] = out_dimensions.data()[ndims - 1];
for (int64_t dim_i = ndims - 2; dim_i >= 0; dim_i--) {
reversed_dim_cumprod[dim_i] =
reversed_dim_cumprod[dim_i + 1] * out_dimensions.data()[dim_i];
}
// do cummax or cummin based on `Operation` type
tensor_dim_apply3<float, int64_t>(
input, output, indices, dim, ndims, reversed_dim_cumprod,
cummax_cummin_helper<float, int64_t, Operation>);
}
}
void MMCVCumMaxKernel::Compute(OrtKernelContext *context) {
// get input
const OrtValue *input = ort_.KernelContext_GetInput(context, 0);
const float *input_data =
reinterpret_cast<const float *>(ort_.GetTensorData<float>(input));
// get output
OrtTensorDimensions out_dimensions(ort_, input);
OrtValue *output = ort_.KernelContext_GetOutput(
context, 0, out_dimensions.data(), out_dimensions.size());
float *output_data = ort_.GetTensorMutableData<float>(output);
OrtValue *indices = ort_.KernelContext_GetOutput(
context, 1, out_dimensions.data(), out_dimensions.size());
int64_t *indices_data = ort_.GetTensorMutableData<int64_t>(indices);
// allocate tmp memory for computing the cumulative production on dimension
// size
const int64_t ndims = out_dimensions.size();
assert(ndims > 0);
int64_t *reversed_dim_cumprod =
(int64_t *)allocator_.Alloc(sizeof(int64_t) * ndims);
// dim should be wrapped if it's negative (e.g. -1)
const int64_t dim = maybe_wrap_dim(dim_, ndims);
CumMax_CumMin_CPU<float, int64_t, std::greater_equal<float>>(
input_data, output_data, indices_data, reversed_dim_cumprod, dim,
out_dimensions);
}
void MMCVCumMinKernel::Compute(OrtKernelContext *context) {
// get input
const OrtValue *input = ort_.KernelContext_GetInput(context, 0);
const float *input_data =
reinterpret_cast<const float *>(ort_.GetTensorData<float>(input));
// get output
OrtTensorDimensions out_dimensions(ort_, input);
OrtValue *output = ort_.KernelContext_GetOutput(
context, 0, out_dimensions.data(), out_dimensions.size());
float *output_data = ort_.GetTensorMutableData<float>(output);
OrtValue *indices = ort_.KernelContext_GetOutput(
context, 1, out_dimensions.data(), out_dimensions.size());
int64_t *indices_data = ort_.GetTensorMutableData<int64_t>(indices);
// allocate tmp memory for computing the cumulative production on dimension
// size
const int64_t ndims = out_dimensions.size();
assert(ndims > 0);
int64_t *reversed_dim_cumprod =
(int64_t *)allocator_.Alloc(sizeof(int64_t) * ndims);
// dim should be wrapped if it's negative (e.g. -1)
const int64_t dim = maybe_wrap_dim(dim_, ndims);
CumMax_CumMin_CPU<float, int64_t, std::less_equal<float>>(
input_data, output_data, indices_data, reversed_dim_cumprod, dim,
out_dimensions);
}
// Copyright (c) OpenMMLab. All rights reserved
#include "roi_align.h"
#include "../ort_mmcv_utils.h"
// implementation taken from Caffe2
struct PreCalc {
int pos1;
int pos2;
int pos3;
int pos4;
float w1;
float w2;
float w3;
float w4;
};
void pre_calc_for_bilinear_interpolate(
const int height, const int width, const int pooled_height,
const int pooled_width, const int iy_upper, const int ix_upper,
float roi_start_h, float roi_start_w, float bin_size_h, float bin_size_w,
int roi_bin_grid_h, int roi_bin_grid_w, std::vector<PreCalc> &pre_calc) {
int pre_calc_index = 0;
for (int ph = 0; ph < pooled_height; ph++) {
for (int pw = 0; pw < pooled_width; pw++) {
for (int iy = 0; iy < iy_upper; iy++) {
const float yy =
roi_start_h + ph * bin_size_h +
static_cast<float>(iy + .5f) * bin_size_h /
static_cast<float>(roi_bin_grid_h); // e.g., 0.5, 1.5
for (int ix = 0; ix < ix_upper; ix++) {
const float xx = roi_start_w + pw * bin_size_w +
static_cast<float>(ix + .5f) * bin_size_w /
static_cast<float>(roi_bin_grid_w);
float x = xx;
float y = yy;
// deal with: inverse elements are out of feature map boundary
if (y < -1.0 || y > height || x < -1.0 || x > width) {
// empty
PreCalc pc;
pc.pos1 = 0;
pc.pos2 = 0;
pc.pos3 = 0;
pc.pos4 = 0;
pc.w1 = 0;
pc.w2 = 0;
pc.w3 = 0;
pc.w4 = 0;
pre_calc[pre_calc_index] = pc;
pre_calc_index += 1;
continue;
}
if (y <= 0) {
y = 0;
}
if (x <= 0) {
x = 0;
}
int y_low = (int)y;
int x_low = (int)x;
int y_high;
int x_high;
if (y_low >= height - 1) {
y_high = y_low = height - 1;
y = (float)y_low;
} else {
y_high = y_low + 1;
}
if (x_low >= width - 1) {
x_high = x_low = width - 1;
x = (float)x_low;
} else {
x_high = x_low + 1;
}
float ly = y - y_low;
float lx = x - x_low;
float hy = 1. - ly, hx = 1. - lx;
float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
// save weights and indices
PreCalc pc;
pc.pos1 = y_low * width + x_low;
pc.pos2 = y_low * width + x_high;
pc.pos3 = y_high * width + x_low;
pc.pos4 = y_high * width + x_high;
pc.w1 = w1;
pc.w2 = w2;
pc.w3 = w3;
pc.w4 = w4;
pre_calc[pre_calc_index] = pc;
pre_calc_index += 1;
}
}
}
}
}
void ROIAlignForwardCPU(const int nthreads, const float *input,
const float *rois, float *output, float *argmax_y,
float *argmax_x, const int pooled_height,
const int pooled_width, const float spatial_scale,
const int sampling_ratio,
const int pool_mode, // 0 - max pool, 1 - avg pool
const bool aligned, const int channels,
const int height, const int width) {
int n_rois = nthreads / channels / pooled_width / pooled_height;
// (n, c, ph, pw) is an element in the pooled output
// can be parallelized using omp
// #pragma omp parallel for num_threads(32)
for (int n = 0; n < n_rois; n++) {
int index_n = n * channels * pooled_width * pooled_height;
const float *offset_rois = rois + n * 5;
int roi_batch_ind = offset_rois[0];
// Do not use rounding; this implementation detail is critical
float offset = aligned ? (float)0.5 : (float)0.0;
float roi_start_w = offset_rois[1] * spatial_scale - offset;
float roi_start_h = offset_rois[2] * spatial_scale - offset;
float roi_end_w = offset_rois[3] * spatial_scale - offset;
float roi_end_h = offset_rois[4] * spatial_scale - offset;
float roi_width = roi_end_w - roi_start_w;
float roi_height = roi_end_h - roi_start_h;
if (aligned) {
/*AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
"ROIs in ROIAlign cannot have non-negative size!");*/
assert(roi_width >= 0 && roi_height >= 0);
} else { // for backward-compatibility only
roi_width = std::max(roi_width, (float)1.);
roi_height = std::max(roi_height, (float)1.);
}
float bin_size_h =
static_cast<float>(roi_height) / static_cast<float>(pooled_height);
float bin_size_w =
static_cast<float>(roi_width) / static_cast<float>(pooled_width);
// We use roi_bin_grid to sample the grid and mimic integral
int roi_bin_grid_h = (sampling_ratio > 0)
? sampling_ratio
: ceil(roi_height / pooled_height); // e.g., = 2
int roi_bin_grid_w =
(sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
// When the grid is empty, output zeros == 0/1, instead of NaN.
const float count =
std::max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4
// we want to precalculate indices and weights shared by all channels,
// this is the key point of optimization
std::vector<PreCalc> pre_calc(roi_bin_grid_h * roi_bin_grid_w *
pooled_width * pooled_height);
pre_calc_for_bilinear_interpolate(
height, width, pooled_height, pooled_width, roi_bin_grid_h,
roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h, bin_size_w,
roi_bin_grid_h, roi_bin_grid_w, pre_calc);
for (int c = 0; c < channels; c++) {
int index_n_c = index_n + c * pooled_width * pooled_height;
const float *offset_input =
input + (roi_batch_ind * channels + c) * height * width;
int pre_calc_index = 0;
for (int ph = 0; ph < pooled_height; ph++) {
for (int pw = 0; pw < pooled_width; pw++) {
int index = index_n_c + ph * pooled_width + pw;
float output_val = 0.;
float maxval = -10000;
float maxidx_y = -1.f, maxidx_x = -1.f;
for (int iy = 0; iy < roi_bin_grid_h; iy++) {
const float y = roi_start_h + ph * bin_size_h +
static_cast<float>(iy + .5f) * bin_size_h /
static_cast<float>(roi_bin_grid_h);
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
const float x = roi_start_w + pw * bin_size_w +
static_cast<float>(ix + .5f) * bin_size_w /
static_cast<float>(roi_bin_grid_w);
PreCalc pc = pre_calc[pre_calc_index];
float val = pc.w1 * offset_input[pc.pos1] +
pc.w2 * offset_input[pc.pos2] +
pc.w3 * offset_input[pc.pos3] +
pc.w4 * offset_input[pc.pos4];
if (val > maxval) {
maxval = val;
maxidx_y = y;
maxidx_x = x;
}
output_val += val;
pre_calc_index += 1;
}
}
if (pool_mode == 0) {
// We do max pooling inside a bin
output[index] = maxval;
argmax_y[index] = maxidx_y;
argmax_x[index] = maxidx_x;
} else if (pool_mode == 1) {
// We do average (integral) pooling inside a bin
output[index] = output_val / count;
} // if
} // for pw
} // for ph
} // for c
} // for n
}
void MMCVRoiAlignKernel::Compute(OrtKernelContext *context) {
// Setup inputs
const OrtValue *input_X = ort_.KernelContext_GetInput(context, 0);
const float *X_data =
reinterpret_cast<const float *>(ort_.GetTensorData<float>(input_X));
const OrtValue *input_rois = ort_.KernelContext_GetInput(context, 1);
const float *rois = reinterpret_cast<const float *>(
ort_.GetTensorData<const float *>(input_rois));
// Setup output
OrtTensorDimensions out_dimensions(ort_, input_X);
OrtTensorDimensions roi_dimensions(ort_, input_rois);
int batch_size = out_dimensions.data()[0];
int input_channels = out_dimensions.data()[1];
int input_height = out_dimensions.data()[2];
int input_width = out_dimensions.data()[3];
out_dimensions.data()[0] = roi_dimensions.data()[0];
out_dimensions.data()[2] = aligned_height_;
out_dimensions.data()[3] = aligned_width_;
OrtValue *output = ort_.KernelContext_GetOutput(
context, 0, out_dimensions.data(), out_dimensions.size());
float *out = ort_.GetTensorMutableData<float>(output);
OrtTensorTypeAndShapeInfo *output_info = ort_.GetTensorTypeAndShape(output);
ort_.ReleaseTensorTypeAndShapeInfo(output_info);
// TODO: forward here
int output_size = out_dimensions.data()[0];
for (auto i = 1; i < out_dimensions.size(); ++i) {
output_size *= out_dimensions.data()[i];
}
int poolMod = 1;
if (pool_mode_ == "max") poolMod = 0;
float *argmax_x = nullptr, *argmax_y = nullptr;
if (poolMod == 0) {
argmax_y = new float[output_size];
argmax_x = new float[output_size];
}
ROIAlignForwardCPU(output_size, X_data, rois, out, argmax_y, argmax_x,
aligned_height_, aligned_width_, spatial_scale_,
sampling_ratio_, poolMod, aligned_, input_channels,
input_height, input_width);
if (argmax_x) delete argmax_x;
if (argmax_y) delete argmax_y;
}
// Modified from
// https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/ROIAlignRotated
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
#include "roi_align_rotated.h"
#include "../ort_mmcv_utils.h"
struct PreCalc {
int pos1;
int pos2;
int pos3;
int pos4;
float w1;
float w2;
float w3;
float w4;
};
void pre_calc_for_bilinear_interpolate(
const int height, const int width, const int pooled_height,
const int pooled_width, const int iy_upper, const int ix_upper,
float roi_start_h, float roi_start_w, float bin_size_h, float bin_size_w,
int roi_bin_grid_h, int roi_bin_grid_w, float roi_center_h,
float roi_center_w, float cos_theta, float sin_theta,
std::vector<PreCalc> &pre_calc) {
int pre_calc_index = 0;
for (int ph = 0; ph < pooled_height; ph++) {
for (int pw = 0; pw < pooled_width; pw++) {
for (int iy = 0; iy < iy_upper; iy++) {
const float yy =
roi_start_h + ph * bin_size_h +
static_cast<float>(iy + .5f) * bin_size_h /
static_cast<float>(roi_bin_grid_h); // e.g., 0.5, 1.5
for (int ix = 0; ix < ix_upper; ix++) {
const float xx = roi_start_w + pw * bin_size_w +
static_cast<float>(ix + .5f) * bin_size_w /
static_cast<float>(roi_bin_grid_w);
// Rotate by theta around the center and translate
// In image space, (y, x) is the order for Right Handed System,
// and this is essentially multiplying the point by a rotation matrix
// to rotate it counterclockwise through angle theta.
float y = yy * cos_theta - xx * sin_theta + roi_center_h;
float x = yy * sin_theta + xx * cos_theta + roi_center_w;
// deal with: inverse elements are out of feature map boundary
if (y < -1.0 || y > height || x < -1.0 || x > width) {
// empty
PreCalc pc;
pc.pos1 = 0;
pc.pos2 = 0;
pc.pos3 = 0;
pc.pos4 = 0;
pc.w1 = 0;
pc.w2 = 0;
pc.w3 = 0;
pc.w4 = 0;
pre_calc[pre_calc_index] = pc;
pre_calc_index += 1;
continue;
}
if (y < 0) {
y = 0;
}
if (x < 0) {
x = 0;
}
int y_low = (int)y;
int x_low = (int)x;
int y_high;
int x_high;
if (y_low >= height - 1) {
y_high = y_low = height - 1;
y = (float)y_low;
} else {
y_high = y_low + 1;
}
if (x_low >= width - 1) {
x_high = x_low = width - 1;
x = (float)x_low;
} else {
x_high = x_low + 1;
}
float ly = y - y_low;
float lx = x - x_low;
float hy = 1. - ly, hx = 1. - lx;
float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
// save weights and indices
PreCalc pc;
pc.pos1 = y_low * width + x_low;
pc.pos2 = y_low * width + x_high;
pc.pos3 = y_high * width + x_low;
pc.pos4 = y_high * width + x_high;
pc.w1 = w1;
pc.w2 = w2;
pc.w3 = w3;
pc.w4 = w4;
pre_calc[pre_calc_index] = pc;
pre_calc_index += 1;
}
}
}
}
}
void ROIAlignRotatedForwardCPU(const int nthreads, const float *input,
const float *rois, float *output,
const float &spatial_scale, const int aligned,
const int clockwise, const int channels,
const int height, const int width,
const int pooled_height, const int pooled_width,
const int sampling_ratio) {
int n_rois = nthreads / channels / pooled_width / pooled_height;
// (n, c, ph, pw) is an element in the pooled output
// can be parallelized using omp
// #pragma omp parallel for num_threads(32)
for (int n = 0; n < n_rois; n++) {
int index_n = n * channels * pooled_width * pooled_height;
const float *current_roi = rois + n * 6;
int roi_batch_ind = current_roi[0];
// Do not use rounding; this implementation detail is critical
float offset = aligned ? (float)0.5 : (float)0.0;
float roi_center_w = current_roi[1] * spatial_scale - offset;
float roi_center_h = current_roi[2] * spatial_scale - offset;
float roi_width = current_roi[3] * spatial_scale;
float roi_height = current_roi[4] * spatial_scale;
// float theta = current_roi[5] * M_PI / 180.0;
float theta = current_roi[5]; // Radian angle by default
if (clockwise) {
theta = -theta;
}
float cos_theta = cos(theta);
float sin_theta = sin(theta);
if (!aligned) { // for backward-compatibility only
roi_width = std::max(roi_width, (float)1.);
roi_height = std::max(roi_height, (float)1.);
}
float bin_size_h =
static_cast<float>(roi_height) / static_cast<float>(pooled_height);
float bin_size_w =
static_cast<float>(roi_width) / static_cast<float>(pooled_width);
// We use roi_bin_grid to sample the grid and mimic integral
int roi_bin_grid_h = (sampling_ratio > 0)
? sampling_ratio
: ceil(roi_height / pooled_height); // e.g., = 2
int roi_bin_grid_w =
(sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
// We do average (integral) pooling inside a bin
const float count =
std::max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4
// we want to precalculate indices and weights shared by all channels,
// this is the key point of optimization
std::vector<PreCalc> pre_calc(roi_bin_grid_h * roi_bin_grid_w *
pooled_width * pooled_height);
// roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
// Appropriate translation needs to be applied after.
float roi_start_h = -roi_height / 2.0;
float roi_start_w = -roi_width / 2.0;
pre_calc_for_bilinear_interpolate(
height, width, pooled_height, pooled_width, roi_bin_grid_h,
roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h, bin_size_w,
roi_bin_grid_h, roi_bin_grid_w, roi_center_h, roi_center_w, cos_theta,
sin_theta, pre_calc);
for (int c = 0; c < channels; c++) {
int index_n_c = index_n + c * pooled_width * pooled_height;
const float *offset_input =
input + (roi_batch_ind * channels + c) * height * width;
int pre_calc_index = 0;
for (int ph = 0; ph < pooled_height; ph++) {
for (int pw = 0; pw < pooled_width; pw++) {
int index = index_n_c + ph * pooled_width + pw;
float output_val = 0.;
for (int iy = 0; iy < roi_bin_grid_h; iy++) {
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
PreCalc pc = pre_calc[pre_calc_index];
output_val += pc.w1 * offset_input[pc.pos1] +
pc.w2 * offset_input[pc.pos2] +
pc.w3 * offset_input[pc.pos3] +
pc.w4 * offset_input[pc.pos4];
pre_calc_index += 1;
}
}
output_val /= count;
output[index] = output_val;
} // for pw
} // for ph
} // for c
} // for n
}
void MMCVRoIAlignRotatedKernel::Compute(OrtKernelContext *context) {
// Setup inputs
const OrtValue *input_X = ort_.KernelContext_GetInput(context, 0);
const float *X_data =
reinterpret_cast<const float *>(ort_.GetTensorData<float>(input_X));
const OrtValue *input_rois = ort_.KernelContext_GetInput(context, 1);
const float *rois = reinterpret_cast<const float *>(
ort_.GetTensorData<const float *>(input_rois));
// Setup output
OrtTensorDimensions out_dimensions(ort_, input_X);
OrtTensorDimensions roi_dimensions(ort_, input_rois);
int batch_size = out_dimensions.data()[0];
int input_channels = out_dimensions.data()[1];
int input_height = out_dimensions.data()[2];
int input_width = out_dimensions.data()[3];
out_dimensions.data()[0] = roi_dimensions.data()[0];
out_dimensions.data()[2] = aligned_height_;
out_dimensions.data()[3] = aligned_width_;
OrtValue *output = ort_.KernelContext_GetOutput(
context, 0, out_dimensions.data(), out_dimensions.size());
float *out = ort_.GetTensorMutableData<float>(output);
OrtTensorTypeAndShapeInfo *output_info = ort_.GetTensorTypeAndShape(output);
ort_.ReleaseTensorTypeAndShapeInfo(output_info);
// TODO: forward here
int output_size = out_dimensions.data()[0];
for (auto i = 1; i < out_dimensions.size(); ++i) {
output_size *= out_dimensions.data()[i];
}
ROIAlignRotatedForwardCPU(output_size, X_data, rois, out, spatial_scale_,
aligned_, clockwise_, input_channels, input_height,
input_width, aligned_height_, aligned_width_,
sampling_ratio_);
}
// Copyright (c) OpenMMLab. All rights reserved
#include "soft_nms.h"
#include <assert.h>
#include <algorithm>
#include <cmath>
#include "../ort_mmcv_utils.h"
SoftNmsKernel::SoftNmsKernel(OrtApi api, const OrtKernelInfo *info)
: api_(api), ort_(api_), info_(info) {
iou_threshold_ = ort_.KernelInfoGetAttribute<float>(info, "iou_threshold");
sigma_ = ort_.KernelInfoGetAttribute<float>(info, "sigma");
min_score_ = ort_.KernelInfoGetAttribute<float>(info, "min_score");
method_ = ort_.KernelInfoGetAttribute<int64_t>(info, "method");
offset_ = ort_.KernelInfoGetAttribute<int64_t>(info, "offset");
// create allocator
allocator_ = Ort::AllocatorWithDefaultOptions();
}
void SoftNmsKernel::Compute(OrtKernelContext *context) {
typedef float T;
const T iou_threshold = T(iou_threshold_);
const T sigma = T(sigma_);
const T min_score = T(min_score_);
const int method = int(method_);
const T offset = T(offset_);
const OrtValue *boxes = ort_.KernelContext_GetInput(context, 0);
const T *boxes_data =
reinterpret_cast<const float *>(ort_.GetTensorData<T>(boxes));
const OrtValue *scores = ort_.KernelContext_GetInput(context, 1);
const T *scores_data =
reinterpret_cast<const float *>(ort_.GetTensorData<T>(scores));
OrtTensorDimensions boxes_dim(ort_, boxes);
OrtTensorDimensions scores_dim(ort_, scores);
int64_t nboxes = boxes_dim[0];
assert(boxes_dim[1] == 4);
// allocate tmp memory
T *tmp_boxes = (T *)allocator_.Alloc(sizeof(T) * nboxes * 4);
T *x1 = tmp_boxes;
T *y1 = tmp_boxes + 1;
T *x2 = tmp_boxes + 2;
T *y2 = tmp_boxes + 3;
T *sc = (T *)allocator_.Alloc(sizeof(T) * nboxes);
T *areas = (T *)allocator_.Alloc(sizeof(T) * nboxes);
T *de = (T *)allocator_.Alloc(sizeof(T) * nboxes * 5);
int64_t *inds = (int64_t *)allocator_.Alloc(sizeof(int64_t) * nboxes);
memcpy(tmp_boxes, boxes_data, sizeof(T) * nboxes * 4);
memcpy(sc, scores_data, sizeof(T) * nboxes);
// init inds as arange(nboxes)
std::generate(inds, inds + nboxes, [n = 0]() mutable { return n++; });
// area = (x2-x1+offset)*(y2-y1+offset)
for (int64_t i = 0; i < nboxes; i++) {
areas[i] =
(x2[i * 4] - x1[i * 4] + offset) * (y2[i * 4] - y1[i * 4] + offset);
}
int64_t pos = 0;
for (int64_t i = 0; i < nboxes; i++) {
auto max_score = sc[i];
auto max_pos = i;
pos = i + 1;
// get max box
while (pos < nboxes) {
if (max_score < sc[pos]) {
max_score = sc[pos];
max_pos = pos;
}
pos = pos + 1;
}
// swap
auto ix1 = de[i * 5 + 0] = x1[max_pos * 4];
auto iy1 = de[i * 5 + 1] = y1[max_pos * 4];
auto ix2 = de[i * 5 + 2] = x2[max_pos * 4];
auto iy2 = de[i * 5 + 3] = y2[max_pos * 4];
auto iscore = de[i * 5 + 4] = sc[max_pos];
auto iarea = areas[max_pos];
auto iind = inds[max_pos];
x1[max_pos * 4] = x1[i * 4];
y1[max_pos * 4] = y1[i * 4];
x2[max_pos * 4] = x2[i * 4];
y2[max_pos * 4] = y2[i * 4];
sc[max_pos] = sc[i];
areas[max_pos] = areas[i];
inds[max_pos] = inds[i];
x1[i * 4] = ix1;
y1[i * 4] = iy1;
x2[i * 4] = ix2;
y2[i * 4] = iy2;
sc[i] = iscore;
areas[i] = iarea;
inds[i] = iind;
pos = i + 1;
while (pos < nboxes) {
auto xx1 = std::max(ix1, x1[pos * 4]);
auto yy1 = std::max(iy1, y1[pos * 4]);
auto xx2 = std::min(ix2, x2[pos * 4]);
auto yy2 = std::min(iy2, y2[pos * 4]);
auto w = std::max(0.f, xx2 - xx1 + offset);
auto h = std::max(0.f, yy2 - yy1 + offset);
auto inter = w * h;
auto ovr = inter / (iarea + areas[pos] - inter);
float weight = 1.;
if (method == 0) {
if (ovr >= iou_threshold) weight = 0;
} else if (method == 1) {
if (ovr >= iou_threshold) weight = 1 - ovr;
} else if (method == 2) {
weight = std::exp(-(ovr * ovr) / sigma);
}
sc[pos] *= weight;
// if box score falls below threshold, discard the box by
// swapping with last box update N
if (sc[pos] < min_score) {
x1[pos * 4] = x1[(nboxes - 1) * 4];
y1[pos * 4] = y1[(nboxes - 1) * 4];
x2[pos * 4] = x2[(nboxes - 1) * 4];
y2[pos * 4] = y2[(nboxes - 1) * 4];
sc[pos] = sc[nboxes - 1];
areas[pos] = areas[nboxes - 1];
inds[pos] = inds[nboxes - 1];
nboxes = nboxes - 1;
pos = pos - 1;
}
pos = pos + 1;
}
}
std::vector<int64_t> dets_dim({nboxes, 5});
OrtValue *dets = ort_.KernelContext_GetOutput(context, 0, dets_dim.data(),
dets_dim.size());
T *dets_data = ort_.GetTensorMutableData<T>(dets);
std::vector<int64_t> inds_dim({nboxes});
OrtValue *inds_ov = ort_.KernelContext_GetOutput(context, 1, inds_dim.data(),
inds_dim.size());
int64_t *inds_data = ort_.GetTensorMutableData<int64_t>(inds_ov);
memcpy(dets_data, de, sizeof(T) * nboxes * 5);
memcpy(inds_data, inds, sizeof(int64_t) * nboxes);
}
// Copyright (c) OpenMMLab. All rights reserved
#ifndef ONNXRUNTIME_DEFORM_CONV_H
#define ONNXRUNTIME_DEFORM_CONV_H
#include <onnxruntime_cxx_api.h>
struct MMCVDeformConvKernel {
MMCVDeformConvKernel(OrtApi api, const OrtKernelInfo *info);
void Compute(OrtKernelContext *context);
protected:
OrtApi api_;
Ort::CustomOpApi ort_;
const OrtKernelInfo *info_;
Ort::AllocatorWithDefaultOptions allocator_;
int64_t stride_height_;
int64_t stride_width_;
int64_t padding_height_;
int64_t padding_width_;
int64_t dilation_height_;
int64_t dilation_width_;
int64_t deformable_group_;
int64_t group_;
int64_t im2col_step_;
};
struct MMCVDeformConvOp
: Ort::CustomOpBase<MMCVDeformConvOp, MMCVDeformConvKernel> {
void *CreateKernel(OrtApi api, const OrtKernelInfo *info) const {
return new MMCVDeformConvKernel(api, info);
}
const char *GetName() const { return "MMCVDeformConv2d"; };
size_t GetInputTypeCount() const { return 3; };
ONNXTensorElementDataType GetInputType(size_t /*index*/) const {
return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
};
OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(
size_t index) const {
return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_REQUIRED;
}
size_t GetOutputTypeCount() const { return 1; };
ONNXTensorElementDataType GetOutputType(size_t /*index*/) const {
return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
};
// force cpu
const char *GetExecutionProviderType() const {
return "CPUExecutionProvider";
};
};
#endif
// Copyright (c) OpenMMLab. All rights reserved
#ifndef ONNXRUNTIME_GRIDSAMPLE_H
#define ONNXRUNTIME_GRIDSAMPLE_H
#include <onnxruntime_cxx_api.h>
struct GridSampleKernel {
GridSampleKernel(OrtApi api, const OrtKernelInfo *info);
void Compute(OrtKernelContext *context);
protected:
OrtApi api_;
Ort::CustomOpApi ort_;
const OrtKernelInfo *info_;
Ort::AllocatorWithDefaultOptions allocator_;
int64_t align_corners_;
int64_t interpolation_mode_;
int64_t padding_mode_;
};
struct GridSampleOp : Ort::CustomOpBase<GridSampleOp, GridSampleKernel> {
void *CreateKernel(OrtApi api, const OrtKernelInfo *info) const {
return new GridSampleKernel(api, info);
};
const char *GetName() const { return "grid_sampler"; };
size_t GetInputTypeCount() const { return 2; };
ONNXTensorElementDataType GetInputType(size_t /*index*/) const {
return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
};
size_t GetOutputTypeCount() const { return 1; };
ONNXTensorElementDataType GetOutputType(size_t /*index*/) const {
return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
};
const char *GetExecutionProviderType() const {
return "CPUExecutionProvider";
};
};
#endif
// Copyright (c) OpenMMLab. All rights reserved
#ifndef ONNXRUNTIME_MODULATED_DEFORM_CONV_H
#define ONNXRUNTIME_MODULATED_DEFORM_CONV_H
#include <onnxruntime_cxx_api.h>
struct MMCVModulatedDeformConvKernel {
MMCVModulatedDeformConvKernel(OrtApi api, const OrtKernelInfo *info);
void Compute(OrtKernelContext *context);
protected:
OrtApi api_;
Ort::CustomOpApi ort_;
const OrtKernelInfo *info_;
Ort::AllocatorWithDefaultOptions allocator_;
int64_t stride_height_;
int64_t stride_width_;
int64_t padding_height_;
int64_t padding_width_;
int64_t dilation_height_;
int64_t dilation_width_;
int64_t deformable_group_;
int64_t group_;
};
struct MMCVModulatedDeformConvOp
: Ort::CustomOpBase<MMCVModulatedDeformConvOp,
MMCVModulatedDeformConvKernel> {
void *CreateKernel(OrtApi api, const OrtKernelInfo *info) const {
return new MMCVModulatedDeformConvKernel(api, info);
}
const char *GetName() const { return "MMCVModulatedDeformConv2d"; };
size_t GetInputTypeCount() const { return 5; };
ONNXTensorElementDataType GetInputType(size_t /*index*/) const {
return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
};
OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(
size_t index) const {
// The last input (index == 4) is optional, which is bias
if (index == 4)
return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_OPTIONAL;
return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_REQUIRED;
}
size_t GetOutputTypeCount() const { return 1; };
ONNXTensorElementDataType GetOutputType(size_t /*index*/) const {
return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
};
// force cpu
const char *GetExecutionProviderType() const {
return "CPUExecutionProvider";
};
};
#endif
// Copyright (c) OpenMMLab. All rights reserved
#ifndef ONNXRUNTIME_NMS_H
#define ONNXRUNTIME_NMS_H
#include <onnxruntime_cxx_api.h>
struct NmsKernel {
NmsKernel(OrtApi api, const OrtKernelInfo *info);
void Compute(OrtKernelContext *context);
protected:
OrtApi api_;
Ort::CustomOpApi ort_;
const OrtKernelInfo *info_;
Ort::AllocatorWithDefaultOptions allocator_;
float iou_threshold_;
int64_t offset_;
};
struct NmsOp : Ort::CustomOpBase<NmsOp, NmsKernel> {
void *CreateKernel(OrtApi api, const OrtKernelInfo *info) const {
return new NmsKernel(api, info);
};
const char *GetName() const { return "NonMaxSuppression"; };
size_t GetInputTypeCount() const { return 2; };
ONNXTensorElementDataType GetInputType(size_t /*index*/) const {
return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
};
size_t GetOutputTypeCount() const { return 1; };
ONNXTensorElementDataType GetOutputType(size_t index) const {
return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
}
// force cpu
const char *GetExecutionProviderType() const {
return "CPUExecutionProvider";
}
};
#endif
// Copyright (c) OpenMMLab. All rights reserved
#ifndef ONNXRUNTIME_REGISTER_H
#define ONNXRUNTIME_REGISTER_H
#include <onnxruntime_c_api.h>
#ifdef __cplusplus
extern "C" {
#endif
OrtStatus *ORT_API_CALL RegisterCustomOps(OrtSessionOptions *options,
const OrtApiBase *api);
#ifdef __cplusplus
}
#endif
#endif // ONNXRUNTIME_REGISTER_H
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#ifndef ONNXRUNTIME_SESSION_OPTIONS_CONFIG_KEYS_H
#define ONNXRUNTIME_SESSION_OPTIONS_CONFIG_KEYS_H
/*
* This file defines SessionOptions Config Keys and format of the Config Values.
*
* The Naming Convention for a SessionOptions Config Key,
* "[Area][.[SubArea1].[SubArea2]...].[Keyname]"
* Such as "ep.cuda.use_arena"
* The Config Key cannot be empty
* The maximum length of the Config Key is 128
*
* The string format of a SessionOptions Config Value is defined individually
* for each Config. The maximum length of the Config Value is 1024
*/
// Key for disable PrePacking,
// If the config value is set to "1" then the prepacking is disabled, otherwise
// prepacking is enabled (default value)
static const char* const kOrtSessionOptionsConfigDisablePrepacking =
"session.disable_prepacking";
// A value of "1" means allocators registered in the env will be used. "0" means
// the allocators created in the session will be used. Use this to override the
// usage of env allocators on a per session level.
static const char* const kOrtSessionOptionsConfigUseEnvAllocators =
"session.use_env_allocators";
// Set to 'ORT' (case sensitive) to load an ORT format model.
// If unset, model type will default to ONNX unless inferred from filename
// ('.ort' == ORT format) or bytes to be ORT
static const char* const kOrtSessionOptionsConfigLoadModelFormat =
"session.load_model_format";
// Set to 'ORT' (case sensitive) to save optimized model in ORT format when
// SessionOptions.optimized_model_path is set. If unset, format will default to
// ONNX unless optimized_model_filepath ends in '.ort'.
static const char* const kOrtSessionOptionsConfigSaveModelFormat =
"session.save_model_format";
#endif // ONNXRUNTIME_SESSION_OPTIONS_CONFIG_KEYS_H
// Copyright (c) OpenMMLab. All rights reserved
#ifndef ORT_MMCV_UTILS_H
#define ORT_MMCV_UTILS_H
#include <onnxruntime_cxx_api.h>
#include <vector>
struct OrtTensorDimensions : std::vector<int64_t> {
OrtTensorDimensions(Ort::CustomOpApi ort, const OrtValue* value) {
OrtTensorTypeAndShapeInfo* info = ort.GetTensorTypeAndShape(value);
std::vector<int64_t>::operator=(ort.GetTensorShape(info));
ort.ReleaseTensorTypeAndShapeInfo(info);
}
};
#endif // ORT_MMCV_UTILS_H
// Copyright (c) OpenMMLab. All rights reserved
#ifndef ONNXRUNTIME_REDUCE_OPS_H
#define ONNXRUNTIME_REDUCE_OPS_H
#include <onnxruntime_cxx_api.h>
struct MMCVCumMaxKernel {
public:
MMCVCumMaxKernel(Ort::CustomOpApi ort, const OrtKernelInfo* info)
: ort_(ort) {
dim_ = ort_.KernelInfoGetAttribute<int64_t>(info, "dim");
// create allocator
allocator_ = Ort::AllocatorWithDefaultOptions();
}
void Compute(OrtKernelContext* context);
private:
Ort::CustomOpApi ort_;
Ort::AllocatorWithDefaultOptions allocator_;
int64_t dim_;
};
struct MMCVCumMinKernel {
public:
MMCVCumMinKernel(Ort::CustomOpApi ort, const OrtKernelInfo* info)
: ort_(ort) {
dim_ = ort_.KernelInfoGetAttribute<int64_t>(info, "dim");
// create allocator
allocator_ = Ort::AllocatorWithDefaultOptions();
}
void Compute(OrtKernelContext* context);
private:
Ort::CustomOpApi ort_;
Ort::AllocatorWithDefaultOptions allocator_;
int64_t dim_;
};
struct MMCVCumMaxCustomOp
: Ort::CustomOpBase<MMCVCumMaxCustomOp, MMCVCumMaxKernel> {
void* CreateKernel(Ort::CustomOpApi api, const OrtKernelInfo* info) const {
return new MMCVCumMaxKernel(api, info);
}
const char* GetName() const { return "cummax"; }
size_t GetInputTypeCount() const { return 1; }
ONNXTensorElementDataType GetInputType(size_t) const {
return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
};
size_t GetOutputTypeCount() const { return 2; }
ONNXTensorElementDataType GetOutputType(size_t index) const {
if (index == 1) return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
};
// force cpu
const char* GetExecutionProviderType() const {
return "CPUExecutionProvider";
};
};
struct MMCVCumMinCustomOp
: Ort::CustomOpBase<MMCVCumMinCustomOp, MMCVCumMinKernel> {
void* CreateKernel(Ort::CustomOpApi api, const OrtKernelInfo* info) const {
return new MMCVCumMinKernel(api, info);
}
const char* GetName() const { return "cummin"; }
size_t GetInputTypeCount() const { return 1; }
ONNXTensorElementDataType GetInputType(size_t) const {
return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
};
size_t GetOutputTypeCount() const { return 2; }
ONNXTensorElementDataType GetOutputType(size_t index) const {
if (index == 1) return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
};
// force cpu
const char* GetExecutionProviderType() const {
return "CPUExecutionProvider";
};
};
#endif // ONNXRUNTIME_REDUCE_OPS_H
// Copyright (c) OpenMMLab. All rights reserved
#ifndef ONNXRUNTIME_ROI_ALIGN_H
#define ONNXRUNTIME_ROI_ALIGN_H
#include <assert.h>
#include <onnxruntime_cxx_api.h>
#include <cmath>
#include <mutex>
#include <string>
#include <vector>
struct MMCVRoiAlignKernel {
public:
MMCVRoiAlignKernel(Ort::CustomOpApi ort, const OrtKernelInfo* info)
: ort_(ort) {
aligned_ = ort_.KernelInfoGetAttribute<int64_t>(info, "aligned");
aligned_height_ =
ort_.KernelInfoGetAttribute<int64_t>(info, "output_height");
aligned_width_ = ort_.KernelInfoGetAttribute<int64_t>(info, "output_width");
pool_mode_ = ort_.KernelInfoGetAttribute<std::string>(info, "mode");
sampling_ratio_ =
ort_.KernelInfoGetAttribute<int64_t>(info, "sampling_ratio");
spatial_scale_ = ort_.KernelInfoGetAttribute<float>(info, "spatial_scale");
}
void Compute(OrtKernelContext* context);
private:
Ort::CustomOpApi ort_;
int aligned_height_;
int aligned_width_;
float spatial_scale_;
int sampling_ratio_;
std::string pool_mode_;
int aligned_;
};
struct MMCVRoiAlignCustomOp
: Ort::CustomOpBase<MMCVRoiAlignCustomOp, MMCVRoiAlignKernel> {
void* CreateKernel(Ort::CustomOpApi api, const OrtKernelInfo* info) const {
return new MMCVRoiAlignKernel(api, info);
}
const char* GetName() const { return "MMCVRoiAlign"; }
size_t GetInputTypeCount() const { return 2; }
ONNXTensorElementDataType GetInputType(size_t) const {
return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
}
size_t GetOutputTypeCount() const { return 1; }
ONNXTensorElementDataType GetOutputType(size_t) const {
return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
}
// force cpu
const char* GetExecutionProviderType() const {
return "CPUExecutionProvider";
}
};
#endif // ONNXRUNTIME_ROI_ALIGN_H
// Copyright (c) OpenMMLab. All rights reserved
#ifndef ONNXRUNTIME_ROI_ALIGN_ROTATED_H
#define ONNXRUNTIME_ROI_ALIGN_ROTATED_H
#include <assert.h>
#include <onnxruntime_cxx_api.h>
#include <cmath>
#include <mutex>
#include <string>
#include <vector>
struct MMCVRoIAlignRotatedKernel {
public:
MMCVRoIAlignRotatedKernel(Ort::CustomOpApi ort, const OrtKernelInfo* info)
: ort_(ort) {
aligned_height_ =
ort_.KernelInfoGetAttribute<int64_t>(info, "output_height");
aligned_width_ = ort_.KernelInfoGetAttribute<int64_t>(info, "output_width");
sampling_ratio_ =
ort_.KernelInfoGetAttribute<int64_t>(info, "sampling_ratio");
spatial_scale_ = ort_.KernelInfoGetAttribute<float>(info, "spatial_scale");
aligned_ = ort_.KernelInfoGetAttribute<int64_t>(info, "aligned");
clockwise_ = ort_.KernelInfoGetAttribute<int64_t>(info, "clockwise");
}
void Compute(OrtKernelContext* context);
private:
Ort::CustomOpApi ort_;
int aligned_height_;
int aligned_width_;
float spatial_scale_;
int sampling_ratio_;
int aligned_;
int clockwise_;
};
struct MMCVRoIAlignRotatedCustomOp
: Ort::CustomOpBase<MMCVRoIAlignRotatedCustomOp,
MMCVRoIAlignRotatedKernel> {
void* CreateKernel(Ort::CustomOpApi api, const OrtKernelInfo* info) const {
return new MMCVRoIAlignRotatedKernel(api, info);
}
const char* GetName() const { return "MMCVRoIAlignRotated"; }
size_t GetInputTypeCount() const { return 2; }
ONNXTensorElementDataType GetInputType(size_t) const {
return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
}
size_t GetOutputTypeCount() const { return 1; }
ONNXTensorElementDataType GetOutputType(size_t) const {
return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
}
// force cpu
const char* GetExecutionProviderType() const {
return "CPUExecutionProvider";
}
};
#endif // ONNXRUNTIME_ROI_ALIGN_ROTATED_H
// Copyright (c) OpenMMLab. All rights reserved
#ifndef ONNXRUNTIME_SOFT_NMS_H
#define ONNXRUNTIME_SOFT_NMS_H
#include <onnxruntime_cxx_api.h>
struct SoftNmsKernel {
SoftNmsKernel(OrtApi api, const OrtKernelInfo *info);
void Compute(OrtKernelContext *context);
protected:
OrtApi api_;
Ort::CustomOpApi ort_;
const OrtKernelInfo *info_;
Ort::AllocatorWithDefaultOptions allocator_;
float iou_threshold_;
float sigma_;
float min_score_;
int64_t method_;
int64_t offset_;
};
struct SoftNmsOp : Ort::CustomOpBase<SoftNmsOp, SoftNmsKernel> {
void *CreateKernel(OrtApi api, const OrtKernelInfo *info) const {
return new SoftNmsKernel(api, info);
};
const char *GetName() const { return "SoftNonMaxSuppression"; };
size_t GetInputTypeCount() const { return 2; };
ONNXTensorElementDataType GetInputType(size_t /*index*/) const {
return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
};
size_t GetOutputTypeCount() const { return 2; };
ONNXTensorElementDataType GetOutputType(size_t index) const {
if (index == 1) {
return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
}
return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
};
// force cpu
const char *GetExecutionProviderType() const {
return "CPUExecutionProvider";
};
};
#endif // ONNXRUNTIME_SOFT_NMS_H
// Copyright (c) OpenMMLab. All rights reserved.
// Modified from
// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/orn/src/ActiveRotatingFilter.h
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
void active_rotated_filter_forward_impl(const Tensor input,
const Tensor indices, Tensor output) {
DISPATCH_DEVICE_IMPL(active_rotated_filter_forward_impl, input, indices,
output);
}
void active_rotated_filter_backward_impl(const Tensor grad_out,
const Tensor indices, Tensor grad_in) {
DISPATCH_DEVICE_IMPL(active_rotated_filter_backward_impl, grad_out, indices,
grad_in);
}
void active_rotated_filter_forward(const Tensor input, const Tensor indices,
Tensor output) {
active_rotated_filter_forward_impl(input, indices, output);
}
void active_rotated_filter_backward(const Tensor grad_out, const Tensor indices,
Tensor grad_in) {
active_rotated_filter_backward_impl(grad_out, indices, grad_in);
}
// Copyright (c) OpenMMLab. All rights reserved
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>
#include "active_rotated_filter_pytorch.h"
using namespace parrots;
#ifdef MMCV_WITH_CUDA
void active_rotated_filter_forward_cuda_parrots(
CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
auto input = buildATensor(ctx, ins[0]);
auto indices = buildATensor(ctx, ins[1]);
auto output = buildATensor(ctx, outs[0]);
active_rotated_filter_forward(input, indices, output);
}
void active_rotated_filter_backward_cuda_parrots(
CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
auto grad_out = buildATensor(ctx, ins[0]);
auto indices = buildATensor(ctx, ins[1]);
auto grad_in = buildATensor(ctx, outs[0]);
active_rotated_filter_backward(grad_out, indices, grad_in);
}
#endif
void active_rotated_filter_forward_cpu_parrots(
HostContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
auto input = buildATensor(ctx, ins[0]);
auto indices = buildATensor(ctx, ins[1]);
auto output = buildATensor(ctx, outs[0]);
active_rotated_filter_forward(input, indices, output);
}
void active_rotated_filter_backward_cpu_parrots(
HostContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
auto grad_out = buildATensor(ctx, ins[0]);
auto indices = buildATensor(ctx, ins[1]);
auto grad_in = buildATensor(ctx, outs[0]);
active_rotated_filter_backward(grad_out, indices, grad_in);
}
PARROTS_EXTENSION_REGISTER(active_rotated_filter_forward)
.input(2)
.output(1)
.apply(active_rotated_filter_forward_cpu_parrots)
#ifdef MMCV_WITH_CUDA
.apply(active_rotated_filter_forward_cuda_parrots)
#endif
.done();
PARROTS_EXTENSION_REGISTER(active_rotated_filter_backward)
.input(2)
.output(1)
.apply(active_rotated_filter_backward_cpu_parrots)
#ifdef MMCV_WITH_CUDA
.apply(active_rotated_filter_backward_cuda_parrots)
#endif
.done();
// Copyright (c) OpenMMLab. All rights reserved
#ifndef ACTIVE_ROTATED_FILTER_PYTORCH_H
#define ACTIVE_ROTATED_FILTER_PYTORCH_H
#include <torch/extension.h>
using namespace at;
void active_rotated_filter_forward(const Tensor input, const Tensor indices,
Tensor output);
void active_rotated_filter_backward(const Tensor grad_out, const Tensor indices,
Tensor grad_in);
#endif // ACTIVE_ROTATED_FILTER_PYTORCH_H
// Modified from // Modified from
// https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/paconv_lib/src/gpu // https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/paconv_lib/src/gpu
#include "pytorch_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
void assign_score_withk_forward_impl(int B, int N0, int N1, int M, int K, int O, #ifdef MMCV_WITH_CUDA
void AssignScoreWithKForwardCUDAKernelLauncher(
int B, int N0, int N1, int M, int K, int O, int aggregate,
const Tensor& points, const Tensor& centers, const Tensor& scores,
const Tensor& knn_idx, Tensor& output);
void assign_score_withk_forward_cuda(int B, int N0, int N1, int M, int K, int O,
int aggregate, const Tensor& points, int aggregate, const Tensor& points,
const Tensor& centers, const Tensor& centers,
const Tensor& scores, const Tensor& scores,
const Tensor& knn_idx, Tensor& output) { const Tensor& knn_idx, Tensor& output) {
DISPATCH_DEVICE_IMPL(assign_score_withk_forward_impl, B, N0, N1, M, K, O, AssignScoreWithKForwardCUDAKernelLauncher(
aggregate, points, centers, scores, knn_idx, output); B, N0, N1, M, K, O, aggregate, points, centers, scores, knn_idx, output);
} };
void AssignScoreWithKBackwardCUDAKernelLauncher(
int B, int N0, int N1, int M, int K, int O, int aggregate,
const Tensor& grad_out, const Tensor& points, const Tensor& centers,
const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
Tensor& grad_centers, Tensor& grad_scores);
void assign_score_withk_backward_impl( void assign_score_withk_backward_cuda(
int B, int N0, int N1, int M, int K, int O, int aggregate, int B, int N0, int N1, int M, int K, int O, int aggregate,
const Tensor& grad_out, const Tensor& points, const Tensor& centers, const Tensor& grad_out, const Tensor& points, const Tensor& centers,
const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points, const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
Tensor& grad_centers, Tensor& grad_scores) { Tensor& grad_centers, Tensor& grad_scores) {
DISPATCH_DEVICE_IMPL(assign_score_withk_backward_impl, B, N0, N1, M, K, O, AssignScoreWithKBackwardCUDAKernelLauncher(
aggregate, grad_out, points, centers, scores, knn_idx, B, N0, N1, M, K, O, aggregate, grad_out, points, centers, scores, knn_idx,
grad_points, grad_centers, grad_scores); grad_points, grad_centers, grad_scores);
} };
#endif
void assign_score_withk_forward(const Tensor& points, const Tensor& centers, void assign_score_withk_forward(const Tensor& points, const Tensor& centers,
const Tensor& scores, const Tensor& knn_idx, const Tensor& scores, const Tensor& knn_idx,
Tensor& output, int B, int N0, int N1, int M, Tensor& output, int B, int N0, int N1, int M,
int K, int O, int aggregate) { int K, int O, int aggregate) {
assign_score_withk_forward_impl(B, N0, N1, M, K, O, aggregate, points, if (points.device().is_cuda()) {
centers, scores, knn_idx, output); #ifdef MMCV_WITH_CUDA
CHECK_CONTIGUOUS(points);
CHECK_CONTIGUOUS(centers);
CHECK_CONTIGUOUS(scores);
CHECK_CONTIGUOUS(knn_idx);
CHECK_CONTIGUOUS(output);
assign_score_withk_forward_cuda(B, N0, N1, M, K, O, aggregate, points,
centers, scores, knn_idx, output);
#else
AT_ERROR("assign_score_withk is not compiled with GPU support");
#endif
} else {
AT_ERROR("assign_score_withk is not implemented on CPU");
}
} }
void assign_score_withk_backward(const Tensor& grad_out, const Tensor& points, void assign_score_withk_backward(const Tensor& grad_out, const Tensor& points,
...@@ -36,7 +62,24 @@ void assign_score_withk_backward(const Tensor& grad_out, const Tensor& points, ...@@ -36,7 +62,24 @@ void assign_score_withk_backward(const Tensor& grad_out, const Tensor& points,
Tensor& grad_centers, Tensor& grad_scores, Tensor& grad_centers, Tensor& grad_scores,
int B, int N0, int N1, int M, int K, int O, int B, int N0, int N1, int M, int K, int O,
int aggregate) { int aggregate) {
assign_score_withk_backward_impl(B, N0, N1, M, K, O, aggregate, grad_out, if (grad_points.device().is_cuda()) {
points, centers, scores, knn_idx, #ifdef MMCV_WITH_CUDA
grad_points, grad_centers, grad_scores); CHECK_CONTIGUOUS(grad_out);
CHECK_CONTIGUOUS(scores);
CHECK_CONTIGUOUS(points);
CHECK_CONTIGUOUS(centers);
CHECK_CONTIGUOUS(knn_idx);
CHECK_CONTIGUOUS(grad_scores);
CHECK_CONTIGUOUS(grad_points);
CHECK_CONTIGUOUS(grad_centers);
assign_score_withk_backward_cuda(B, N0, N1, M, K, O, aggregate, grad_out,
points, centers, scores, knn_idx,
grad_points, grad_centers, grad_scores);
#else
AT_ERROR("assign_score_withk is not compiled with GPU support");
#endif
} else {
AT_ERROR("assign_score_withk is not implemented on CPU");
}
} }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment