Unverified Commit 2e5628b4 authored by q.yao's avatar q.yao Committed by GitHub
Browse files

[Refactor]: Remove deployment for dev-2.x (#2225)

* remove deploy for 2.0

* update onnx ut
parent 961373ad
# Copyright (c) OpenMMLab. All rights reserved. # Copyright (c) OpenMMLab. All rights reserved.
import torch import torch
from torch import Tensor, nn from torch import Tensor, nn
from torch.autograd import Function
_mode_dict = {'top': 0, 'bottom': 1, 'left': 2, 'right': 3} _mode_dict = {'top': 0, 'bottom': 1, 'left': 2, 'right': 3}
...@@ -35,58 +34,6 @@ def _corner_pool(x: Tensor, dim: int, flip: bool) -> Tensor: ...@@ -35,58 +34,6 @@ def _corner_pool(x: Tensor, dim: int, flip: bool) -> Tensor:
return output return output
class TopPoolFunction(Function):
@staticmethod
def symbolic(g, input: Tensor) -> Tensor:
output = g.op(
'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['top']))
return output
@staticmethod
def forward(ctx, input: Tensor) -> Tensor:
return _corner_pool(input, 2, True)
class BottomPoolFunction(Function):
@staticmethod
def symbolic(g, input: Tensor) -> Tensor:
output = g.op(
'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['bottom']))
return output
@staticmethod
def forward(ctx, input: Tensor) -> Tensor:
return _corner_pool(input, 2, False)
class LeftPoolFunction(Function):
@staticmethod
def symbolic(g, input: Tensor) -> Tensor:
output = g.op(
'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['left']))
return output
@staticmethod
def forward(ctx, input: Tensor) -> Tensor:
return _corner_pool(input, 3, True)
class RightPoolFunction(Function):
@staticmethod
def symbolic(g, input: Tensor) -> Tensor:
output = g.op(
'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['right']))
return output
@staticmethod
def forward(ctx, input: Tensor) -> Tensor:
return _corner_pool(input, 3, False)
class CornerPool(nn.Module): class CornerPool(nn.Module):
"""Corner Pooling. """Corner Pooling.
...@@ -110,13 +57,6 @@ class CornerPool(nn.Module): ...@@ -110,13 +57,6 @@ class CornerPool(nn.Module):
Feature map after pooling. Feature map after pooling.
""" """
pool_functions = {
'bottom': BottomPoolFunction,
'left': LeftPoolFunction,
'right': RightPoolFunction,
'top': TopPoolFunction,
}
cummax_dim_flip = { cummax_dim_flip = {
'bottom': (2, False), 'bottom': (2, False),
'left': (3, True), 'left': (3, True),
...@@ -126,21 +66,11 @@ class CornerPool(nn.Module): ...@@ -126,21 +66,11 @@ class CornerPool(nn.Module):
def __init__(self, mode: str): def __init__(self, mode: str):
super().__init__() super().__init__()
assert mode in self.pool_functions assert mode in self.cummax_dim_flip
self.mode = mode self.mode = mode
self.corner_pool: Function = self.pool_functions[mode]
def forward(self, x: Tensor) -> Tensor: def forward(self, x: Tensor) -> Tensor:
if torch.__version__ != 'parrots' and torch.__version__ >= '1.5.0': if torch.__version__ != 'parrots' and torch.__version__ >= '1.5.0':
if torch.onnx.is_in_onnx_export():
assert torch.__version__ >= '1.7.0', \
'When `cummax` serves as an intermediate component whose '\
'outputs is used as inputs for another modules, it\'s '\
'expected that pytorch version must be >= 1.7.0, '\
'otherwise Error appears like: `RuntimeError: tuple '\
'appears in op that does not forward tuples, unsupported '\
'kind: prim::PythonOp`.'
dim, flip = self.cummax_dim_flip[self.mode] dim, flip = self.cummax_dim_flip[self.mode]
if flip: if flip:
x = x.flip(dim) x = x.flip(dim)
...@@ -149,8 +79,5 @@ class CornerPool(nn.Module): ...@@ -149,8 +79,5 @@ class CornerPool(nn.Module):
pool_tensor = pool_tensor.flip(dim) pool_tensor = pool_tensor.flip(dim)
return pool_tensor return pool_tensor
else: else:
if torch.onnx.is_in_onnx_export(): dim, flip = self.cummax_dim_flip[self.mode]
return self.corner_pool.apply(x) return _corner_pool(x, dim, flip)
else:
dim, flip = self.cummax_dim_flip[self.mode]
return _corner_pool(x, dim, flip)
...@@ -26,51 +26,28 @@ This folder contains all non-python code for MMCV custom ops. Please follow the ...@@ -26,51 +26,28 @@ This folder contains all non-python code for MMCV custom ops. Please follow the
│   │ └── ... │   │ └── ...
|   └── utils |   └── utils
│   │ └── ... │   │ └── ...
├── onnxruntime
│   ├── onnxruntime_register.h
│   ├── onnxruntime_session_options_config_keys.h
│   ├── ort_mmcv_utils.h
│   ├── ...
│   ├── onnx_ops.h
│   └── cpu
│ ├── onnxruntime_register.cpp
│      ├── ...
│      └── onnx_ops_impl.cpp
├── parrots ├── parrots
│   ├── ... │   ├── ...
│   ├── ops.cpp │   ├── ops.cpp
│   ├── ops_parrots.cpp │   ├── ops_parrots.cpp
│   └── ops_pytorch.h │   └── ops_pytorch.h
├── pytorch └── pytorch
│   ├── info.cpp     ├── info.cpp
│   ├── pybind.cpp     ├── pybind.cpp
│   ├── ...     ├── ...
│   ├── ops.cpp     ├── ops.cpp
│   ├── cuda     ├── cuda
│   │   ├── ...     │   ├── ...
│   │   └── ops_cuda.cu     │   └── ops_cuda.cu
│   ├── cpu     ├── cpu
│   │   ├── ...     │   ├── ...
│   │   └── ops.cpp     │   └── ops.cpp
│   ├── mps     ├── mps
│   │   ├── ...     │   ├── ...
│   |   └── op_mps.mm     |   └── op_mps.mm
│   └── mlu     └── mlu
│      ├── ...        ├── ...
│      └── op_mlu.cpp        └── op_mlu.cpp
└── tensorrt
├── trt_cuda_helper.cuh
├── trt_plugin_helper.hpp
├── trt_plugin.hpp
├── trt_serialize.hpp
├── ...
├── trt_ops.hpp
└── plugins
   ├── trt_cuda_helper.cu
   ├── trt_plugin.cpp
   ├── ...
   ├── trt_ops.cpp
   └── trt_ops_kernel.cu
``` ```
## Components ## Components
...@@ -80,16 +57,12 @@ This folder contains all non-python code for MMCV custom ops. Please follow the ...@@ -80,16 +57,12 @@ This folder contains all non-python code for MMCV custom ops. Please follow the
- `mps`: The tools used to support MPS ops. **NOTE** that MPS support is **experimental**. - `mps`: The tools used to support MPS ops. **NOTE** that MPS support is **experimental**.
- `mlu`: The MLU kernels used to support [Cambricon](https://www.cambricon.com/) device. - `mlu`: The MLU kernels used to support [Cambricon](https://www.cambricon.com/) device.
- `utils`: The kernels and utils of spconv. - `utils`: The kernels and utils of spconv.
- `onnxruntime`: **ONNX Runtime** support for custom ops. Has been deprecated, please try the latest custom ops in [MMDeploy](https://github.com/open-mmlab/mmdeploy).
- `cpu`: CPU implementation of supported ops.
- `parrots`: **Parrots** is a deep learning frame for model training and inference. Parrots custom ops are placed in this directory. - `parrots`: **Parrots** is a deep learning frame for model training and inference. Parrots custom ops are placed in this directory.
- `pytorch`: **PyTorch** custom ops are supported by binding C++ to Python with **pybind11**. The ops implementation and binding codes are placed in this directory. - `pytorch`: **PyTorch** custom ops are supported by binding C++ to Python with **pybind11**. The ops implementation and binding codes are placed in this directory.
- `cuda`: This directory contains cuda kernel launchers, which feed memory pointers of tensor to the cuda kernel in `common/cuda`. The launchers provide c++ interface of cuda implementation of corresponding custom ops. - `cuda`: This directory contains cuda kernel launchers, which feed memory pointers of tensor to the cuda kernel in `common/cuda`. The launchers provide c++ interface of cuda implementation of corresponding custom ops.
- `cpu`: This directory contain cpu implementations of corresponding custom ops. - `cpu`: This directory contain cpu implementations of corresponding custom ops.
- `mlu`: This directory contain launchers of each MLU kernels. - `mlu`: This directory contain launchers of each MLU kernels.
- `mps`: MPS ops implementation and launchers. - `mps`: MPS ops implementation and launchers.
- `tensorrt`: **TensorRT** support for custom ops. Has been deprecated, please try the latest custom ops in [MMDeploy](https://github.com/open-mmlab/mmdeploy).
- `plugins`: This directory contains the implementation of the supported custom ops. Some ops might also use shared cuda kernel in `common/cuda`.
## How to add new PyTorch ops? ## How to add new PyTorch ops?
......
// Copyright (c) OpenMMLab. All rights reserved
#ifndef ONNXRUNTIME_CORNER_POOL_H
#define ONNXRUNTIME_CORNER_POOL_H
#include <assert.h>
#include <onnxruntime_cxx_api.h>
struct MMCVCornerPoolKernel {
public:
MMCVCornerPoolKernel(Ort::CustomOpApi ort, const OrtKernelInfo* info)
: ort_(ort) {
mode_ = ort_.KernelInfoGetAttribute<int64_t>(info, "mode");
}
void Compute(OrtKernelContext* context);
private:
Ort::CustomOpApi ort_;
int64_t mode_;
};
struct MMCVCornerPoolCustomOp
: Ort::CustomOpBase<MMCVCornerPoolCustomOp, MMCVCornerPoolKernel> {
void* CreateKernel(Ort::CustomOpApi api, const OrtKernelInfo* info) const {
return new MMCVCornerPoolKernel(api, info);
}
const char* GetName() const { return "MMCVCornerPool"; }
size_t GetInputTypeCount() const { return 1; }
ONNXTensorElementDataType GetInputType(size_t) const {
return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
}
size_t GetOutputTypeCount() const { return 1; }
ONNXTensorElementDataType GetOutputType(size_t) const {
return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
}
// force cpu
const char* GetExecutionProviderType() const {
return "CPUExecutionProvider";
}
};
#endif // ONNXRUNTIME_CORNER_POOL_H
// Copyright (c) OpenMMLab. All rights reserved
#include "corner_pool.h"
#include "../ort_mmcv_utils.h"
void TopPoolForwardCPU(const float *input, float *output, const int batch_size,
const int channels, const int height, const int width) {
for (int n = 0; n < batch_size; n++) {
int index_n = n * channels * width * height;
for (int c = 0; c < channels; c++) {
int index_n_c = index_n + c * width * height;
for (int w = 0; w < width; w++) {
// directly copy the most bottom value from input to output
output[index_n_c + (height - 1) * width + w] =
input[index_n_c + (height - 1) * width + w];
// do top_pool
for (int h = height - 2; h >= 0; h--) {
output[index_n_c + h * width + w] =
std::max(output[index_n_c + (h + 1) * width + w],
input[index_n_c + h * width + w]);
} // for h
} // for w
} // for c
} // for n
}
void BottomPoolForwardCPU(const float *input, float *output,
const int batch_size, const int channels,
const int height, const int width) {
for (int n = 0; n < batch_size; n++) {
int index_n = n * channels * width * height;
for (int c = 0; c < channels; c++) {
int index_n_c = index_n + c * width * height;
for (int w = 0; w < width; w++) {
// directly copy the most top value from input to output
output[index_n_c + w] = input[index_n_c + w];
// do top_pool
for (int h = 1; h < height; h++) {
output[index_n_c + h * width + w] =
std::max(output[index_n_c + (h - 1) * width + w],
input[index_n_c + h * width + w]);
} // for h
} // for w
} // for c
} // for n
}
void LeftPoolForwardCPU(const float *input, float *output, const int batch_size,
const int channels, const int height, const int width) {
for (int n = 0; n < batch_size; n++) {
int index_n = n * channels * width * height;
for (int c = 0; c < channels; c++) {
int index_n_c = index_n + c * width * height;
for (int h = 0; h < height; h++) {
// directly copy the most right value from input to output
output[index_n_c + h * width + width - 1] =
input[index_n_c + h * width + width - 1];
// do left_pool
for (int w = width - 2; w >= 0; w--) {
output[index_n_c + h * width + w] =
std::max(output[index_n_c + h * width + w + 1],
input[index_n_c + h * width + w]);
} // for w
} // for h
} // for c
} // for n
}
void RightPoolForwardCPU(const float *input, float *output,
const int batch_size, const int channels,
const int height, const int width) {
for (int n = 0; n < batch_size; n++) {
int index_n = n * channels * width * height;
for (int c = 0; c < channels; c++) {
int index_n_c = index_n + c * width * height;
for (int h = 0; h < height; h++) {
// directly copy the most left value from input to output
output[index_n_c + h * width] = input[index_n_c + h * width];
// do right_pool
for (int w = 1; w < width; w++) {
output[index_n_c + h * width + w] =
std::max(output[index_n_c + h * width + w - 1],
input[index_n_c + h * width + w]);
} // for w
} // for h
} // for c
} // for n
}
void MMCVCornerPoolKernel::Compute(OrtKernelContext *context) {
const int mode = int(mode_);
typedef float T;
const OrtValue *input = ort_.KernelContext_GetInput(context, 0);
const T *input_data =
reinterpret_cast<const float *>(ort_.GetTensorData<T>(input));
// get output memory
OrtTensorDimensions out_dimensions(ort_, input);
OrtValue *output = ort_.KernelContext_GetOutput(
context, 0, out_dimensions.data(), out_dimensions.size());
T *output_data = ort_.GetTensorMutableData<T>(output);
// 'top': 0, 'bottom': 1, 'left': 2, 'right':3
assert(mode == 0 || mode == 1 || mode == 2 || mode == 3);
// do corner_pool
int batch_size = out_dimensions.data()[0];
int input_channels = out_dimensions.data()[1];
int input_height = out_dimensions.data()[2];
int input_width = out_dimensions.data()[3];
if (mode == 0)
TopPoolForwardCPU(input_data, output_data, batch_size, input_channels,
input_height, input_width);
else if (mode == 1)
BottomPoolForwardCPU(input_data, output_data, batch_size, input_channels,
input_height, input_width);
else if (mode == 2)
LeftPoolForwardCPU(input_data, output_data, batch_size, input_channels,
input_height, input_width);
else
RightPoolForwardCPU(input_data, output_data, batch_size, input_channels,
input_height, input_width);
}
// Copyright (c) OpenMMLab. All rights reserved
#include "deform_conv.h"
#include <cmath>
#include <vector>
#include "../ort_mmcv_utils.h"
void gemm_ref_fp32_deform(const float *A, const float *B, const float *V,
const float *H, const int32_t trans_A,
const int32_t trans_B, const int32_t M,
const int32_t N, const int32_t K, const float alpha,
const float beta, float *Y) {
if (!trans_A && !trans_B) { // MK, KN; NN
for (int64_t m = 0; m < M; ++m) {
for (int64_t n = 0; n < N; ++n) {
float y = 0.0f;
for (int64_t k = 0; k < K; ++k) {
y += A[m * K + k] * B[k * N + n];
}
y *= alpha;
if (V) y += beta * V[n];
if (H) y += beta * H[m * N + n];
Y[m * N + n] = y;
}
}
}
if (trans_A && !trans_B) { // KM, KN; TN
for (int64_t m = 0; m < M; ++m) {
for (int64_t n = 0; n < N; ++n) {
float y = 0.0f;
for (int64_t k = 0; k < K; ++k) {
y += A[k * M + m] * B[k * N + n];
}
y *= alpha;
if (V) y += beta * V[n];
if (H) y += beta * H[m * N + n];
Y[m * N + n] = y;
}
}
}
if (trans_A && trans_B) { // KM, NK; TT
for (int64_t m = 0; m < M; ++m) {
for (int64_t n = 0; n < N; ++n) {
float y = 0.0f;
for (int64_t k = 0; k < K; ++k) {
y += A[k * M + m] * B[n * K + k];
}
y *= alpha;
if (V) y += beta * V[n];
if (H) y += beta * H[m * N + n];
Y[m * N + n] = y;
}
}
}
if (!trans_A && trans_B) { // MK, NK; NT
for (int64_t m = 0; m < M; ++m) {
for (int64_t n = 0; n < N; ++n) {
float y = 0.0f;
for (int64_t k = 0; k < K; ++k) {
y += A[m * K + k] * B[n * K + k];
}
y *= alpha;
if (V) y += beta * V[n];
if (H) y += beta * H[m * N + n];
Y[m * N + n] = y;
}
}
}
}
float bilinear_interpolate(const float *src, const int64_t src_h,
const int64_t src_w, const float h, const float w) {
if (h <= -1 || src_h <= h || w <= -1 || src_w <= w) {
return 0;
}
int64_t h_low = floor(h);
int64_t w_low = floor(w);
int64_t h_high = h_low + 1;
int64_t w_high = w_low + 1;
float lh = h - h_low;
float lw = w - w_low;
float hh = 1 - lh;
float hw = 1 - lw;
float v1 = 0;
if (h_low >= 0 && w_low >= 0) v1 = src[h_low * src_w + w_low];
float v2 = 0;
if (h_low >= 0 && w_high <= src_w - 1) v2 = src[h_low * src_w + w_high];
float v3 = 0;
if (h_high <= src_h - 1 && w_low >= 0) v3 = src[h_high * src_w + w_low];
float v4 = 0;
if (h_high <= src_h - 1 && w_high <= src_w - 1)
v4 = src[h_high * src_w + w_high];
float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
return val;
}
void deformable_im2col(const float *input, const float *offset,
const int64_t src_h, const int64_t src_w,
const int64_t kernel_h, const int64_t kernel_w,
const int64_t pad_h, const int64_t pad_w,
const int64_t stride_h, const int64_t stride_w,
const int64_t dilation_h, const int64_t dilation_w,
const int64_t channels, const int64_t offset_groups,
const int64_t dst_h, const int64_t dst_w,
float *columns) {
const int64_t indices = channels * dst_h * dst_w;
for (int64_t index = 0; index != indices; ++index) {
const int64_t w_col = index % dst_w;
const int64_t h_col = (index / dst_w) % dst_h;
const int64_t c_im = index / (dst_w * dst_h);
const int64_t c_col = c_im * kernel_h * kernel_w;
int64_t c_per_offset_grp = channels / offset_groups;
const int64_t grp_idx = c_im / c_per_offset_grp;
auto columns_ptr =
columns + (c_col * (dst_h * dst_w) + h_col * dst_w + w_col);
auto input_ptr = input + c_im * (src_h * src_w);
auto offset_ptr =
offset + grp_idx * 2 * kernel_h * kernel_w * dst_h * dst_w;
for (int64_t kh = 0; kh < kernel_h; ++kh) {
for (int64_t kw = 0; kw < kernel_w; ++kw) {
const int data_offset_h_ptr =
((2 * (kh * kernel_w + kw)) * dst_h + h_col) * dst_w + w_col;
const int data_offset_w_ptr =
((2 * (kh * kernel_w + kw) + 1) * dst_h + h_col) * dst_w + w_col;
const float offset_h = offset_ptr[data_offset_h_ptr];
const float offset_w = offset_ptr[data_offset_w_ptr];
const float ih =
(h_col * stride_h - pad_h) + kh * dilation_h + offset_h;
const float iw =
(w_col * stride_w - pad_w) + kw * dilation_w + offset_w;
*columns_ptr = bilinear_interpolate(input_ptr, src_h, src_w, ih, iw);
columns_ptr += dst_h * dst_w;
}
}
}
}
void deformable_conv_forward(
const float *src, const float *offset, const float *filter,
const int64_t batch, const int64_t src_c, const int64_t src_h,
const int64_t src_w, const int64_t dst_c, const int64_t dst_h,
const int64_t dst_w, const int64_t group, const int64_t offset_group,
const int64_t channels, const int64_t num_output, const int64_t kernel_h,
const int64_t kernel_w, const int64_t stride_h, const int64_t stride_w,
const int64_t pad_h, const int64_t pad_w, const int64_t dilation_h,
const int64_t dilation_w, float *columns, float *dst) {
const int64_t ic_per_gp = channels / group;
const int64_t oc_per_gp = num_output / group;
for (int64_t b = 0; b < batch; ++b) {
for (int64_t g = 0; g < group; ++g) {
deformable_im2col(
src + b * src_c * src_h * src_w + g * ic_per_gp * src_h * src_w,
offset + b * offset_group * 2 * kernel_h * kernel_w * dst_h * dst_w,
src_h, src_w, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, ic_per_gp, offset_group, dst_h, dst_w,
columns);
float *dst_ptr =
dst + b * dst_c * dst_h * dst_w + g * oc_per_gp * dst_h * dst_w;
memset(dst_ptr, 0.0f, sizeof(float) * oc_per_gp * dst_h * dst_w);
gemm_ref_fp32_deform(
filter + g * oc_per_gp * ic_per_gp * kernel_h * kernel_w, columns,
nullptr, dst_ptr, 0, 0, oc_per_gp, dst_h * dst_w,
ic_per_gp * kernel_h * kernel_w, 1.0f, 1.0f, dst_ptr);
}
}
}
MMCVDeformConvKernel::MMCVDeformConvKernel(OrtApi api,
const OrtKernelInfo *info)
: api_(api), ort_(api_), info_(info) {
std::vector<int64_t> stride =
ort_.KernelInfoGetAttribute<std::vector<int64_t>>(info, "stride");
stride_height_ = stride[0];
stride_width_ = stride[1];
std::vector<int64_t> padding =
ort_.KernelInfoGetAttribute<std::vector<int64_t>>(info, "padding");
padding_height_ = padding[0];
padding_width_ = padding[1];
std::vector<int64_t> dilation =
ort_.KernelInfoGetAttribute<std::vector<int64_t>>(info, "dilation");
dilation_height_ = dilation[0];
dilation_width_ = dilation[1];
deformable_group_ =
ort_.KernelInfoGetAttribute<int64_t>(info, "deform_groups");
group_ = ort_.KernelInfoGetAttribute<int64_t>(info, "groups");
// create allocator
allocator_ = Ort::AllocatorWithDefaultOptions();
}
void MMCVDeformConvKernel::Compute(OrtKernelContext *context) {
const int64_t stride_height = stride_height_;
const int64_t stride_width = stride_width_;
const int64_t padding_height = padding_height_;
const int64_t padding_width = padding_width_;
const int64_t dilation_height = dilation_height_;
const int64_t dilation_width = dilation_width_;
const int64_t deformable_group = deformable_group_;
const int64_t group = group_;
const OrtValue *input = ort_.KernelContext_GetInput(context, 0);
const float *input_data =
reinterpret_cast<const float *>(ort_.GetTensorData<float>(input));
const OrtValue *offset = ort_.KernelContext_GetInput(context, 1);
const float *offset_data =
reinterpret_cast<const float *>(ort_.GetTensorData<float>(offset));
const OrtValue *filter = ort_.KernelContext_GetInput(context, 2);
const float *filter_data =
reinterpret_cast<const float *>(ort_.GetTensorData<float>(filter));
OrtTensorDimensions input_dims(ort_, input);
OrtTensorDimensions filter_dims(ort_, filter);
int64_t batch_size = input_dims[0];
int64_t in_channels = input_dims[1];
int64_t in_height = input_dims[2];
int64_t in_width = input_dims[3];
int64_t out_channels = filter_dims[0];
int64_t kernel_height = filter_dims[2];
int64_t kernel_width = filter_dims[3];
// get output memory
int64_t out_height = floor((in_height + 2 * padding_height -
dilation_height * (kernel_height - 1) - 1) /
stride_height +
1);
int64_t out_width = floor(
(in_width + 2 * padding_width - dilation_width * (kernel_width - 1) - 1) /
stride_width +
1);
std::vector<int64_t> output_dims = {batch_size, out_channels, out_height,
out_width};
OrtValue *output = ort_.KernelContext_GetOutput(
context, 0, output_dims.data(), output_dims.size());
float *out_ptr = ort_.GetTensorMutableData<float>(output);
// allocate tmp memory
int64_t column_len = (in_channels / group) * kernel_height * kernel_width *
out_height * out_width;
float *columns = (float *)allocator_.Alloc(sizeof(float) * column_len);
deformable_conv_forward(
input_data, offset_data, filter_data, batch_size, in_channels, in_height,
in_width, out_channels, out_height, out_width, group, deformable_group,
in_channels, out_channels, kernel_height, kernel_width, stride_height,
stride_width, padding_height, padding_width, dilation_height,
dilation_width, columns, out_ptr);
}
// Copyright (c) OpenMMLab. All rights reserved
#include <cmath>
#include "../ort_mmcv_utils.h"
#include "grid_sample.h"
#define MIN(a, b) (((a) < (b)) ? (a) : (b))
#define MAX(a, b) (((a) < (b)) ? (b) : (a))
#define CLIP_COORDINATES(in, out, clip_limit) \
out = MIN((clip_limit - 1), MAX(in, 0))
// modified from
// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/GridSampler.cpp
GridSampleKernel::GridSampleKernel(OrtApi api, const OrtKernelInfo *info)
: api_(api), ort_(api_), info_(info) {
align_corners_ = ort_.KernelInfoGetAttribute<int64_t>(info, "align_corners");
interpolation_mode_ =
ort_.KernelInfoGetAttribute<int64_t>(info, "interpolation_mode");
padding_mode_ = ort_.KernelInfoGetAttribute<int64_t>(info, "padding_mode");
allocator_ = Ort::AllocatorWithDefaultOptions();
}
enum GridSamplerInterpolation { Bilinear = 0, Nearest = 1, Bicubic = 2 };
enum GridSamplerPadding { Zeros = 0, Border = 1, Reflection = 2 };
template <typename scalar_t>
static inline scalar_t grid_sampler_unnormalize(scalar_t coord, int64_t size,
bool align_corners) {
if (align_corners) {
return ((coord + 1) / 2) * (size - 1);
} else {
return ((coord + 1) * size - 1) / 2;
}
}
// Clips coordinates to between 0 and clip_limit - 1
template <typename scalar_t>
static inline scalar_t clip_coordinates(scalar_t in, int64_t clip_limit) {
return std::min(static_cast<scalar_t>(clip_limit - 1),
std::max(in, static_cast<scalar_t>(0)));
}
// Reflects coordinates until they fall between low and high (inclusive).
// The bounds are passed as twice their value so that half-integer values
// can be represented as ints.
template <typename scalar_t>
static inline scalar_t reflect_coordinates(scalar_t in, int64_t twice_low,
int64_t twice_high) {
if (twice_low == twice_high) {
return static_cast<scalar_t>(0);
}
scalar_t min = static_cast<scalar_t>(twice_low) / 2;
scalar_t span = static_cast<scalar_t>(twice_high - twice_low) / 2;
in = std::fabs(in - min);
// `fmod` returns same sign as `in`, which is positive after the `fabs` above.
scalar_t extra = std::fmod(in, span);
int flips = static_cast<int>(std::floor(in / span));
if (flips % 2 == 0) {
return extra + min;
} else {
return span - extra + min;
}
}
template <typename scalar_t>
static inline scalar_t compute_coordinates(scalar_t coord, int64_t size,
int64_t padding_mode,
bool align_corners) {
if (padding_mode == GridSamplerPadding::Border) {
coord = clip_coordinates(coord, size);
} else if (padding_mode == GridSamplerPadding::Reflection) {
if (align_corners) {
coord = reflect_coordinates(coord, 0, 2 * (size - 1));
} else {
coord = reflect_coordinates(coord, -1, 2 * size - 1);
}
coord = clip_coordinates(coord, size);
}
return coord;
}
// Computes the pixel source index value for a grid coordinate
template <typename scalar_t>
static inline scalar_t grid_sampler_compute_source_index(scalar_t coord,
int64_t size,
int64_t padding_mode,
bool align_corners) {
coord = grid_sampler_unnormalize(coord, size, align_corners);
coord = compute_coordinates(coord, size, padding_mode, align_corners);
return coord;
}
static inline bool within_bounds_2d(int64_t h, int64_t w, int64_t H,
int64_t W) {
return h >= 0 && h < H && w >= 0 && w < W;
}
template <typename scalar_t>
static inline scalar_t get_value_bounded(const scalar_t *data, scalar_t x,
scalar_t y, int64_t W, int64_t H,
int64_t sW, int64_t sH,
int64_t padding_mode,
bool align_corners) {
x = compute_coordinates(x, W, padding_mode, align_corners);
y = compute_coordinates(y, H, padding_mode, align_corners);
int64_t ix = static_cast<int64_t>(x);
int64_t iy = static_cast<int64_t>(y);
if (within_bounds_2d(iy, ix, H, W)) {
return data[iy * sH + ix * sW];
}
return static_cast<scalar_t>(0);
}
template <typename scalar_t>
static inline scalar_t cubic_convolution1(scalar_t x, scalar_t A) {
return ((A + 2) * x - (A + 3)) * x * x + 1;
}
template <typename scalar_t>
static inline scalar_t cubic_convolution2(scalar_t x, scalar_t A) {
return ((A * x - 5 * A) * x + 8 * A) * x - 4 * A;
}
template <typename scalar_t>
static inline void get_cubic_upsample_coefficients(scalar_t coeffs[4],
scalar_t t) {
scalar_t A = -0.75;
scalar_t x1 = t;
coeffs[0] = cubic_convolution2<scalar_t>(x1 + 1.0, A);
coeffs[1] = cubic_convolution1<scalar_t>(x1, A);
// opposite coefficients
scalar_t x2 = 1.0 - t;
coeffs[2] = cubic_convolution1<scalar_t>(x2, A);
coeffs[3] = cubic_convolution2<scalar_t>(x2 + 1.0, A);
}
template <typename scalar_t>
static inline scalar_t cubic_interp1d(scalar_t x0, scalar_t x1, scalar_t x2,
scalar_t x3, scalar_t t) {
scalar_t coeffs[4];
get_cubic_upsample_coefficients<scalar_t>(coeffs, t);
return x0 * coeffs[0] + x1 * coeffs[1] + x2 * coeffs[2] + x3 * coeffs[3];
}
void GridSampleKernel::Compute(OrtKernelContext *context) {
const bool align_corners = align_corners_;
const int64_t padding_mode = padding_mode_;
const int64_t interpolation_mode = interpolation_mode_;
const OrtValue *input = ort_.KernelContext_GetInput(context, 0);
const float *input_data =
reinterpret_cast<const float *>(ort_.GetTensorData<float>(input));
const OrtValue *grid = ort_.KernelContext_GetInput(context, 1);
const float *grid_data =
reinterpret_cast<const float *>(ort_.GetTensorData<float>(grid));
OrtTensorDimensions input_dims(ort_, input);
OrtTensorDimensions grid_dims(ort_, grid);
int64_t N = input_dims[0];
int64_t C = input_dims[1];
int64_t inp_H = input_dims[2];
int64_t inp_W = input_dims[3];
int64_t out_H = grid_dims[1];
int64_t out_W = grid_dims[2];
std::vector<int64_t> output_dims = {N, C, out_H, out_W};
OrtValue *output = ort_.KernelContext_GetOutput(
context, 0, output_dims.data(), output_dims.size());
float *out_ptr = ort_.GetTensorMutableData<float>(output);
int64_t inp_sN = input_dims[1] * input_dims[2] * input_dims[3];
int64_t inp_sC = input_dims[2] * input_dims[3];
int64_t inp_sH = input_dims[3];
int64_t inp_sW = 1;
int64_t grid_sN = grid_dims[1] * grid_dims[2] * grid_dims[3];
int64_t grid_sH = grid_dims[2] * grid_dims[3];
int64_t grid_sW = grid_dims[3];
int64_t grid_sCoor = 1;
int64_t out_sN = output_dims[1] * output_dims[2] * output_dims[3];
int64_t out_sC = output_dims[2] * output_dims[3];
int64_t out_sH = output_dims[3];
int64_t out_sW = 1;
// loop over each output pixel
for (int64_t n = 0; n < N; ++n) {
const float *grid_ptr_N = grid_data + n * grid_sN;
const float *inp_ptr_N = input_data + n * inp_sN;
for (int64_t h = 0; h < out_H; ++h) {
for (int64_t w = 0; w < out_W; ++w) {
const float *grid_ptr_NHW = grid_ptr_N + h * grid_sH + w * grid_sW;
float x = *grid_ptr_NHW;
float y = grid_ptr_NHW[grid_sCoor];
float ix = grid_sampler_compute_source_index(x, inp_W, padding_mode,
align_corners);
float iy = grid_sampler_compute_source_index(y, inp_H, padding_mode,
align_corners);
if (interpolation_mode == GridSamplerInterpolation::Bilinear) {
// get corner pixel values from (x, y)
// for 4d, we use north-east-south-west
int64_t ix_nw = static_cast<int64_t>(std::floor(ix));
int64_t iy_nw = static_cast<int64_t>(std::floor(iy));
int64_t ix_ne = ix_nw + 1;
int64_t iy_ne = iy_nw;
int64_t ix_sw = ix_nw;
int64_t iy_sw = iy_nw + 1;
int64_t ix_se = ix_nw + 1;
int64_t iy_se = iy_nw + 1;
// get surfaces to each neighbor:
float nw = (ix_se - ix) * (iy_se - iy);
float ne = (ix - ix_sw) * (iy_sw - iy);
float sw = (ix_ne - ix) * (iy - iy_ne);
float se = (ix - ix_nw) * (iy - iy_nw);
// calculate bilinear weighted pixel value and set output pixel
const float *inp_ptr_NC = inp_ptr_N;
float *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW;
for (int64_t c = 0; c < C;
++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC) {
auto res = static_cast<float>(0);
if (within_bounds_2d(iy_nw, ix_nw, inp_H, inp_W)) {
res += inp_ptr_NC[iy_nw * inp_sH + ix_nw * inp_sW] * nw;
}
if (within_bounds_2d(iy_ne, ix_ne, inp_H, inp_W)) {
res += inp_ptr_NC[iy_ne * inp_sH + ix_ne * inp_sW] * ne;
}
if (within_bounds_2d(iy_sw, ix_sw, inp_H, inp_W)) {
res += inp_ptr_NC[iy_sw * inp_sH + ix_sw * inp_sW] * sw;
}
if (within_bounds_2d(iy_se, ix_se, inp_H, inp_W)) {
res += inp_ptr_NC[iy_se * inp_sH + ix_se * inp_sW] * se;
}
*out_ptr_NCHW = res;
}
} else if (interpolation_mode == GridSamplerInterpolation::Nearest) {
int64_t ix_nearest = static_cast<int64_t>(std::nearbyint(ix));
int64_t iy_nearest = static_cast<int64_t>(std::nearbyint(iy));
// assign nearest neighbor pixel value to output pixel
float *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW;
const float *inp_ptr_NC = inp_ptr_N;
for (int64_t c = 0; c < C;
++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC) {
if (within_bounds_2d(iy_nearest, ix_nearest, inp_H, inp_W)) {
*out_ptr_NCHW =
inp_ptr_NC[iy_nearest * inp_sH + ix_nearest * inp_sW];
} else {
*out_ptr_NCHW = static_cast<float>(0);
}
}
} else if (interpolation_mode == GridSamplerInterpolation::Bicubic) {
// grid_sampler_compute_source_index will "clip the value" of idx
// depends on the padding,
// which would cause calculation to be wrong,
// for example x = -0.1 -> ix = 0 for zero padding, but in bicubic ix
// = floor(x) = -1
// There would be more problem in reflection padding, since the -1 and
// +1 direction is not fixed in boundary condition
ix = grid_sampler_unnormalize(x, inp_W, align_corners);
iy = grid_sampler_unnormalize(y, inp_H, align_corners);
float ix_nw = std::floor(ix);
float iy_nw = std::floor(iy);
const float tx = ix - ix_nw;
const float ty = iy - iy_nw;
const float *inp_ptr_NC = inp_ptr_N;
float *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW;
for (int64_t c = 0; c < C;
++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC) {
float coefficients[4];
// Interpolate 4 values in the x direction
for (int64_t i = 0; i < 4; ++i) {
coefficients[i] = cubic_interp1d<float>(
get_value_bounded<float>(inp_ptr_NC, ix_nw - 1, iy_nw - 1 + i,
inp_W, inp_H, inp_sW, inp_sH,
padding_mode, align_corners),
get_value_bounded<float>(inp_ptr_NC, ix_nw + 0, iy_nw - 1 + i,
inp_W, inp_H, inp_sW, inp_sH,
padding_mode, align_corners),
get_value_bounded<float>(inp_ptr_NC, ix_nw + 1, iy_nw - 1 + i,
inp_W, inp_H, inp_sW, inp_sH,
padding_mode, align_corners),
get_value_bounded<float>(inp_ptr_NC, ix_nw + 2, iy_nw - 1 + i,
inp_W, inp_H, inp_sW, inp_sH,
padding_mode, align_corners),
tx);
}
// Interpolate in the y direction
*out_ptr_NCHW =
cubic_interp1d<float>(coefficients[0], coefficients[1],
coefficients[2], coefficients[3], ty);
}
}
}
}
}
}
// Copyright (c) OpenMMLab. All rights reserved
#include "modulated_deform_conv.h"
#include <cmath>
#include <vector>
#include "../ort_mmcv_utils.h"
float bilinear_interpolate_2d(const float *src, const int64_t src_h,
const int64_t src_w, const float h,
const float w) {
if (h <= -1 || src_h <= h || w <= -1 || src_w <= w) {
return 0;
}
int64_t h_low = floor(h);
int64_t w_low = floor(w);
int64_t h_high = h_low + 1;
int64_t w_high = w_low + 1;
float lh = h - h_low;
float lw = w - w_low;
float hh = 1 - lh;
float hw = 1 - lw;
float v1 = 0;
if (h_low >= 0 && w_low >= 0) v1 = src[h_low * src_w + w_low];
float v2 = 0;
if (h_low >= 0 && w_high <= src_w - 1) v2 = src[h_low * src_w + w_high];
float v3 = 0;
if (h_high <= src_h - 1 && w_low >= 0) v3 = src[h_high * src_w + w_low];
float v4 = 0;
if (h_high <= src_h - 1 && w_high <= src_w - 1)
v4 = src[h_high * src_w + w_high];
float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
return val;
}
// output: (channels * kernel_h * kernel_w, dst_h * dst_w)
void deformable_im2col_2d(const float *input, const float *offset,
const float *mask, const int64_t src_h,
const int64_t src_w, const int64_t kernel_h,
const int64_t kernel_w, const int64_t pad_h,
const int64_t pad_w, const int64_t stride_h,
const int64_t stride_w, const int64_t dilation_h,
const int64_t dilation_w, const int64_t channels,
const int64_t offset_groups, const int64_t dst_h,
const int64_t dst_w, const bool use_mask,
float *columns) {
const int64_t workload = channels * dst_h * dst_w;
for (int64_t index = 0; index != workload; ++index) {
const int64_t ow = index % dst_w;
const int64_t oh = (index / dst_w) % dst_h;
const int64_t ic = index / (dst_w * dst_h);
const int64_t oc = ic * kernel_h * kernel_w;
int64_t c_per_offset_grp = channels / offset_groups;
const int64_t grp_idx = ic / c_per_offset_grp;
auto columns_ptr = columns + (oc * (dst_h * dst_w) + oh * dst_w + ow);
auto input_ptr = input + ic * (src_h * src_w);
auto offset_ptr =
offset + grp_idx * 2 * kernel_h * kernel_w * dst_h * dst_w;
auto mask_ptr = mask;
if (use_mask) {
mask_ptr += grp_idx * kernel_h * kernel_w * dst_h * dst_w;
}
for (int64_t kh = 0; kh < kernel_h; ++kh) {
for (int64_t kw = 0; kw < kernel_w; ++kw) {
const int64_t mask_idx = kh * kernel_w + kw;
const int64_t offset_idx = 2 * mask_idx;
float mask_value = 1;
if (use_mask) {
mask_value = mask_ptr[mask_idx * (dst_h * dst_w) + oh * dst_w + ow];
}
const float offset_h =
offset_ptr[offset_idx * (dst_h * dst_w) + oh * dst_w + ow];
const float offset_w =
offset_ptr[(offset_idx + 1) * (dst_h * dst_w) + oh * dst_w + ow];
const float ih = (oh * stride_h - pad_h) + kh * dilation_h + offset_h;
const float iw = (ow * stride_w - pad_w) + kw * dilation_w + offset_w;
*columns_ptr = mask_value *
bilinear_interpolate_2d(input_ptr, src_h, src_w, ih, iw);
columns_ptr += dst_h * dst_w;
}
}
}
}
void gemm_ref_fp32(const float *A, const float *B, const float *V,
const float *H, const int32_t trans_A, const int32_t trans_B,
const int32_t M, const int32_t N, const int32_t K,
const float alpha, const float beta, float *Y) {
if (!trans_A && !trans_B) { // MK, KN; NN
for (int64_t m = 0; m < M; ++m) {
for (int64_t n = 0; n < N; ++n) {
float y = 0.0f;
for (int64_t k = 0; k < K; ++k) {
y += A[m * K + k] * B[k * N + n];
}
y *= alpha;
if (V) y += beta * V[n];
if (H) y += beta * H[m * N + n];
Y[m * N + n] = y;
}
}
}
if (trans_A && !trans_B) { // KM, KN; TN
for (int64_t m = 0; m < M; ++m) {
for (int64_t n = 0; n < N; ++n) {
float y = 0.0f;
for (int64_t k = 0; k < K; ++k) {
y += A[k * M + m] * B[k * N + n];
}
y *= alpha;
if (V) y += beta * V[n];
if (H) y += beta * H[m * N + n];
Y[m * N + n] = y;
}
}
}
if (trans_A && trans_B) { // KM, NK; TT
for (int64_t m = 0; m < M; ++m) {
for (int64_t n = 0; n < N; ++n) {
float y = 0.0f;
for (int64_t k = 0; k < K; ++k) {
y += A[k * M + m] * B[n * K + k];
}
y *= alpha;
if (V) y += beta * V[n];
if (H) y += beta * H[m * N + n];
Y[m * N + n] = y;
}
}
}
if (!trans_A && trans_B) { // MK, NK; NT
for (int64_t m = 0; m < M; ++m) {
for (int64_t n = 0; n < N; ++n) {
float y = 0.0f;
for (int64_t k = 0; k < K; ++k) {
y += A[m * K + k] * B[n * K + k];
}
y *= alpha;
if (V) y += beta * V[n];
if (H) y += beta * H[m * N + n];
Y[m * N + n] = y;
}
}
}
}
void deformable_conv2d_ref_fp32(
const float *src, const float *offset, const float *mask,
const float *filter, const float *bias, const int64_t batch,
const int64_t src_c, const int64_t src_h, const int64_t src_w,
const int64_t dst_c, const int64_t dst_h, const int64_t dst_w,
const int64_t group, const int64_t offset_group, const int64_t channels,
const int64_t num_output, const int64_t kernel_h, const int64_t kernel_w,
const int64_t stride_h, const int64_t stride_w, const int64_t pad_h,
const int64_t pad_w, const int64_t dilation_h, const int64_t dilation_w,
float *columns, float *dst) {
const int64_t ic_per_gp = channels / group;
const int64_t oc_per_gp = num_output / group;
for (int64_t b = 0; b < batch; ++b) {
for (int64_t g = 0; g < group; ++g) {
deformable_im2col_2d(
src + b * src_c * src_h * src_w + g * ic_per_gp * src_h * src_w,
offset + b * offset_group * 2 * kernel_h * kernel_w * dst_h * dst_w,
mask + b * offset_group * kernel_h * kernel_w * dst_h * dst_w, src_h,
src_w, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, ic_per_gp, offset_group, dst_h, dst_w,
mask != nullptr, columns);
float *dst_ptr =
dst + b * dst_c * dst_h * dst_w + g * oc_per_gp * dst_h * dst_w;
if (bias != nullptr) {
const float *bias_ptr = bias + g * oc_per_gp;
for (int64_t oc = 0; oc < oc_per_gp; ++oc) {
for (int64_t hw = 0; hw < dst_h * dst_w; ++hw) {
dst_ptr[oc * dst_h * dst_w + hw] = bias_ptr[oc];
}
}
} else {
memset(dst_ptr, 0.0f, sizeof(float) * oc_per_gp * dst_h * dst_w);
}
gemm_ref_fp32(filter + g * oc_per_gp * ic_per_gp * kernel_h * kernel_w,
columns, nullptr, dst_ptr, 0, 0, oc_per_gp, dst_h * dst_w,
ic_per_gp * kernel_h * kernel_w, 1.0f, 1.0f, dst_ptr);
}
}
}
MMCVModulatedDeformConvKernel::MMCVModulatedDeformConvKernel(
OrtApi api, const OrtKernelInfo *info)
: api_(api), ort_(api_), info_(info) {
std::vector<int64_t> stride =
ort_.KernelInfoGetAttribute<std::vector<int64_t>>(info, "stride");
stride_height_ = stride[0];
stride_width_ = stride[1];
std::vector<int64_t> padding =
ort_.KernelInfoGetAttribute<std::vector<int64_t>>(info, "padding");
padding_height_ = padding[0];
padding_width_ = padding[1];
std::vector<int64_t> dilation =
ort_.KernelInfoGetAttribute<std::vector<int64_t>>(info, "dilation");
dilation_height_ = dilation[0];
dilation_width_ = dilation[1];
deformable_group_ =
ort_.KernelInfoGetAttribute<int64_t>(info, "deform_groups");
group_ = ort_.KernelInfoGetAttribute<int64_t>(info, "groups");
// create allocator
allocator_ = Ort::AllocatorWithDefaultOptions();
}
void MMCVModulatedDeformConvKernel::Compute(OrtKernelContext *context) {
const int64_t stride_height = stride_height_;
const int64_t stride_width = stride_width_;
const int64_t padding_height = padding_height_;
const int64_t padding_width = padding_width_;
const int64_t dilation_height = dilation_height_;
const int64_t dilation_width = dilation_width_;
const int64_t deformable_group = deformable_group_;
const int64_t group = group_;
const OrtValue *input = ort_.KernelContext_GetInput(context, 0);
const float *input_data =
reinterpret_cast<const float *>(ort_.GetTensorData<float>(input));
const OrtValue *offset = ort_.KernelContext_GetInput(context, 1);
const float *offset_data =
reinterpret_cast<const float *>(ort_.GetTensorData<float>(offset));
const OrtValue *mask = ort_.KernelContext_GetInput(context, 2);
const float *mask_data =
reinterpret_cast<const float *>(ort_.GetTensorData<float>(mask));
const OrtValue *filter = ort_.KernelContext_GetInput(context, 3);
const float *filter_data =
reinterpret_cast<const float *>(ort_.GetTensorData<float>(filter));
const OrtValue *bias = ort_.KernelContext_GetInput(context, 4);
const float *bias_data =
(bias != nullptr)
? reinterpret_cast<const float *>(ort_.GetTensorData<float>(bias))
: nullptr;
// const float *bias_data = nullptr;
OrtTensorDimensions input_dims(ort_, input);
OrtTensorDimensions filter_dims(ort_, filter);
int64_t batch = input_dims[0];
int64_t channels = input_dims[1];
int64_t in_height = input_dims[2];
int64_t in_width = input_dims[3];
int64_t num_output = filter_dims[0];
int64_t kernel_height = filter_dims[2];
int64_t kernel_width = filter_dims[3];
// get output memory
int64_t out_height = floor((in_height + 2 * padding_height -
dilation_height * (kernel_height - 1) - 1) /
stride_height +
1);
int64_t out_width = floor(
(in_width + 2 * padding_width - dilation_width * (kernel_width - 1) - 1) /
stride_width +
1);
std::vector<int64_t> output_dims = {batch, num_output, out_height, out_width};
OrtValue *output = ort_.KernelContext_GetOutput(
context, 0, output_dims.data(), output_dims.size());
float *out_ptr = ort_.GetTensorMutableData<float>(output);
// allocate tmp memory
int64_t column_len = (channels / group) * kernel_height * kernel_width *
out_height * out_width;
float *columns = (float *)allocator_.Alloc(sizeof(float) * column_len);
deformable_conv2d_ref_fp32(
input_data, offset_data, mask_data, filter_data, bias_data, batch,
channels, in_height, in_width, num_output, out_height, out_width, group,
deformable_group, channels, num_output, kernel_height, kernel_width,
stride_height, stride_width, padding_height, padding_width,
dilation_height, dilation_width, columns, out_ptr);
}
// Copyright (c) OpenMMLab. All rights reserved
#include "nms.h"
#include <assert.h>
#include <algorithm>
#include <cmath>
#include <iostream>
#include <iterator>
#include <numeric> // std::iota
#include <vector>
#include "../ort_mmcv_utils.h"
NmsKernel::NmsKernel(OrtApi api, const OrtKernelInfo *info)
: api_(api), ort_(api_), info_(info) {
iou_threshold_ = ort_.KernelInfoGetAttribute<float>(info, "iou_threshold");
offset_ = ort_.KernelInfoGetAttribute<int64_t>(info, "offset");
// create allocator
allocator_ = Ort::AllocatorWithDefaultOptions();
}
void NmsKernel::Compute(OrtKernelContext *context) {
const float iou_threshold = iou_threshold_;
const int64_t offset = offset_;
const OrtValue *boxes = ort_.KernelContext_GetInput(context, 0);
const float *boxes_data =
reinterpret_cast<const float *>(ort_.GetTensorData<float>(boxes));
const OrtValue *scores = ort_.KernelContext_GetInput(context, 1);
const float *scores_data =
reinterpret_cast<const float *>(ort_.GetTensorData<float>(scores));
OrtTensorDimensions boxes_dim(ort_, boxes);
OrtTensorDimensions scores_dim(ort_, scores);
int64_t nboxes = boxes_dim[0];
assert(boxes_dim[1] == 4);
// allocate tmp memory
float *tmp_boxes = (float *)allocator_.Alloc(sizeof(float) * nboxes * 4);
float *sc = (float *)allocator_.Alloc(sizeof(float) * nboxes);
float *areas = (float *)allocator_.Alloc(sizeof(float) * nboxes);
bool *select = (bool *)allocator_.Alloc(sizeof(bool) * nboxes);
for (int64_t i = 0; i < nboxes; i++) {
select[i] = true;
}
memcpy(tmp_boxes, boxes_data, sizeof(float) * nboxes * 4);
memcpy(sc, scores_data, sizeof(float) * nboxes);
// sort scores
std::vector<float> tmp_sc;
for (int i = 0; i < nboxes; i++) {
tmp_sc.push_back(sc[i]);
}
std::vector<int64_t> order(tmp_sc.size());
std::iota(order.begin(), order.end(), 0);
std::sort(order.begin(), order.end(), [&tmp_sc](int64_t id1, int64_t id2) {
return tmp_sc[id1] > tmp_sc[id2];
});
// area = (x2 - x1 + offset) * (y2 - y1 + offset)
for (int64_t i = 0; i < nboxes; i++) {
areas[i] = (tmp_boxes[i * 4 + 2] - tmp_boxes[i * 4 + 0] + offset) *
(tmp_boxes[i * 4 + 3] - tmp_boxes[i * 4 + 1] + offset);
}
for (int64_t _i = 0; _i < nboxes; _i++) {
if (select[_i] == false) continue;
auto i = order[_i];
auto ix1 = tmp_boxes[i * 4 + 0];
auto iy1 = tmp_boxes[i * 4 + 1];
auto ix2 = tmp_boxes[i * 4 + 2];
auto iy2 = tmp_boxes[i * 4 + 3];
auto iarea = areas[i];
for (int64_t _j = _i + 1; _j < nboxes; _j++) {
if (select[_j] == false) continue;
auto j = order[_j];
auto xx1 = std::max(ix1, tmp_boxes[j * 4 + 0]);
auto yy1 = std::max(iy1, tmp_boxes[j * 4 + 1]);
auto xx2 = std::min(ix2, tmp_boxes[j * 4 + 2]);
auto yy2 = std::min(iy2, tmp_boxes[j * 4 + 3]);
auto w = std::max(0.f, xx2 - xx1 + offset);
auto h = std::max(0.f, yy2 - yy1 + offset);
auto inter = w * h;
auto ovr = inter / (iarea + areas[j] - inter);
if (ovr > iou_threshold) select[_j] = false;
}
}
std::vector<int64_t> res_order;
for (int i = 0; i < nboxes; i++) {
if (select[i]) {
res_order.push_back(order[i]);
}
}
std::vector<int64_t> inds_dims({res_order.size()});
OrtValue *res = ort_.KernelContext_GetOutput(context, 0, inds_dims.data(),
inds_dims.size());
int64_t *res_data = ort_.GetTensorMutableData<int64_t>(res);
memcpy(res_data, res_order.data(), sizeof(int64_t) * res_order.size());
}
// Copyright (c) OpenMMLab. All rights reserved
#include "onnxruntime_register.h"
#include "corner_pool.h"
#include "deform_conv.h"
#include "grid_sample.h"
#include "modulated_deform_conv.h"
#include "nms.h"
#include "ort_mmcv_utils.h"
#include "reduce_ops.h"
#include "roi_align.h"
#include "roi_align_rotated.h"
#include "rotated_feature_align.h"
#include "soft_nms.h"
const char *c_MMCVOpDomain = "mmcv";
SoftNmsOp c_SoftNmsOp;
NmsOp c_NmsOp;
MMCVRoiAlignCustomOp c_MMCVRoiAlignCustomOp;
MMCVRoIAlignRotatedCustomOp c_MMCVRoIAlignRotatedCustomOp;
MMCVRotatedFeatureAlignCustomOp c_MMCVRotatedFeatureAlignCustomOp;
GridSampleOp c_GridSampleOp;
MMCVCumMaxCustomOp c_MMCVCumMaxCustomOp;
MMCVCumMinCustomOp c_MMCVCumMinCustomOp;
MMCVCornerPoolCustomOp c_MMCVCornerPoolCustomOp;
MMCVModulatedDeformConvOp c_MMCVModulatedDeformConvOp;
MMCVDeformConvOp c_MMCVDeformConvOp;
OrtStatus *ORT_API_CALL RegisterCustomOps(OrtSessionOptions *options,
const OrtApiBase *api) {
OrtCustomOpDomain *domain = nullptr;
const OrtApi *ortApi = api->GetApi(ORT_API_VERSION);
if (auto status = ortApi->CreateCustomOpDomain(c_MMCVOpDomain, &domain)) {
return status;
}
if (auto status = ortApi->CustomOpDomain_Add(domain, &c_SoftNmsOp)) {
return status;
}
if (auto status = ortApi->CustomOpDomain_Add(domain, &c_NmsOp)) {
return status;
}
if (auto status =
ortApi->CustomOpDomain_Add(domain, &c_MMCVRoiAlignCustomOp)) {
return status;
}
if (auto status =
ortApi->CustomOpDomain_Add(domain, &c_MMCVRoIAlignRotatedCustomOp)) {
return status;
}
if (auto status = ortApi->CustomOpDomain_Add(domain, &c_GridSampleOp)) {
return status;
}
if (auto status =
ortApi->CustomOpDomain_Add(domain, &c_MMCVCornerPoolCustomOp)) {
return status;
}
if (auto status = ortApi->CustomOpDomain_Add(domain, &c_MMCVCumMaxCustomOp)) {
return status;
}
if (auto status = ortApi->CustomOpDomain_Add(domain, &c_MMCVCumMinCustomOp)) {
return status;
}
if (auto status =
ortApi->CustomOpDomain_Add(domain, &c_MMCVModulatedDeformConvOp)) {
return status;
}
if (auto status = ortApi->CustomOpDomain_Add(domain, &c_MMCVDeformConvOp)) {
return status;
}
if (auto status = ortApi->CustomOpDomain_Add(
domain, &c_MMCVRotatedFeatureAlignCustomOp)) {
return status;
}
return ortApi->AddCustomOpDomain(options, domain);
}
// Copyright (c) OpenMMLab. All rights reserved
#include "reduce_ops.h"
#include <assert.h>
#include <vector>
#include "../ort_mmcv_utils.h"
// modified from
// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/ReduceOps.cpp
static inline int64_t maybe_wrap_dim(int64_t dim, int64_t ndims) {
int64_t min = -ndims;
int64_t max = ndims - 1;
assert(dim >= min && dim <= max);
if (dim < 0) dim += ndims;
return dim;
}
static inline int64_t get_dim_stride(const int64_t dim, const int64_t ndims,
const int64_t *reversed_dim_cumprod) {
return dim == ndims - 1 ? 1 : reversed_dim_cumprod[dim + 1];
}
static inline int64_t get_dim_size(const int64_t dim, const int64_t ndims,
const int64_t *reversed_dim_cumprod) {
return dim == ndims - 1
? reversed_dim_cumprod[dim]
: reversed_dim_cumprod[dim] / reversed_dim_cumprod[dim + 1];
}
template <typename T1, typename T2, typename Operation>
void cummax_cummin_helper(const T1 *input, T1 *output, T2 *indices,
const int64_t input_dim_size, const int64_t stride) {
Operation op;
T1 out = input[0];
int64_t idx = 0;
for (int64_t i = 0; i < input_dim_size; i++) {
T1 curr_elem = input[i * stride];
if (op(curr_elem, out)) {
out = curr_elem;
idx = i;
}
output[i * stride] = out;
indices[i * stride] = idx;
}
}
// modified `tensor_dim_apply3` from
// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/TensorDimApply.h.
// the difference is that: (1) use `reversed_dim_cumprod` for fast computing of
// tensor `size` and `stride`. (2) the same `stride` is used for input, output,
// and indices, since it's unnecessary to use separate values. currently
// `tensor_dim_apply3` is only used for `cummax` and `cummin`, according to the
// official pytorch projects: https://github.com/pytorch/pytorch.
template <typename T1, typename T2, typename Function>
void tensor_dim_apply3(const T1 *input, T1 *output, T2 *indices,
const int64_t dim, const int64_t ndims,
const int64_t *reversed_dim_cumprod, Function func) {
int dim_apply_finished = 0;
int64_t input_dim_size = get_dim_size(dim, ndims, reversed_dim_cumprod);
// the same stride is used for input, output and indices
int64_t stride = get_dim_stride(dim, ndims, reversed_dim_cumprod);
std::vector<int64_t> counter(ndims, 0);
while (!dim_apply_finished) {
// call `func` once to update output and indices
func(input, output, indices, input_dim_size, stride);
if (ndims == 1) break;
for (int64_t dim_i = 0; dim_i < ndims; dim_i++) {
if (dim_i == dim) {
if (dim_i == (ndims - 1)) {
dim_apply_finished = 1;
break;
}
continue;
}
counter[dim_i]++;
// the same stride is used for input, output, and indices
int64_t stride_dim_i = get_dim_stride(dim_i, ndims, reversed_dim_cumprod);
input += stride_dim_i;
output += stride_dim_i;
indices += stride_dim_i;
if (counter[dim_i] == get_dim_size(dim_i, ndims, reversed_dim_cumprod)) {
if (dim_i == ndims - 1) {
dim_apply_finished = 1;
break;
} else {
input -= counter[dim_i] * stride_dim_i;
output -= counter[dim_i] * stride_dim_i;
indices -= counter[dim_i] * stride_dim_i;
counter[dim_i] = 0;
}
} else {
break;
} // if
} // for
} // while
}
template <typename T1, typename T2, typename Operation>
void CumMax_CumMin_CPU(const T1 *input, T1 *output, T2 *indices,
int64_t *reversed_dim_cumprod, const int64_t dim,
const OrtTensorDimensions &out_dimensions) {
// calculate numel
const int64_t ndims = out_dimensions.size();
int64_t numel = 1;
for (int64_t dim_i = 0; dim_i < ndims; dim_i++) {
numel *= out_dimensions.data()[dim_i];
}
// cummax is only applied to input which is non-zero dim and non-empty
if (numel) {
// compute the cumulative production on dimension size,
// which is then used for computing the stride or size of a specific `dim`.
reversed_dim_cumprod[ndims - 1] = out_dimensions.data()[ndims - 1];
for (int64_t dim_i = ndims - 2; dim_i >= 0; dim_i--) {
reversed_dim_cumprod[dim_i] =
reversed_dim_cumprod[dim_i + 1] * out_dimensions.data()[dim_i];
}
// do cummax or cummin based on `Operation` type
tensor_dim_apply3<float, int64_t>(
input, output, indices, dim, ndims, reversed_dim_cumprod,
cummax_cummin_helper<float, int64_t, Operation>);
}
}
void MMCVCumMaxKernel::Compute(OrtKernelContext *context) {
// get input
const OrtValue *input = ort_.KernelContext_GetInput(context, 0);
const float *input_data =
reinterpret_cast<const float *>(ort_.GetTensorData<float>(input));
// get output
OrtTensorDimensions out_dimensions(ort_, input);
OrtValue *output = ort_.KernelContext_GetOutput(
context, 0, out_dimensions.data(), out_dimensions.size());
float *output_data = ort_.GetTensorMutableData<float>(output);
OrtValue *indices = ort_.KernelContext_GetOutput(
context, 1, out_dimensions.data(), out_dimensions.size());
int64_t *indices_data = ort_.GetTensorMutableData<int64_t>(indices);
// allocate tmp memory for computing the cumulative production on dimension
// size
const int64_t ndims = out_dimensions.size();
assert(ndims > 0);
int64_t *reversed_dim_cumprod =
(int64_t *)allocator_.Alloc(sizeof(int64_t) * ndims);
// dim should be wrapped if it's negative (e.g. -1)
const int64_t dim = maybe_wrap_dim(dim_, ndims);
CumMax_CumMin_CPU<float, int64_t, std::greater_equal<float>>(
input_data, output_data, indices_data, reversed_dim_cumprod, dim,
out_dimensions);
}
void MMCVCumMinKernel::Compute(OrtKernelContext *context) {
// get input
const OrtValue *input = ort_.KernelContext_GetInput(context, 0);
const float *input_data =
reinterpret_cast<const float *>(ort_.GetTensorData<float>(input));
// get output
OrtTensorDimensions out_dimensions(ort_, input);
OrtValue *output = ort_.KernelContext_GetOutput(
context, 0, out_dimensions.data(), out_dimensions.size());
float *output_data = ort_.GetTensorMutableData<float>(output);
OrtValue *indices = ort_.KernelContext_GetOutput(
context, 1, out_dimensions.data(), out_dimensions.size());
int64_t *indices_data = ort_.GetTensorMutableData<int64_t>(indices);
// allocate tmp memory for computing the cumulative production on dimension
// size
const int64_t ndims = out_dimensions.size();
assert(ndims > 0);
int64_t *reversed_dim_cumprod =
(int64_t *)allocator_.Alloc(sizeof(int64_t) * ndims);
// dim should be wrapped if it's negative (e.g. -1)
const int64_t dim = maybe_wrap_dim(dim_, ndims);
CumMax_CumMin_CPU<float, int64_t, std::less_equal<float>>(
input_data, output_data, indices_data, reversed_dim_cumprod, dim,
out_dimensions);
}
// Copyright (c) OpenMMLab. All rights reserved
#include "roi_align.h"
#include "../ort_mmcv_utils.h"
// implementation taken from Caffe2
struct PreCalc {
int pos1;
int pos2;
int pos3;
int pos4;
float w1;
float w2;
float w3;
float w4;
};
void pre_calc_for_bilinear_interpolate(
const int height, const int width, const int pooled_height,
const int pooled_width, const int iy_upper, const int ix_upper,
float roi_start_h, float roi_start_w, float bin_size_h, float bin_size_w,
int roi_bin_grid_h, int roi_bin_grid_w, std::vector<PreCalc> &pre_calc) {
int pre_calc_index = 0;
for (int ph = 0; ph < pooled_height; ph++) {
for (int pw = 0; pw < pooled_width; pw++) {
for (int iy = 0; iy < iy_upper; iy++) {
const float yy =
roi_start_h + ph * bin_size_h +
static_cast<float>(iy + .5f) * bin_size_h /
static_cast<float>(roi_bin_grid_h); // e.g., 0.5, 1.5
for (int ix = 0; ix < ix_upper; ix++) {
const float xx = roi_start_w + pw * bin_size_w +
static_cast<float>(ix + .5f) * bin_size_w /
static_cast<float>(roi_bin_grid_w);
float x = xx;
float y = yy;
// deal with: inverse elements are out of feature map boundary
if (y < -1.0 || y > height || x < -1.0 || x > width) {
// empty
PreCalc pc;
pc.pos1 = 0;
pc.pos2 = 0;
pc.pos3 = 0;
pc.pos4 = 0;
pc.w1 = 0;
pc.w2 = 0;
pc.w3 = 0;
pc.w4 = 0;
pre_calc[pre_calc_index] = pc;
pre_calc_index += 1;
continue;
}
if (y <= 0) {
y = 0;
}
if (x <= 0) {
x = 0;
}
int y_low = (int)y;
int x_low = (int)x;
int y_high;
int x_high;
if (y_low >= height - 1) {
y_high = y_low = height - 1;
y = (float)y_low;
} else {
y_high = y_low + 1;
}
if (x_low >= width - 1) {
x_high = x_low = width - 1;
x = (float)x_low;
} else {
x_high = x_low + 1;
}
float ly = y - y_low;
float lx = x - x_low;
float hy = 1. - ly, hx = 1. - lx;
float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
// save weights and indices
PreCalc pc;
pc.pos1 = y_low * width + x_low;
pc.pos2 = y_low * width + x_high;
pc.pos3 = y_high * width + x_low;
pc.pos4 = y_high * width + x_high;
pc.w1 = w1;
pc.w2 = w2;
pc.w3 = w3;
pc.w4 = w4;
pre_calc[pre_calc_index] = pc;
pre_calc_index += 1;
}
}
}
}
}
void ROIAlignForwardCPU(const int nthreads, const float *input,
const float *rois, float *output, float *argmax_y,
float *argmax_x, const int pooled_height,
const int pooled_width, const float spatial_scale,
const int sampling_ratio,
const int pool_mode, // 0 - max pool, 1 - avg pool
const bool aligned, const int channels,
const int height, const int width) {
int n_rois = nthreads / channels / pooled_width / pooled_height;
// (n, c, ph, pw) is an element in the pooled output
// can be parallelized using omp
// #pragma omp parallel for num_threads(32)
for (int n = 0; n < n_rois; n++) {
int index_n = n * channels * pooled_width * pooled_height;
const float *offset_rois = rois + n * 5;
int roi_batch_ind = offset_rois[0];
// Do not use rounding; this implementation detail is critical
float offset = aligned ? (float)0.5 : (float)0.0;
float roi_start_w = offset_rois[1] * spatial_scale - offset;
float roi_start_h = offset_rois[2] * spatial_scale - offset;
float roi_end_w = offset_rois[3] * spatial_scale - offset;
float roi_end_h = offset_rois[4] * spatial_scale - offset;
float roi_width = roi_end_w - roi_start_w;
float roi_height = roi_end_h - roi_start_h;
if (aligned) {
/*AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
"ROIs in ROIAlign cannot have non-negative size!");*/
assert(roi_width >= 0 && roi_height >= 0);
} else { // for backward-compatibility only
roi_width = std::max(roi_width, (float)1.);
roi_height = std::max(roi_height, (float)1.);
}
float bin_size_h =
static_cast<float>(roi_height) / static_cast<float>(pooled_height);
float bin_size_w =
static_cast<float>(roi_width) / static_cast<float>(pooled_width);
// We use roi_bin_grid to sample the grid and mimic integral
int roi_bin_grid_h = (sampling_ratio > 0)
? sampling_ratio
: ceil(roi_height / pooled_height); // e.g., = 2
int roi_bin_grid_w =
(sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
// When the grid is empty, output zeros == 0/1, instead of NaN.
const float count =
std::max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4
// we want to precalculate indices and weights shared by all channels,
// this is the key point of optimization
std::vector<PreCalc> pre_calc(roi_bin_grid_h * roi_bin_grid_w *
pooled_width * pooled_height);
pre_calc_for_bilinear_interpolate(
height, width, pooled_height, pooled_width, roi_bin_grid_h,
roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h, bin_size_w,
roi_bin_grid_h, roi_bin_grid_w, pre_calc);
for (int c = 0; c < channels; c++) {
int index_n_c = index_n + c * pooled_width * pooled_height;
const float *offset_input =
input + (roi_batch_ind * channels + c) * height * width;
int pre_calc_index = 0;
for (int ph = 0; ph < pooled_height; ph++) {
for (int pw = 0; pw < pooled_width; pw++) {
int index = index_n_c + ph * pooled_width + pw;
float output_val = 0.;
float maxval = -10000;
float maxidx_y = -1.f, maxidx_x = -1.f;
for (int iy = 0; iy < roi_bin_grid_h; iy++) {
const float y = roi_start_h + ph * bin_size_h +
static_cast<float>(iy + .5f) * bin_size_h /
static_cast<float>(roi_bin_grid_h);
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
const float x = roi_start_w + pw * bin_size_w +
static_cast<float>(ix + .5f) * bin_size_w /
static_cast<float>(roi_bin_grid_w);
PreCalc pc = pre_calc[pre_calc_index];
float val = pc.w1 * offset_input[pc.pos1] +
pc.w2 * offset_input[pc.pos2] +
pc.w3 * offset_input[pc.pos3] +
pc.w4 * offset_input[pc.pos4];
if (val > maxval) {
maxval = val;
maxidx_y = y;
maxidx_x = x;
}
output_val += val;
pre_calc_index += 1;
}
}
if (pool_mode == 0) {
// We do max pooling inside a bin
output[index] = maxval;
argmax_y[index] = maxidx_y;
argmax_x[index] = maxidx_x;
} else if (pool_mode == 1) {
// We do average (integral) pooling inside a bin
output[index] = output_val / count;
} // if
} // for pw
} // for ph
} // for c
} // for n
}
void MMCVRoiAlignKernel::Compute(OrtKernelContext *context) {
// Setup inputs
const OrtValue *input_X = ort_.KernelContext_GetInput(context, 0);
const float *X_data =
reinterpret_cast<const float *>(ort_.GetTensorData<float>(input_X));
const OrtValue *input_rois = ort_.KernelContext_GetInput(context, 1);
const float *rois = reinterpret_cast<const float *>(
ort_.GetTensorData<const float *>(input_rois));
// Setup output
OrtTensorDimensions out_dimensions(ort_, input_X);
OrtTensorDimensions roi_dimensions(ort_, input_rois);
int batch_size = out_dimensions.data()[0];
int input_channels = out_dimensions.data()[1];
int input_height = out_dimensions.data()[2];
int input_width = out_dimensions.data()[3];
out_dimensions.data()[0] = roi_dimensions.data()[0];
out_dimensions.data()[2] = aligned_height_;
out_dimensions.data()[3] = aligned_width_;
OrtValue *output = ort_.KernelContext_GetOutput(
context, 0, out_dimensions.data(), out_dimensions.size());
float *out = ort_.GetTensorMutableData<float>(output);
OrtTensorTypeAndShapeInfo *output_info = ort_.GetTensorTypeAndShape(output);
ort_.ReleaseTensorTypeAndShapeInfo(output_info);
// TODO: forward here
int output_size = out_dimensions.data()[0];
for (auto i = 1; i < out_dimensions.size(); ++i) {
output_size *= out_dimensions.data()[i];
}
int poolMod = 1;
if (pool_mode_ == "max") poolMod = 0;
float *argmax_x = nullptr, *argmax_y = nullptr;
if (poolMod == 0) {
argmax_y = new float[output_size];
argmax_x = new float[output_size];
}
ROIAlignForwardCPU(output_size, X_data, rois, out, argmax_y, argmax_x,
aligned_height_, aligned_width_, spatial_scale_,
sampling_ratio_, poolMod, aligned_, input_channels,
input_height, input_width);
if (argmax_x) delete argmax_x;
if (argmax_y) delete argmax_y;
}
// Modified from
// https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/ROIAlignRotated
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
#include "roi_align_rotated.h"
#include "../ort_mmcv_utils.h"
struct PreCalc {
int pos1;
int pos2;
int pos3;
int pos4;
float w1;
float w2;
float w3;
float w4;
};
void pre_calc_for_bilinear_interpolate(
const int height, const int width, const int pooled_height,
const int pooled_width, const int iy_upper, const int ix_upper,
float roi_start_h, float roi_start_w, float bin_size_h, float bin_size_w,
int roi_bin_grid_h, int roi_bin_grid_w, float roi_center_h,
float roi_center_w, float cos_theta, float sin_theta,
std::vector<PreCalc> &pre_calc) {
int pre_calc_index = 0;
for (int ph = 0; ph < pooled_height; ph++) {
for (int pw = 0; pw < pooled_width; pw++) {
for (int iy = 0; iy < iy_upper; iy++) {
const float yy =
roi_start_h + ph * bin_size_h +
static_cast<float>(iy + .5f) * bin_size_h /
static_cast<float>(roi_bin_grid_h); // e.g., 0.5, 1.5
for (int ix = 0; ix < ix_upper; ix++) {
const float xx = roi_start_w + pw * bin_size_w +
static_cast<float>(ix + .5f) * bin_size_w /
static_cast<float>(roi_bin_grid_w);
// Rotate by theta around the center and translate
// In image space, (y, x) is the order for Right Handed System,
// and this is essentially multiplying the point by a rotation matrix
// to rotate it counterclockwise through angle theta.
float y = yy * cos_theta - xx * sin_theta + roi_center_h;
float x = yy * sin_theta + xx * cos_theta + roi_center_w;
// deal with: inverse elements are out of feature map boundary
if (y < -1.0 || y > height || x < -1.0 || x > width) {
// empty
PreCalc pc;
pc.pos1 = 0;
pc.pos2 = 0;
pc.pos3 = 0;
pc.pos4 = 0;
pc.w1 = 0;
pc.w2 = 0;
pc.w3 = 0;
pc.w4 = 0;
pre_calc[pre_calc_index] = pc;
pre_calc_index += 1;
continue;
}
if (y < 0) {
y = 0;
}
if (x < 0) {
x = 0;
}
int y_low = (int)y;
int x_low = (int)x;
int y_high;
int x_high;
if (y_low >= height - 1) {
y_high = y_low = height - 1;
y = (float)y_low;
} else {
y_high = y_low + 1;
}
if (x_low >= width - 1) {
x_high = x_low = width - 1;
x = (float)x_low;
} else {
x_high = x_low + 1;
}
float ly = y - y_low;
float lx = x - x_low;
float hy = 1. - ly, hx = 1. - lx;
float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
// save weights and indices
PreCalc pc;
pc.pos1 = y_low * width + x_low;
pc.pos2 = y_low * width + x_high;
pc.pos3 = y_high * width + x_low;
pc.pos4 = y_high * width + x_high;
pc.w1 = w1;
pc.w2 = w2;
pc.w3 = w3;
pc.w4 = w4;
pre_calc[pre_calc_index] = pc;
pre_calc_index += 1;
}
}
}
}
}
void ROIAlignRotatedForwardCPU(const int nthreads, const float *input,
const float *rois, float *output,
const float &spatial_scale, const int aligned,
const int clockwise, const int channels,
const int height, const int width,
const int pooled_height, const int pooled_width,
const int sampling_ratio) {
int n_rois = nthreads / channels / pooled_width / pooled_height;
// (n, c, ph, pw) is an element in the pooled output
// can be parallelized using omp
// #pragma omp parallel for num_threads(32)
for (int n = 0; n < n_rois; n++) {
int index_n = n * channels * pooled_width * pooled_height;
const float *current_roi = rois + n * 6;
int roi_batch_ind = current_roi[0];
// Do not use rounding; this implementation detail is critical
float offset = aligned ? (float)0.5 : (float)0.0;
float roi_center_w = current_roi[1] * spatial_scale - offset;
float roi_center_h = current_roi[2] * spatial_scale - offset;
float roi_width = current_roi[3] * spatial_scale;
float roi_height = current_roi[4] * spatial_scale;
// float theta = current_roi[5] * M_PI / 180.0;
float theta = current_roi[5]; // Radian angle by default
if (clockwise) {
theta = -theta;
}
float cos_theta = cos(theta);
float sin_theta = sin(theta);
if (!aligned) { // for backward-compatibility only
roi_width = std::max(roi_width, (float)1.);
roi_height = std::max(roi_height, (float)1.);
}
float bin_size_h =
static_cast<float>(roi_height) / static_cast<float>(pooled_height);
float bin_size_w =
static_cast<float>(roi_width) / static_cast<float>(pooled_width);
// We use roi_bin_grid to sample the grid and mimic integral
int roi_bin_grid_h = (sampling_ratio > 0)
? sampling_ratio
: ceil(roi_height / pooled_height); // e.g., = 2
int roi_bin_grid_w =
(sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
// We do average (integral) pooling inside a bin
const float count =
std::max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4
// we want to precalculate indices and weights shared by all channels,
// this is the key point of optimization
std::vector<PreCalc> pre_calc(roi_bin_grid_h * roi_bin_grid_w *
pooled_width * pooled_height);
// roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
// Appropriate translation needs to be applied after.
float roi_start_h = -roi_height / 2.0;
float roi_start_w = -roi_width / 2.0;
pre_calc_for_bilinear_interpolate(
height, width, pooled_height, pooled_width, roi_bin_grid_h,
roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h, bin_size_w,
roi_bin_grid_h, roi_bin_grid_w, roi_center_h, roi_center_w, cos_theta,
sin_theta, pre_calc);
for (int c = 0; c < channels; c++) {
int index_n_c = index_n + c * pooled_width * pooled_height;
const float *offset_input =
input + (roi_batch_ind * channels + c) * height * width;
int pre_calc_index = 0;
for (int ph = 0; ph < pooled_height; ph++) {
for (int pw = 0; pw < pooled_width; pw++) {
int index = index_n_c + ph * pooled_width + pw;
float output_val = 0.;
for (int iy = 0; iy < roi_bin_grid_h; iy++) {
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
PreCalc pc = pre_calc[pre_calc_index];
output_val += pc.w1 * offset_input[pc.pos1] +
pc.w2 * offset_input[pc.pos2] +
pc.w3 * offset_input[pc.pos3] +
pc.w4 * offset_input[pc.pos4];
pre_calc_index += 1;
}
}
output_val /= count;
output[index] = output_val;
} // for pw
} // for ph
} // for c
} // for n
}
void MMCVRoIAlignRotatedKernel::Compute(OrtKernelContext *context) {
// Setup inputs
const OrtValue *input_X = ort_.KernelContext_GetInput(context, 0);
const float *X_data =
reinterpret_cast<const float *>(ort_.GetTensorData<float>(input_X));
const OrtValue *input_rois = ort_.KernelContext_GetInput(context, 1);
const float *rois = reinterpret_cast<const float *>(
ort_.GetTensorData<const float *>(input_rois));
// Setup output
OrtTensorDimensions out_dimensions(ort_, input_X);
OrtTensorDimensions roi_dimensions(ort_, input_rois);
int batch_size = out_dimensions.data()[0];
int input_channels = out_dimensions.data()[1];
int input_height = out_dimensions.data()[2];
int input_width = out_dimensions.data()[3];
out_dimensions.data()[0] = roi_dimensions.data()[0];
out_dimensions.data()[2] = aligned_height_;
out_dimensions.data()[3] = aligned_width_;
OrtValue *output = ort_.KernelContext_GetOutput(
context, 0, out_dimensions.data(), out_dimensions.size());
float *out = ort_.GetTensorMutableData<float>(output);
OrtTensorTypeAndShapeInfo *output_info = ort_.GetTensorTypeAndShape(output);
ort_.ReleaseTensorTypeAndShapeInfo(output_info);
// TODO: forward here
int output_size = out_dimensions.data()[0];
for (auto i = 1; i < out_dimensions.size(); ++i) {
output_size *= out_dimensions.data()[i];
}
ROIAlignRotatedForwardCPU(output_size, X_data, rois, out, spatial_scale_,
aligned_, clockwise_, input_channels, input_height,
input_width, aligned_height_, aligned_width_,
sampling_ratio_);
}
// Modified from
// https://github.com/SJTU-Thinklab-Det/r3det-on-mmdetection/blob/master/mmdet/ops/fr/src/feature_refine_kernel.cu
#include "rotated_feature_align.h"
#include "../ort_mmcv_utils.h"
template <typename T>
T bilinear_interpolate(const T *input, const int height, const int width, T y,
T x, const int index /* index for debug only*/) {
// deal with cases that inverse elements are out of feature map boundary
if (y < -1.0 || y > height || x < -1.0 || x > width) return 0;
if (y <= 0) y = 0;
if (x <= 0) x = 0;
int y_low = (int)y;
int x_low = (int)x;
int y_high;
int x_high;
if (y_low >= height - 1) {
y_high = y_low = height - 1;
y = (T)y_low;
} else {
y_high = y_low + 1;
}
if (x_low >= width - 1) {
x_high = x_low = width - 1;
x = (T)x_low;
} else {
x_high = x_low + 1;
}
T ly = y - y_low;
T lx = x - x_low;
T hy = 1. - ly, hx = 1. - lx;
// do bilinear interpolation
T v1 = input[int(fma(y_low, width, x_low))];
T v2 = input[int(fma(y_low, width, x_high))];
T v3 = input[int(fma(y_high, width, x_low))];
T v4 = input[int(fma(y_high, width, x_high))];
T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
return val;
}
template <typename scalar_t>
void rotated_feature_align_forward_cpu_kernel(
const int nthreads, const int points, const scalar_t *bottom_data,
const scalar_t *best_bboxes, const scalar_t spatial_scale,
const int channels, const int height, const int width, scalar_t *top_data) {
for (int index = 0; index < nthreads; index++) {
int w = index % width;
int h = (index / width) % height;
int c = (index / width / height) % channels;
int n = index / width / height / channels;
const scalar_t *bbox_offset =
best_bboxes + ((n * height + h) * width + w) * 5;
scalar_t roi_y = bbox_offset[0] * spatial_scale;
scalar_t roi_x = bbox_offset[1] * spatial_scale;
scalar_t px[5] = {roi_x, 0, 0, 0, 0};
scalar_t py[5] = {roi_y, 0, 0, 0, 0};
if (points > 1) {
scalar_t roi_w = bbox_offset[2] * spatial_scale;
scalar_t roi_h = bbox_offset[3] * spatial_scale;
scalar_t roi_a = bbox_offset[4];
scalar_t w_2 = roi_w / 2, h_2 = roi_h / 2;
scalar_t cosa = cosf(roi_a), sina = sinf(roi_a);
scalar_t wx = cosa * w_2, wy = sina * w_2;
scalar_t hx = -sina * h_2, hy = cosa * h_2;
px[1] = roi_x + wx + hx;
py[1] = roi_y + wy + hy;
px[2] = roi_x - wx + hx;
py[2] = roi_y - wy + hy;
px[3] = roi_x - wx - hx;
py[3] = roi_y - wy - hy;
px[4] = roi_x + wx - hx;
py[4] = roi_y + wy - hy;
}
const scalar_t *offset_bottom_data =
bottom_data + (n * channels + c) * height * width;
scalar_t output_val = bottom_data[index];
for (int i = 0; i < points; i++) {
output_val += bilinear_interpolate<scalar_t>(offset_bottom_data, height,
width, py[i], px[i], i);
}
top_data[index] = output_val;
}
}
void MMCVRotatedFeatureAlignKernel::Compute(OrtKernelContext *context) {
// Setup inputs
const OrtValue *input_features = ort_.KernelContext_GetInput(context, 0);
const float *features_data = reinterpret_cast<const float *>(
ort_.GetTensorData<float>(input_features));
const OrtValue *input_best_rbboxes = ort_.KernelContext_GetInput(context, 1);
const float *best_rbboxes = reinterpret_cast<const float *>(
ort_.GetTensorData<const float *>(input_best_rbboxes));
// Setup output
OrtTensorDimensions out_dimensions(ort_, input_features);
int batch_size = out_dimensions.data()[0];
int input_channels = out_dimensions.data()[1];
int input_height = out_dimensions.data()[2];
int input_width = out_dimensions.data()[3];
OrtValue *output = ort_.KernelContext_GetOutput(
context, 0, out_dimensions.data(), out_dimensions.size());
float *out = ort_.GetTensorMutableData<float>(output);
OrtTensorTypeAndShapeInfo *output_info = ort_.GetTensorTypeAndShape(output);
ort_.ReleaseTensorTypeAndShapeInfo(output_info);
// TODO: forward here
int output_size = out_dimensions.data()[0];
for (auto i = 1; i < out_dimensions.size(); ++i) {
output_size *= out_dimensions.data()[i];
}
rotated_feature_align_forward_cpu_kernel<float>(
output_size, points_, features_data, best_rbboxes, spatial_scale_,
input_channels, input_height, input_width, out);
}
// Copyright (c) OpenMMLab. All rights reserved
#include "soft_nms.h"
#include <assert.h>
#include <algorithm>
#include <cmath>
#include "../ort_mmcv_utils.h"
SoftNmsKernel::SoftNmsKernel(OrtApi api, const OrtKernelInfo *info)
: api_(api), ort_(api_), info_(info) {
iou_threshold_ = ort_.KernelInfoGetAttribute<float>(info, "iou_threshold");
sigma_ = ort_.KernelInfoGetAttribute<float>(info, "sigma");
min_score_ = ort_.KernelInfoGetAttribute<float>(info, "min_score");
method_ = ort_.KernelInfoGetAttribute<int64_t>(info, "method");
offset_ = ort_.KernelInfoGetAttribute<int64_t>(info, "offset");
// create allocator
allocator_ = Ort::AllocatorWithDefaultOptions();
}
void SoftNmsKernel::Compute(OrtKernelContext *context) {
typedef float T;
const T iou_threshold = T(iou_threshold_);
const T sigma = T(sigma_);
const T min_score = T(min_score_);
const int method = int(method_);
const T offset = T(offset_);
const OrtValue *boxes = ort_.KernelContext_GetInput(context, 0);
const T *boxes_data =
reinterpret_cast<const float *>(ort_.GetTensorData<T>(boxes));
const OrtValue *scores = ort_.KernelContext_GetInput(context, 1);
const T *scores_data =
reinterpret_cast<const float *>(ort_.GetTensorData<T>(scores));
OrtTensorDimensions boxes_dim(ort_, boxes);
OrtTensorDimensions scores_dim(ort_, scores);
int64_t nboxes = boxes_dim[0];
assert(boxes_dim[1] == 4);
// allocate tmp memory
T *tmp_boxes = (T *)allocator_.Alloc(sizeof(T) * nboxes * 4);
T *x1 = tmp_boxes;
T *y1 = tmp_boxes + 1;
T *x2 = tmp_boxes + 2;
T *y2 = tmp_boxes + 3;
T *sc = (T *)allocator_.Alloc(sizeof(T) * nboxes);
T *areas = (T *)allocator_.Alloc(sizeof(T) * nboxes);
T *de = (T *)allocator_.Alloc(sizeof(T) * nboxes * 5);
int64_t *inds = (int64_t *)allocator_.Alloc(sizeof(int64_t) * nboxes);
memcpy(tmp_boxes, boxes_data, sizeof(T) * nboxes * 4);
memcpy(sc, scores_data, sizeof(T) * nboxes);
// init inds as arange(nboxes)
std::generate(inds, inds + nboxes, [n = 0]() mutable { return n++; });
// area = (x2-x1+offset)*(y2-y1+offset)
for (int64_t i = 0; i < nboxes; i++) {
areas[i] =
(x2[i * 4] - x1[i * 4] + offset) * (y2[i * 4] - y1[i * 4] + offset);
}
int64_t pos = 0;
for (int64_t i = 0; i < nboxes; i++) {
auto max_score = sc[i];
auto max_pos = i;
pos = i + 1;
// get max box
while (pos < nboxes) {
if (max_score < sc[pos]) {
max_score = sc[pos];
max_pos = pos;
}
pos = pos + 1;
}
// swap
auto ix1 = de[i * 5 + 0] = x1[max_pos * 4];
auto iy1 = de[i * 5 + 1] = y1[max_pos * 4];
auto ix2 = de[i * 5 + 2] = x2[max_pos * 4];
auto iy2 = de[i * 5 + 3] = y2[max_pos * 4];
auto iscore = de[i * 5 + 4] = sc[max_pos];
auto iarea = areas[max_pos];
auto iind = inds[max_pos];
x1[max_pos * 4] = x1[i * 4];
y1[max_pos * 4] = y1[i * 4];
x2[max_pos * 4] = x2[i * 4];
y2[max_pos * 4] = y2[i * 4];
sc[max_pos] = sc[i];
areas[max_pos] = areas[i];
inds[max_pos] = inds[i];
x1[i * 4] = ix1;
y1[i * 4] = iy1;
x2[i * 4] = ix2;
y2[i * 4] = iy2;
sc[i] = iscore;
areas[i] = iarea;
inds[i] = iind;
pos = i + 1;
while (pos < nboxes) {
auto xx1 = std::max(ix1, x1[pos * 4]);
auto yy1 = std::max(iy1, y1[pos * 4]);
auto xx2 = std::min(ix2, x2[pos * 4]);
auto yy2 = std::min(iy2, y2[pos * 4]);
auto w = std::max(0.f, xx2 - xx1 + offset);
auto h = std::max(0.f, yy2 - yy1 + offset);
auto inter = w * h;
auto ovr = inter / (iarea + areas[pos] - inter);
float weight = 1.;
if (method == 0) {
if (ovr >= iou_threshold) weight = 0;
} else if (method == 1) {
if (ovr >= iou_threshold) weight = 1 - ovr;
} else if (method == 2) {
weight = std::exp(-(ovr * ovr) / sigma);
}
sc[pos] *= weight;
// if box score falls below threshold, discard the box by
// swapping with last box update N
if (sc[pos] < min_score) {
x1[pos * 4] = x1[(nboxes - 1) * 4];
y1[pos * 4] = y1[(nboxes - 1) * 4];
x2[pos * 4] = x2[(nboxes - 1) * 4];
y2[pos * 4] = y2[(nboxes - 1) * 4];
sc[pos] = sc[nboxes - 1];
areas[pos] = areas[nboxes - 1];
inds[pos] = inds[nboxes - 1];
nboxes = nboxes - 1;
pos = pos - 1;
}
pos = pos + 1;
}
}
std::vector<int64_t> dets_dim({nboxes, 5});
OrtValue *dets = ort_.KernelContext_GetOutput(context, 0, dets_dim.data(),
dets_dim.size());
T *dets_data = ort_.GetTensorMutableData<T>(dets);
std::vector<int64_t> inds_dim({nboxes});
OrtValue *inds_ov = ort_.KernelContext_GetOutput(context, 1, inds_dim.data(),
inds_dim.size());
int64_t *inds_data = ort_.GetTensorMutableData<int64_t>(inds_ov);
memcpy(dets_data, de, sizeof(T) * nboxes * 5);
memcpy(inds_data, inds, sizeof(int64_t) * nboxes);
}
// Copyright (c) OpenMMLab. All rights reserved
#ifndef ONNXRUNTIME_DEFORM_CONV_H
#define ONNXRUNTIME_DEFORM_CONV_H
#include <onnxruntime_cxx_api.h>
struct MMCVDeformConvKernel {
MMCVDeformConvKernel(OrtApi api, const OrtKernelInfo *info);
void Compute(OrtKernelContext *context);
protected:
OrtApi api_;
Ort::CustomOpApi ort_;
const OrtKernelInfo *info_;
Ort::AllocatorWithDefaultOptions allocator_;
int64_t stride_height_;
int64_t stride_width_;
int64_t padding_height_;
int64_t padding_width_;
int64_t dilation_height_;
int64_t dilation_width_;
int64_t deformable_group_;
int64_t group_;
int64_t im2col_step_;
};
struct MMCVDeformConvOp
: Ort::CustomOpBase<MMCVDeformConvOp, MMCVDeformConvKernel> {
void *CreateKernel(OrtApi api, const OrtKernelInfo *info) const {
return new MMCVDeformConvKernel(api, info);
}
const char *GetName() const { return "MMCVDeformConv2d"; };
size_t GetInputTypeCount() const { return 3; };
ONNXTensorElementDataType GetInputType(size_t /*index*/) const {
return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
};
OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(
size_t index) const {
return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_REQUIRED;
}
size_t GetOutputTypeCount() const { return 1; };
ONNXTensorElementDataType GetOutputType(size_t /*index*/) const {
return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
};
// force cpu
const char *GetExecutionProviderType() const {
return "CPUExecutionProvider";
};
};
#endif
// Copyright (c) OpenMMLab. All rights reserved
#ifndef ONNXRUNTIME_GRIDSAMPLE_H
#define ONNXRUNTIME_GRIDSAMPLE_H
#include <onnxruntime_cxx_api.h>
struct GridSampleKernel {
GridSampleKernel(OrtApi api, const OrtKernelInfo *info);
void Compute(OrtKernelContext *context);
protected:
OrtApi api_;
Ort::CustomOpApi ort_;
const OrtKernelInfo *info_;
Ort::AllocatorWithDefaultOptions allocator_;
int64_t align_corners_;
int64_t interpolation_mode_;
int64_t padding_mode_;
};
struct GridSampleOp : Ort::CustomOpBase<GridSampleOp, GridSampleKernel> {
void *CreateKernel(OrtApi api, const OrtKernelInfo *info) const {
return new GridSampleKernel(api, info);
};
const char *GetName() const { return "grid_sampler"; };
size_t GetInputTypeCount() const { return 2; };
ONNXTensorElementDataType GetInputType(size_t /*index*/) const {
return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
};
size_t GetOutputTypeCount() const { return 1; };
ONNXTensorElementDataType GetOutputType(size_t /*index*/) const {
return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
};
const char *GetExecutionProviderType() const {
return "CPUExecutionProvider";
};
};
#endif
// Copyright (c) OpenMMLab. All rights reserved
#ifndef ONNXRUNTIME_MODULATED_DEFORM_CONV_H
#define ONNXRUNTIME_MODULATED_DEFORM_CONV_H
#include <onnxruntime_cxx_api.h>
struct MMCVModulatedDeformConvKernel {
MMCVModulatedDeformConvKernel(OrtApi api, const OrtKernelInfo *info);
void Compute(OrtKernelContext *context);
protected:
OrtApi api_;
Ort::CustomOpApi ort_;
const OrtKernelInfo *info_;
Ort::AllocatorWithDefaultOptions allocator_;
int64_t stride_height_;
int64_t stride_width_;
int64_t padding_height_;
int64_t padding_width_;
int64_t dilation_height_;
int64_t dilation_width_;
int64_t deformable_group_;
int64_t group_;
};
struct MMCVModulatedDeformConvOp
: Ort::CustomOpBase<MMCVModulatedDeformConvOp,
MMCVModulatedDeformConvKernel> {
void *CreateKernel(OrtApi api, const OrtKernelInfo *info) const {
return new MMCVModulatedDeformConvKernel(api, info);
}
const char *GetName() const { return "MMCVModulatedDeformConv2d"; };
size_t GetInputTypeCount() const { return 5; };
ONNXTensorElementDataType GetInputType(size_t /*index*/) const {
return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
};
OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(
size_t index) const {
// The last input (index == 4) is optional, which is bias
if (index == 4)
return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_OPTIONAL;
return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_REQUIRED;
}
size_t GetOutputTypeCount() const { return 1; };
ONNXTensorElementDataType GetOutputType(size_t /*index*/) const {
return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
};
// force cpu
const char *GetExecutionProviderType() const {
return "CPUExecutionProvider";
};
};
#endif
// Copyright (c) OpenMMLab. All rights reserved
#ifndef ONNXRUNTIME_NMS_H
#define ONNXRUNTIME_NMS_H
#include <onnxruntime_cxx_api.h>
struct NmsKernel {
NmsKernel(OrtApi api, const OrtKernelInfo *info);
void Compute(OrtKernelContext *context);
protected:
OrtApi api_;
Ort::CustomOpApi ort_;
const OrtKernelInfo *info_;
Ort::AllocatorWithDefaultOptions allocator_;
float iou_threshold_;
int64_t offset_;
};
struct NmsOp : Ort::CustomOpBase<NmsOp, NmsKernel> {
void *CreateKernel(OrtApi api, const OrtKernelInfo *info) const {
return new NmsKernel(api, info);
};
const char *GetName() const { return "NonMaxSuppression"; };
size_t GetInputTypeCount() const { return 2; };
ONNXTensorElementDataType GetInputType(size_t /*index*/) const {
return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
};
size_t GetOutputTypeCount() const { return 1; };
ONNXTensorElementDataType GetOutputType(size_t index) const {
return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
}
// force cpu
const char *GetExecutionProviderType() const {
return "CPUExecutionProvider";
}
};
#endif
// Copyright (c) OpenMMLab. All rights reserved
#ifndef ONNXRUNTIME_REGISTER_H
#define ONNXRUNTIME_REGISTER_H
#include <onnxruntime_c_api.h>
#ifdef __cplusplus
extern "C" {
#endif
OrtStatus *ORT_API_CALL RegisterCustomOps(OrtSessionOptions *options,
const OrtApiBase *api);
#ifdef __cplusplus
}
#endif
#endif // ONNXRUNTIME_REGISTER_H
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment