Commit 546b4279 authored by limm's avatar limm
Browse files

add csrc and mmdeploy module

parent 502f4fb9
Pipeline #2810 canceled with stages
// Copyright (c) OpenMMLab. All rights reserved
#include "modulated_deform_conv.h"
#include <cmath>
#include <thread>
#include <vector>
#include "modulated_deform_conv/modulated_deform_conv_cpu.h"
#include "ort_utils.h"
namespace mmdeploy {
void parallel_unroll_gemm(const float *A, const float *B, const float *V, const float *H,
const int32_t M, const int32_t N, const int32_t K, const float alpha,
const float beta, float *Y, const int32_t start_row,
const int32_t end_row) {
std::vector<float> tmp(N);
for (int32_t m = start_row; m < end_row; ++m) {
for (int32_t n = 0; n < N; n++) {
tmp[n] = 0;
}
{
int32_t remainder = K % 8; // unroll
for (int32_t k = 0; k < K; k += 8) {
for (int32_t n = 0; n < N; n++) {
tmp[n] += A[m * K + k] * B[k * N + n];
tmp[n] += A[m * K + k + 1] * B[k * N + N + n];
tmp[n] += A[m * K + k + 2] * B[k * N + 2 * N + n];
tmp[n] += A[m * K + k + 3] * B[k * N + 3 * N + n];
tmp[n] += A[m * K + k + 4] * B[k * N + 4 * N + n];
tmp[n] += A[m * K + k + 5] * B[k * N + 5 * N + n];
tmp[n] += A[m * K + k + 6] * B[k * N + 6 * N + n];
tmp[n] += A[m * K + k + 7] * B[k * N + 7 * N + n];
}
}
for (int32_t k = K - remainder; k < K; k++) {
for (int32_t n = 0; n < N; n++) {
tmp[n] += A[m * K + k] * B[k * N + n];
}
}
}
for (int32_t n = 0; n < N; n++) {
tmp[n] *= alpha;
if (V) tmp[n] += beta * V[n];
if (H) tmp[n] += beta * H[m * N + n];
Y[m * N + n] = tmp[n];
}
}
}
void deformable_conv2d_ref_fp32(const float *src, const float *offset, const float *mask,
const float *filter, const float *bias, const int64_t batch,
const int64_t src_c, const int64_t src_h, const int64_t src_w,
const int64_t dst_c, const int64_t dst_h, const int64_t dst_w,
const int64_t group, const int64_t offset_group,
const int64_t channels, const int64_t num_output,
const int64_t kernel_h, const int64_t kernel_w,
const int64_t stride_h, const int64_t stride_w, const int64_t pad_h,
const int64_t pad_w, const int64_t dilation_h,
const int64_t dilation_w, float *columns, float *dst) {
const int64_t ic_per_gp = channels / group;
const int64_t oc_per_gp = num_output / group;
// Set up for launching threads
std::size_t num_threads = std::thread::hardware_concurrency();
std::vector<std::thread> threads;
threads.reserve(num_threads);
for (int64_t b = 0; b < batch; ++b) {
for (int64_t g = 0; g < group; ++g) {
deformable_im2col_2d<float>(
src + b * src_c * src_h * src_w + g * ic_per_gp * src_h * src_w,
offset + b * offset_group * 2 * kernel_h * kernel_w * dst_h * dst_w,
mask + b * offset_group * kernel_h * kernel_w * dst_h * dst_w, src_h, src_w, kernel_h,
kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, ic_per_gp,
offset_group, dst_h, dst_w, mask != nullptr, columns);
float *dst_ptr = dst + b * dst_c * dst_h * dst_w + g * oc_per_gp * dst_h * dst_w;
if (bias != nullptr) {
const float *bias_ptr = bias + g * oc_per_gp;
for (int64_t oc = 0; oc < oc_per_gp; ++oc) {
for (int64_t hw = 0; hw < dst_h * dst_w; ++hw) {
dst_ptr[oc * dst_h * dst_w + hw] = bias_ptr[oc];
}
}
} else {
memset(dst_ptr, 0.0f, sizeof(float) * oc_per_gp * dst_h * dst_w);
}
if (num_threads > 1) {
// Calculate values to pass to threads
int32_t n_rows = (oc_per_gp + num_threads - 1) / num_threads;
int32_t end_row = 0;
for (int32_t i = 0; i < num_threads; i++) {
auto start_row = i * n_rows;
end_row = start_row + n_rows;
if (end_row > oc_per_gp) end_row = oc_per_gp;
std::thread t(parallel_unroll_gemm,
filter + g * oc_per_gp * ic_per_gp * kernel_h * kernel_w, columns, nullptr,
dst_ptr, oc_per_gp, dst_h * dst_w, ic_per_gp * kernel_h * kernel_w, 1.0f,
1.0f, dst_ptr, start_row, end_row);
threads.emplace_back(std::move(t));
}
// Wait for all threads to complete
for (auto &t : threads) t.join();
threads.clear();
} else { // parallel gemm degrade to serial gemm with start_row=0 and end_row= oc_per_gp
parallel_unroll_gemm(filter + g * oc_per_gp * ic_per_gp * kernel_h * kernel_w, columns,
nullptr, dst_ptr, oc_per_gp, dst_h * dst_w,
ic_per_gp * kernel_h * kernel_w, 1.0f, 1.0f, dst_ptr, 0, oc_per_gp);
}
}
}
}
MMCVModulatedDeformConvKernel::MMCVModulatedDeformConvKernel(const OrtApi &api,
const OrtKernelInfo *info)
: ort_(api), info_(info) {
std::vector<int64_t> stride = ort_.KernelInfoGetAttribute<std::vector<int64_t>>(info, "stride");
stride_height_ = stride[0];
stride_width_ = stride[1];
std::vector<int64_t> padding = ort_.KernelInfoGetAttribute<std::vector<int64_t>>(info, "padding");
padding_height_ = padding[0];
padding_width_ = padding[1];
std::vector<int64_t> dilation =
ort_.KernelInfoGetAttribute<std::vector<int64_t>>(info, "dilation");
dilation_height_ = dilation[0];
dilation_width_ = dilation[1];
deformable_group_ = ort_.KernelInfoGetAttribute<int64_t>(info, "deform_groups");
group_ = ort_.KernelInfoGetAttribute<int64_t>(info, "groups");
// create allocator
allocator_ = Ort::AllocatorWithDefaultOptions();
}
void MMCVModulatedDeformConvKernel::Compute(OrtKernelContext *context) {
const int64_t stride_height = stride_height_;
const int64_t stride_width = stride_width_;
const int64_t padding_height = padding_height_;
const int64_t padding_width = padding_width_;
const int64_t dilation_height = dilation_height_;
const int64_t dilation_width = dilation_width_;
const int64_t deformable_group = deformable_group_;
const int64_t group = group_;
const OrtValue *input = ort_.KernelContext_GetInput(context, 0);
const float *input_data = reinterpret_cast<const float *>(ort_.GetTensorData<float>(input));
const OrtValue *offset = ort_.KernelContext_GetInput(context, 1);
const float *offset_data = reinterpret_cast<const float *>(ort_.GetTensorData<float>(offset));
const OrtValue *mask = ort_.KernelContext_GetInput(context, 2);
const float *mask_data = reinterpret_cast<const float *>(ort_.GetTensorData<float>(mask));
const OrtValue *filter = ort_.KernelContext_GetInput(context, 3);
const float *filter_data = reinterpret_cast<const float *>(ort_.GetTensorData<float>(filter));
const OrtValue *bias = ort_.KernelContext_GetInput(context, 4);
const float *bias_data = (bias != nullptr)
? reinterpret_cast<const float *>(ort_.GetTensorData<float>(bias))
: nullptr;
// const float *bias_data = nullptr;
OrtTensorDimensions input_dims(ort_, input);
OrtTensorDimensions filter_dims(ort_, filter);
int64_t batch = input_dims[0];
int64_t channels = input_dims[1];
int64_t in_height = input_dims[2];
int64_t in_width = input_dims[3];
int64_t num_output = filter_dims[0];
int64_t kernel_height = filter_dims[2];
int64_t kernel_width = filter_dims[3];
// get output memory
int64_t out_height = floor(
(in_height + 2 * padding_height - dilation_height * (kernel_height - 1) - 1) / stride_height +
1);
int64_t out_width = floor(
(in_width + 2 * padding_width - dilation_width * (kernel_width - 1) - 1) / stride_width + 1);
std::vector<int64_t> output_dims = {batch, num_output, out_height, out_width};
OrtValue *output =
ort_.KernelContext_GetOutput(context, 0, output_dims.data(), output_dims.size());
float *out_ptr = ort_.GetTensorMutableData<float>(output);
// allocate tmp memory
int64_t column_len = (channels / group) * kernel_height * kernel_width * out_height * out_width;
float *columns = (float *)allocator_.Alloc(sizeof(float) * column_len);
deformable_conv2d_ref_fp32(input_data, offset_data, mask_data, filter_data, bias_data, batch,
channels, in_height, in_width, num_output, out_height, out_width,
group, deformable_group, channels, num_output, kernel_height,
kernel_width, stride_height, stride_width, padding_height,
padding_width, dilation_height, dilation_width, columns, out_ptr);
allocator_.Free(columns);
}
REGISTER_ONNXRUNTIME_OPS(mmdeploy, MMCVModulatedDeformConvOp);
REGISTER_ONNXRUNTIME_OPS(mmcv, MMCVModulatedDeformConvOp);
} // namespace mmdeploy
// Copyright (c) OpenMMLab. All rights reserved
#ifndef ONNXRUNTIME_MODULATED_DEFORM_CONV_H
#define ONNXRUNTIME_MODULATED_DEFORM_CONV_H
#include <onnxruntime_cxx_api.h>
namespace mmdeploy {
struct MMCVModulatedDeformConvKernel {
MMCVModulatedDeformConvKernel(const OrtApi &api, const OrtKernelInfo *info);
void Compute(OrtKernelContext *context);
protected:
Ort::CustomOpApi ort_;
const OrtKernelInfo *info_;
Ort::AllocatorWithDefaultOptions allocator_;
int64_t stride_height_;
int64_t stride_width_;
int64_t padding_height_;
int64_t padding_width_;
int64_t dilation_height_;
int64_t dilation_width_;
int64_t deformable_group_;
int64_t group_;
};
struct MMCVModulatedDeformConvOp
: Ort::CustomOpBase<MMCVModulatedDeformConvOp, MMCVModulatedDeformConvKernel> {
void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const {
return new MMCVModulatedDeformConvKernel(api, info);
}
const char *GetName() const { return "MMCVModulatedDeformConv2d"; };
size_t GetInputTypeCount() const { return 5; };
ONNXTensorElementDataType GetInputType(size_t /*index*/) const {
return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
};
OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(size_t index) const {
// The last input (index == 4) is optional, which is bias
if (index == 4) return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_OPTIONAL;
return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_REQUIRED;
}
size_t GetOutputTypeCount() const { return 1; };
ONNXTensorElementDataType GetOutputType(size_t /*index*/) const {
return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
};
// force cpu
const char *GetExecutionProviderType() const { return "CPUExecutionProvider"; };
};
} // namespace mmdeploy
#endif
// Copyright (c) OpenMMLab. All rights reserved
#include "nms_match.h"
#include <assert.h>
#include <algorithm>
#include <cassert>
#include <cmath>
#include <iostream>
#include <iterator>
#include <numeric>
#include <vector>
#include "ort_utils.h"
namespace mmdeploy {
struct Box {
float x1, y1, x2, y2;
};
float nms_match_iou(Box box1, Box box2) {
auto inter_x1 = std::max(box1.x1, box2.x1);
auto inter_y1 = std::max(box1.y1, box2.y1);
auto inter_x2 = std::min(box1.x2, box2.x2);
auto inter_y2 = std::min(box1.y2, box2.y2);
auto eps = 1e-10;
auto w = std::max(static_cast<float>(0), inter_x2 - inter_x1);
auto h = std::max(static_cast<float>(0), inter_y2 - inter_y1);
auto area1 = (box1.x2 - box1.x1) * (box1.y2 - box1.y1);
auto area2 = (box2.x2 - box2.x1) * (box2.y2 - box2.y1);
auto inter = w * h;
auto ovr = inter / (area1 + area2 - inter + eps);
return ovr;
}
NMSMatchKernel::NMSMatchKernel(const OrtApi& api, const OrtKernelInfo* info)
: ort_(api), info_(info) {
// create allocator
allocator_ = Ort::AllocatorWithDefaultOptions();
}
void NMSMatchKernel::Compute(OrtKernelContext* context) {
const OrtValue* boxes = ort_.KernelContext_GetInput(context, 0);
const float* boxes_data = reinterpret_cast<const float*>(ort_.GetTensorData<float>(boxes));
const OrtValue* scores = ort_.KernelContext_GetInput(context, 1);
const float* scores_data = reinterpret_cast<const float*>(ort_.GetTensorData<float>(scores));
const OrtValue* iou_threshold_ = ort_.KernelContext_GetInput(context, 2);
const float iou_threshold_data = ort_.GetTensorData<float>(iou_threshold_)[0];
const OrtValue* score_threshold_ = ort_.KernelContext_GetInput(context, 3);
const float score_threshold_data = ort_.GetTensorData<float>(score_threshold_)[0];
OrtTensorDimensions boxes_dim(ort_, boxes);
OrtTensorDimensions scores_dim(ort_, scores);
// loop over batch
int64_t nbatch = boxes_dim[0];
int64_t nboxes = boxes_dim[1];
int64_t nclass = scores_dim[1];
assert(boxes_dim[2] == 4); //(x1, x2, y1, y2)
// alloc some temp memory
bool* select = (bool*)allocator_.Alloc(sizeof(bool) * nbatch * nboxes);
std::vector<int64_t> res_order;
for (int64_t k = 0; k < nbatch; k++) {
for (int64_t g = 0; g < nclass; g++) {
for (int64_t i = 0; i < nboxes; i++) {
select[i] = true;
}
// scores
// k * nboxes * nclass means per batch
// g * nboxes means per class
// batch = 2 boxes = 3 classes = 4
std::vector<float> tmp_sc;
// get the class scores
for (int i = 0; i < nboxes; i++) {
tmp_sc.push_back(scores_data[k * nboxes * nclass + g * nboxes + i]);
}
std::vector<int64_t> order(tmp_sc.size());
std::iota(order.begin(), order.end(), 0);
std::sort(order.begin(), order.end(),
[&tmp_sc](int64_t id1, int64_t id2) { return tmp_sc[id1] > tmp_sc[id2]; });
for (int64_t _i = 0; _i < nboxes; _i++) {
auto i = order[_i];
if (select[i] == false) continue;
std::vector<int64_t> v_i;
for (int64_t _j = _i + 1; _j < nboxes; _j++) {
auto j = order[_j];
if (select[j] == false) continue;
Box vbox1, vbox2;
vbox1.x1 = boxes_data[k * nboxes * 4 + i * 4];
vbox1.y1 = boxes_data[k * nboxes * 4 + i * 4 + 1];
vbox1.x2 = boxes_data[k * nboxes * 4 + i * 4 + 2];
vbox1.y2 = boxes_data[k * nboxes * 4 + i * 4 + 3];
vbox2.x1 = boxes_data[k * nboxes * 4 + j * 4];
vbox2.y1 = boxes_data[k * nboxes * 4 + j * 4 + 1];
vbox2.x2 = boxes_data[k * nboxes * 4 + j * 4 + 2];
vbox2.y2 = boxes_data[k * nboxes * 4 + j * 4 + 3];
auto ovr = nms_match_iou(vbox1, vbox2);
if (ovr >= iou_threshold_data) {
select[j] = false;
v_i.push_back(j);
}
}
if (tmp_sc[i] > score_threshold_data && v_i.size() != 0) {
for (int v_i_idx = 0; v_i_idx < v_i.size(); v_i_idx++) {
res_order.push_back(k);
res_order.push_back(g);
res_order.push_back(i);
res_order.push_back(v_i[v_i_idx]);
}
}
}
}
}
std::vector<int64_t> inds_dims({(int64_t)res_order.size() / 4, 4});
OrtValue* res = ort_.KernelContext_GetOutput(context, 0, inds_dims.data(), inds_dims.size());
int64_t* res_data = ort_.GetTensorMutableData<int64_t>(res);
memcpy(res_data, res_order.data(), sizeof(int64_t) * res_order.size());
allocator_.Free(select);
}
REGISTER_ONNXRUNTIME_OPS(mmdeploy, NMSMatchOp);
} // namespace mmdeploy
// Copyright (c) OpenMMLab. All rights reserved.
#ifndef ONNXRUNTIME_NMS_MATCH_H
#define ONNXRUNTIME_NMS_MATCH_H
#include <assert.h>
#include <onnxruntime_cxx_api.h>
#include <cmath>
#include <mutex>
#include <string>
#include <vector>
namespace mmdeploy {
struct NMSMatchKernel {
NMSMatchKernel(const OrtApi& api, const OrtKernelInfo* info);
void Compute(OrtKernelContext* context);
private:
Ort::CustomOpApi ort_;
const OrtKernelInfo* info_;
Ort::AllocatorWithDefaultOptions allocator_;
};
struct NMSMatchOp : Ort::CustomOpBase<NMSMatchOp, NMSMatchKernel> {
void* CreateKernel(const OrtApi& api, const OrtKernelInfo* info) const {
return new NMSMatchKernel(api, info);
}
const char* GetName() const { return "NMSMatch"; }
size_t GetInputTypeCount() const { return 4; }
ONNXTensorElementDataType GetInputType(size_t) const {
return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
}
size_t GetOutputTypeCount() const { return 1; }
ONNXTensorElementDataType GetOutputType(size_t) const {
return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
}
// force cpu
const char* GetExecutionProviderType() const { return "CPUExecutionProvider"; }
};
} // namespace mmdeploy
#endif // ONNXRUNTIME_NMS_MATCH_H
// Copyright (c) OpenMMLab. All rights reserved
#include "nms_rotated.h"
#include <assert.h>
#include <algorithm>
#include <cassert>
#include <cmath>
#include <iostream>
#include <iterator>
#include <numeric> // std::iota
#include <vector>
#include "ort_utils.h"
namespace mmdeploy {
namespace {
struct RotatedBox {
float x_ctr, y_ctr, w, h, a;
};
struct Point {
float x, y;
Point(const float& px = 0, const float& py = 0) : x(px), y(py) {}
Point operator+(const Point& p) const { return Point(x + p.x, y + p.y); }
Point& operator+=(const Point& p) {
x += p.x;
y += p.y;
return *this;
}
Point operator-(const Point& p) const { return Point(x - p.x, y - p.y); }
Point operator*(const float coeff) const { return Point(x * coeff, y * coeff); }
};
float dot_2d(const Point& A, const Point& B) { return A.x * B.x + A.y * B.y; }
float cross_2d(const Point& A, const Point& B) { return A.x * B.y - B.x * A.y; }
} // namespace
void get_rotated_vertices(const RotatedBox& box, Point (&pts)[4]) {
// M_PI / 180. == 0.01745329251
// double theta = box.a * 0.01745329251;
// MODIFIED
double theta = box.a;
float cosTheta2 = (float)cos(theta) * 0.5f;
float sinTheta2 = (float)sin(theta) * 0.5f;
// y: top --> down; x: left --> right
pts[0].x = box.x_ctr - sinTheta2 * box.h - cosTheta2 * box.w;
pts[0].y = box.y_ctr + cosTheta2 * box.h - sinTheta2 * box.w;
pts[1].x = box.x_ctr + sinTheta2 * box.h - cosTheta2 * box.w;
pts[1].y = box.y_ctr - cosTheta2 * box.h - sinTheta2 * box.w;
pts[2].x = 2 * box.x_ctr - pts[0].x;
pts[2].y = 2 * box.y_ctr - pts[0].y;
pts[3].x = 2 * box.x_ctr - pts[1].x;
pts[3].y = 2 * box.y_ctr - pts[1].y;
}
int get_intersection_points(const Point (&pts1)[4], const Point (&pts2)[4],
Point (&intersections)[24]) {
// Line vector
// A line from p1 to p2 is: p1 + (p2-p1)*t, t=[0,1]
Point vec1[4], vec2[4];
for (int i = 0; i < 4; i++) {
vec1[i] = pts1[(i + 1) % 4] - pts1[i];
vec2[i] = pts2[(i + 1) % 4] - pts2[i];
}
// Line test - test all line combos for intersection
int num = 0; // number of intersections
for (int i = 0; i < 4; i++) {
for (int j = 0; j < 4; j++) {
// Solve for 2x2 Ax=b
float det = cross_2d(vec2[j], vec1[i]);
// This takes care of parallel lines
if (fabs(det) <= 1e-14) {
continue;
}
auto vec12 = pts2[j] - pts1[i];
float t1 = cross_2d(vec2[j], vec12) / det;
float t2 = cross_2d(vec1[i], vec12) / det;
if (t1 >= 0.0f && t1 <= 1.0f && t2 >= 0.0f && t2 <= 1.0f) {
intersections[num++] = pts1[i] + vec1[i] * t1;
}
}
}
// Check for vertices of rect1 inside rect2
{
const auto& AB = vec2[0];
const auto& DA = vec2[3];
auto ABdotAB = dot_2d(AB, AB);
auto ADdotAD = dot_2d(DA, DA);
for (int i = 0; i < 4; i++) {
// assume ABCD is the rectangle, and P is the point to be judged
// P is inside ABCD iff. P's projection on AB lies within AB
// and P's projection on AD lies within AD
auto AP = pts1[i] - pts2[0];
auto APdotAB = dot_2d(AP, AB);
auto APdotAD = -dot_2d(AP, DA);
if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) && (APdotAD <= ADdotAD)) {
intersections[num++] = pts1[i];
}
}
}
// Reverse the check - check for vertices of rect2 inside rect1
{
const auto& AB = vec1[0];
const auto& DA = vec1[3];
auto ABdotAB = dot_2d(AB, AB);
auto ADdotAD = dot_2d(DA, DA);
for (int i = 0; i < 4; i++) {
auto AP = pts2[i] - pts1[0];
auto APdotAB = dot_2d(AP, AB);
auto APdotAD = -dot_2d(AP, DA);
if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) && (APdotAD <= ADdotAD)) {
intersections[num++] = pts2[i];
}
}
}
return num;
}
int convex_hull_graham(const Point (&p)[24], const int& num_in, Point (&q)[24],
bool shift_to_zero = false) {
assert(num_in >= 2);
// Step 1:
// Find point with minimum y
// if more than 1 points have the same minimum y,
// pick the one with the minimum x.
int t = 0;
for (int i = 1; i < num_in; i++) {
if (p[i].y < p[t].y || (p[i].y == p[t].y && p[i].x < p[t].x)) {
t = i;
}
}
auto& start = p[t]; // starting point
// Step 2:
// Subtract starting point from every points (for sorting in the next step)
for (int i = 0; i < num_in; i++) {
q[i] = p[i] - start;
}
// Swap the starting point to position 0
auto tmp = q[0];
q[0] = q[t];
q[t] = tmp;
// Step 3:
// Sort point 1 ~ num_in according to their relative cross-product values
// (essentially sorting according to angles)
// If the angles are the same, sort according to their distance to origin
float dist[24];
for (int i = 0; i < num_in; i++) {
dist[i] = dot_2d(q[i], q[i]);
}
// CPU version
std::sort(q + 1, q + num_in, [](const Point& A, const Point& B) -> bool {
float temp = cross_2d(A, B);
if (fabs(temp) < 1e-6) {
return dot_2d(A, A) < dot_2d(B, B);
} else {
return temp > 0;
}
});
// compute distance to origin after sort, since the points are now different.
for (int i = 0; i < num_in; i++) {
dist[i] = dot_2d(q[i], q[i]);
}
// Step 4:
// Make sure there are at least 2 points (that don't overlap with each other)
// in the stack
int k; // index of the non-overlapped second point
for (k = 1; k < num_in; k++) {
if (dist[k] > 1e-8) {
break;
}
}
if (k == num_in) {
// We reach the end, which means the convex hull is just one point
q[0] = p[t];
return 1;
}
q[1] = q[k];
int m = 2; // 2 points in the stack
// Step 5:
// Finally we can start the scanning process.
// When a non-convex relationship between the 3 points is found
// (either concave shape or duplicated points),
// we pop the previous point from the stack
// until the 3-point relationship is convex again, or
// until the stack only contains two points
for (int i = k + 1; i < num_in; i++) {
while (m > 1 && cross_2d(q[i] - q[m - 2], q[m - 1] - q[m - 2]) >= 0) {
m--;
}
q[m++] = q[i];
}
// Step 6 (Optional):
// In general sense we need the original coordinates, so we
// need to shift the points back (reverting Step 2)
// But if we're only interested in getting the area/perimeter of the shape
// We can simply return.
if (!shift_to_zero) {
for (int i = 0; i < m; i++) {
q[i] += start;
}
}
return m;
}
float polygon_area(const Point (&q)[24], const int& m) {
if (m <= 2) {
return 0;
}
float area = 0;
for (int i = 1; i < m - 1; i++) {
area += fabs(cross_2d(q[i] - q[0], q[i + 1] - q[0]));
}
return area / 2.0;
}
float rotated_boxes_intersection(const RotatedBox& box1, const RotatedBox& box2) {
// There are up to 4 x 4 + 4 + 4 = 24 intersections (including dups) returned
// from rotated_rect_intersection_pts
Point intersectPts[24], orderedPts[24];
Point pts1[4];
Point pts2[4];
get_rotated_vertices(box1, pts1);
get_rotated_vertices(box2, pts2);
int num = get_intersection_points(pts1, pts2, intersectPts);
if (num <= 2) {
return 0.0;
}
// Convex Hull to order the intersection points in clockwise order and find
// the contour area.
int num_convex = convex_hull_graham(intersectPts, num, orderedPts, true);
return polygon_area(orderedPts, num_convex);
}
NMSRotatedKernel::NMSRotatedKernel(const OrtApi& api, const OrtKernelInfo* info)
: ort_(api), info_(info) {
iou_threshold_ = ort_.KernelInfoGetAttribute<float>(info, "iou_threshold");
score_threshold_ = ort_.KernelInfoGetAttribute<float>(info, "score_threshold");
// create allocator
allocator_ = Ort::AllocatorWithDefaultOptions();
}
void NMSRotatedKernel::Compute(OrtKernelContext* context) {
const float iou_threshold = iou_threshold_;
const float score_threshold = score_threshold_;
const OrtValue* boxes = ort_.KernelContext_GetInput(context, 0);
const float* boxes_data = reinterpret_cast<const float*>(ort_.GetTensorData<float>(boxes));
const OrtValue* scores = ort_.KernelContext_GetInput(context, 1);
const float* scores_data = reinterpret_cast<const float*>(ort_.GetTensorData<float>(scores));
OrtTensorDimensions boxes_dim(ort_, boxes);
OrtTensorDimensions scores_dim(ort_, scores);
// loop over batch
int64_t nbatch = boxes_dim[0];
int64_t nboxes = boxes_dim[1];
int64_t nclass = scores_dim[1];
assert(boxes_dim[2] == 5); //(cx,cy,w,h,theta)
// allocate tmp memory
float* tmp_boxes = (float*)allocator_.Alloc(sizeof(float) * nbatch * nboxes * 5);
float* sc = (float*)allocator_.Alloc(sizeof(float) * nbatch * nclass * nboxes);
bool* select = (bool*)allocator_.Alloc(sizeof(bool) * nbatch * nboxes);
memcpy(tmp_boxes, boxes_data, sizeof(float) * nbatch * nboxes * 5);
memcpy(sc, scores_data, sizeof(float) * nbatch * nclass * nboxes);
// std::vector<std::vector<int64_t>> res_order;
std::vector<int64_t> res_order;
for (int64_t k = 0; k < nbatch; k++) {
for (int64_t g = 0; g < nclass; g++) {
for (int64_t i = 0; i < nboxes; i++) {
select[i] = true;
}
// sort scores
std::vector<float> tmp_sc;
for (int i = 0; i < nboxes; i++) {
tmp_sc.push_back(sc[k * nboxes * nclass + g * nboxes + i]);
}
std::vector<int64_t> order(tmp_sc.size());
std::iota(order.begin(), order.end(), 0);
std::sort(order.begin(), order.end(),
[&tmp_sc](int64_t id1, int64_t id2) { return tmp_sc[id1] > tmp_sc[id2]; });
for (int64_t _i = 0; _i < nboxes; _i++) {
if (select[_i] == false) continue;
auto i = order[_i];
for (int64_t _j = _i + 1; _j < nboxes; _j++) {
if (select[_j] == false) continue;
auto j = order[_j];
RotatedBox box1, box2;
auto center_shift_x =
(tmp_boxes[k * nboxes * 5 + i * 5] + tmp_boxes[k * nboxes * 5 + j * 5]) / 2.0;
auto center_shift_y =
(tmp_boxes[k * nboxes * 5 + i * 5 + 1] + tmp_boxes[k * nboxes * 5 + j * 5 + 1]) / 2.0;
box1.x_ctr = tmp_boxes[k * nboxes * 5 + i * 5] - center_shift_x;
box1.y_ctr = tmp_boxes[k * nboxes * 5 + i * 5 + 1] - center_shift_y;
box1.w = tmp_boxes[k * nboxes * 5 + i * 5 + 2];
box1.h = tmp_boxes[k * nboxes * 5 + i * 5 + 3];
box1.a = tmp_boxes[k * nboxes * 5 + i * 5 + 4];
box2.x_ctr = tmp_boxes[k * nboxes * 5 + j * 5] - center_shift_x;
box2.y_ctr = tmp_boxes[k * nboxes * 5 + j * 5 + 1] - center_shift_y;
box2.w = tmp_boxes[k * nboxes * 5 + j * 5 + 2];
box2.h = tmp_boxes[k * nboxes * 5 + j * 5 + 3];
box2.a = tmp_boxes[k * nboxes * 5 + j * 5 + 4];
auto area1 = box1.w * box1.h;
auto area2 = box2.w * box2.h;
auto intersection = rotated_boxes_intersection(box1, box2);
float baseS = 1.0;
baseS = (area1 + area2 - intersection);
auto ovr = intersection / baseS;
if (ovr > iou_threshold) select[_j] = false;
}
}
for (int i = 0; i < nboxes; i++) {
if (select[i] & (tmp_sc[order[i]] > score_threshold)) {
res_order.push_back(k);
res_order.push_back(g);
res_order.push_back(order[i]);
}
}
} // class loop
} // batch loop
std::vector<int64_t> inds_dims({(int64_t)res_order.size() / 3, 3});
OrtValue* res = ort_.KernelContext_GetOutput(context, 0, inds_dims.data(), inds_dims.size());
int64_t* res_data = ort_.GetTensorMutableData<int64_t>(res);
memcpy(res_data, res_order.data(), sizeof(int64_t) * res_order.size());
allocator_.Free(tmp_boxes);
allocator_.Free(sc);
allocator_.Free(select);
}
REGISTER_ONNXRUNTIME_OPS(mmdeploy, NMSRotatedOp);
} // namespace mmdeploy
// Copyright (c) OpenMMLab. All rights reserved.
#ifndef ONNXRUNTIME_NMS_ROTATED_H
#define ONNXRUNTIME_NMS_ROTATED_H
#include <assert.h>
#include <onnxruntime_cxx_api.h>
#include <cmath>
#include <mutex>
#include <string>
#include <vector>
namespace mmdeploy {
struct NMSRotatedKernel {
NMSRotatedKernel(const OrtApi& api, const OrtKernelInfo* info);
void Compute(OrtKernelContext* context);
private:
Ort::CustomOpApi ort_;
const OrtKernelInfo* info_;
Ort::AllocatorWithDefaultOptions allocator_;
float iou_threshold_;
float score_threshold_;
};
struct NMSRotatedOp : Ort::CustomOpBase<NMSRotatedOp, NMSRotatedKernel> {
void* CreateKernel(const OrtApi& api, const OrtKernelInfo* info) const {
return new NMSRotatedKernel(api, info);
}
const char* GetName() const { return "NMSRotated"; }
size_t GetInputTypeCount() const { return 2; }
ONNXTensorElementDataType GetInputType(size_t) const {
return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
}
size_t GetOutputTypeCount() const { return 1; }
ONNXTensorElementDataType GetOutputType(size_t) const {
return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
}
// force cpu
const char* GetExecutionProviderType() const { return "CPUExecutionProvider"; }
};
} // namespace mmdeploy
#endif // ONNXRUNTIME_NMS_ROTATED_H
// Copyright (c) OpenMMLab. All rights reserved.
#include "onnxruntime_register.h"
#include "ort_utils.h"
const char *c_MMDeployOpDomain = "mmdeploy";
OrtStatus *ORT_API_CALL RegisterCustomOps(OrtSessionOptions *options, const OrtApiBase *api) {
const OrtApi *kOrtApi = api->GetApi(ORT_API_VERSION);
OrtStatus *status = nullptr;
for (auto &_op_list_pair : mmdeploy::get_mmdeploy_custom_ops()) {
OrtCustomOpDomain *domain = nullptr;
if (auto status = kOrtApi->CreateCustomOpDomain(_op_list_pair.first.c_str(), &domain)) {
return status;
}
auto &_op_list = _op_list_pair.second;
for (auto &_op : _op_list) {
if (auto status = kOrtApi->CustomOpDomain_Add(domain, _op)) {
return status;
}
}
// TODO: figure out what will return if failed.
status = kOrtApi->AddCustomOpDomain(options, domain);
}
return status;
}
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
// Modified from
// https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/ROIAlignRotated
#include "roi_align_rotated.h"
#include "ort_utils.h"
namespace mmdeploy {
// implementation taken from Caffe2
struct PreCalc {
int pos1;
int pos2;
int pos3;
int pos4;
float w1;
float w2;
float w3;
float w4;
};
void pre_calc_for_bilinear_interpolate(const int height, const int width, const int pooled_height,
const int pooled_width, const int iy_upper,
const int ix_upper, float roi_start_h, float roi_start_w,
float bin_size_h, float bin_size_w, int roi_bin_grid_h,
int roi_bin_grid_w, float roi_center_h, float roi_center_w,
float cos_theta, float sin_theta,
std::vector<PreCalc> &pre_calc) {
int pre_calc_index = 0;
for (int ph = 0; ph < pooled_height; ph++) {
for (int pw = 0; pw < pooled_width; pw++) {
for (int iy = 0; iy < iy_upper; iy++) {
const float yy = roi_start_h + ph * bin_size_h +
static_cast<float>(iy + .5f) * bin_size_h /
static_cast<float>(roi_bin_grid_h); // e.g., 0.5, 1.5
for (int ix = 0; ix < ix_upper; ix++) {
const float xx =
roi_start_w + pw * bin_size_w +
static_cast<float>(ix + .5f) * bin_size_w / static_cast<float>(roi_bin_grid_w);
// Rotate by theta around the center and translate
// In image space, (y, x) is the order for Right Handed System,
// and this is essentially multiplying the point by a rotation matrix
// to rotate it counterclockwise through angle theta.
float y = yy * cos_theta - xx * sin_theta + roi_center_h;
float x = yy * sin_theta + xx * cos_theta + roi_center_w;
// deal with: inverse elements are out of feature map boundary
if (y < -1.0 || y > height || x < -1.0 || x > width) {
// empty
PreCalc pc;
pc.pos1 = 0;
pc.pos2 = 0;
pc.pos3 = 0;
pc.pos4 = 0;
pc.w1 = 0;
pc.w2 = 0;
pc.w3 = 0;
pc.w4 = 0;
pre_calc[pre_calc_index] = pc;
pre_calc_index += 1;
continue;
}
if (y < 0) {
y = 0;
}
if (x < 0) {
x = 0;
}
int y_low = (int)y;
int x_low = (int)x;
int y_high;
int x_high;
if (y_low >= height - 1) {
y_high = y_low = height - 1;
y = (float)y_low;
} else {
y_high = y_low + 1;
}
if (x_low >= width - 1) {
x_high = x_low = width - 1;
x = (float)x_low;
} else {
x_high = x_low + 1;
}
float ly = y - y_low;
float lx = x - x_low;
float hy = 1. - ly, hx = 1. - lx;
float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
// save weights and indices
PreCalc pc;
pc.pos1 = y_low * width + x_low;
pc.pos2 = y_low * width + x_high;
pc.pos3 = y_high * width + x_low;
pc.pos4 = y_high * width + x_high;
pc.w1 = w1;
pc.w2 = w2;
pc.w3 = w3;
pc.w4 = w4;
pre_calc[pre_calc_index] = pc;
pre_calc_index += 1;
}
}
}
}
}
void ROIAlignRotatedForwardCPU(const int nthreads, const float *input, const float *rois,
float *output, const float &spatial_scale, const int aligned,
const int clockwise, const int channels, const int height,
const int width, const int pooled_height, const int pooled_width,
const int sampling_ratio) {
int n_rois = nthreads / channels / pooled_width / pooled_height;
// (n, c, ph, pw) is an element in the pooled output
// can be parallelized using omp
// #pragma omp parallel for num_threads(32)
for (int n = 0; n < n_rois; n++) {
int index_n = n * channels * pooled_width * pooled_height;
const float *current_roi = rois + n * 6;
int roi_batch_ind = current_roi[0];
// Do not use rounding; this implementation detail is critical
float offset = aligned ? (float)0.5 : (float)0.0;
float roi_center_w = current_roi[1] * spatial_scale - offset;
float roi_center_h = current_roi[2] * spatial_scale - offset;
float roi_width = current_roi[3] * spatial_scale;
float roi_height = current_roi[4] * spatial_scale;
// float theta = current_roi[5] * M_PI / 180.0;
float theta = current_roi[5]; // Radian angle by default
if (clockwise) {
theta = -theta;
}
float cos_theta = cos(theta);
float sin_theta = sin(theta);
if (!aligned) { // for backward-compatibility only
roi_width = std::max(roi_width, (float)1.);
roi_height = std::max(roi_height, (float)1.);
}
float bin_size_h = static_cast<float>(roi_height) / static_cast<float>(pooled_height);
float bin_size_w = static_cast<float>(roi_width) / static_cast<float>(pooled_width);
// We use roi_bin_grid to sample the grid and mimic integral
int roi_bin_grid_h =
(sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height); // e.g., = 2
int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
// We do average (integral) pooling inside a bin
const float count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4
// we want to precalculate indices and weights shared by all channels,
// this is the key point of optimization
std::vector<PreCalc> pre_calc(roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
// roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
// Appropriate translation needs to be applied after.
float roi_start_h = -roi_height / 2.0;
float roi_start_w = -roi_width / 2.0;
pre_calc_for_bilinear_interpolate(height, width, pooled_height, pooled_width, roi_bin_grid_h,
roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h,
bin_size_w, roi_bin_grid_h, roi_bin_grid_w, roi_center_h,
roi_center_w, cos_theta, sin_theta, pre_calc);
for (int c = 0; c < channels; c++) {
int index_n_c = index_n + c * pooled_width * pooled_height;
const float *offset_input = input + (roi_batch_ind * channels + c) * height * width;
int pre_calc_index = 0;
for (int ph = 0; ph < pooled_height; ph++) {
for (int pw = 0; pw < pooled_width; pw++) {
int index = index_n_c + ph * pooled_width + pw;
float output_val = 0.;
for (int iy = 0; iy < roi_bin_grid_h; iy++) {
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
PreCalc pc = pre_calc[pre_calc_index];
output_val += pc.w1 * offset_input[pc.pos1] + pc.w2 * offset_input[pc.pos2] +
pc.w3 * offset_input[pc.pos3] + pc.w4 * offset_input[pc.pos4];
pre_calc_index += 1;
}
}
output_val /= count;
output[index] = output_val;
} // for pw
} // for ph
} // for c
} // for n
}
void MMCVRoIAlignRotatedKernel::Compute(OrtKernelContext *context) {
// Setup inputs
const OrtValue *input_X = ort_.KernelContext_GetInput(context, 0);
const float *X_data = reinterpret_cast<const float *>(ort_.GetTensorData<float>(input_X));
const OrtValue *input_rois = ort_.KernelContext_GetInput(context, 1);
const float *rois =
reinterpret_cast<const float *>(ort_.GetTensorData<const float *>(input_rois));
// Setup output
OrtTensorDimensions out_dimensions(ort_, input_X);
OrtTensorDimensions roi_dimensions(ort_, input_rois);
int batch_size = out_dimensions.data()[0];
int input_channels = out_dimensions.data()[1];
int input_height = out_dimensions.data()[2];
int input_width = out_dimensions.data()[3];
out_dimensions.data()[0] = roi_dimensions.data()[0];
out_dimensions.data()[2] = aligned_height_;
out_dimensions.data()[3] = aligned_width_;
OrtValue *output =
ort_.KernelContext_GetOutput(context, 0, out_dimensions.data(), out_dimensions.size());
float *out = ort_.GetTensorMutableData<float>(output);
OrtTensorTypeAndShapeInfo *output_info = ort_.GetTensorTypeAndShape(output);
ort_.ReleaseTensorTypeAndShapeInfo(output_info);
// TODO: forward here
int output_size = out_dimensions.data()[0];
for (auto i = 1; i < out_dimensions.size(); ++i) {
output_size *= out_dimensions.data()[i];
}
ROIAlignRotatedForwardCPU(output_size, X_data, rois, out, spatial_scale_, aligned_, clockwise_,
input_channels, input_height, input_width, aligned_height_,
aligned_width_, sampling_ratio_);
}
REGISTER_ONNXRUNTIME_OPS(mmdeploy, MMCVRoIAlignRotatedCustomOp);
} // namespace mmdeploy
// Copyright (c) OpenMMLab. All rights reserved
#ifndef ONNXRUNTIME_ROI_ALIGN_ROTATED_H
#define ONNXRUNTIME_ROI_ALIGN_ROTATED_H
#include <assert.h>
#include <onnxruntime_cxx_api.h>
#include <cmath>
#include <mutex>
#include <string>
#include <vector>
namespace mmdeploy {
struct MMCVRoIAlignRotatedKernel {
public:
MMCVRoIAlignRotatedKernel(Ort::CustomOpApi ort, const OrtKernelInfo* info) : ort_(ort) {
aligned_height_ = ort_.KernelInfoGetAttribute<int64_t>(info, "output_height");
aligned_width_ = ort_.KernelInfoGetAttribute<int64_t>(info, "output_width");
sampling_ratio_ = ort_.KernelInfoGetAttribute<int64_t>(info, "sampling_ratio");
spatial_scale_ = ort_.KernelInfoGetAttribute<float>(info, "spatial_scale");
aligned_ = ort_.KernelInfoGetAttribute<int64_t>(info, "aligned");
clockwise_ = ort_.KernelInfoGetAttribute<int64_t>(info, "clockwise");
}
void Compute(OrtKernelContext* context);
private:
Ort::CustomOpApi ort_;
int aligned_height_;
int aligned_width_;
float spatial_scale_;
int sampling_ratio_;
int aligned_;
int clockwise_;
};
struct MMCVRoIAlignRotatedCustomOp
: Ort::CustomOpBase<MMCVRoIAlignRotatedCustomOp, MMCVRoIAlignRotatedKernel> {
void* CreateKernel(Ort::CustomOpApi api, const OrtKernelInfo* info) const {
return new MMCVRoIAlignRotatedKernel(api, info);
}
const char* GetName() const { return "MMCVRoIAlignRotated"; }
size_t GetInputTypeCount() const { return 2; }
ONNXTensorElementDataType GetInputType(size_t) const {
return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
}
size_t GetOutputTypeCount() const { return 1; }
ONNXTensorElementDataType GetOutputType(size_t) const {
return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
}
// force cpu
const char* GetExecutionProviderType() const { return "CPUExecutionProvider"; }
};
} // namespace mmdeploy
#endif // ONNXRUNTIME_ROI_ALIGN_ROTATED_H
# Copyright (c) OpenMMLab. All rights reserved.
project(mmdeploy_tensorrt_ops)
include(${CMAKE_SOURCE_DIR}/cmake/tensorrt.cmake)
# cub
if (NOT DEFINED CUB_ROOT_DIR)
if (CUDA_VERSION VERSION_LESS 11.0)
set(CUB_ROOT_DIR "${CMAKE_SOURCE_DIR}/third_party/cub")
endif ()
endif ()
file(GLOB_RECURSE BACKEND_OPS_SRCS *.cpp *.cu)
add_library(${PROJECT_NAME}_obj OBJECT "${BACKEND_OPS_SRCS}")
set_target_properties(${PROJECT_NAME}_obj PROPERTIES POSITION_INDEPENDENT_CODE 1)
target_compile_definitions(${PROJECT_NAME}_obj
PRIVATE -DTHRUST_IGNORE_DEPRECATED_CPP_DIALECT=1)
target_include_directories(${PROJECT_NAME}_obj
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../common)
target_include_directories(${PROJECT_NAME}_obj
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/common)
target_include_directories(${PROJECT_NAME}_obj
PRIVATE ${CUDA_TOOLKIT_ROOT_DIR}/include)
target_include_directories(${PROJECT_NAME}_obj PRIVATE ${TENSORRT_INCLUDE_DIR})
target_include_directories(${PROJECT_NAME}_obj PRIVATE ${CUDNN_DIR}/include)
target_include_directories(${PROJECT_NAME}_obj PRIVATE ${CUB_ROOT_DIR})
target_link_libraries(${PROJECT_NAME}_obj
PUBLIC ${TENSORRT_LIBS} cublas cudnn)
mmdeploy_export(${PROJECT_NAME}_obj)
# Build module library. It is used to convert onnx model to tensorrt engine
mmdeploy_add_module(${PROJECT_NAME} MODULE EXCLUDE "")
target_link_libraries(${PROJECT_NAME} PRIVATE ${PROJECT_NAME}_obj)
add_library(mmdeploy::tensorrt_ops ALIAS ${PROJECT_NAME})
set(_TRT_OPS_DIR ${CMAKE_SOURCE_DIR}/mmdeploy/lib)
install(TARGETS ${PROJECT_NAME} DESTINATION ${_TRT_OPS_DIR})
// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
// modify from
// https://github.com/NVIDIA/TensorRT/tree/master/plugin/batchedNMSPlugin
#include "trt_batched_nms.hpp"
#include <cstring>
#include "nms/batched_nms_kernel.hpp"
#include "nms/kernel.h"
#include "trt_serialize.hpp"
namespace mmdeploy {
using namespace nvinfer1;
using nvinfer1::plugin::NMSParameters;
namespace {
static const char* NMS_PLUGIN_VERSION{"1"};
static const char* NMS_PLUGIN_NAME{"TRTBatchedNMS"};
} // namespace
TRTBatchedNMS::TRTBatchedNMS(const std::string& name, NMSParameters params, bool returnIndex)
: TRTPluginBase(name), param(params), mReturnIndex(returnIndex) {}
TRTBatchedNMS::TRTBatchedNMS(const std::string& name, const void* data, size_t length)
: TRTPluginBase(name) {
deserialize_value(&data, &length, &param);
deserialize_value(&data, &length, &mClipBoxes);
deserialize_value(&data, &length, &mReturnIndex);
}
int TRTBatchedNMS::getNbOutputs() const TRT_NOEXCEPT {
int num = mReturnIndex ? 3 : 2;
return num;
}
nvinfer1::DimsExprs TRTBatchedNMS::getOutputDimensions(
int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
nvinfer1::IExprBuilder& exprBuilder) TRT_NOEXCEPT {
ASSERT(nbInputs == 2);
ASSERT(outputIndex >= 0 && outputIndex < this->getNbOutputs());
ASSERT(inputs[0].nbDims == 4);
ASSERT(inputs[1].nbDims == 3);
nvinfer1::DimsExprs ret;
ret.d[0] = inputs[0].d[0];
ret.d[1] = exprBuilder.constant(param.keepTopK);
switch (outputIndex) {
case 0:
ret.nbDims = 3;
ret.d[2] = exprBuilder.constant(5);
break;
case 1:
ret.nbDims = 2;
break;
case 2:
ret.nbDims = 2;
default:
break;
}
return ret;
}
size_t TRTBatchedNMS::getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs,
const nvinfer1::PluginTensorDesc* outputs,
int nbOutputs) const TRT_NOEXCEPT {
size_t batch_size = inputs[0].dims.d[0];
size_t boxes_size = inputs[0].dims.d[1] * inputs[0].dims.d[2] * inputs[0].dims.d[3];
size_t score_size = inputs[1].dims.d[1] * inputs[1].dims.d[2];
size_t num_priors = inputs[0].dims.d[1];
bool shareLocation = (inputs[0].dims.d[2] == 1);
int topk = param.topK > 0 && param.topK <= inputs[1].dims.d[1] ? param.topK : inputs[1].dims.d[1];
return detectionInferenceWorkspaceSize(shareLocation, batch_size, boxes_size, score_size,
param.numClasses, num_priors, topk, DataType::kFLOAT,
DataType::kFLOAT);
}
int TRTBatchedNMS::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
const nvinfer1::PluginTensorDesc* outputDesc, const void* const* inputs,
void* const* outputs, void* workSpace,
cudaStream_t stream) TRT_NOEXCEPT {
const void* const locData = inputs[0];
const void* const confData = inputs[1];
void* nmsedDets = outputs[0];
void* nmsedLabels = outputs[1];
void* nmsedIndex = mReturnIndex ? outputs[2] : nullptr;
size_t batch_size = inputDesc[0].dims.d[0];
size_t boxes_size = inputDesc[0].dims.d[1] * inputDesc[0].dims.d[2] * inputDesc[0].dims.d[3];
size_t score_size = inputDesc[1].dims.d[1] * inputDesc[1].dims.d[2];
size_t num_priors = inputDesc[0].dims.d[1];
bool shareLocation = (inputDesc[0].dims.d[2] == 1);
int topk =
param.topK > 0 && param.topK <= inputDesc[1].dims.d[1] ? param.topK : inputDesc[1].dims.d[1];
bool rotated = false;
pluginStatus_t status = nmsInference(
stream, batch_size, boxes_size, score_size, shareLocation, param.backgroundLabelId,
num_priors, param.numClasses, topk, param.keepTopK, param.scoreThreshold, param.iouThreshold,
DataType::kFLOAT, locData, DataType::kFLOAT, confData, nmsedDets, nmsedLabels, nmsedIndex,
workSpace, param.isNormalized, false, mClipBoxes, rotated);
ASSERT(status == STATUS_SUCCESS);
return 0;
}
size_t TRTBatchedNMS::getSerializationSize() const TRT_NOEXCEPT {
// NMSParameters
return sizeof(NMSParameters) + sizeof(mClipBoxes) + sizeof(mReturnIndex);
}
void TRTBatchedNMS::serialize(void* buffer) const TRT_NOEXCEPT {
serialize_value(&buffer, param);
serialize_value(&buffer, mClipBoxes);
serialize_value(&buffer, mReturnIndex);
}
void TRTBatchedNMS::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs, int nbInputs,
const nvinfer1::DynamicPluginTensorDesc* outputs,
int nbOutputs) TRT_NOEXCEPT {
// Validate input arguments
}
bool TRTBatchedNMS::supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* ioDesc,
int nbInputs, int nbOutputs) TRT_NOEXCEPT {
if (pos == 3 || pos == 4) {
return ioDesc[pos].type == nvinfer1::DataType::kINT32 &&
ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR;
}
return ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR;
}
const char* TRTBatchedNMS::getPluginType() const TRT_NOEXCEPT { return NMS_PLUGIN_NAME; }
const char* TRTBatchedNMS::getPluginVersion() const TRT_NOEXCEPT { return NMS_PLUGIN_VERSION; }
IPluginV2DynamicExt* TRTBatchedNMS::clone() const TRT_NOEXCEPT {
auto* plugin = new TRTBatchedNMS(mLayerName, param, mReturnIndex);
plugin->setPluginNamespace(mNamespace.c_str());
plugin->setClipParam(mClipBoxes);
return plugin;
}
nvinfer1::DataType TRTBatchedNMS::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
int nbInputs) const TRT_NOEXCEPT {
ASSERT(index >= 0 && index < this->getNbOutputs());
if (index == 1 || index == 2) {
return nvinfer1::DataType::kINT32;
}
return inputTypes[0];
}
void TRTBatchedNMS::setClipParam(bool clip) { mClipBoxes = clip; }
TRTBatchedNMSCreator::TRTBatchedNMSCreator() {
mPluginAttributes.emplace_back(
PluginField("background_label_id", nullptr, PluginFieldType::kINT32, 1));
mPluginAttributes.emplace_back(PluginField("num_classes", nullptr, PluginFieldType::kINT32, 1));
mPluginAttributes.emplace_back(PluginField("topk", nullptr, PluginFieldType::kINT32, 1));
mPluginAttributes.emplace_back(PluginField("keep_topk", nullptr, PluginFieldType::kINT32, 1));
mPluginAttributes.emplace_back(
PluginField("score_threshold", nullptr, PluginFieldType::kFLOAT32, 1));
mPluginAttributes.emplace_back(
PluginField("iou_threshold", nullptr, PluginFieldType::kFLOAT32, 1));
mPluginAttributes.emplace_back(PluginField("is_normalized", nullptr, PluginFieldType::kINT32, 1));
mPluginAttributes.emplace_back(PluginField("clip_boxes", nullptr, PluginFieldType::kINT32, 1));
mPluginAttributes.emplace_back(PluginField("return_index", nullptr, PluginFieldType::kINT32, 1));
mFC.nbFields = mPluginAttributes.size();
mFC.fields = mPluginAttributes.data();
}
const char* TRTBatchedNMSCreator::getPluginName() const TRT_NOEXCEPT { return NMS_PLUGIN_NAME; }
const char* TRTBatchedNMSCreator::getPluginVersion() const TRT_NOEXCEPT {
return NMS_PLUGIN_VERSION;
}
IPluginV2Ext* TRTBatchedNMSCreator::createPlugin(const char* name,
const PluginFieldCollection* fc) TRT_NOEXCEPT {
const PluginField* fields = fc->fields;
bool clipBoxes = true;
bool returnIndex = false;
nvinfer1::plugin::NMSParameters params{};
for (int i = 0; i < fc->nbFields; ++i) {
const char* attrName = fields[i].name;
if (!strcmp(attrName, "background_label_id")) {
ASSERT(fields[i].type == PluginFieldType::kINT32);
params.backgroundLabelId = *(static_cast<const int*>(fields[i].data));
} else if (!strcmp(attrName, "num_classes")) {
ASSERT(fields[i].type == PluginFieldType::kINT32);
params.numClasses = *(static_cast<const int*>(fields[i].data));
} else if (!strcmp(attrName, "topk")) {
ASSERT(fields[i].type == PluginFieldType::kINT32);
params.topK = *(static_cast<const int*>(fields[i].data));
} else if (!strcmp(attrName, "keep_topk")) {
ASSERT(fields[i].type == PluginFieldType::kINT32);
params.keepTopK = *(static_cast<const int*>(fields[i].data));
} else if (!strcmp(attrName, "score_threshold")) {
ASSERT(fields[i].type == PluginFieldType::kFLOAT32);
params.scoreThreshold = *(static_cast<const float*>(fields[i].data));
} else if (!strcmp(attrName, "iou_threshold")) {
ASSERT(fields[i].type == PluginFieldType::kFLOAT32);
params.iouThreshold = *(static_cast<const float*>(fields[i].data));
} else if (!strcmp(attrName, "is_normalized")) {
params.isNormalized = *(static_cast<const bool*>(fields[i].data));
} else if (!strcmp(attrName, "clip_boxes")) {
clipBoxes = *(static_cast<const bool*>(fields[i].data));
} else if (!strcmp(attrName, "return_index")) {
returnIndex = *(static_cast<const bool*>(fields[i].data));
}
}
TRTBatchedNMS* plugin = new TRTBatchedNMS(name, params, returnIndex);
plugin->setClipParam(clipBoxes);
plugin->setPluginNamespace(mNamespace.c_str());
return plugin;
}
IPluginV2Ext* TRTBatchedNMSCreator::deserializePlugin(const char* name, const void* serialData,
size_t serialLength) TRT_NOEXCEPT {
// This object will be deleted when the network is destroyed, which will
// call NMS::destroy()
TRTBatchedNMS* plugin = new TRTBatchedNMS(name, serialData, serialLength);
plugin->setPluginNamespace(mNamespace.c_str());
return plugin;
}
REGISTER_TENSORRT_PLUGIN(TRTBatchedNMSCreator);
} // namespace mmdeploy
// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
// modify from
// https://github.com/NVIDIA/TensorRT/tree/master/plugin/batchedNMSPlugin
#ifndef TRT_BATCHED_NMS_PLUGIN_CUSTOM_H
#define TRT_BATCHED_NMS_PLUGIN_CUSTOM_H
#include <string>
#include <vector>
#include "NvInferPluginUtils.h"
#include "trt_plugin_base.hpp"
namespace mmdeploy {
enum NMSReturnType { RETURN_DETS = 1, RETURN_INDEX = 1 << 1 };
class TRTBatchedNMS : public TRTPluginBase {
public:
TRTBatchedNMS(const std::string& name, nvinfer1::plugin::NMSParameters param, bool returnIndex);
TRTBatchedNMS(const std::string& name, const void* data, size_t length);
~TRTBatchedNMS() TRT_NOEXCEPT override = default;
int getNbOutputs() const TRT_NOEXCEPT override;
nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs* inputs,
int nbInputs, nvinfer1::IExprBuilder& exprBuilder)
TRT_NOEXCEPT override;
size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs,
const nvinfer1::PluginTensorDesc* outputs,
int nbOutputs) const TRT_NOEXCEPT override;
int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
const nvinfer1::PluginTensorDesc* outputDesc, const void* const* inputs,
void* const* outputs, void* workSpace, cudaStream_t stream) TRT_NOEXCEPT override;
size_t getSerializationSize() const TRT_NOEXCEPT override;
void serialize(void* buffer) const TRT_NOEXCEPT override;
void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs, int nbInputs,
const nvinfer1::DynamicPluginTensorDesc* outputs,
int nbOutputs) TRT_NOEXCEPT override;
bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* ioDesc, int nbInputs,
int nbOutputs) TRT_NOEXCEPT override;
const char* getPluginType() const TRT_NOEXCEPT override;
const char* getPluginVersion() const TRT_NOEXCEPT override;
nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType* inputType,
int nbInputs) const TRT_NOEXCEPT override;
void setClipParam(bool clip);
private:
nvinfer1::plugin::NMSParameters param{};
bool mClipBoxes{};
bool mReturnIndex{};
};
class TRTBatchedNMSCreator : public TRTPluginCreatorBase {
public:
TRTBatchedNMSCreator();
~TRTBatchedNMSCreator() TRT_NOEXCEPT override = default;
const char* getPluginName() const TRT_NOEXCEPT override;
const char* getPluginVersion() const TRT_NOEXCEPT override;
nvinfer1::IPluginV2Ext* createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc)
TRT_NOEXCEPT override;
nvinfer1::IPluginV2Ext* deserializePlugin(const char* name, const void* serialData,
size_t serialLength) TRT_NOEXCEPT override;
};
} // namespace mmdeploy
#endif // TRT_BATCHED_NMS_PLUGIN_CUSTOM_H
// Copyright (c) OpenMMLab. All rights reserved
#include "trt_batched_rotated_nms.hpp"
#include <cstring>
#include "nms/batched_nms_kernel.hpp"
#include "nms/kernel.h"
#include "trt_serialize.hpp"
namespace mmdeploy {
using namespace nvinfer1;
using nvinfer1::plugin::NMSParameters;
namespace {
static const char* NMS_PLUGIN_VERSION{"1"};
static const char* NMS_PLUGIN_NAME{"TRTBatchedRotatedNMS"};
} // namespace
TRTBatchedRotatedNMS::TRTBatchedRotatedNMS(const std::string& name, NMSParameters params)
: TRTPluginBase(name), param(params) {}
TRTBatchedRotatedNMS::TRTBatchedRotatedNMS(const std::string& name, const void* data, size_t length)
: TRTPluginBase(name) {
deserialize_value(&data, &length, &param);
deserialize_value(&data, &length, &mClipBoxes);
}
int TRTBatchedRotatedNMS::getNbOutputs() const TRT_NOEXCEPT { return 2; }
nvinfer1::DimsExprs TRTBatchedRotatedNMS::getOutputDimensions(
int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
nvinfer1::IExprBuilder& exprBuilder) TRT_NOEXCEPT {
ASSERT(nbInputs == 2);
ASSERT(outputIndex >= 0 && outputIndex < this->getNbOutputs());
ASSERT(inputs[0].nbDims == 4);
ASSERT(inputs[1].nbDims == 3);
nvinfer1::DimsExprs ret;
ret.d[0] = inputs[0].d[0];
ret.d[1] = exprBuilder.constant(param.keepTopK);
switch (outputIndex) {
case 0:
ret.nbDims = 3;
ret.d[2] = exprBuilder.constant(6);
break;
case 1:
ret.nbDims = 2;
break;
default:
break;
}
return ret;
}
size_t TRTBatchedRotatedNMS::getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
int nbInputs,
const nvinfer1::PluginTensorDesc* outputs,
int nbOutputs) const TRT_NOEXCEPT {
size_t batch_size = inputs[0].dims.d[0];
size_t boxes_size = inputs[0].dims.d[1] * inputs[0].dims.d[2] * inputs[0].dims.d[3];
size_t score_size = inputs[1].dims.d[1] * inputs[1].dims.d[2];
size_t num_priors = inputs[0].dims.d[1];
bool shareLocation = (inputs[0].dims.d[2] == 1);
int topk = param.topK > 0 && param.topK <= inputs[1].dims.d[1] ? param.topK : inputs[1].dims.d[1];
return detectionInferenceWorkspaceSize(shareLocation, batch_size, boxes_size, score_size,
param.numClasses, num_priors, topk, DataType::kFLOAT,
DataType::kFLOAT);
}
int TRTBatchedRotatedNMS::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
const nvinfer1::PluginTensorDesc* outputDesc,
const void* const* inputs, void* const* outputs, void* workSpace,
cudaStream_t stream) TRT_NOEXCEPT {
const void* const locData = inputs[0];
const void* const confData = inputs[1];
void* nmsedDets = outputs[0];
void* nmsedLabels = outputs[1];
size_t batch_size = inputDesc[0].dims.d[0];
size_t boxes_size = inputDesc[0].dims.d[1] * inputDesc[0].dims.d[2] * inputDesc[0].dims.d[3];
size_t score_size = inputDesc[1].dims.d[1] * inputDesc[1].dims.d[2];
size_t num_priors = inputDesc[0].dims.d[1];
bool shareLocation = (inputDesc[0].dims.d[2] == 1);
int topk =
param.topK > 0 && param.topK <= inputDesc[1].dims.d[1] ? param.topK : inputDesc[1].dims.d[1];
bool rotated = true;
pluginStatus_t status = nmsInference(
stream, batch_size, boxes_size, score_size, shareLocation, param.backgroundLabelId,
num_priors, param.numClasses, topk, param.keepTopK, param.scoreThreshold, param.iouThreshold,
DataType::kFLOAT, locData, DataType::kFLOAT, confData, nmsedDets, nmsedLabels, nullptr,
workSpace, param.isNormalized, false, mClipBoxes, rotated);
ASSERT(status == STATUS_SUCCESS);
return 0;
}
size_t TRTBatchedRotatedNMS::getSerializationSize() const TRT_NOEXCEPT {
// NMSParameters,
return sizeof(NMSParameters) + sizeof(bool);
}
void TRTBatchedRotatedNMS::serialize(void* buffer) const TRT_NOEXCEPT {
serialize_value(&buffer, param);
serialize_value(&buffer, mClipBoxes);
}
void TRTBatchedRotatedNMS::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs,
int nbInputs,
const nvinfer1::DynamicPluginTensorDesc* outputs,
int nbOutputs) TRT_NOEXCEPT {
// Validate input arguments
}
bool TRTBatchedRotatedNMS::supportsFormatCombination(int pos,
const nvinfer1::PluginTensorDesc* ioDesc,
int nbInputs, int nbOutputs) TRT_NOEXCEPT {
if (pos == 3) {
return ioDesc[pos].type == nvinfer1::DataType::kINT32 &&
ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR;
}
return ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR;
}
const char* TRTBatchedRotatedNMS::getPluginType() const TRT_NOEXCEPT { return NMS_PLUGIN_NAME; }
const char* TRTBatchedRotatedNMS::getPluginVersion() const TRT_NOEXCEPT {
return NMS_PLUGIN_VERSION;
}
IPluginV2DynamicExt* TRTBatchedRotatedNMS::clone() const TRT_NOEXCEPT {
auto* plugin = new TRTBatchedRotatedNMS(mLayerName, param);
plugin->setPluginNamespace(mNamespace.c_str());
plugin->setClipParam(mClipBoxes);
return plugin;
}
nvinfer1::DataType TRTBatchedRotatedNMS::getOutputDataType(int index,
const nvinfer1::DataType* inputTypes,
int nbInputs) const TRT_NOEXCEPT {
ASSERT(index >= 0 && index < this->getNbOutputs());
if (index == 1) {
return nvinfer1::DataType::kINT32;
}
return inputTypes[0];
}
void TRTBatchedRotatedNMS::setClipParam(bool clip) { mClipBoxes = clip; }
TRTBatchedRotatedNMSCreator::TRTBatchedRotatedNMSCreator() {
mPluginAttributes.emplace_back(
PluginField("background_label_id", nullptr, PluginFieldType::kINT32, 1));
mPluginAttributes.emplace_back(PluginField("num_classes", nullptr, PluginFieldType::kINT32, 1));
mPluginAttributes.emplace_back(PluginField("topk", nullptr, PluginFieldType::kINT32, 1));
mPluginAttributes.emplace_back(PluginField("keep_topk", nullptr, PluginFieldType::kINT32, 1));
mPluginAttributes.emplace_back(
PluginField("score_threshold", nullptr, PluginFieldType::kFLOAT32, 1));
mPluginAttributes.emplace_back(
PluginField("iou_threshold", nullptr, PluginFieldType::kFLOAT32, 1));
mPluginAttributes.emplace_back(PluginField("is_normalized", nullptr, PluginFieldType::kINT32, 1));
mPluginAttributes.emplace_back(PluginField("clip_boxes", nullptr, PluginFieldType::kINT32, 1));
mFC.nbFields = mPluginAttributes.size();
mFC.fields = mPluginAttributes.data();
}
const char* TRTBatchedRotatedNMSCreator::getPluginName() const TRT_NOEXCEPT {
return NMS_PLUGIN_NAME;
}
const char* TRTBatchedRotatedNMSCreator::getPluginVersion() const TRT_NOEXCEPT {
return NMS_PLUGIN_VERSION;
}
IPluginV2Ext* TRTBatchedRotatedNMSCreator::createPlugin(
const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT {
const PluginField* fields = fc->fields;
bool clipBoxes = true;
nvinfer1::plugin::NMSParameters params{};
for (int i = 0; i < fc->nbFields; ++i) {
const char* attrName = fields[i].name;
if (!strcmp(attrName, "background_label_id")) {
ASSERT(fields[i].type == PluginFieldType::kINT32);
params.backgroundLabelId = *(static_cast<const int*>(fields[i].data));
} else if (!strcmp(attrName, "num_classes")) {
ASSERT(fields[i].type == PluginFieldType::kINT32);
params.numClasses = *(static_cast<const int*>(fields[i].data));
} else if (!strcmp(attrName, "topk")) {
ASSERT(fields[i].type == PluginFieldType::kINT32);
params.topK = *(static_cast<const int*>(fields[i].data));
} else if (!strcmp(attrName, "keep_topk")) {
ASSERT(fields[i].type == PluginFieldType::kINT32);
params.keepTopK = *(static_cast<const int*>(fields[i].data));
} else if (!strcmp(attrName, "score_threshold")) {
ASSERT(fields[i].type == PluginFieldType::kFLOAT32);
params.scoreThreshold = *(static_cast<const float*>(fields[i].data));
} else if (!strcmp(attrName, "iou_threshold")) {
ASSERT(fields[i].type == PluginFieldType::kFLOAT32);
params.iouThreshold = *(static_cast<const float*>(fields[i].data));
} else if (!strcmp(attrName, "is_normalized")) {
params.isNormalized = *(static_cast<const bool*>(fields[i].data));
} else if (!strcmp(attrName, "clip_boxes")) {
clipBoxes = *(static_cast<const bool*>(fields[i].data));
}
}
TRTBatchedRotatedNMS* plugin = new TRTBatchedRotatedNMS(name, params);
plugin->setClipParam(clipBoxes);
plugin->setPluginNamespace(mNamespace.c_str());
return plugin;
}
IPluginV2Ext* TRTBatchedRotatedNMSCreator::deserializePlugin(const char* name,
const void* serialData,
size_t serialLength) TRT_NOEXCEPT {
// This object will be deleted when the network is destroyed, which will
// call NMS::destroy()
TRTBatchedRotatedNMS* plugin = new TRTBatchedRotatedNMS(name, serialData, serialLength);
plugin->setPluginNamespace(mNamespace.c_str());
return plugin;
}
REGISTER_TENSORRT_PLUGIN(TRTBatchedRotatedNMSCreator);
} // namespace mmdeploy
// Copyright (c) OpenMMLab. All rights reserved
#ifndef TRT_BATCHED_ROTATED_NMS_HPP
#define TRT_BATCHED_ROTATED_NMS_HPP
#include <string>
#include <vector>
#include "NvInferPluginUtils.h"
#include "trt_plugin_base.hpp"
namespace mmdeploy {
class TRTBatchedRotatedNMS : public TRTPluginBase {
public:
TRTBatchedRotatedNMS(const std::string& name, nvinfer1::plugin::NMSParameters param);
TRTBatchedRotatedNMS(const std::string& name, const void* data, size_t length);
~TRTBatchedRotatedNMS() TRT_NOEXCEPT override = default;
int getNbOutputs() const TRT_NOEXCEPT override;
nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs* inputs,
int nbInputs, nvinfer1::IExprBuilder& exprBuilder)
TRT_NOEXCEPT override;
size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs,
const nvinfer1::PluginTensorDesc* outputs,
int nbOutputs) const TRT_NOEXCEPT override;
int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
const nvinfer1::PluginTensorDesc* outputDesc, const void* const* inputs,
void* const* outputs, void* workSpace, cudaStream_t stream) TRT_NOEXCEPT override;
size_t getSerializationSize() const TRT_NOEXCEPT override;
void serialize(void* buffer) const TRT_NOEXCEPT override;
void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs, int nbInputs,
const nvinfer1::DynamicPluginTensorDesc* outputs,
int nbOutputs) TRT_NOEXCEPT override;
bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* ioDesc, int nbInputs,
int nbOutputs) TRT_NOEXCEPT override;
const char* getPluginType() const TRT_NOEXCEPT override;
const char* getPluginVersion() const TRT_NOEXCEPT override;
nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType* inputType,
int nbInputs) const TRT_NOEXCEPT override;
void setClipParam(bool clip);
private:
nvinfer1::plugin::NMSParameters param{};
bool mClipBoxes{};
};
class TRTBatchedRotatedNMSCreator : public TRTPluginCreatorBase {
public:
TRTBatchedRotatedNMSCreator();
~TRTBatchedRotatedNMSCreator() TRT_NOEXCEPT override = default;
const char* getPluginName() const TRT_NOEXCEPT override;
const char* getPluginVersion() const TRT_NOEXCEPT override;
nvinfer1::IPluginV2Ext* createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc)
TRT_NOEXCEPT override;
nvinfer1::IPluginV2Ext* deserializePlugin(const char* name, const void* serialData,
size_t serialLength) TRT_NOEXCEPT override;
};
} // namespace mmdeploy
#endif
// Copyright (c) OpenMMLab. All rights reserved
#include "trt_bicubic_interpolate.hpp"
#include <assert.h>
#include <chrono>
#include "trt_bicubic_interpolate_kernel.hpp"
#include "trt_plugin_helper.hpp"
#include "trt_serialize.hpp"
using namespace nvinfer1;
namespace mmdeploy {
namespace {
static const char *PLUGIN_VERSION{"1"};
static const char *PLUGIN_NAME{"TRTBicubicInterpolate"};
} // namespace
TRTBicubicInterpolate::TRTBicubicInterpolate(const std::string &name,
std::vector<float> scale_factor, bool align_corners)
: TRTPluginBase(name), mScaleFactor(scale_factor), mAlignCorners(align_corners) {}
TRTBicubicInterpolate::TRTBicubicInterpolate(const std::string name, const void *data,
size_t length)
: TRTPluginBase(name) {
deserialize_value(&data, &length, &mScaleFactor);
deserialize_value(&data, &length, &mAlignCorners);
}
nvinfer1::IPluginV2DynamicExt *TRTBicubicInterpolate::clone() const TRT_NOEXCEPT {
TRTBicubicInterpolate *plugin =
new TRTBicubicInterpolate(mLayerName, mScaleFactor, mAlignCorners);
plugin->setPluginNamespace(getPluginNamespace());
return plugin;
}
nvinfer1::DimsExprs TRTBicubicInterpolate::getOutputDimensions(
int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
nvinfer1::IExprBuilder &exprBuilder) TRT_NOEXCEPT {
nvinfer1::DimsExprs ret;
ret.nbDims = 4;
ret.d[0] = inputs[0].d[0];
ret.d[1] = inputs[0].d[1];
auto height = exprBuilder.constant(mScaleFactor[0]);
auto width = exprBuilder.constant(mScaleFactor[1]);
auto d2 = exprBuilder.operation(DimensionOperation::kPROD, *inputs[0].d[2], *height);
auto d3 = exprBuilder.operation(DimensionOperation::kPROD, *inputs[0].d[3], *width);
ret.d[2] = d2;
ret.d[3] = d3;
return ret;
}
bool TRTBicubicInterpolate::supportsFormatCombination(int pos,
const nvinfer1::PluginTensorDesc *ioDesc,
int nbInputs, int nbOutputs) TRT_NOEXCEPT {
if (pos == 0) {
return (ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR);
} else {
return ioDesc[pos].type == ioDesc[0].type && ioDesc[pos].format == ioDesc[0].format;
}
}
void TRTBicubicInterpolate::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *inputs,
int nbInputs,
const nvinfer1::DynamicPluginTensorDesc *outputs,
int nbOutputs) TRT_NOEXCEPT {}
size_t TRTBicubicInterpolate::getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs,
int nbInputs,
const nvinfer1::PluginTensorDesc *outputs,
int nbOutputs) const TRT_NOEXCEPT {
return 0;
}
int TRTBicubicInterpolate::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
const nvinfer1::PluginTensorDesc *outputDesc,
const void *const *inputs, void *const *outputs, void *workSpace,
cudaStream_t stream) TRT_NOEXCEPT {
int batch = inputDesc[0].dims.d[0];
int channels = inputDesc[0].dims.d[1];
int height = inputDesc[0].dims.d[2];
int width = inputDesc[0].dims.d[3];
int height_out = outputDesc[0].dims.d[2];
int width_out = outputDesc[0].dims.d[3];
const void *x = inputs[0];
void *output = outputs[0];
// TODO: add fp16 support
auto data_type = inputDesc[0].type;
switch (data_type) {
case nvinfer1::DataType::kFLOAT:
bicubic_interpolate<float>((float *)x, (float *)output, batch, channels, height, width,
height_out, width_out, mAlignCorners, stream);
break;
default:
return 1;
break;
}
return 0;
}
nvinfer1::DataType TRTBicubicInterpolate::getOutputDataType(int index,
const nvinfer1::DataType *inputTypes,
int nbInputs) const TRT_NOEXCEPT {
return inputTypes[0];
}
// IPluginV2 Methods
const char *TRTBicubicInterpolate::getPluginType() const TRT_NOEXCEPT { return PLUGIN_NAME; }
const char *TRTBicubicInterpolate::getPluginVersion() const TRT_NOEXCEPT { return PLUGIN_VERSION; }
int TRTBicubicInterpolate::getNbOutputs() const TRT_NOEXCEPT { return 1; }
size_t TRTBicubicInterpolate::getSerializationSize() const TRT_NOEXCEPT {
return serialized_size(mScaleFactor) + serialized_size(mAlignCorners);
}
void TRTBicubicInterpolate::serialize(void *buffer) const TRT_NOEXCEPT {
serialize_value(&buffer, mScaleFactor);
serialize_value(&buffer, mAlignCorners);
}
////////////////////// creator /////////////////////////////
TRTBicubicInterpolateCreator::TRTBicubicInterpolateCreator() {
mPluginAttributes.clear();
mPluginAttributes.emplace_back(nvinfer1::PluginField("scale_factor"));
mPluginAttributes.emplace_back(nvinfer1::PluginField("align_corners"));
mFC.nbFields = mPluginAttributes.size();
mFC.fields = mPluginAttributes.data();
}
const char *TRTBicubicInterpolateCreator::getPluginName() const TRT_NOEXCEPT { return PLUGIN_NAME; }
const char *TRTBicubicInterpolateCreator::getPluginVersion() const TRT_NOEXCEPT {
return PLUGIN_VERSION;
}
nvinfer1::IPluginV2 *TRTBicubicInterpolateCreator::createPlugin(
const char *name, const nvinfer1::PluginFieldCollection *fc) TRT_NOEXCEPT {
nvinfer1::Dims size{2, {1, 1}};
std::vector<float> scale_factor;
bool align_corners = 1;
for (int i = 0; i < fc->nbFields; i++) {
if (fc->fields[i].data == nullptr) {
continue;
}
std::string field_name(fc->fields[i].name);
if (field_name.compare("scale_factor") == 0) {
int data_size = (fc->fields[i].length);
if (data_size != 2) {
data_size = data_size / sizeof(float);
}
ASSERT(data_size == 2)
const float *data_start = static_cast<const float *>(fc->fields[i].data);
scale_factor = std::vector<float>(data_start, data_start + data_size);
}
if (field_name.compare("align_corners") == 0) {
align_corners = static_cast<const int *>(fc->fields[i].data)[0];
}
}
TRTBicubicInterpolate *plugin = new TRTBicubicInterpolate(name, scale_factor, align_corners);
plugin->setPluginNamespace(getPluginNamespace());
return plugin;
}
nvinfer1::IPluginV2 *TRTBicubicInterpolateCreator::deserializePlugin(
const char *name, const void *serialData, size_t serialLength) TRT_NOEXCEPT {
auto plugin = new TRTBicubicInterpolate(name, serialData, serialLength);
plugin->setPluginNamespace(getPluginNamespace());
return plugin;
}
REGISTER_TENSORRT_PLUGIN(TRTBicubicInterpolateCreator);
} // namespace mmdeploy
#ifndef TRT_BICUBIC_INTERPOLATE_HPP
#define TRT_BICUBIC_INTERPOLATE_HPP
#include <cublas_v2.h>
#include <memory>
#include <string>
#include <vector>
#include "trt_plugin_base.hpp"
namespace mmdeploy {
class TRTBicubicInterpolate : public TRTPluginBase {
public:
TRTBicubicInterpolate(const std::string &name, std::vector<float> scale_factor,
bool align_corners);
TRTBicubicInterpolate(const std::string name, const void *data, size_t length);
TRTBicubicInterpolate() = delete;
// IPluginV2DynamicExt Methods
nvinfer1::IPluginV2DynamicExt *clone() const TRT_NOEXCEPT override;
nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs,
int nbInputs, nvinfer1::IExprBuilder &exprBuilder)
TRT_NOEXCEPT override;
bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *ioDesc, int nbInputs,
int nbOutputs) TRT_NOEXCEPT override;
void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs,
const nvinfer1::DynamicPluginTensorDesc *out,
int nbOutputs) TRT_NOEXCEPT override;
size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
const nvinfer1::PluginTensorDesc *outputs,
int nbOutputs) const TRT_NOEXCEPT override;
int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
void *const *outputs, void *workspace, cudaStream_t stream) TRT_NOEXCEPT override;
// IPluginV2Ext Methods
nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType *inputTypes,
int nbInputs) const TRT_NOEXCEPT override;
// IPluginV2 Methods
const char *getPluginType() const TRT_NOEXCEPT override;
const char *getPluginVersion() const TRT_NOEXCEPT override;
int getNbOutputs() const TRT_NOEXCEPT override;
size_t getSerializationSize() const TRT_NOEXCEPT override;
void serialize(void *buffer) const TRT_NOEXCEPT override;
private:
std::vector<float> mScaleFactor;
bool mAlignCorners;
};
class TRTBicubicInterpolateCreator : public TRTPluginCreatorBase {
public:
TRTBicubicInterpolateCreator();
const char *getPluginName() const TRT_NOEXCEPT override;
const char *getPluginVersion() const TRT_NOEXCEPT override;
nvinfer1::IPluginV2 *createPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
TRT_NOEXCEPT override;
nvinfer1::IPluginV2 *deserializePlugin(const char *name, const void *serialData,
size_t serialLength) TRT_NOEXCEPT override;
};
} // namespace mmdeploy
#endif // TRT_BICUBIC_INTERPOLATE_HPP
// Modified from
// https://github.com/pytorch/pytorch/blob/6adbe044e39c8e8db158d91e151aa6dead6e9aa4/aten/src/ATen/native/cuda/UpSampleBicubic2d.cu
#include <cuda_fp16.h>
#include <stdio.h>
#include <algorithm>
#include <cmath>
#include <vector>
#include "common_cuda_helper.hpp"
#include "trt_bicubic_interpolate_kernel.hpp"
// Based on
// https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm
template <typename scalar_t>
__device__ __forceinline__ static scalar_t cubic_convolution1(scalar_t x, scalar_t A) {
return ((A + 2) * x - (A + 3)) * x * x + 1;
}
template <typename scalar_t>
__device__ __forceinline__ static scalar_t cubic_convolution2(scalar_t x, scalar_t A) {
return ((A * x - 5 * A) * x + 8 * A) * x - 4 * A;
}
template <typename scalar_t>
__device__ __forceinline__ static void get_cubic_upsample_coefficients(scalar_t coeffs[4],
scalar_t t) {
scalar_t A = -0.75;
scalar_t x1 = t;
coeffs[0] = cubic_convolution2<scalar_t>(x1 + 1.0, A);
coeffs[1] = cubic_convolution1<scalar_t>(x1, A);
// opposite coefficients
scalar_t x2 = 1.0 - t;
coeffs[2] = cubic_convolution1<scalar_t>(x2, A);
coeffs[3] = cubic_convolution2<scalar_t>(x2 + 1.0, A);
}
template <typename scalar_t>
__device__ __forceinline__ static scalar_t cubic_interp1d(scalar_t x0, scalar_t x1, scalar_t x2,
scalar_t x3, scalar_t t) {
scalar_t coeffs[4];
get_cubic_upsample_coefficients<scalar_t>(coeffs, t);
return x0 * coeffs[0] + x1 * coeffs[1] + x2 * coeffs[2] + x3 * coeffs[3];
}
/* Used by UpSampleBicubic2d.cu */
template <typename scalar_t>
__device__ __forceinline__ static scalar_t upsample_get_value_bounded(const scalar_t *data,
int batch, int channel,
int batchsize, int channels,
int height, int width, int y,
int x) {
int access_y = max(min(y, height - 1), 0);
int access_x = max(min(x, width - 1), 0);
return data[batch * channels * height * width + channel * height * width + access_y * width +
access_x];
}
template <typename scalar_t>
__device__ __forceinline__ scalar_t
area_pixel_compute_source_index(scalar_t scale, int64_t dst_index, bool align_corners, bool cubic) {
if (align_corners) {
return scale * dst_index;
} else {
scalar_t src_idx = scale * (dst_index + 0.5) - 0.5;
// [Note] Follow Opencv resize logic:
// We allow negative src_idx here and later will use
// dx = src_idx - floorf(src_idx)
// to compute the "distance"(which affects weights).
// For linear modes, weight distribution doesn't matter
// for negative indices as they use 2 pixels to interpolate.
// For example, [-1, 0], they both use pixel 0 value so it
// doesn't affect if we bound the src_idx to 0 or not.
// TODO: Our current linear mode impls use unbound indices
// where we should and then remove this cubic flag.
// This matters in cubic mode, as we might need [-1, 0, 1, 2]
// to interpolate and the weights can be affected.
return (!cubic && src_idx < 0) ? scalar_t(0) : src_idx;
}
}
// cubic interpolation pytorch
template <typename scalar_t>
__global__ void resize_cubic_kernel_torch(const int num_elements, const scalar_t *src,
const int batchsize, const int channels, int srcWidth,
int srcHeight, scalar_t *dst, int dstWidth, int dstHeight,
bool align_corners, float height_scale,
float width_scale) {
CUDA_1D_KERNEL_LOOP(index, num_elements) {
// Special case: input and output are the same size, just copy
const int output_x = index % dstWidth;
const int output_y = index / dstWidth;
if (srcHeight == dstHeight && srcWidth == dstWidth) {
for (int n = 0; n < batchsize; n++) {
for (int c = 0; c < channels; c++) {
const scalar_t val = src[n * channels * dstHeight * dstWidth + c * dstHeight * dstWidth +
output_y * dstWidth + output_x];
dst[n * channels * dstHeight * dstWidth + c * dstHeight * dstWidth + output_y * dstWidth +
output_x] = val;
}
}
return;
}
// Interpolation kernel
scalar_t real_x =
area_pixel_compute_source_index(width_scale, output_x, align_corners, /*cubic=*/true);
int in_x = floorf(real_x);
scalar_t t_x = real_x - in_x;
scalar_t real_y =
area_pixel_compute_source_index(height_scale, output_y, align_corners, /*cubic=*/true);
int in_y = floorf(real_y);
scalar_t t_y = real_y - in_y;
for (int n = 0; n < batchsize; n++) {
for (int c = 0; c < channels; c++) {
scalar_t coefficients[4];
for (int k = 0; k < 4; k++) {
coefficients[k] = cubic_interp1d<scalar_t>(
upsample_get_value_bounded(src, n, c, batchsize, channels, srcHeight, srcWidth,
in_y - 1 + k, in_x - 1),
upsample_get_value_bounded(src, n, c, batchsize, channels, srcHeight, srcWidth,
in_y - 1 + k, in_x + 0),
upsample_get_value_bounded(src, n, c, batchsize, channels, srcHeight, srcWidth,
in_y - 1 + k, in_x + 1),
upsample_get_value_bounded(src, n, c, batchsize, channels, srcHeight, srcWidth,
in_y - 1 + k, in_x + 2),
t_x);
}
dst[n * channels * dstHeight * dstWidth + c * dstHeight * dstWidth + output_y * dstWidth +
output_x] = scalar_t(cubic_interp1d(coefficients[0], coefficients[1], coefficients[2],
coefficients[3], t_y));
}
}
}
}
template <typename scalar_t>
void resizeGPU(const scalar_t *pIn_d, scalar_t *pOut_d, int batch, int channels, int srcWidth,
int srcHeight, int dstWidth, int dstHeight, bool align_corners,
cudaStream_t stream) {
float height_scale = float(srcHeight) / dstHeight;
float width_scale = float(srcWidth) / dstWidth;
if (align_corners && dstWidth > 1 && dstHeight > 1) {
height_scale = (float)(srcHeight - 1) / (dstHeight - 1);
width_scale = (float)(srcWidth - 1) / (dstWidth - 1);
}
int n = batch * dstWidth * dstHeight * channels;
resize_cubic_kernel_torch<<<GET_BLOCKS(n), THREADS_PER_BLOCK, 0, stream>>>(
dstWidth * dstHeight, pIn_d, batch, channels, srcWidth, srcHeight, pOut_d, dstWidth,
dstHeight, align_corners, height_scale, width_scale);
}
template <typename scalar_t>
void bicubic_interpolate(const scalar_t *input, scalar_t *output, int batch, int channels,
int in_height, int in_width, int out_height, int out_width,
bool align_corners, cudaStream_t stream) {
resizeGPU(input, output, batch, channels, in_width, in_height, out_width, out_height,
align_corners, stream);
}
template void bicubic_interpolate<float>(const float *input, float *output, int batch, int channels,
int in_height, int in_width, int out_height, int out_width,
bool align_corners, cudaStream_t stream);
#ifndef TRT_BICUBIC_INTERPOLATE_KERNEL_HPP
#define TRT_BICUBIC_INTERPOLATE_KERNEL_HPP
#include <cuda_runtime.h>
#include "common_cuda_helper.hpp"
template <typename scalar_t>
void bicubic_interpolate(const scalar_t *input, scalar_t *output, int batch, int channels,
int in_height, int in_width, int out_height, int out_width,
bool align_corners, cudaStream_t stream);
#endif // TRT_BICUBIC_INTERPOLATE_KERNEL_HPP
// Copyright (c) OpenMMLab. All rights reserved.
#ifndef COMMON_CUDA_HELPER
#define COMMON_CUDA_HELPER
#include <cublas_v2.h>
#include <cuda.h>
#include <stdio.h>
#include <algorithm>
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
#define THREADS_PER_BLOCK 512
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
inline int GET_BLOCKS(const int N) {
int optimal_block_num = DIVUP(N, THREADS_PER_BLOCK);
int max_block_num = 4096;
return std::min(optimal_block_num, max_block_num);
}
#define cudaCheckError() \
{ \
cudaError_t e = cudaGetLastError(); \
if (e != cudaSuccess) { \
printf("Cuda failure %s:%d: '%s'\n", __FILE__, __LINE__, cudaGetErrorString(e)); \
exit(0); \
} \
}
/**
* Returns a view of the original tensor with its dimensions permuted.
*
* @param[out] dst pointer to the destination tensor
* @param[in] src pointer to the source tensor
* @param[in] src_size shape of the src tensor
* @param[in] permute The desired ordering of dimensions
* @param[in] src_dim dim of src tensor
* @param[in] stream cuda stream handle
*/
template <class scalar_t>
void memcpyPermute(scalar_t* dst, const scalar_t* src, int* src_size, int* permute, int src_dim,
cudaStream_t stream = 0);
template <typename scalar_t>
cublasStatus_t cublasGemmWrap(cublasHandle_t handle, cublasOperation_t transa,
cublasOperation_t transb, int m, int n, int k, const scalar_t* alpha,
const scalar_t* A, int lda, const scalar_t* B, int ldb,
const scalar_t* beta, scalar_t* C, int ldc);
template <typename scalar_t>
__device__ __forceinline__ scalar_t bilinear_interpolate(const scalar_t* __restrict__ input,
const int height, const int width,
scalar_t y, scalar_t x) {
// deal with cases that inverse elements are out of feature map boundary
if (y < -1.0 || y > height || x < -1.0 || x > width) return 0;
y = min(scalar_t(height - 1), max(scalar_t(0), y));
x = min(scalar_t(width - 1), max(scalar_t(0), x));
const int y_low = floor(y);
const int x_low = floor(x);
const int y_high = ceil(y);
const int x_high = ceil(x);
const scalar_t v1 = input[y_low * width + x_low];
const scalar_t v2 = input[y_low * width + x_high];
const scalar_t v3 = input[y_high * width + x_low];
const scalar_t v4 = input[y_high * width + x_high];
// lerp can be performed by fma
const scalar_t ly = y - y_low;
const scalar_t lx = x - x_low;
const scalar_t v_low = fma(v2 - v1, lx, v1);
const scalar_t v_high = fma(v4 - v3, lx, v3);
const scalar_t val = fma(v_high - v_low, ly, v_low);
return val;
}
#endif // COMMON_CUDA_HELPER
// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
// modify from
// https://github.com/NVIDIA/TensorRT/tree/master/plugin/batchedNMSPlugin
#ifndef TRT_BATCHED_NMS_KERNEL_HPP
#define TRT_BATCHED_NMS_KERNEL_HPP
#include "cuda_runtime_api.h"
#include "kernel.h"
pluginStatus_t nmsInference(cudaStream_t stream, const int N, const int perBatchBoxesSize,
const int perBatchScoresSize, const bool shareLocation,
const int backgroundLabelId, const int numPredsPerClass,
const int numClasses, const int topK, const int keepTopK,
const float scoreThreshold, const float iouThreshold,
const DataType DT_BBOX, const void* locData, const DataType DT_SCORE,
const void* confData, void* nmsedDets, void* nmsedLabels,
void* nmsedIndex, void* workspace, bool isNormalized, bool confSigmoid,
bool clipBoxes, bool rotated = false);
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment