Unverified Commit c0f5492e authored by zhuyuanhao's avatar zhuyuanhao Committed by GitHub
Browse files

add ext ops, support parrots (#310)



* add ext ops, support parrots

* fix lint

* fix lint

* update op from mmdetection

* support non-pytorch env

* fix import bug

* test not import mmcv.op

* rename mmcv.op to mmcv.ops

* fix compile warning

* 1. fix syncbn warning in pytorch 1.5
2. support only cpu compile
3. add point_sample from mmdet

* fix text bug

* update docstrings

* fix line endings

* minor updates

* remove non_local from ops

* bug fix for nonlocal2d

* rename ops_ext to _ext and _ext to _flow_warp_ext

* update the doc

* try clang-format github action

* fix github action

* add ops to api.rst

* fix cpp format

* fix clang format issues

* remove .clang-format
Co-authored-by: default avatarKai Chen <chenkaidev@gmail.com>
parent a7bf7701
// Copyright (c) 2018, SenseTime.
#include "parrots_cpp_helper.hpp"
void SigmoidFocalLossForwardCUDAKernelLauncher(
const DArrayLite input, const DArrayLite target, const DArrayLite weight,
DArrayLite output, float gamma, float alpha, cudaStream_t stream);
void SigmoidFocalLossBackwardCUDAKernelLauncher(
const DArrayLite input, const DArrayLite target, const DArrayLite weight,
DArrayLite grad_input, float gamma, float alpha, cudaStream_t stream);
void SoftmaxFocalLossForwardCUDAKernelLauncher(
const DArrayLite input, const DArrayLite target, const DArrayLite weight,
DArrayLite output, float gamma, float alpha, cudaStream_t stream);
void SoftmaxFocalLossBackwardCUDAKernelLauncher(
const DArrayLite input, const DArrayLite target, const DArrayLite weight,
DArrayLite buff, DArrayLite grad_input, float gamma, float alpha,
cudaStream_t stream);
void sigmoid_focal_loss_forward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
float gamma;
float alpha;
SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
// get inputs and outputs
const auto& input = ins[0];
const auto& target = ins[1];
const auto& weight = ins[2];
auto& output = outs[0];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
SigmoidFocalLossForwardCUDAKernelLauncher(input, target, weight, output,
gamma, alpha, stream);
}
void sigmoid_focal_loss_backward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
float gamma;
float alpha;
SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
// get inputs and outputs
const auto& input = ins[0];
const auto& target = ins[1];
const auto& weight = ins[2];
auto& grad_input = outs[0];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
SigmoidFocalLossBackwardCUDAKernelLauncher(input, target, weight, grad_input,
gamma, alpha, stream);
}
void softmax_focal_loss_forward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
float gamma;
float alpha;
SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
// get inputs and outputs
const auto& input = ins[0];
const auto& target = ins[1];
const auto& weight = ins[2];
auto& grad_input = outs[0];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
SoftmaxFocalLossForwardCUDAKernelLauncher(input, target, weight, grad_input,
gamma, alpha, stream);
}
void softmax_focal_loss_backward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
float gamma;
float alpha;
SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
// get inputs and outputs
const auto& input = ins[0];
const auto& target = ins[1];
const auto& weight = ins[2];
auto& buff = outs[0];
auto& grad_input = outs[1];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
SoftmaxFocalLossBackwardCUDAKernelLauncher(input, target, weight, buff,
grad_input, gamma, alpha, stream);
}
PARROTS_EXTENSION_REGISTER(sigmoid_focal_loss_forward)
.attr("gamma")
.attr("alpha")
.input(3)
.output(1)
.apply(sigmoid_focal_loss_forward_cuda)
.done();
PARROTS_EXTENSION_REGISTER(sigmoid_focal_loss_backward)
.attr("gamma")
.attr("alpha")
.input(3)
.output(1)
.apply(sigmoid_focal_loss_backward_cuda)
.done();
PARROTS_EXTENSION_REGISTER(softmax_focal_loss_forward)
.attr("gamma")
.attr("alpha")
.input(3)
.output(1)
.apply(softmax_focal_loss_forward_cuda)
.done();
PARROTS_EXTENSION_REGISTER(softmax_focal_loss_backward)
.attr("gamma")
.attr("alpha")
.input(3)
.output(2)
.apply(softmax_focal_loss_backward_cuda)
.done();
#include "parrots_cuda_helper.hpp"
#include "sigmoid_focal_loss_kernel.cuh"
#include "softmax_focal_loss_kernel.cuh"
void SigmoidFocalLossForwardCUDAKernelLauncher(
const DArrayLite input, const DArrayLite target, const DArrayLite weight,
DArrayLite output, float gamma, float alpha, cudaStream_t stream) {
int output_size = output.size();
int num_classes = input.dim(1);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
input.elemType().prim(), ([&] {
sigmoid_focal_loss_forward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, input.ptr<scalar_t>(), target.ptr<int64_t>(),
weight.ptr<scalar_t>(), output.ptr<scalar_t>(), gamma, alpha,
num_classes);
}));
PARROTS_CUDA_CHECK(cudaGetLastError());
}
void SigmoidFocalLossBackwardCUDAKernelLauncher(
const DArrayLite input, const DArrayLite target, const DArrayLite weight,
DArrayLite grad_input, float gamma, float alpha, cudaStream_t stream) {
int output_size = grad_input.size();
int num_classes = input.dim(1);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
input.elemType().prim(), ([&] {
sigmoid_focal_loss_backward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, input.ptr<scalar_t>(), target.ptr<int64_t>(),
weight.ptr<scalar_t>(), grad_input.ptr<scalar_t>(), gamma,
alpha, num_classes);
}));
PARROTS_CUDA_CHECK(cudaGetLastError());
}
void SoftmaxFocalLossForwardCUDAKernelLauncher(
const DArrayLite softmax, const DArrayLite target, const DArrayLite weight,
DArrayLite output, float gamma, float alpha, cudaStream_t stream) {
int output_size = output.size();
int num_classes = softmax.dim(1);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
softmax.elemType().prim(), ([&] {
softmax_focal_loss_forward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, softmax.ptr<scalar_t>(), target.ptr<int64_t>(),
weight.ptr<scalar_t>(), output.ptr<scalar_t>(), gamma, alpha,
num_classes);
}));
PARROTS_CUDA_CHECK(cudaGetLastError());
}
void SoftmaxFocalLossBackwardCUDAKernelLauncher(
const DArrayLite softmax, const DArrayLite target, const DArrayLite weight,
DArrayLite buff, DArrayLite grad_input, float gamma, float alpha,
cudaStream_t stream) {
int output_size = buff.size();
int num_classes = softmax.dim(1);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
grad_input.elemType().prim(), ([&] {
softmax_focal_loss_backward_cuda1_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, softmax.ptr<scalar_t>(), target.ptr<int64_t>(),
weight.ptr<scalar_t>(), buff.ptr<scalar_t>(), gamma, alpha,
num_classes);
}));
PARROTS_CUDA_CHECK(cudaGetLastError());
output_size = grad_input.size();
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
grad_input.elemType().prim(), ([&] {
softmax_focal_loss_backward_cuda2_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, softmax.ptr<scalar_t>(), target.ptr<int64_t>(),
buff.ptr<scalar_t>(), grad_input.ptr<scalar_t>(), num_classes);
}));
PARROTS_CUDA_CHECK(cudaGetLastError());
}
#include "parrots_cpp_helper.hpp"
void MaskedIm2colForwardCUDAKernelLauncher(
const DArrayLite bottom_data, const DArrayLite mask_h_idx,
const DArrayLite mask_w_idx, DArrayLite top_data, const int kernel_h,
const int kernel_w, const int pad_h, const int pad_w, cudaStream_t stream);
void MaskedCol2imForwardCUDAKernelLaucher(const DArrayLite bottom_data,
const DArrayLite mask_h_idx,
const DArrayLite mask_w_idx,
DArrayLite top_data, const int height,
const int width, const int channels,
cudaStream_t stream);
void masked_im2col_forward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
// im: (n, ic, h, w), kernel size (kh, kw)
// kernel: (oc, ic * kh * kw), col: (kh * kw * ic, ow * oh)
int kernel_h, kernel_w, pad_h, pad_w;
SSAttrs(attr)
.get<int>("kernel_h", kernel_h)
.get<int>("kernel_w", kernel_w)
.get<int>("pad_h", pad_h)
.get<int>("pad_w", pad_w)
.done();
const auto& im = ins[0];
const auto& mask_h_idx = ins[1];
const auto& mask_w_idx = ins[2];
auto& col = outs[0];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
MaskedIm2colForwardCUDAKernelLauncher(im, mask_h_idx, mask_w_idx, col,
kernel_h, kernel_w, pad_h, pad_w,
stream);
}
void masked_col2im_forward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
// im: (n, ic, h, w), kernel size (kh, kw)
// kernel: (oc, ic * kh * kh), col: (kh * kw * ic, ow * oh)
int height, width, channels;
SSAttrs(attr)
.get<int>("height", height)
.get<int>("width", width)
.get<int>("channels", channels)
.done();
const auto& col = ins[0];
const auto& mask_h_idx = ins[1];
const auto& mask_w_idx = ins[2];
auto& im = outs[0];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
MaskedCol2imForwardCUDAKernelLaucher(col, mask_h_idx, mask_w_idx, im, height,
width, channels, stream);
}
PARROTS_EXTENSION_REGISTER(masked_im2col_forward)
.attr("kernel_h")
.attr("kernel_w")
.attr("pad_h")
.attr("pad_w")
.input(3)
.output(1)
.apply(masked_im2col_forward_cuda)
.done();
PARROTS_EXTENSION_REGISTER(masked_col2im_forward)
.attr("height")
.attr("width")
.attr("channels")
.input(3)
.output(1)
.apply(masked_col2im_forward_cuda)
.done();
#include "masked_conv2d_cuda_kernel.cuh"
#include "parrots_cuda_helper.hpp"
void MaskedIm2colForwardCUDAKernelLauncher(
const DArrayLite bottom_data, const DArrayLite mask_h_idx,
const DArrayLite mask_w_idx, DArrayLite top_data, const int kernel_h,
const int kernel_w, const int pad_h, const int pad_w, cudaStream_t stream) {
int channels = bottom_data.dim(1);
int height = bottom_data.dim(2);
int width = bottom_data.dim(3);
int mask_cnt = mask_h_idx.dim(0);
int output_size = mask_cnt * channels;
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
bottom_data.elemType().prim(), ([&] {
MaskedIm2colForward<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, bottom_data.ptr<scalar_t>(), height, width,
kernel_h, kernel_w, pad_h, pad_w, mask_h_idx.ptr<int64_t>(),
mask_w_idx.ptr<int64_t>(), mask_cnt, top_data.ptr<scalar_t>());
}));
PARROTS_CUDA_CHECK(cudaGetLastError());
}
void MaskedCol2imForwardCUDAKernelLaucher(const DArrayLite bottom_data,
const DArrayLite mask_h_idx,
const DArrayLite mask_w_idx,
DArrayLite top_data, const int height,
const int width, const int channels,
cudaStream_t stream) {
int mask_cnt = mask_h_idx.dim(0);
int output_size = mask_cnt * channels;
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
bottom_data.elemType().prim(), ([&] {
MaskedCol2imForward<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, bottom_data.ptr<scalar_t>(), height, width,
channels, mask_h_idx.ptr<int64_t>(), mask_w_idx.ptr<int64_t>(),
mask_cnt, top_data.ptr<scalar_t>());
}));
PARROTS_CUDA_CHECK(cudaGetLastError());
}
// Copyright (c) 2019, SenseTime.
#include "parrots_cpp_helper.hpp"
void ModulatedDeformConvForwardCUDAKernelLauncher(
const DArrayLite input, const DArrayLite weight, const DArrayLite bias,
const DArrayLite ones, const DArrayLite offset, const DArrayLite mask,
DArrayLite output, DArrayLite columns, int kernel_h, int kernel_w,
const int stride_h, const int stride_w, const int pad_h, const int pad_w,
const int dilation_h, const int dilation_w, const int group,
int deformable_group, const bool with_bias, CudaContext& ctx,
cudaStream_t stream);
void ModulatedDeformConvBackwardCUDAKernelLauncher(
const DArrayLite input, const DArrayLite weight, const DArrayLite bias,
const DArrayLite ones, const DArrayLite offset, const DArrayLite mask,
DArrayLite columns, DArrayLite grad_input, DArrayLite grad_weight,
DArrayLite grad_bias, DArrayLite grad_offset, DArrayLite grad_mask,
DArrayLite grad_output, int kernel_h, int kernel_w, int stride_h,
int stride_w, int pad_h, int pad_w, int dilation_h, int dilation_w,
int group, int deformable_group, const bool with_bias, CudaContext& ctx,
cudaStream_t stream);
void modulated_deform_conv_forward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h,
dilation_w, group, deformable_group, with_bias;
SSAttrs(attr)
.get<int>("kernel_h", kernel_h)
.get<int>("kernel_w", kernel_w)
.get<int>("stride_h", stride_h)
.get<int>("stride_w", stride_w)
.get<int>("pad_h", pad_h)
.get<int>("pad_w", pad_w)
.get<int>("dilation_h", dilation_h)
.get<int>("dilation_w", dilation_w)
.get<int>("group", group)
.get<int>("deformable_group", deformable_group)
.get<int>("with_bias", with_bias)
.done();
auto input = ins[0];
auto weight = ins[1];
auto bias = ins[2];
auto ones = ins[3];
auto offset = ins[4];
auto mask = ins[5];
auto output = outs[0];
auto columns = outs[1];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
ModulatedDeformConvForwardCUDAKernelLauncher(
input, weight, bias, ones, offset, mask, output, columns, kernel_h,
kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
deformable_group, with_bias, ctx, stream);
}
void modulated_deform_conv_backward_cuda(CudaContext& ctx,
const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h,
dilation_w, group, deformable_group, with_bias;
SSAttrs(attr)
.get<int>("kernel_h", kernel_h)
.get<int>("kernel_w", kernel_w)
.get<int>("stride_h", stride_h)
.get<int>("stride_w", stride_w)
.get<int>("pad_h", pad_h)
.get<int>("pad_w", pad_w)
.get<int>("dilation_h", dilation_h)
.get<int>("dilation_w", dilation_w)
.get<int>("group", group)
.get<int>("deformable_group", deformable_group)
.get<int>("with_bias", with_bias)
.done();
auto input = ins[0];
auto weight = ins[1];
auto bias = ins[2];
auto ones = ins[3];
auto offset = ins[4];
auto mask = ins[5];
auto columns = outs[0];
auto grad_input = outs[1];
auto grad_weight = outs[2];
auto grad_bias = outs[3];
auto grad_offset = outs[4];
auto grad_mask = outs[5];
auto grad_output = outs[6];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
ModulatedDeformConvBackwardCUDAKernelLauncher(
input, weight, bias, ones, offset, mask, columns, grad_input, grad_weight,
grad_bias, grad_offset, grad_mask, grad_output, kernel_h, kernel_w,
stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
deformable_group, with_bias, ctx, stream);
}
PARROTS_EXTENSION_REGISTER(modulated_deform_conv_forward)
.attr("kernel_h")
.attr("kernel_w")
.attr("stride_h")
.attr("stride_w")
.attr("pad_h")
.attr("pad_w")
.attr("dilation_h")
.attr("dilation_w")
.attr("group")
.attr("deformable_group")
.attr("with_bias")
.input(6)
.output(2)
.apply(modulated_deform_conv_forward_cuda)
.done();
PARROTS_EXTENSION_REGISTER(modulated_deform_conv_backward)
.attr("kernel_h")
.attr("kernel_w")
.attr("stride_h")
.attr("stride_w")
.attr("pad_h")
.attr("pad_w")
.attr("dilation_h")
.attr("dilation_w")
.attr("group")
.attr("deformable_group")
.attr("with_bias")
.input(6)
.output(7)
.apply(modulated_deform_conv_backward_cuda)
.done();
This diff is collapsed.
#include "parrots_cpp_helper.hpp"
#define DIVUP(x, y) (((x) + (y)-1) / (y))
int const threadsPerBlock = sizeof(unsigned long long) * 8;
DArrayLite NMSCUDAKernelLauncher(const DArrayLite boxes_sorted,
const DArrayLite order, const DArrayLite areas,
float iou_threshold, int offset,
CudaContext& ctx, cudaStream_t stream);
void nms_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
float iou_threshold;
int offset;
SSAttrs(attr)
.get<float>("iou_threshold", iou_threshold)
.get<int>("offset", offset)
.done();
const auto& boxes_sorted = ins[0];
const auto& order = ins[1];
const auto& areas = ins[2];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
outs[0] = NMSCUDAKernelLauncher(boxes_sorted, order, areas, iou_threshold,
offset, ctx, stream);
}
void nms_cpu(HostContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
float iou_threshold;
int offset;
SSAttrs(attr)
.get<float>("iou_threshold", iou_threshold)
.get<int>("offset", offset)
.done();
const auto& boxes = ins[0];
const auto& order = ins[1];
const auto& areas = ins[2];
size_t nboxes = boxes.shape().dim(0);
size_t boxes_dim = boxes.shape().dim(1);
auto select = ctx.createDArrayLite(DArraySpec::array(Prim::Int64, nboxes),
getHostProxy());
select.setZeros(syncStream());
if (boxes.size() == 0) {
outs[0] = select;
return;
}
fill(ctx, select, *toScalar(1));
auto select_ptr = select.ptr<int64_t>();
auto boxes_ptr = boxes.ptr<float>();
auto order_ptr = order.ptr<int64_t>();
auto areas_ptr = areas.ptr<float>();
for (int64_t _i = 0; _i < nboxes; _i++) {
if (select_ptr[_i] == 0) continue;
auto i = order_ptr[_i];
auto ix1 = boxes_ptr[i * boxes_dim];
auto iy1 = boxes_ptr[i * boxes_dim + 1];
auto ix2 = boxes_ptr[i * boxes_dim + 2];
auto iy2 = boxes_ptr[i * boxes_dim + 3];
auto iarea = areas_ptr[i];
for (int64_t _j = _i + 1; _j < nboxes; _j++) {
if (select_ptr[_j] == 0) continue;
auto j = order_ptr[_j];
auto xx1 = fmaxf(ix1, boxes_ptr[j * boxes_dim]);
auto yy1 = fmaxf(iy1, boxes_ptr[j * boxes_dim + 1]);
auto xx2 = fminf(ix2, boxes_ptr[j * boxes_dim + 2]);
auto yy2 = fminf(iy2, boxes_ptr[j * boxes_dim + 3]);
auto w = fmaxf(0.0, xx2 - xx1 + offset);
auto h = fmaxf(0.0, yy2 - yy1 + offset);
auto inter = w * h;
auto ovr = inter / (iarea + areas_ptr[j] - inter);
if (ovr >= iou_threshold) select_ptr[_j] = 0;
}
}
outs[0] = select;
}
void softnms_cpu(HostContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
float iou_threshold;
float sigma;
float min_score;
int method;
int offset;
SSAttrs(attr)
.get<float>("iou_threshold", iou_threshold)
.get<float>("sigma", sigma)
.get<float>("min_score", min_score)
.get<int>("method", method)
.get<int>("offset", offset)
.done();
const auto& boxes = ins[0];
const auto& scores = ins[1];
const auto& areas = ins[2];
size_t nboxes = boxes.shape().dim(0);
size_t boxes_dim = boxes.shape().dim(1);
auto boxes_ptr = boxes.ptr<float>();
auto scores_ptr = scores.ptr<float>();
auto areas_ptr = areas.ptr<float>();
auto inputs = ctx.createDArrayLite(
DArraySpec::array(Prim::Float32, DArrayShape(nboxes, 6)));
auto inputs_ptr = inputs.ptr<float>();
auto dets = ctx.createDArrayLite(
DArraySpec::array(Prim::Float32, DArrayShape(nboxes, 5)));
auto de = dets.ptr<float>();
for (size_t i = 0; i < nboxes; i++) {
inputs_ptr[i * 6 + 0] = boxes_ptr[i * boxes_dim + 0];
inputs_ptr[i * 6 + 1] = boxes_ptr[i * boxes_dim + 1];
inputs_ptr[i * 6 + 2] = boxes_ptr[i * boxes_dim + 2];
inputs_ptr[i * 6 + 3] = boxes_ptr[i * boxes_dim + 3];
inputs_ptr[i * 6 + 4] = scores_ptr[i];
inputs_ptr[i * 6 + 5] = areas_ptr[i];
}
size_t pos = 0;
auto inds_t = ctx.createDArrayLite(DArraySpec::array(Prim::Int64, nboxes));
arange(ctx, *toScalar(0), *toScalar(nboxes), *toScalar(1), inds_t);
auto inds = inds_t.ptr<int64_t>();
auto num_out = ctx.createDArrayLite(DArraySpec::scalar(Prim::Int64));
for (size_t i = 0; i < nboxes; i++) {
auto max_score = inputs_ptr[i * 6 + 4];
auto max_pos = i;
pos = i + 1;
// get max box
while (pos < nboxes) {
if (max_score < inputs_ptr[pos * 6 + 4]) {
max_score = inputs_ptr[pos * 6 + 4];
max_pos = pos;
}
pos = pos + 1;
}
// swap
auto ix1 = de[i * 5 + 0] = inputs_ptr[max_pos * 6 + 0];
auto iy1 = de[i * 5 + 1] = inputs_ptr[max_pos * 6 + 1];
auto ix2 = de[i * 5 + 2] = inputs_ptr[max_pos * 6 + 2];
auto iy2 = de[i * 5 + 3] = inputs_ptr[max_pos * 6 + 3];
auto iscore = de[i * 5 + 4] = inputs_ptr[max_pos * 6 + 4];
auto iarea = inputs_ptr[max_pos * 6 + 5];
auto iind = inds[max_pos];
inputs_ptr[max_pos * 6 + 0] = inputs_ptr[i * 6 + 0];
inputs_ptr[max_pos * 6 + 1] = inputs_ptr[i * 6 + 1];
inputs_ptr[max_pos * 6 + 2] = inputs_ptr[i * 6 + 2];
inputs_ptr[max_pos * 6 + 3] = inputs_ptr[i * 6 + 3];
inputs_ptr[max_pos * 6 + 4] = inputs_ptr[i * 6 + 4];
inputs_ptr[max_pos * 6 + 5] = inputs_ptr[i * 6 + 5];
inds[max_pos] = inds[i];
inputs_ptr[i * 6 + 0] = ix1;
inputs_ptr[i * 6 + 1] = iy1;
inputs_ptr[i * 6 + 2] = ix2;
inputs_ptr[i * 6 + 3] = iy2;
inputs_ptr[i * 6 + 4] = iscore;
inputs_ptr[i * 6 + 5] = iarea;
inds[i] = iind;
pos = i + 1;
while (pos < nboxes) {
auto xx1 = fmaxf(ix1, inputs_ptr[pos * 6 + 0]);
auto yy1 = fmaxf(iy1, inputs_ptr[pos * 6 + 1]);
auto xx2 = fminf(ix2, inputs_ptr[pos * 6 + 2]);
auto yy2 = fminf(iy2, inputs_ptr[pos * 6 + 3]);
auto w = fmaxf(0.0, xx2 - xx1 + offset);
auto h = fmaxf(0.0, yy2 - yy1 + offset);
auto inter = w * h;
auto ovr = inter / (iarea + inputs_ptr[pos * 6 + 5] - inter);
float weight = 1.;
if (method == 0) {
if (ovr >= iou_threshold) weight = 0;
} else if (method == 1) {
if (ovr >= iou_threshold) weight = 1 - ovr;
} else if (method == 2) {
weight = exp(-(ovr * ovr) / sigma);
}
inputs_ptr[pos * 6 + 4] *= weight;
// if box score falls below threshold, discard the box by
// swapping with last box update N
if (inputs_ptr[pos * 6 + 4] < min_score) {
inputs_ptr[pos * 6 + 0] = inputs_ptr[(nboxes - 1) * 6 + 0];
inputs_ptr[pos * 6 + 1] = inputs_ptr[(nboxes - 1) * 6 + 1];
inputs_ptr[pos * 6 + 2] = inputs_ptr[(nboxes - 1) * 6 + 2];
inputs_ptr[pos * 6 + 3] = inputs_ptr[(nboxes - 1) * 6 + 3];
inputs_ptr[pos * 6 + 4] = inputs_ptr[(nboxes - 1) * 6 + 4];
inputs_ptr[pos * 6 + 5] = inputs_ptr[(nboxes - 1) * 6 + 5];
inds[pos] = inds[nboxes - 1];
nboxes = nboxes - 1;
pos = pos - 1;
}
pos = pos + 1;
}
}
setScalar(num_out, int64_t{nboxes});
outs[0] = dets;
outs[1] = inds_t;
outs[2] = num_out;
}
void nms_match_cpu(HostContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
float iou_threshold;
SSAttrs(attr).get<float>("iou_threshold", iou_threshold).done();
}
PARROTS_EXTENSION_REGISTER(nms)
.attr("iou_threshold")
.attr("offset")
.input(3)
.output(1)
.apply(nms_cpu)
#ifdef PARROTS_USE_CUDA
.apply(nms_cuda)
#endif
.done();
PARROTS_EXTENSION_REGISTER(softnms)
.attr("iou_threshold")
.attr("sigma")
.attr("min_score")
.attr("method")
.attr("offset")
.input(3)
.output(3)
.apply(softnms_cpu)
.done();
PARROTS_EXTENSION_REGISTER(nms_match)
.attr("iou_threshold")
.input(1)
.output(1)
.apply(nms_match_cpu)
.done();
This diff is collapsed.
#include "parrots_cpp_helper.hpp"
using namespace parrots;
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment