Unverified Commit 48d99025 authored by z55250825's avatar z55250825 Committed by GitHub
Browse files

Add new parrots extension implementation for all ops (#794)

* delete all parrots file
add bbox_overlaps new parrots op impl

* support first new impl parrts op (bbox_overlaps)(success test)

* add box_iou_rotated op, test succeed

* add carafe and carafe_naive op, test succeed (one parrots bug need fix)

* add cc_attention op, test success

* add corner_pool op, test success

* add parrots op deform_conv, test success

* add deform_roi_pool op, test success (but has question)

* add focal loss op, test success (gradcheck)

* add masked_conv2d op, test success

* add modulated_deform_conv op, test success

* add nms and nms_rotated op, test success

* add psamask op, test success

* add roi_align op, test_success

* add roi_pool op, test success

* add sync_bn op, test success

* add tin_shift op, test success

* fix test_deform_roi_pool, add parrots test

* skip test_onnx because parrots does not support onnx

* fix c++ lint

* fix python lint

* fix python lint
parent 72e4cc12
#ifndef MASKED_CONV2D_PYTORCH_H
#define MASKED_CONV2D_PYTORCH_H
#include <torch/extension.h>
using namespace at;
void masked_im2col_forward_cuda(const Tensor im, const Tensor mask_h_idx,
const Tensor mask_w_idx, Tensor col,
const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w);
void masked_col2im_forward_cuda(const Tensor col, const Tensor mask_h_idx,
const Tensor mask_w_idx, Tensor im, int height,
int width, int channels);
#endif // MASKED_CONV2D_PYTORCH_H
// Copyright (c) 2019, SenseTime.
#include "parrots_cpp_helper.hpp"
#include "pytorch_cpp_helper.hpp"
#ifdef MMCV_WITH_CUDA
void ModulatedDeformConvForwardCUDAKernelLauncher(
const DArrayLite input, const DArrayLite weight, const DArrayLite bias,
const DArrayLite ones, const DArrayLite offset, const DArrayLite mask,
DArrayLite output, DArrayLite columns, int kernel_h, int kernel_w,
Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
Tensor mask, Tensor output, Tensor columns, int kernel_h, int kernel_w,
const int stride_h, const int stride_w, const int pad_h, const int pad_w,
const int dilation_h, const int dilation_w, const int group,
int deformable_group, const bool with_bias, CudaContext& ctx,
cudaStream_t stream);
const int deformable_group, const bool with_bias);
void ModulatedDeformConvBackwardCUDAKernelLauncher(
const DArrayLite input, const DArrayLite weight, const DArrayLite bias,
const DArrayLite ones, const DArrayLite offset, const DArrayLite mask,
DArrayLite columns, DArrayLite grad_input, DArrayLite grad_weight,
DArrayLite grad_bias, DArrayLite grad_offset, DArrayLite grad_mask,
DArrayLite grad_output, int kernel_h, int kernel_w, int stride_h,
int stride_w, int pad_h, int pad_w, int dilation_h, int dilation_w,
int group, int deformable_group, const bool with_bias, CudaContext& ctx,
cudaStream_t stream);
Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
Tensor mask, Tensor columns, Tensor grad_input, Tensor grad_weight,
Tensor grad_bias, Tensor grad_offset, Tensor grad_mask, Tensor grad_output,
int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
const bool with_bias);
void modulated_deform_conv_forward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h,
dilation_w, group, deformable_group, with_bias;
SSAttrs(attr)
.get<int>("kernel_h", kernel_h)
.get<int>("kernel_w", kernel_w)
.get<int>("stride_h", stride_h)
.get<int>("stride_w", stride_w)
.get<int>("pad_h", pad_h)
.get<int>("pad_w", pad_w)
.get<int>("dilation_h", dilation_h)
.get<int>("dilation_w", dilation_w)
.get<int>("group", group)
.get<int>("deformable_group", deformable_group)
.get<int>("with_bias", with_bias)
.done();
auto input = ins[0];
auto weight = ins[1];
auto bias = ins[2];
auto ones = ins[3];
auto offset = ins[4];
auto mask = ins[5];
auto output = outs[0];
auto columns = outs[1];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
void modulated_deform_conv_forward_cuda(
Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
Tensor mask, Tensor output, Tensor columns, int kernel_h, int kernel_w,
const int stride_h, const int stride_w, const int pad_h, const int pad_w,
const int dilation_h, const int dilation_w, const int group,
const int deformable_group, const bool with_bias) {
ModulatedDeformConvForwardCUDAKernelLauncher(
input, weight, bias, ones, offset, mask, output, columns, kernel_h,
kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
deformable_group, with_bias, ctx, stream);
deformable_group, with_bias);
}
void modulated_deform_conv_backward_cuda(CudaContext& ctx,
const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h,
dilation_w, group, deformable_group, with_bias;
SSAttrs(attr)
.get<int>("kernel_h", kernel_h)
.get<int>("kernel_w", kernel_w)
.get<int>("stride_h", stride_h)
.get<int>("stride_w", stride_w)
.get<int>("pad_h", pad_h)
.get<int>("pad_w", pad_w)
.get<int>("dilation_h", dilation_h)
.get<int>("dilation_w", dilation_w)
.get<int>("group", group)
.get<int>("deformable_group", deformable_group)
.get<int>("with_bias", with_bias)
.done();
auto input = ins[0];
auto weight = ins[1];
auto bias = ins[2];
auto ones = ins[3];
auto offset = ins[4];
auto mask = ins[5];
auto columns = outs[0];
auto grad_input = outs[1];
auto grad_weight = outs[2];
auto grad_bias = outs[3];
auto grad_offset = outs[4];
auto grad_mask = outs[5];
auto grad_output = outs[6];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
void modulated_deform_conv_backward_cuda(
Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
Tensor mask, Tensor columns, Tensor grad_input, Tensor grad_weight,
Tensor grad_bias, Tensor grad_offset, Tensor grad_mask, Tensor grad_output,
int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
const bool with_bias) {
ModulatedDeformConvBackwardCUDAKernelLauncher(
input, weight, bias, ones, offset, mask, columns, grad_input, grad_weight,
grad_bias, grad_offset, grad_mask, grad_output, kernel_h, kernel_w,
stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
deformable_group, with_bias, ctx, stream);
deformable_group, with_bias);
}
#endif
PARROTS_EXTENSION_REGISTER(modulated_deform_conv_forward)
.attr("kernel_h")
.attr("kernel_w")
.attr("stride_h")
.attr("stride_w")
.attr("pad_h")
.attr("pad_w")
.attr("dilation_h")
.attr("dilation_w")
.attr("group")
.attr("deformable_group")
.attr("with_bias")
.input(6)
.output(2)
.apply(modulated_deform_conv_forward_cuda)
.done();
void modulated_deform_conv_forward(
Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
Tensor mask, Tensor output, Tensor columns, int kernel_h, int kernel_w,
const int stride_h, const int stride_w, const int pad_h, const int pad_w,
const int dilation_h, const int dilation_w, const int group,
const int deformable_group, const bool with_bias) {
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(bias);
CHECK_CUDA_INPUT(ones);
CHECK_CUDA_INPUT(offset);
CHECK_CUDA_INPUT(mask);
CHECK_CUDA_INPUT(output);
CHECK_CUDA_INPUT(columns);
PARROTS_EXTENSION_REGISTER(modulated_deform_conv_backward)
.attr("kernel_h")
.attr("kernel_w")
.attr("stride_h")
.attr("stride_w")
.attr("pad_h")
.attr("pad_w")
.attr("dilation_h")
.attr("dilation_w")
.attr("group")
.attr("deformable_group")
.attr("with_bias")
.input(6)
.output(7)
.apply(modulated_deform_conv_backward_cuda)
.done();
modulated_deform_conv_forward_cuda(
input, weight, bias, ones, offset, mask, output, columns, kernel_h,
kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w,
group, deformable_group, with_bias);
#else
AT_ERROR("ModulatedDeformConv is not compiled with GPU support");
#endif
} else {
AT_ERROR("ModulatedDeformConv is not implemented on CPU");
}
}
void modulated_deform_conv_backward(
Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
Tensor mask, Tensor columns, Tensor grad_input, Tensor grad_weight,
Tensor grad_bias, Tensor grad_offset, Tensor grad_mask, Tensor grad_output,
int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
const bool with_bias) {
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(bias);
CHECK_CUDA_INPUT(ones);
CHECK_CUDA_INPUT(offset);
CHECK_CUDA_INPUT(mask);
CHECK_CUDA_INPUT(columns);
CHECK_CUDA_INPUT(grad_input);
CHECK_CUDA_INPUT(grad_weight);
CHECK_CUDA_INPUT(grad_bias);
CHECK_CUDA_INPUT(grad_offset);
CHECK_CUDA_INPUT(grad_mask);
CHECK_CUDA_INPUT(grad_output);
modulated_deform_conv_backward_cuda(
input, weight, bias, ones, offset, mask, columns, grad_input,
grad_weight, grad_bias, grad_offset, grad_mask, grad_output, kernel_h,
kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w,
group, deformable_group, with_bias);
#else
AT_ERROR("ModulatedDeformConv is not compiled with GPU support");
#endif
} else {
AT_ERROR("ModulatedDeformConv is not implemented on CPU");
}
}
#include "modulated_deform_conv_cuda_kernel.cuh"
#include "parrots_cuda_helper.hpp"
#include "pytorch_cuda_helper.hpp"
void modulated_deformable_im2col_cuda(
const DArrayLite data_im, const DArrayLite data_offset,
const DArrayLite data_mask, const int batch_size, const int channels,
const int height_im, const int width_im, const int height_col,
const int width_col, const int kernel_h, const int kenerl_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w, const int deformable_group,
DArrayLite data_col, cudaStream_t stream) {
const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
const int batch_size, const int channels, const int height_im,
const int width_im, const int height_col, const int width_col,
const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w, const int dilation_h,
const int dilation_w, const int deformable_group, Tensor data_col) {
// num_axes should be smaller than block size
const int channel_per_deformable_group = channels / deformable_group;
const int num_kernels = channels * batch_size * height_col * width_col;
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
data_im.elemType().prim(), ([&] {
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
data_im.scalar_type(), "modulated_deformable_im2col_gpu", ([&] {
const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
modulated_deformable_im2col_gpu_kernel<<<
GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>(
num_kernels, data_im.ptr<scalar_t>(), data_offset.ptr<scalar_t>(),
data_mask.ptr<scalar_t>(), height_im, width_im, kernel_h, kenerl_w,
pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
channel_per_deformable_group, batch_size, channels,
deformable_group, height_col, width_col, data_col.ptr<scalar_t>());
GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0,
at::cuda::getCurrentCUDAStream()>>>(
num_kernels, data_im_, data_offset_, data_mask_, height_im,
width_im, kernel_h, kenerl_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, channel_per_deformable_group, batch_size,
channels, deformable_group, height_col, width_col, data_col_);
}));
PARROTS_CUDA_CHECK(cudaGetLastError());
AT_CUDA_CHECK(cudaGetLastError());
}
void modulated_deformable_col2im_cuda(
const DArrayLite data_col, const DArrayLite data_offset,
const DArrayLite data_mask, const int batch_size, const int channels,
const int height_im, const int width_im, const int height_col,
const int width_col, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w, const int deformable_group,
DArrayLite grad_im, cudaStream_t stream) {
const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
const int batch_size, const int channels, const int height_im,
const int width_im, const int height_col, const int width_col,
const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w, const int dilation_h,
const int dilation_w, const int deformable_group, Tensor grad_im) {
const int channel_per_deformable_group = channels / deformable_group;
const int num_kernels =
channels * kernel_h * kernel_w * batch_size * height_col * width_col;
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
data_col.elemType().prim(), ([&] {
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
data_col.scalar_type(), "modulated_deformable_col2im_gpu", ([&] {
const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>();
modulated_deformable_col2im_gpu_kernel<<<
GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>(
num_kernels, data_col.ptr<scalar_t>(), data_offset.ptr<scalar_t>(),
data_mask.ptr<scalar_t>(), channels, height_im, width_im, kernel_h,
kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
channel_per_deformable_group, batch_size, deformable_group,
height_col, width_col, grad_im.ptr<scalar_t>());
GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0,
at::cuda::getCurrentCUDAStream()>>>(
num_kernels, data_col_, data_offset_, data_mask_, channels,
height_im, width_im, kernel_h, kernel_w, pad_h, pad_w, stride_h,
stride_w, dilation_h, dilation_w, channel_per_deformable_group,
batch_size, deformable_group, height_col, width_col, grad_im_);
}));
PARROTS_CUDA_CHECK(cudaGetLastError());
AT_CUDA_CHECK(cudaGetLastError());
}
void modulated_deformable_col2im_coord_cuda(
const DArrayLite data_col, const DArrayLite data_im,
const DArrayLite data_offset, const DArrayLite data_mask,
const int batch_size, const int channels, const int height_im,
const int width_im, const int height_col, const int width_col,
const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w, const int dilation_h,
const int dilation_w, const int deformable_group, DArrayLite grad_offset,
DArrayLite grad_mask, cudaStream_t stream) {
const Tensor data_col, const Tensor data_im, const Tensor data_offset,
const Tensor data_mask, const int batch_size, const int channels,
const int height_im, const int width_im, const int height_col,
const int width_col, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w, const int deformable_group,
Tensor grad_offset, Tensor grad_mask) {
const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h *
kernel_w * deformable_group;
const int channel_per_deformable_group =
channels * kernel_h * kernel_w / deformable_group;
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
data_col.elemType().prim(), ([&] {
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
data_col.scalar_type(), "modulated_deformable_col2im_coord_gpu", ([&] {
const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>();
scalar_t *grad_mask_ = grad_mask.data_ptr<scalar_t>();
modulated_deformable_col2im_coord_gpu_kernel<<<
GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>(
num_kernels, data_col.ptr<scalar_t>(), data_im.ptr<scalar_t>(),
data_offset.ptr<scalar_t>(), data_mask.ptr<scalar_t>(), channels,
height_im, width_im, kernel_h, kernel_w, pad_h, pad_w, stride_h,
stride_w, dilation_h, dilation_w, channel_per_deformable_group,
batch_size, 2 * kernel_h * kernel_w * deformable_group,
deformable_group, height_col, width_col,
grad_offset.ptr<scalar_t>(), grad_mask.ptr<scalar_t>());
GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0,
at::cuda::getCurrentCUDAStream()>>>(
num_kernels, data_col_, data_im_, data_offset_, data_mask_,
channels, height_im, width_im, kernel_h, kernel_w, pad_h, pad_w,
stride_h, stride_w, dilation_h, dilation_w,
channel_per_deformable_group, batch_size,
2 * kernel_h * kernel_w * deformable_group, deformable_group,
height_col, width_col, grad_offset_, grad_mask_);
}));
PARROTS_CUDA_CHECK(cudaGetLastError());
AT_CUDA_CHECK(cudaGetLastError());
}
void ModulatedDeformConvForwardCUDAKernelLauncher(
DArrayLite input, DArrayLite weight, DArrayLite bias, DArrayLite ones,
DArrayLite offset, DArrayLite mask, DArrayLite output, DArrayLite columns,
int kernel_h, int kernel_w, const int stride_h, const int stride_w,
const int pad_h, const int pad_w, const int dilation_h,
const int dilation_w, const int group, const int deformable_group,
const bool with_bias, CudaContext& ctx, cudaStream_t stream) {
const int batch = input.dim(0);
const int channels = input.dim(1);
const int height = input.dim(2);
const int width = input.dim(3);
const int channels_out = weight.dim(0);
const int channels_kernel = weight.dim(1);
const int kernel_h_ = weight.dim(2);
const int kernel_w_ = weight.dim(3);
PARROTS_CHECKARGS(kernel_h_ == kernel_h && kernel_w_ == kernel_w)
<< "Input shape and kernel shape wont match: (" << kernel_h << " x "
<< kernel_w << " vs " << kernel_h_ << " x " << kernel_w_ << ").";
PARROTS_CHECKARGS(channels == channels_kernel * group)
<< "Input shape and kernel channels wont match: (" << channels << " vs "
<< channels_kernel * group << ").";
Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
Tensor mask, Tensor output, Tensor columns, int kernel_h, int kernel_w,
const int stride_h, const int stride_w, const int pad_h, const int pad_w,
const int dilation_h, const int dilation_w, const int group,
const int deformable_group, const bool with_bias) {
at::DeviceGuard guard(input.device());
const int batch = input.size(0);
const int channels = input.size(1);
const int height = input.size(2);
const int width = input.size(3);
const int channels_out = weight.size(0);
const int channels_kernel = weight.size(1);
const int kernel_h_ = weight.size(2);
const int kernel_w_ = weight.size(3);
if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
AT_ERROR("Input shape and kernel shape wont match: (%d x %d vs %d x %d).",
kernel_h_, kernel_w, kernel_h_, kernel_w_);
if (channels != channels_kernel * group)
AT_ERROR("Input shape and kernel channels wont match: (%d vs %d).",
channels, channels_kernel * group);
const int height_out =
(height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
const int width_out =
(width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
if (ones.ndims() != 2 || ones.dim(0) * ones.dim(1) < height_out * width_out) {
if (ones.ndimension() != 2 ||
ones.size(0) * ones.size(1) < height_out * width_out) {
// Resize plane and fill with ones...
ones = ctx.createDArrayLite(input.elemType(),
DArrayShape(height_out, width_out));
fill(ctx, ones, *toScalar(1));
ones = at::ones({height_out, width_out}, input.options());
}
// resize output
output = output.view({batch, channels_out, height_out, width_out});
output.setZeros(ctx.getStream());
output = output.view({batch, channels_out, height_out, width_out}).zero_();
// resize temporary columns
columns = ctx.createDArrayLite(
input.elemType(),
DArrayShape(channels * kernel_h * kernel_w, 1 * height_out * width_out));
columns.setZeros(ctx.getStream());
columns =
at::zeros({channels * kernel_h * kernel_w, 1 * height_out * width_out},
input.options());
output = output.view({output.dim(0), group, output.dim(1) / group,
output.dim(2), output.dim(3)});
output = output.view({output.size(0), group, output.size(1) / group,
output.size(2), output.size(3)});
for (size_t b = 0; b < batch; b++) {
for (int b = 0; b < batch; b++) {
modulated_deformable_im2col_cuda(
input[b], offset[b], mask[b], 1, channels, height, width, height_out,
width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, deformable_group, columns, stream);
dilation_h, dilation_w, deformable_group, columns);
// divide into group
weight = weight.view({group, weight.dim(0) / group, weight.dim(1),
weight.dim(2), weight.dim(3)});
columns = columns.view({group, columns.dim(0) / group, columns.dim(1)});
for (size_t g = 0; g < group; g++) {
auto output_g = output[b][g];
gemm(ctx, 1, false,
weight[g].view(
{weight.dim(1), weight.dim(2) * weight.dim(3) * weight.dim(4)}),
false, columns[g], 1, output_g);
weight = weight.view({group, weight.size(0) / group, weight.size(1),
weight.size(2), weight.size(3)});
columns = columns.view({group, columns.size(0) / group, columns.size(1)});
for (int g = 0; g < group; g++) {
output[b][g] = output[b][g]
.flatten(1)
.addmm_(weight[g].flatten(1), columns[g])
.view_as(output[b][g]);
}
weight = weight.view({weight.dim(0) * weight.dim(1), weight.dim(2),
weight.dim(3), weight.dim(4)});
columns = columns.view({columns.dim(0) * columns.dim(1), columns.dim(2)});
weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
weight.size(3), weight.size(4)});
columns =
columns.view({columns.size(0) * columns.size(1), columns.size(2)});
}
output = output.view({output.dim(0), output.dim(1) * output.dim(2),
output.dim(3), output.dim(4)});
output = output.view({output.size(0), output.size(1) * output.size(2),
output.size(3), output.size(4)});
if (with_bias) {
bias = bias.view({1, bias.dim(0), 1, 1});
add(ctx, output, bias, output);
output += bias.view({1, bias.size(0), 1, 1});
}
}
void ModulatedDeformConvBackwardCUDAKernelLauncher(
DArrayLite input, DArrayLite weight, DArrayLite bias, DArrayLite ones,
DArrayLite offset, DArrayLite mask, DArrayLite columns,
DArrayLite grad_input, DArrayLite grad_weight, DArrayLite grad_bias,
DArrayLite grad_offset, DArrayLite grad_mask, DArrayLite grad_output,
Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
Tensor mask, Tensor columns, Tensor grad_input, Tensor grad_weight,
Tensor grad_bias, Tensor grad_offset, Tensor grad_mask, Tensor grad_output,
int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
const bool with_bias, CudaContext& ctx, cudaStream_t stream) {
const int batch = input.dim(0);
const int channels = input.dim(1);
const int height = input.dim(2);
const int width = input.dim(3);
const int channels_kernel = weight.dim(1);
const int kernel_h_ = weight.dim(2);
const int kernel_w_ = weight.dim(3);
PARROTS_CHECKARGS(kernel_h_ == kernel_h && kernel_w_ == kernel_w)
<< "Input shape and kernel shape wont match: (" << kernel_h << " x "
<< kernel_w << " vs " << kernel_h_ << " x " << kernel_w_ << ").";
PARROTS_CHECKARGS(channels == channels_kernel * group)
<< "Input shape and kernel channels wont match: (" << channels << " vs "
<< channels_kernel * group << ").";
const bool with_bias) {
at::DeviceGuard guard(input.device());
const int batch = input.size(0);
const int channels = input.size(1);
const int height = input.size(2);
const int width = input.size(3);
const int channels_kernel = weight.size(1);
const int kernel_h_ = weight.size(2);
const int kernel_w_ = weight.size(3);
if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
AT_ERROR("Input shape and kernel shape wont match: (%d x %d vs %d x %d).",
kernel_h_, kernel_w, kernel_h_, kernel_w_);
if (channels != channels_kernel * group)
AT_ERROR("Input shape and kernel channels wont match: (%d vs %d).",
channels, channels_kernel * group);
const int height_out =
(height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
const int width_out =
(width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
if (ones.ndims() != 2 || ones.dim(0) * ones.dim(1) < height_out * width_out) {
if (ones.ndimension() != 2 ||
ones.size(0) * ones.size(1) < height_out * width_out) {
// Resize plane and fill with ones...
ones = ctx.createDArrayLite(input.elemType(),
DArrayShape(height_out, width_out));
fill(ctx, ones, *toScalar(1));
ones = at::ones({height_out, width_out}, input.options());
}
grad_input = grad_input.view({batch, channels, height, width});
columns = ctx.createDArrayLite(
input.elemType(),
DArrayShape(channels * kernel_h * kernel_w, height_out * width_out));
columns = at::zeros({channels * kernel_h * kernel_w, height_out * width_out},
input.options());
grad_output =
grad_output.view({grad_output.dim(0), group, grad_output.dim(1) / group,
grad_output.dim(2), grad_output.dim(3)});
grad_output.view({grad_output.size(0), group, grad_output.size(1) / group,
grad_output.size(2), grad_output.size(3)});
for (size_t b = 0; b < batch; b++) {
for (int b = 0; b < batch; b++) {
// divide int group
columns = columns.view({group, columns.dim(0) / group, columns.dim(1)});
weight = weight.view({group, weight.dim(0) / group, weight.dim(1),
weight.dim(2), weight.dim(3)});
for (size_t g = 0; g < group; g++) {
auto columns_g = ctx.createDArrayLite(
weight.elemType(), DArrayShape(columns.dim(1), columns.dim(2)));
copy(ctx, columns_g, columns[g]);
auto weight_g = weight[g].view(
{weight.dim(1), weight.dim(2) * weight.dim(3) * weight.dim(4)});
weight_g = transpose(ctx, weight_g, 0, 1);
auto grad_output_bg = ctx.createDArrayLite(
grad_output.elemType(),
DArrayShape(grad_output.dim(2), grad_output.dim(3),
grad_output.dim(4)));
copy(ctx, grad_output_bg, grad_output[b][g]);
grad_output_bg =
grad_output_bg.view({grad_output_bg.dim(0),
grad_output_bg.dim(1) * grad_output_bg.dim(2)});
columns_g =
parrots::op::addmm(ctx, columns[g], weight_g, grad_output_bg, 0, 1);
auto columns_out = columns[g];
copy(ctx, columns_out, columns_g);
columns = columns.view({group, columns.size(0) / group, columns.size(1)});
weight = weight.view({group, weight.size(0) / group, weight.size(1),
weight.size(2), weight.size(3)});
for (int g = 0; g < group; g++) {
columns[g].addmm_(weight[g].flatten(1).transpose(0, 1),
grad_output[b][g].flatten(1), 0.0f, 1.0f);
}
columns = columns.view({columns.dim(0) * columns.dim(1), columns.dim(2)});
weight = weight.view({weight.dim(0) * weight.dim(1), weight.dim(2),
weight.dim(3), weight.dim(4)});
columns =
columns.view({columns.size(0) * columns.size(1), columns.size(2)});
weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
weight.size(3), weight.size(4)});
// gradient w.r.t. input coordinate data
modulated_deformable_col2im_coord_cuda(
columns, input[b], offset[b], mask[b], 1, channels, height, width,
height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h,
stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b],
grad_mask[b], stream);
grad_mask[b]);
// gradient w.r.t. input data
modulated_deformable_col2im_cuda(
columns, offset[b], mask[b], 1, channels, height, width, height_out,
width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, deformable_group, grad_input[b], stream);
dilation_h, dilation_w, deformable_group, grad_input[b]);
// gradient w.r.t. weight, dWeight should accumulate across the batch and
// group
modulated_deformable_im2col_cuda(
input[b], offset[b], mask[b], 1, channels, height, width, height_out,
width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, deformable_group, columns, stream);
columns = columns.view({group, columns.dim(0) / group, columns.dim(1)});
grad_weight =
grad_weight.view({group, grad_weight.dim(0) / group, grad_weight.dim(1),
grad_weight.dim(2), grad_weight.dim(3)});
if (with_bias) {
grad_bias = grad_bias.view({group, grad_bias.dim(0) / group});
}
for (size_t g = 0; g < group; g++) {
auto grad_weight_g = ctx.createDArrayLite(
grad_weight.elemType(),
DArrayShape(grad_weight.dim(1), grad_weight.dim(2),
grad_weight.dim(3), grad_weight.dim(4)));
copy(ctx, grad_weight_g, grad_weight[g]);
grad_weight_g = grad_weight_g.view(
{grad_weight_g.dim(0),
grad_weight_g.dim(1) * grad_weight_g.dim(2) * grad_weight_g.dim(3)});
auto columns_g = columns[g];
columns_g = transpose(ctx, columns_g, 0, 1);
auto grad_output_bg = ctx.createDArrayLite(
grad_output.elemType(),
DArrayShape(grad_output.dim(2), grad_output.dim(3),
grad_output.dim(4)));
copy(ctx, grad_output_bg, grad_output[b][g]);
grad_output_bg =
grad_output_bg.view({grad_output_bg.dim(0),
grad_output_bg.dim(1) * grad_output_bg.dim(2)});
grad_weight_g = parrots::op::addmm(ctx, grad_weight_g, grad_output_bg,
columns_g, 1, 1);
auto grad_weight_out = grad_weight[g];
copy(ctx, grad_weight_out, grad_weight_g);
dilation_h, dilation_w, deformable_group, columns);
columns = columns.view({group, columns.size(0) / group, columns.size(1)});
grad_weight = grad_weight.view({group, grad_weight.size(0) / group,
grad_weight.size(1), grad_weight.size(2),
grad_weight.size(3)});
if (with_bias)
grad_bias = grad_bias.view({group, grad_bias.size(0) / group});
for (int g = 0; g < group; g++) {
grad_weight[g] =
grad_weight[g]
.flatten(1)
.addmm_(grad_output[b][g].flatten(1), columns[g].transpose(0, 1))
.view_as(grad_weight[g]);
if (with_bias) {
auto grad_bias_g = ctx.createDArrayLite(grad_bias.elemType(),
DArrayShape(grad_bias.dim(1)));
copy(ctx, grad_bias_g, grad_bias[g]);
grad_bias_g = grad_bias_g.view({grad_bias_g.dim(0), 1});
auto grad_output_bg = ctx.createDArrayLite(
grad_output.elemType(),
DArrayShape(grad_output.dim(2), grad_output.dim(3),
grad_output.dim(4)));
copy(ctx, grad_output_bg, grad_output[b][g]);
grad_output_bg = grad_output_bg.view(
{grad_output_bg.dim(0),
grad_output_bg.dim(1) * grad_output_bg.dim(2)});
auto ones_g = ctx.createDArrayLite(
ones.elemType(), DArrayShape(ones.dim(0), ones.dim(1)));
copy(ctx, ones_g, ones);
ones_g = ones_g.view({ones_g.dim(0) * ones_g.dim(1), 1});
grad_bias_g =
parrots::op::addmm(ctx, grad_bias_g, grad_output_bg, ones_g, 1, 1);
auto grad_bias_out = grad_bias[g];
copy(ctx, grad_bias_out, grad_bias_g);
grad_bias[g] =
grad_bias[g]
.view({-1, 1})
.addmm_(grad_output[b][g].flatten(1), ones.view({-1, 1}))
.view(-1);
}
}
columns = columns.view({columns.dim(0) * columns.dim(1), columns.dim(2)});
grad_weight = grad_weight.view({grad_weight.dim(0) * grad_weight.dim(1),
grad_weight.dim(2), grad_weight.dim(3),
grad_weight.dim(4)});
columns =
columns.view({columns.size(0) * columns.size(1), columns.size(2)});
grad_weight = grad_weight.view({grad_weight.size(0) * grad_weight.size(1),
grad_weight.size(2), grad_weight.size(3),
grad_weight.size(4)});
if (with_bias)
grad_bias =
grad_bias.view(DArrayShape{grad_bias.dim(0) * grad_bias.dim(1)});
grad_bias = grad_bias.view({grad_bias.size(0) * grad_bias.size(1)});
}
grad_output = grad_output.view({grad_output.dim(0) * grad_output.dim(1),
grad_output.dim(2), grad_output.dim(3),
grad_output.dim(4)});
grad_output = grad_output.view({grad_output.size(0) * grad_output.size(1),
grad_output.size(2), grad_output.size(3),
grad_output.size(4)});
}
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>
#include "modulated_deform_conv_pytorch.h"
using namespace parrots;
void modulated_deform_conv_forward_cuda_parrots(
CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h,
dilation_w, group, deformable_group, with_bias;
SSAttrs(attr)
.get<int>("kernel_h", kernel_h)
.get<int>("kernel_w", kernel_w)
.get<int>("stride_h", stride_h)
.get<int>("stride_w", stride_w)
.get<int>("pad_h", pad_h)
.get<int>("pad_w", pad_w)
.get<int>("dilation_h", dilation_h)
.get<int>("dilation_w", dilation_w)
.get<int>("group", group)
.get<int>("deformable_group", deformable_group)
.get<int>("with_bias", with_bias)
.done();
const auto& input = buildATensor(ctx, ins[0]);
const auto& weight = buildATensor(ctx, ins[1]);
const auto& bias = buildATensor(ctx, ins[2]);
const auto& ones = buildATensor(ctx, ins[3]);
const auto& offset = buildATensor(ctx, ins[4]);
const auto& mask = buildATensor(ctx, ins[5]);
auto output = buildATensor(ctx, outs[0]);
auto columns = buildATensor(ctx, outs[1]);
modulated_deform_conv_forward_cuda(
input, weight, bias, ones, offset, mask, output, columns, kernel_h,
kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
deformable_group, with_bias);
}
void modulated_deform_conv_backward_cuda_parrots(
CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h,
dilation_w, group, deformable_group, with_bias;
SSAttrs(attr)
.get<int>("kernel_h", kernel_h)
.get<int>("kernel_w", kernel_w)
.get<int>("stride_h", stride_h)
.get<int>("stride_w", stride_w)
.get<int>("pad_h", pad_h)
.get<int>("pad_w", pad_w)
.get<int>("dilation_h", dilation_h)
.get<int>("dilation_w", dilation_w)
.get<int>("group", group)
.get<int>("deformable_group", deformable_group)
.get<int>("with_bias", with_bias)
.done();
const auto& input = buildATensor(ctx, ins[0]);
const auto& weight = buildATensor(ctx, ins[1]);
const auto& bias = buildATensor(ctx, ins[2]);
const auto& ones = buildATensor(ctx, ins[3]);
const auto& offset = buildATensor(ctx, ins[4]);
const auto& mask = buildATensor(ctx, ins[5]);
auto columns = buildATensor(ctx, outs[0]);
auto grad_input = buildATensor(ctx, outs[1]);
auto grad_weight = buildATensor(ctx, outs[2]);
auto grad_bias = buildATensor(ctx, outs[3]);
auto grad_offset = buildATensor(ctx, outs[4]);
auto grad_mask = buildATensor(ctx, outs[5]);
auto grad_output = buildATensor(ctx, outs[6]);
modulated_deform_conv_backward_cuda(
input, weight, bias, ones, offset, mask, columns, grad_input, grad_weight,
grad_bias, grad_offset, grad_mask, grad_output, kernel_h, kernel_w,
stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
deformable_group, with_bias);
}
PARROTS_EXTENSION_REGISTER(modulated_deform_conv_forward)
.attr("kernel_h")
.attr("kernel_w")
.attr("stride_h")
.attr("stride_w")
.attr("pad_h")
.attr("pad_w")
.attr("dilation_h")
.attr("dilation_w")
.attr("group")
.attr("deformable_group")
.attr("with_bias")
.input(6)
.output(2)
.apply(modulated_deform_conv_forward_cuda_parrots)
.done();
PARROTS_EXTENSION_REGISTER(modulated_deform_conv_backward)
.attr("kernel_h")
.attr("kernel_w")
.attr("stride_h")
.attr("stride_w")
.attr("pad_h")
.attr("pad_w")
.attr("dilation_h")
.attr("dilation_w")
.attr("group")
.attr("deformable_group")
.attr("with_bias")
.input(6)
.output(7)
.apply(modulated_deform_conv_backward_cuda_parrots)
.done();
#ifndef MODULATED_DEFORM_CONV_PYTORCH_H
#define MODULATED_DEFORM_CONV_PYTORCH_H
#include <torch/extension.h>
using namespace at;
void modulated_deform_conv_forward_cuda(
Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
Tensor mask, Tensor output, Tensor columns, int kernel_h, int kernel_w,
const int stride_h, const int stride_w, const int pad_h, const int pad_w,
const int dilation_h, const int dilation_w, const int group,
const int deformable_group, const bool with_bias);
void modulated_deform_conv_backward_cuda(
Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
Tensor mask, Tensor columns, Tensor grad_input, Tensor grad_weight,
Tensor grad_bias, Tensor grad_offset, Tensor grad_mask, Tensor grad_output,
int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
const bool with_bias);
#endif // MODULATED_DEFORM_CONV_PYTORCH_H
#include "parrots_cpp_helper.hpp"
#define DIVUP(x, y) (((x) + (y)-1) / (y))
int const threadsPerBlock = sizeof(unsigned long long) * 8;
DArrayLite NMSCUDAKernelLauncher(const DArrayLite boxes_sorted,
const DArrayLite order, const DArrayLite areas,
float iou_threshold, int offset,
CudaContext& ctx, cudaStream_t stream);
void nms_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
float iou_threshold;
int offset;
SSAttrs(attr)
.get<float>("iou_threshold", iou_threshold)
.get<int>("offset", offset)
.done();
const auto& boxes_sorted = ins[0];
const auto& order = ins[1];
const auto& areas = ins[2];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
outs[0] = NMSCUDAKernelLauncher(boxes_sorted, order, areas, iou_threshold,
offset, ctx, stream);
#include "pytorch_cpp_helper.hpp"
#ifdef MMCV_WITH_CUDA
Tensor NMSCUDAKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
int offset);
Tensor nms_cuda(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
return NMSCUDAKernelLauncher(boxes, scores, iou_threshold, offset);
}
#endif
void nms_cpu(HostContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
float iou_threshold;
int offset;
SSAttrs(attr)
.get<float>("iou_threshold", iou_threshold)
.get<int>("offset", offset)
.done();
const auto& boxes = ins[0];
const auto& order = ins[1];
const auto& areas = ins[2];
size_t nboxes = boxes.shape().dim(0);
size_t boxes_dim = boxes.shape().dim(1);
auto select = ctx.createDArrayLite(DArraySpec::array(Prim::Int64, nboxes),
getHostProxy());
select.setZeros(syncStream());
if (boxes.size() == 0) {
outs[0] = select;
return;
Tensor nms_cpu(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
if (boxes.numel() == 0) {
return at::empty({0}, boxes.options().dtype(at::kLong));
}
auto x1_t = boxes.select(1, 0).contiguous();
auto y1_t = boxes.select(1, 1).contiguous();
auto x2_t = boxes.select(1, 2).contiguous();
auto y2_t = boxes.select(1, 3).contiguous();
Tensor areas_t = (x2_t - x1_t + offset) * (y2_t - y1_t + offset);
fill(ctx, select, *toScalar(1));
auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
auto select_ptr = select.ptr<int64_t>();
auto boxes_ptr = boxes.ptr<float>();
auto order_ptr = order.ptr<int64_t>();
auto areas_ptr = areas.ptr<float>();
auto nboxes = boxes.size(0);
Tensor select_t = at::ones({nboxes}, boxes.options().dtype(at::kBool));
auto select = select_t.data_ptr<bool>();
auto order = order_t.data_ptr<int64_t>();
auto x1 = x1_t.data_ptr<float>();
auto y1 = y1_t.data_ptr<float>();
auto x2 = x2_t.data_ptr<float>();
auto y2 = y2_t.data_ptr<float>();
auto areas = areas_t.data_ptr<float>();
for (int64_t _i = 0; _i < nboxes; _i++) {
if (select_ptr[_i] == 0) continue;
auto i = order_ptr[_i];
auto ix1 = boxes_ptr[i * boxes_dim];
auto iy1 = boxes_ptr[i * boxes_dim + 1];
auto ix2 = boxes_ptr[i * boxes_dim + 2];
auto iy2 = boxes_ptr[i * boxes_dim + 3];
auto iarea = areas_ptr[i];
if (select[_i] == false) continue;
auto i = order[_i];
auto ix1 = x1[i];
auto iy1 = y1[i];
auto ix2 = x2[i];
auto iy2 = y2[i];
auto iarea = areas[i];
for (int64_t _j = _i + 1; _j < nboxes; _j++) {
if (select_ptr[_j] == 0) continue;
auto j = order_ptr[_j];
auto xx1 = fmaxf(ix1, boxes_ptr[j * boxes_dim]);
auto yy1 = fmaxf(iy1, boxes_ptr[j * boxes_dim + 1]);
auto xx2 = fminf(ix2, boxes_ptr[j * boxes_dim + 2]);
auto yy2 = fminf(iy2, boxes_ptr[j * boxes_dim + 3]);
auto w = fmaxf(0.0, xx2 - xx1 + offset);
auto h = fmaxf(0.0, yy2 - yy1 + offset);
if (select[_j] == false) continue;
auto j = order[_j];
auto xx1 = std::max(ix1, x1[j]);
auto yy1 = std::max(iy1, y1[j]);
auto xx2 = std::min(ix2, x2[j]);
auto yy2 = std::min(iy2, y2[j]);
auto w = std::max(0.f, xx2 - xx1 + offset);
auto h = std::max(0.f, yy2 - yy1 + offset);
auto inter = w * h;
auto ovr = inter / (iarea + areas_ptr[j] - inter);
if (ovr >= iou_threshold) select_ptr[_j] = 0;
auto ovr = inter / (iarea + areas[j] - inter);
if (ovr >= iou_threshold) select[_j] = false;
}
}
outs[0] = select;
return order_t.masked_select(select_t);
}
Tensor nms(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
if (boxes.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(boxes);
CHECK_CUDA_INPUT(scores);
return nms_cuda(boxes, scores, iou_threshold, offset);
#else
AT_ERROR("nms is not compiled with GPU support");
#endif
} else {
CHECK_CPU_INPUT(boxes);
CHECK_CPU_INPUT(scores);
return nms_cpu(boxes, scores, iou_threshold, offset);
}
}
void softnms_cpu(HostContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
float iou_threshold;
float sigma;
float min_score;
int method;
int offset;
SSAttrs(attr)
.get<float>("iou_threshold", iou_threshold)
.get<float>("sigma", sigma)
.get<float>("min_score", min_score)
.get<int>("method", method)
.get<int>("offset", offset)
.done();
const auto& boxes = ins[0];
const auto& scores = ins[1];
const auto& areas = ins[2];
size_t nboxes = boxes.shape().dim(0);
size_t boxes_dim = boxes.shape().dim(1);
auto boxes_ptr = boxes.ptr<float>();
auto scores_ptr = scores.ptr<float>();
auto areas_ptr = areas.ptr<float>();
auto inputs = ctx.createDArrayLite(
DArraySpec::array(Prim::Float32, DArrayShape(nboxes, 6)));
auto inputs_ptr = inputs.ptr<float>();
auto dets = ctx.createDArrayLite(
DArraySpec::array(Prim::Float32, DArrayShape(nboxes, 5)));
auto de = dets.ptr<float>();
for (size_t i = 0; i < nboxes; i++) {
inputs_ptr[i * 6 + 0] = boxes_ptr[i * boxes_dim + 0];
inputs_ptr[i * 6 + 1] = boxes_ptr[i * boxes_dim + 1];
inputs_ptr[i * 6 + 2] = boxes_ptr[i * boxes_dim + 2];
inputs_ptr[i * 6 + 3] = boxes_ptr[i * boxes_dim + 3];
inputs_ptr[i * 6 + 4] = scores_ptr[i];
inputs_ptr[i * 6 + 5] = areas_ptr[i];
Tensor softnms_cpu(Tensor boxes, Tensor scores, Tensor dets,
float iou_threshold, float sigma, float min_score,
int method, int offset) {
if (boxes.numel() == 0) {
return at::empty({0}, boxes.options().dtype(at::kLong));
}
size_t pos = 0;
auto inds_t = ctx.createDArrayLite(DArraySpec::array(Prim::Int64, nboxes));
arange(ctx, *toScalar(0), *toScalar(nboxes), *toScalar(1), inds_t);
auto inds = inds_t.ptr<int64_t>();
auto num_out = ctx.createDArrayLite(DArraySpec::scalar(Prim::Int64));
auto x1_t = boxes.select(1, 0).contiguous();
auto y1_t = boxes.select(1, 1).contiguous();
auto x2_t = boxes.select(1, 2).contiguous();
auto y2_t = boxes.select(1, 3).contiguous();
auto scores_t = scores.clone();
Tensor areas_t = (x2_t - x1_t + offset) * (y2_t - y1_t + offset);
for (size_t i = 0; i < nboxes; i++) {
auto max_score = inputs_ptr[i * 6 + 4];
auto nboxes = boxes.size(0);
auto x1 = x1_t.data_ptr<float>();
auto y1 = y1_t.data_ptr<float>();
auto x2 = x2_t.data_ptr<float>();
auto y2 = y2_t.data_ptr<float>();
auto sc = scores_t.data_ptr<float>();
auto areas = areas_t.data_ptr<float>();
auto de = dets.data_ptr<float>();
int64_t pos = 0;
Tensor inds_t = at::arange(nboxes, boxes.options().dtype(at::kLong));
auto inds = inds_t.data_ptr<int64_t>();
for (int64_t i = 0; i < nboxes; i++) {
auto max_score = sc[i];
auto max_pos = i;
pos = i + 1;
// get max box
while (pos < nboxes) {
if (max_score < inputs_ptr[pos * 6 + 4]) {
max_score = inputs_ptr[pos * 6 + 4];
if (max_score < sc[pos]) {
max_score = sc[pos];
max_pos = pos;
}
pos = pos + 1;
}
// swap
auto ix1 = de[i * 5 + 0] = inputs_ptr[max_pos * 6 + 0];
auto iy1 = de[i * 5 + 1] = inputs_ptr[max_pos * 6 + 1];
auto ix2 = de[i * 5 + 2] = inputs_ptr[max_pos * 6 + 2];
auto iy2 = de[i * 5 + 3] = inputs_ptr[max_pos * 6 + 3];
auto iscore = de[i * 5 + 4] = inputs_ptr[max_pos * 6 + 4];
auto iarea = inputs_ptr[max_pos * 6 + 5];
auto ix1 = de[i * 5 + 0] = x1[max_pos];
auto iy1 = de[i * 5 + 1] = y1[max_pos];
auto ix2 = de[i * 5 + 2] = x2[max_pos];
auto iy2 = de[i * 5 + 3] = y2[max_pos];
auto iscore = de[i * 5 + 4] = sc[max_pos];
auto iarea = areas[max_pos];
auto iind = inds[max_pos];
inputs_ptr[max_pos * 6 + 0] = inputs_ptr[i * 6 + 0];
inputs_ptr[max_pos * 6 + 1] = inputs_ptr[i * 6 + 1];
inputs_ptr[max_pos * 6 + 2] = inputs_ptr[i * 6 + 2];
inputs_ptr[max_pos * 6 + 3] = inputs_ptr[i * 6 + 3];
inputs_ptr[max_pos * 6 + 4] = inputs_ptr[i * 6 + 4];
inputs_ptr[max_pos * 6 + 5] = inputs_ptr[i * 6 + 5];
x1[max_pos] = x1[i];
y1[max_pos] = y1[i];
x2[max_pos] = x2[i];
y2[max_pos] = y2[i];
sc[max_pos] = sc[i];
areas[max_pos] = areas[i];
inds[max_pos] = inds[i];
inputs_ptr[i * 6 + 0] = ix1;
inputs_ptr[i * 6 + 1] = iy1;
inputs_ptr[i * 6 + 2] = ix2;
inputs_ptr[i * 6 + 3] = iy2;
inputs_ptr[i * 6 + 4] = iscore;
inputs_ptr[i * 6 + 5] = iarea;
x1[i] = ix1;
y1[i] = iy1;
x2[i] = ix2;
y2[i] = iy2;
sc[i] = iscore;
areas[i] = iarea;
inds[i] = iind;
pos = i + 1;
while (pos < nboxes) {
auto xx1 = fmaxf(ix1, inputs_ptr[pos * 6 + 0]);
auto yy1 = fmaxf(iy1, inputs_ptr[pos * 6 + 1]);
auto xx2 = fminf(ix2, inputs_ptr[pos * 6 + 2]);
auto yy2 = fminf(iy2, inputs_ptr[pos * 6 + 3]);
auto xx1 = std::max(ix1, x1[pos]);
auto yy1 = std::max(iy1, y1[pos]);
auto xx2 = std::min(ix2, x2[pos]);
auto yy2 = std::min(iy2, y2[pos]);
auto w = fmaxf(0.0, xx2 - xx1 + offset);
auto h = fmaxf(0.0, yy2 - yy1 + offset);
auto w = std::max(0.f, xx2 - xx1 + offset);
auto h = std::max(0.f, yy2 - yy1 + offset);
auto inter = w * h;
auto ovr = inter / (iarea + inputs_ptr[pos * 6 + 5] - inter);
auto ovr = inter / (iarea + areas[pos] - inter);
float weight = 1.;
if (method == 0) {
......@@ -186,18 +158,18 @@ void softnms_cpu(HostContext& ctx, const SSElement& attr,
} else if (method == 1) {
if (ovr >= iou_threshold) weight = 1 - ovr;
} else if (method == 2) {
weight = exp(-(ovr * ovr) / sigma);
weight = std::exp(-(ovr * ovr) / sigma);
}
inputs_ptr[pos * 6 + 4] *= weight;
sc[pos] *= weight;
// if box score falls below threshold, discard the box by
// swapping with last box update N
if (inputs_ptr[pos * 6 + 4] < min_score) {
inputs_ptr[pos * 6 + 0] = inputs_ptr[(nboxes - 1) * 6 + 0];
inputs_ptr[pos * 6 + 1] = inputs_ptr[(nboxes - 1) * 6 + 1];
inputs_ptr[pos * 6 + 2] = inputs_ptr[(nboxes - 1) * 6 + 2];
inputs_ptr[pos * 6 + 3] = inputs_ptr[(nboxes - 1) * 6 + 3];
inputs_ptr[pos * 6 + 4] = inputs_ptr[(nboxes - 1) * 6 + 4];
inputs_ptr[pos * 6 + 5] = inputs_ptr[(nboxes - 1) * 6 + 5];
if (sc[pos] < min_score) {
x1[pos] = x1[nboxes - 1];
y1[pos] = y1[nboxes - 1];
x2[pos] = x2[nboxes - 1];
y2[pos] = y2[nboxes - 1];
sc[pos] = sc[nboxes - 1];
areas[pos] = areas[nboxes - 1];
inds[pos] = inds[nboxes - 1];
nboxes = nboxes - 1;
pos = pos - 1;
......@@ -205,44 +177,84 @@ void softnms_cpu(HostContext& ctx, const SSElement& attr,
pos = pos + 1;
}
}
setScalar(num_out, int64_t{nboxes});
outs[0] = dets;
outs[1] = inds_t;
outs[2] = num_out;
return inds_t.slice(0, 0, nboxes);
}
void nms_match_cpu(HostContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
float iou_threshold;
SSAttrs(attr).get<float>("iou_threshold", iou_threshold).done();
Tensor softnms(Tensor boxes, Tensor scores, Tensor dets, float iou_threshold,
float sigma, float min_score, int method, int offset) {
if (boxes.device().is_cuda()) {
AT_ERROR("softnms is not implemented on GPU");
} else {
return softnms_cpu(boxes, scores, dets, iou_threshold, sigma, min_score,
method, offset);
}
}
PARROTS_EXTENSION_REGISTER(nms)
.attr("iou_threshold")
.attr("offset")
.input(3)
.output(1)
.apply(nms_cpu)
#ifdef PARROTS_USE_CUDA
.apply(nms_cuda)
#endif
.done();
PARROTS_EXTENSION_REGISTER(softnms)
.attr("iou_threshold")
.attr("sigma")
.attr("min_score")
.attr("method")
.attr("offset")
.input(3)
.output(3)
.apply(softnms_cpu)
.done();
PARROTS_EXTENSION_REGISTER(nms_match)
.attr("iou_threshold")
.input(1)
.output(1)
.apply(nms_match_cpu)
.done();
std::vector<std::vector<int> > nms_match_cpu(Tensor dets, float iou_threshold) {
auto x1_t = dets.select(1, 0).contiguous();
auto y1_t = dets.select(1, 1).contiguous();
auto x2_t = dets.select(1, 2).contiguous();
auto y2_t = dets.select(1, 3).contiguous();
auto scores = dets.select(1, 4).contiguous();
at::Tensor areas_t = (x2_t - x1_t) * (y2_t - y1_t);
auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
auto ndets = dets.size(0);
at::Tensor suppressed_t =
at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU));
auto suppressed = suppressed_t.data_ptr<uint8_t>();
auto order = order_t.data_ptr<int64_t>();
auto x1 = x1_t.data_ptr<float>();
auto y1 = y1_t.data_ptr<float>();
auto x2 = x2_t.data_ptr<float>();
auto y2 = y2_t.data_ptr<float>();
auto areas = areas_t.data_ptr<float>();
std::vector<int> keep;
std::vector<std::vector<int> > matched;
for (int64_t _i = 0; _i < ndets; _i++) {
auto i = order[_i];
if (suppressed[i] == 1) continue;
keep.push_back(i);
std::vector<int> v_i;
auto ix1 = x1[i];
auto iy1 = y1[i];
auto ix2 = x2[i];
auto iy2 = y2[i];
auto iarea = areas[i];
for (int64_t _j = _i + 1; _j < ndets; _j++) {
auto j = order[_j];
if (suppressed[j] == 1) continue;
auto xx1 = std::max(ix1, x1[j]);
auto yy1 = std::max(iy1, y1[j]);
auto xx2 = std::min(ix2, x2[j]);
auto yy2 = std::min(iy2, y2[j]);
auto w = std::max(static_cast<float>(0), xx2 - xx1);
auto h = std::max(static_cast<float>(0), yy2 - yy1);
auto inter = w * h;
auto ovr = inter / (iarea + areas[j] - inter);
if (ovr >= iou_threshold) {
suppressed[j] = 1;
v_i.push_back(j);
}
}
matched.push_back(v_i);
}
for (int i = 0; i < keep.size(); i++)
matched[i].insert(matched[i].begin(), keep[i]);
return matched;
}
std::vector<std::vector<int> > nms_match(Tensor dets, float iou_threshold) {
if (dets.device().is_cuda()) {
AT_ERROR("nms_match is not implemented on GPU");
} else {
return nms_match_cpu(dets, iou_threshold);
}
}
#include "nms_cuda_kernel.cuh"
#include "parrots_cuda_helper.hpp"
#include "pytorch_cuda_helper.hpp"
DArrayLite NMSCUDAKernelLauncher(const DArrayLite boxes_sorted,
const DArrayLite order, const DArrayLite areas,
float iou_threshold, int offset,
CudaContext& ctx, cudaStream_t stream) {
size_t boxes_num = boxes_sorted.dim(0);
Tensor NMSCUDAKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
int offset) {
at::cuda::CUDAGuard device_guard(boxes.device());
if (boxes_sorted.size() == 0) {
auto select = ctx.createDArrayLite(DArraySpec::array(Prim::Int64, 0));
return select;
if (boxes.numel() == 0) {
return at::empty({0}, boxes.options().dtype(at::kLong));
}
auto order_t = std::get<1>(scores.sort(0, /*descending=*/true));
auto boxes_sorted = boxes.index_select(0, order_t);
const size_t col_blocks = DIVUP(boxes_num, threadsPerBlock);
auto mask = ctx.createDArrayLite(
DArraySpec::array(Prim::Int64, DArrayShape(boxes_num, col_blocks)));
int boxes_num = boxes.size(0);
const int col_blocks = DIVUP(boxes_num, threadsPerBlock);
Tensor mask =
at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
dim3 blocks(col_blocks, col_blocks);
dim3 threads(threadsPerBlock);
PARROTS_CUDA_CHECK(cudaGetLastError());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
nms_cuda<<<blocks, threads, 0, stream>>>(
boxes_num, iou_threshold, offset, boxes_sorted.ptr<float>(),
(unsigned long long*)mask.ptr<int64_t>());
PARROTS_CUDA_CHECK(cudaGetLastError());
boxes_num, iou_threshold, offset, boxes_sorted.data_ptr<float>(),
(unsigned long long*)mask.data_ptr<int64_t>());
auto mask_cpu = ctx.createDArrayLite(mask, getHostProxy());
auto mask_host = mask_cpu.ptr<int64_t>();
at::Tensor mask_cpu = mask.to(at::kCPU);
unsigned long long* mask_host =
(unsigned long long*)mask_cpu.data_ptr<int64_t>();
auto remv = ctx.createDArrayLite(DArraySpec::array(Prim::Int64, col_blocks),
getHostProxy());
remv.setZeros(syncStream());
auto remv_ptr = remv.ptr<int64_t>();
std::vector<unsigned long long> remv(col_blocks);
memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
auto keep_t = ctx.createDArrayLite(DArraySpec::array(Prim::Uint8, boxes_num),
getHostProxy());
keep_t.setZeros(syncStream());
auto keep = keep_t.ptr<uint8_t>();
at::Tensor keep_t =
at::zeros({boxes_num}, boxes.options().dtype(at::kBool).device(at::kCPU));
bool* keep = keep_t.data_ptr<bool>();
for (int i = 0; i < boxes_num; i++) {
int nblock = i / threadsPerBlock;
int inblock = i % threadsPerBlock;
if (!(remv_ptr[nblock] & (1ULL << inblock))) {
keep[i] = 1;
int64_t* p = mask_host + i * col_blocks;
if (!(remv[nblock] & (1ULL << inblock))) {
keep[i] = true;
// set every overlap box with bit 1 in remv
unsigned long long* p = mask_host + i * col_blocks;
for (int j = nblock; j < col_blocks; j++) {
remv_ptr[j] |= p[j];
remv[j] |= p[j];
}
}
}
auto keep_cuda = ctx.createDArrayLite(keep_t, ctx.getProxy());
PARROTS_CUDA_CHECK(cudaGetLastError());
return keep_cuda;
AT_CUDA_CHECK(cudaGetLastError());
return order_t.masked_select(keep_t.to(at::kCUDA));
}
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>
#include "nms_pytorch.h"
using namespace parrots;
// Tensor nms(Tensor boxes, Tensor scores, float iou_threshold, int offset);
template <typename T>
void nms_parrots(T& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
float iou_threshold;
int offset;
SSAttrs(attr)
.get("iou_threshold", iou_threshold)
.get("offset", offset)
.done();
at::Tensor boxes, scores;
boxes = buildATensor(ctx, ins[0]);
scores = buildATensor(ctx, ins[1]);
auto out = nms(boxes, scores, iou_threshold, offset);
updateDArray(ctx, out, outs[0]);
}
/*Tensor softnms(Tensor boxes, Tensor scores, Tensor dets, float iou_threshold,
* float sigma, float min_score, int method, int offset);*/
template <typename T>
void softnms_parrots(T& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
float iou_threshold, sigma, min_score;
int method, offset;
SSAttrs(attr)
.get("iou_threshold", iou_threshold)
.get("sigma", sigma)
.get("min_score", min_score)
.get("method", method)
.get("offset", offset)
.done();
at::Tensor boxes, scores, dets;
boxes = buildATensor(ctx, ins[0]);
scores = buildATensor(ctx, ins[1]);
dets = buildATensor(ctx, ins[2]);
auto out = softnms(boxes, scores, dets, iou_threshold, sigma, min_score,
method, offset);
updateDArray(ctx, out, outs[0]);
}
// std::vector<std::vector<int> > nms_match(Tensor dets, float iou_threshold);
template <typename T>
void nms_match_parrots(T& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
float iou_threshold;
SSAttrs(attr).get("iou_threshold", iou_threshold).done();
at::Tensor dets;
dets = buildATensor(ctx, ins[0]);
auto out = nms_match(dets, iou_threshold);
int n = out.size(), m = 0;
for (int i = 0; i < n; ++i)
if (m < out[i].size()) m = out[i].size();
auto options = torch::TensorOptions().dtype(at::kInt);
auto tensor = torch::zeros({n, m}, options);
for (int i = 0; i < n; i++)
tensor.slice(0, i, i + 1) =
torch::from_blob(out[i].data(), {out[i].size()}, options);
updateDArray(ctx, tensor, outs[0]);
}
/*Tensor nms_rotated(const Tensor dets, const Tensor scores, const Tensor order,
* const Tensor dets_sorted, const float iou_threshold,
* const int multi_label);*/
template <typename T>
void nms_rotated_parrots(T& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
float iou_threshold;
int multi_label;
SSAttrs(attr)
.get("iou_threshold", iou_threshold)
.get("multi_label", multi_label)
.done();
at::Tensor dets, scores, order, dets_sorted;
dets = buildATensor(ctx, ins[0]);
scores = buildATensor(ctx, ins[1]);
order = buildATensor(ctx, ins[2]);
dets_sorted = buildATensor(ctx, ins[3]);
auto out =
nms_rotated(dets, scores, order, dets_sorted, iou_threshold, multi_label);
updateDArray(ctx, out, outs[0]);
}
PARROTS_EXTENSION_REGISTER(nms)
.attr("iou_threshold")
.attr("offset")
.input(2)
.output(1)
.apply(nms_parrots<HostContext>)
#ifdef MMCV_WITH_CUDA
.apply(nms_parrots<CudaContext>)
#endif
.done();
PARROTS_EXTENSION_REGISTER(softnms)
.attr("iou_threshold")
.attr("sigma")
.attr("min_score")
.attr("method")
.attr("offset")
.input(3)
.output(1)
.apply(softnms_parrots<HostContext>)
#ifdef MMCV_WITH_CUDA
.apply(softnms_parrots<CudaContext>)
#endif
.done();
PARROTS_EXTENSION_REGISTER(nms_match)
.attr("iou_threshold")
.input(1)
.output(1)
.apply(nms_match_parrots<HostContext>)
#ifdef MMCV_WITH_CUDA
.apply(nms_match_parrots<CudaContext>)
#endif
.done();
PARROTS_EXTENSION_REGISTER(nms_rotated)
.attr("multi_label")
.attr("iou_threshold")
.input(4)
.output(1)
.apply(nms_rotated_parrots<HostContext>)
#ifdef MMCV_WITH_CUDA
.apply(nms_rotated_parrots<CudaContext>)
#endif
.done();
#ifndef NMS_PYTORCH_H
#define NMS_PYTORCH_H
#include <torch/extension.h>
at::Tensor nms(at::Tensor boxes, at::Tensor scores, float iou_threshold,
int offset);
at::Tensor softnms(at::Tensor boxes, at::Tensor scores, at::Tensor dets,
float iou_threshold, float sigma, float min_score,
int method, int offset);
std::vector<std::vector<int> > nms_match(at::Tensor dets, float iou_threshold);
at::Tensor nms_rotated(const at::Tensor dets, const at::Tensor scores,
const at::Tensor order, const at::Tensor dets_sorted,
const float iou_threshold, const int multi_label);
#endif // NMS_PYTORCH_H
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
// modified from
// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/nms_rotated/nms_rotated.h
#include "parrots_cpp_helper.hpp"
#include "pytorch_cpp_helper.hpp"
DArrayLite nms_rotated_cuda(const DArrayLite dets, const DArrayLite scores,
const DArrayLite dets_sorted, float iou_threshold,
const int multi_label, cudaStream_t stream,
CudaContext& ctx);
Tensor nms_rotated_cpu(const Tensor dets, const Tensor scores,
const float iou_threshold);
#ifdef MMCV_WITH_CUDA
Tensor nms_rotated_cuda(const Tensor dets, const Tensor scores,
const Tensor order, const Tensor dets_sorted,
const float iou_threshold, const int multi_label);
#endif
// Interface for Python
// inline is needed to prevent multiple function definitions when this header is
// included by different cpps
void nms_rotated(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
float iou_threshold;
int multi_label;
SSAttrs(attr)
.get<float>("iou_threshold", iou_threshold)
.get<int>("multi_label", multi_label)
.done();
const auto& dets = ins[0];
const auto& scores = ins[1];
const auto& dets_sorted = ins[2];
Tensor nms_rotated(const Tensor dets, const Tensor scores, const Tensor order,
const Tensor dets_sorted, const float iou_threshold,
const int multi_label) {
assert(dets.device().is_cuda() == scores.device().is_cuda());
if (dets.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
return nms_rotated_cuda(dets, scores, order, dets_sorted, iou_threshold,
multi_label);
#else
AT_ERROR("Not compiled with GPU support");
#endif
}
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
outs[0] = nms_rotated_cuda(dets, scores, dets_sorted, iou_threshold,
multi_label, stream, ctx);
return nms_rotated_cpu(dets, scores, iou_threshold);
}
PARROTS_EXTENSION_REGISTER(nms_rotated)
.attr("multi_label")
.attr("iou_threshold")
.input(3)
.output(1)
.apply(nms_rotated)
.done();
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
// modified from
// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/nms_rotated/nms_rotated_cpu.cpp
#include "box_iou_rotated_utils.hpp"
#include "pytorch_cpp_helper.hpp"
template <typename scalar_t>
Tensor nms_rotated_cpu_kernel(const Tensor dets, const Tensor scores,
const float iou_threshold) {
// nms_rotated_cpu_kernel is modified from torchvision's nms_cpu_kernel,
// however, the code in this function is much shorter because
// we delegate the IoU computation for rotated boxes to
// the single_box_iou_rotated function in box_iou_rotated_utils.h
AT_ASSERTM(!dets.type().is_cuda(), "dets must be a CPU tensor");
AT_ASSERTM(!scores.type().is_cuda(), "scores must be a CPU tensor");
AT_ASSERTM(dets.type() == scores.type(),
"dets should have the same type as scores");
if (dets.numel() == 0) {
return at::empty({0}, dets.options().dtype(at::kLong));
}
auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
auto ndets = dets.size(0);
Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte));
Tensor keep_t = at::zeros({ndets}, dets.options().dtype(at::kLong));
auto suppressed = suppressed_t.data_ptr<uint8_t>();
auto keep = keep_t.data_ptr<int64_t>();
auto order = order_t.data_ptr<int64_t>();
int64_t num_to_keep = 0;
for (int64_t _i = 0; _i < ndets; _i++) {
auto i = order[_i];
if (suppressed[i] == 1) {
continue;
}
keep[num_to_keep++] = i;
for (int64_t _j = _i + 1; _j < ndets; _j++) {
auto j = order[_j];
if (suppressed[j] == 1) {
continue;
}
auto ovr = single_box_iou_rotated<scalar_t>(
dets[i].data_ptr<scalar_t>(), dets[j].data_ptr<scalar_t>(), 0);
if (ovr >= iou_threshold) {
suppressed[j] = 1;
}
}
}
return keep_t.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep);
}
Tensor nms_rotated_cpu(const Tensor dets, const Tensor scores,
const float iou_threshold) {
auto result = at::empty({0}, dets.options());
AT_DISPATCH_FLOATING_TYPES(dets.type(), "nms_rotated", [&] {
result = nms_rotated_cpu_kernel<scalar_t>(dets, scores, iou_threshold);
});
return result;
}
......@@ -2,45 +2,51 @@
// modified from
// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/nms_rotated/nms_rotated_cuda.cu
#include "nms_rotated_cuda.cuh"
#include "parrots_cuda_helper.hpp"
#include "pytorch_cuda_helper.hpp"
DArrayLite nms_rotated_cuda(const DArrayLite dets, const DArrayLite scores,
const DArrayLite dets_sorted, float iou_threshold,
const int multi_label, cudaStream_t stream,
CudaContext& ctx) {
int dets_num = dets.dim(0);
Tensor nms_rotated_cuda(const Tensor dets, const Tensor scores,
const Tensor order_t, const Tensor dets_sorted,
float iou_threshold, const int multi_label) {
// using scalar_t = float;
AT_ASSERTM(dets.type().is_cuda(), "dets must be a CUDA tensor");
AT_ASSERTM(scores.type().is_cuda(), "scores must be a CUDA tensor");
at::cuda::CUDAGuard device_guard(dets.device());
const int col_blocks = divideUP(dets_num, threadsPerBlock);
int dets_num = dets.size(0);
auto mask = ctx.createDArrayLite(
DArraySpec::array(Prim::Int64, DArrayShape(dets_num * col_blocks)));
const int col_blocks = at::cuda::ATenCeilDiv(dets_num, threadsPerBlock);
Tensor mask =
at::empty({dets_num * col_blocks}, dets.options().dtype(at::kLong));
dim3 blocks(col_blocks, col_blocks);
dim3 threads(threadsPerBlock);
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(dets_sorted.elemType().prim(), [&] {
nms_rotated_cuda_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
dets_num, iou_threshold, dets_sorted.ptr<scalar_t>(),
(unsigned long long*)mask.ptr<int64_t>(), multi_label);
});
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
dets_sorted.type(), "nms_rotated_kernel_cuda", [&] {
nms_rotated_cuda_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
dets_num, iou_threshold, dets_sorted.data<scalar_t>(),
(unsigned long long*)mask.data<int64_t>(), multi_label);
});
DArrayLite mask_cpu = ctx.createDArrayLite(mask, getHostProxy());
unsigned long long* mask_host = (unsigned long long*)mask_cpu.ptr<int64_t>();
Tensor mask_cpu = mask.to(at::kCPU);
unsigned long long* mask_host = (unsigned long long*)mask_cpu.data<int64_t>();
std::vector<unsigned long long> remv(col_blocks);
memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
auto keep = ctx.createDArrayLite(
DArraySpec::array(Prim::Int64, DArrayShape(dets_num)), getHostProxy());
int64_t* keep_out = keep.ptr<int64_t>();
Tensor keep =
at::empty({dets_num}, dets.options().dtype(at::kLong).device(at::kCPU));
int64_t* keep_out = keep.data<int64_t>();
int num_to_keep = 0;
for (int i = 0; i < dets_num; i++) {
int nblock = i / threadsPerBlock;
int inblock = i % threadsPerBlock;
if (!(remv[nblock] & (1ULL << inblock))) {
keep_out[i] = 1;
keep_out[num_to_keep++] = i;
unsigned long long* p = mask_host + i * col_blocks;
for (int j = nblock; j < col_blocks; j++) {
remv[j] |= p[j];
......@@ -48,7 +54,8 @@ DArrayLite nms_rotated_cuda(const DArrayLite dets, const DArrayLite scores,
}
}
auto keep_cuda = ctx.createDArrayLite(keep, ctx.getProxy());
PARROTS_CUDA_CHECK(cudaGetLastError());
return keep_cuda;
AT_CUDA_CHECK(cudaGetLastError());
return order_t.index(
{keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep)
.to(order_t.device(), keep.scalar_type())});
}
#include "parrots_cpp_helper.hpp"
using namespace parrots;
#include "parrots_cuda_helper.hpp"
using namespace parrots;
#include "parrots_cpp_helper.hpp"
// Modified from
// https://github.com/hszhao/semseg/blob/master/lib/psa/src
#include "pytorch_cpp_helper.hpp"
#ifndef min
#define min(a, b) (((a) < (b)) ? (a) : (b))
#endif
#ifndef max
#define max(a, b) (((a) > (b)) ? (a) : (b))
#endif
......@@ -11,8 +12,8 @@
void psamask_collect_forward(const int num_, const int h_feature,
const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask,
const int half_w_mask, const float *mask_data,
float *buffer_data) {
const int half_w_mask, const Tensor mask_data,
Tensor buffer_data) {
for (int n = 0; n < num_; n++) {
for (int h = 0; h < h_feature; h++) {
for (int w = 0; w < w_feature; w++) {
......@@ -27,12 +28,13 @@ void psamask_collect_forward(const int num_, const int h_feature,
// feature-indexed
for (int hidx = hstart; hidx < hend; hidx++) {
for (int widx = wstart; widx < wend; widx++) {
buffer_data[(n * h_feature * w_feature +
(hidx + h - half_h_mask) * w_feature +
(widx + w - half_w_mask)) *
h_feature * w_feature +
h * w_feature + w] =
mask_data[((n * h_mask * w_mask + hidx * w_mask + widx) *
buffer_data.view({-1})[(n * h_feature * w_feature +
(hidx + h - half_h_mask) * w_feature +
(widx + w - half_w_mask)) *
h_feature * w_feature +
h * w_feature + w] =
mask_data.view(
{-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
h_feature +
h) *
w_feature +
......@@ -47,8 +49,8 @@ void psamask_collect_forward(const int num_, const int h_feature,
void psamask_distribute_forward(const int num_, const int h_feature,
const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask,
const int half_w_mask, const float *mask_data,
float *buffer_data) {
const int half_w_mask, const Tensor mask_data,
Tensor buffer_data) {
for (int n = 0; n < num_; n++) {
for (int h = 0; h < h_feature; h++) {
for (int w = 0; w < w_feature; w++) {
......@@ -63,11 +65,13 @@ void psamask_distribute_forward(const int num_, const int h_feature,
// feature-indexed
for (int hidx = hstart; hidx < hend; hidx++) {
for (int widx = wstart; widx < wend; widx++) {
buffer_data[(n * h_feature * w_feature + h * w_feature + w) *
h_feature * w_feature +
(hidx + h - half_h_mask) * w_feature +
(widx + w - half_w_mask)] =
mask_data[((n * h_mask * w_mask + hidx * w_mask + widx) *
buffer_data.view(
{-1})[(n * h_feature * w_feature + h * w_feature + w) *
h_feature * w_feature +
(hidx + h - half_h_mask) * w_feature +
(widx + w - half_w_mask)] =
mask_data.view(
{-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
h_feature +
h) *
w_feature +
......@@ -82,8 +86,8 @@ void psamask_distribute_forward(const int num_, const int h_feature,
void psamask_collect_backward(const int num_, const int h_feature,
const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask,
const int half_w_mask, const float *buffer_diff,
float *mask_diff) {
const int half_w_mask, const Tensor buffer_diff,
Tensor mask_diff) {
for (int n = 0; n < num_; n++) {
for (int h = 0; h < h_feature; h++) {
for (int w = 0; w < w_feature; w++) {
......@@ -98,11 +102,12 @@ void psamask_collect_backward(const int num_, const int h_feature,
// feature-indexed
for (int hidx = hstart; hidx < hend; hidx++) {
for (int widx = wstart; widx < wend; widx++) {
mask_diff[((n * h_mask * w_mask + hidx * w_mask + widx) *
h_feature +
h) *
w_feature +
w] = buffer_diff[(n * h_feature * w_feature +
mask_diff.view({-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
h_feature +
h) *
w_feature +
w] =
buffer_diff.view({-1})[(n * h_feature * w_feature +
(hidx + h - half_h_mask) * w_feature +
(widx + w - half_w_mask)) *
h_feature * w_feature +
......@@ -118,7 +123,7 @@ void psamask_distribute_backward(const int num_, const int h_feature,
const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask,
const int half_w_mask,
const float *buffer_diff, float *mask_diff) {
const Tensor buffer_diff, Tensor mask_diff) {
for (int n = 0; n < num_; n++) {
for (int h = 0; h < h_feature; h++) {
for (int w = 0; w < w_feature; w++) {
......@@ -133,15 +138,16 @@ void psamask_distribute_backward(const int num_, const int h_feature,
// feature-indexed
for (int hidx = hstart; hidx < hend; hidx++) {
for (int widx = wstart; widx < wend; widx++) {
mask_diff[((n * h_mask * w_mask + hidx * w_mask + widx) *
h_feature +
h) *
w_feature +
w] =
buffer_diff[(n * h_feature * w_feature + h * w_feature + w) *
h_feature * w_feature +
(hidx + h - half_h_mask) * w_feature +
(widx + w - half_w_mask)];
mask_diff.view({-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
h_feature +
h) *
w_feature +
w] =
buffer_diff.view(
{-1})[(n * h_feature * w_feature + h * w_feature + w) *
h_feature * w_feature +
(hidx + h - half_h_mask) * w_feature +
(widx + w - half_w_mask)];
}
}
}
......@@ -149,156 +155,101 @@ void psamask_distribute_backward(const int num_, const int h_feature,
}
}
void psamask_forward_cpu(HostContext &ctx, const SSElement &attr,
const OperatorBase::in_list_t &ins,
OperatorBase::out_list_t &outs) {
int psa_type, num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
half_w_mask;
SSAttrs(attr)
.get<int>("psa_type", psa_type)
.get<int>("num_", num_)
.get<int>("h_feature", h_feature)
.get<int>("w_feature", w_feature)
.get<int>("h_mask", h_mask)
.get<int>("w_mask", w_mask)
.get<int>("half_h_mask", half_h_mask)
.get<int>("half_w_mask", half_w_mask)
.done();
const auto &input = ins[0];
auto &output = outs[0];
auto input_ptr = input.ptr<float>();
auto output_ptr = output.ptr<float>();
void psamask_forward_cpu(const int psa_type, const Tensor input, Tensor output,
const int num_, const int h_feature,
const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask,
const int half_w_mask) {
if (psa_type == 0)
psamask_collect_forward(num_, h_feature, w_feature, h_mask, w_mask,
half_h_mask, half_w_mask, input_ptr, output_ptr);
half_h_mask, half_w_mask, input, output);
else
psamask_distribute_forward(num_, h_feature, w_feature, h_mask, w_mask,
half_h_mask, half_w_mask, input_ptr, output_ptr);
half_h_mask, half_w_mask, input, output);
}
void psamask_backward_cpu(HostContext &ctx, const SSElement &attr,
const OperatorBase::in_list_t &ins,
OperatorBase::out_list_t &outs) {
int psa_type, num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
half_w_mask;
SSAttrs(attr)
.get<int>("psa_type", psa_type)
.get<int>("num_", num_)
.get<int>("h_feature", h_feature)
.get<int>("w_feature", w_feature)
.get<int>("h_mask", h_mask)
.get<int>("w_mask", w_mask)
.get<int>("half_h_mask", half_h_mask)
.get<int>("half_w_mask", half_w_mask)
.done();
const auto &input = ins[0];
auto &output = outs[0];
auto input_ptr = input.ptr<float>();
auto output_ptr = output.ptr<float>();
void psamask_backward_cpu(const int psa_type, const Tensor grad_output,
Tensor grad_input, const int num_,
const int h_feature, const int w_feature,
const int h_mask, const int w_mask,
const int half_h_mask, const int half_w_mask) {
if (psa_type == 0)
psamask_collect_backward(num_, h_feature, w_feature, h_mask, w_mask,
half_h_mask, half_w_mask, input_ptr, output_ptr);
half_h_mask, half_w_mask, grad_output, grad_input);
else
psamask_distribute_backward(num_, h_feature, w_feature, h_mask, w_mask,
half_h_mask, half_w_mask, input_ptr,
output_ptr);
half_h_mask, half_w_mask, grad_output,
grad_input);
}
void PSAMaskForwardCUDAKernelLauncher(const int psa_type,
const DArrayLite input, DArrayLite output,
const int num_, const int h_feature,
const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask,
const int half_w_mask, CudaContext &ctx);
void PSAMaskBackwardCUDAKernelLauncher(const int psa_type,
const DArrayLite grad_output,
DArrayLite grad_input, const int num_,
const int h_feature, const int w_feature,
const int h_mask, const int w_mask,
const int half_h_mask,
const int half_w_mask, CudaContext &ctx);
void psamask_forward_cuda(CudaContext &ctx, const SSElement &attr,
const OperatorBase::in_list_t &ins,
OperatorBase::out_list_t &outs) {
int psa_type, num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
half_w_mask;
SSAttrs(attr)
.get<int>("psa_type", psa_type)
.get<int>("num_", num_)
.get<int>("h_feature", h_feature)
.get<int>("w_feature", w_feature)
.get<int>("h_mask", h_mask)
.get<int>("w_mask", w_mask)
.get<int>("half_h_mask", half_h_mask)
.get<int>("half_w_mask", half_w_mask)
.done();
const auto &input = ins[0];
auto &output = outs[0];
#ifdef MMCV_WITH_CUDA
void PSAMaskForwardCUDAKernelLauncher(const int psa_type, const Tensor input,
Tensor output, const int num_,
const int h_feature, const int w_feature,
const int h_mask, const int w_mask,
const int half_h_mask,
const int half_w_mask);
void PSAMaskBackwardCUDAKernelLauncher(
const int psa_type, const Tensor grad_output, Tensor grad_input,
const int num_, const int h_feature, const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask, const int half_w_mask);
void psamask_forward_cuda(const int psa_type, const Tensor input, Tensor output,
const int num_, const int h_feature,
const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask,
const int half_w_mask) {
PSAMaskForwardCUDAKernelLauncher(psa_type, input, output, num_, h_feature,
w_feature, h_mask, w_mask, half_h_mask,
half_w_mask, ctx);
half_w_mask);
}
void psamask_backward_cuda(CudaContext &ctx, const SSElement &attr,
const OperatorBase::in_list_t &ins,
OperatorBase::out_list_t &outs) {
int psa_type, num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
half_w_mask;
SSAttrs(attr)
.get<int>("psa_type", psa_type)
.get<int>("num_", num_)
.get<int>("h_feature", h_feature)
.get<int>("w_feature", w_feature)
.get<int>("h_mask", h_mask)
.get<int>("w_mask", w_mask)
.get<int>("half_h_mask", half_h_mask)
.get<int>("half_w_mask", half_w_mask)
.done();
const auto &input = ins[0];
auto &output = outs[0];
PSAMaskBackwardCUDAKernelLauncher(psa_type, input, output, num_, h_feature,
w_feature, h_mask, w_mask, half_h_mask,
half_w_mask, ctx);
void psamask_backward_cuda(const int psa_type, const Tensor grad_output,
Tensor grad_input, const int num_,
const int h_feature, const int w_feature,
const int h_mask, const int w_mask,
const int half_h_mask, const int half_w_mask) {
PSAMaskBackwardCUDAKernelLauncher(psa_type, grad_output, grad_input, num_,
h_feature, w_feature, h_mask, w_mask,
half_h_mask, half_w_mask);
}
#endif
PARROTS_EXTENSION_REGISTER(psamask_forward)
.attr("psa_type")
.attr("num_")
.attr("h_feature")
.attr("w_feature")
.attr("h_mask")
.attr("w_mask")
.attr("half_h_mask")
.attr("half_w_mask")
.input(1)
.output(1)
.apply(psamask_forward_cpu)
#ifdef PARROTS_USE_CUDA
.apply(psamask_forward_cuda)
void psamask_forward(const Tensor input, Tensor output, const int psa_type,
const int num_, const int h_feature, const int w_feature,
const int h_mask, const int w_mask, const int half_h_mask,
const int half_w_mask) {
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(output);
psamask_forward_cuda(psa_type, input, output, num_, h_feature, w_feature,
h_mask, w_mask, half_h_mask, half_w_mask);
#else
AT_ERROR("PSAMask is not compiled with GPU support");
#endif
.done();
} else {
psamask_forward_cpu(psa_type, input, output, num_, h_feature, w_feature,
h_mask, w_mask, half_h_mask, half_w_mask);
}
}
PARROTS_EXTENSION_REGISTER(psamask_backward)
.attr("psa_type")
.attr("num_")
.attr("h_feature")
.attr("w_feature")
.attr("h_mask")
.attr("w_mask")
.attr("half_h_mask")
.attr("half_w_mask")
.input(1)
.output(1)
.apply(psamask_backward_cpu)
#ifdef PARROTS_USE_CUDA
.apply(psamask_backward_cuda)
void psamask_backward(Tensor grad_output, const Tensor grad_input,
const int psa_type, const int num_, const int h_feature,
const int w_feature, const int h_mask, const int w_mask,
const int half_h_mask, const int half_w_mask) {
if (grad_input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(grad_input);
CHECK_CUDA_INPUT(grad_output);
psamask_backward_cuda(psa_type, grad_output, grad_input, num_, h_feature,
w_feature, h_mask, w_mask, half_h_mask, half_w_mask);
#else
AT_ERROR("PSAMask is not compiled with GPU support");
#endif
.done();
} else {
psamask_backward_cpu(psa_type, grad_output, grad_input, num_, h_feature,
w_feature, h_mask, w_mask, half_h_mask, half_w_mask);
}
}
// Modified from
// https://github.com/hszhao/semseg/blob/master/lib/psa/src
#include "parrots_cuda_helper.hpp"
#include <THC/THC.h>
#include <torch/serialize/tensor.h>
#include <THC/THCDeviceUtils.cuh>
#include "psamask_cuda_kernel.cuh"
#include "pytorch_cuda_helper.hpp"
void PSAMaskForwardCUDAKernelLauncher(const int psa_type,
const DArrayLite input, DArrayLite output,
const int num_, const int h_feature,
const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask,
const int half_w_mask, CudaContext& ctx) {
void PSAMaskForwardCUDAKernelLauncher(const int psa_type, const Tensor input,
Tensor output, const int num_,
const int h_feature, const int w_feature,
const int h_mask, const int w_mask,
const int half_h_mask,
const int half_w_mask) {
int nthreads = num_ * h_feature * w_feature;
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
if (psa_type == 0)
PARROTS_DISPATCH_FLOATING_TYPES(input.elemType().prim(), [&] {
psamask_collect_forward_cuda<scalar_t><<<nthreads, 512, 0, stream>>>(
nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
half_w_mask, input.ptr<scalar_t>(), output.ptr<scalar_t>());
});
AT_DISPATCH_FLOATING_TYPES(
input.scalar_type(), "psamask_collect_forward_cuda", [&] {
psamask_collect_forward_cuda<scalar_t><<<nthreads, 512, 0, stream>>>(
nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
half_w_mask, input.data_ptr<scalar_t>(),
output.data_ptr<scalar_t>());
});
else
PARROTS_DISPATCH_FLOATING_TYPES(input.elemType().prim(), [&] {
psamask_distribute_forward_cuda<scalar_t><<<nthreads, 512, 0, stream>>>(
nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
half_w_mask, input.ptr<scalar_t>(), output.ptr<scalar_t>());
});
AT_DISPATCH_FLOATING_TYPES(
input.scalar_type(), "psamask_distribute_forward_cuda", [&] {
psamask_distribute_forward_cuda<scalar_t>
<<<nthreads, 512, 0, stream>>>(
nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
half_w_mask, input.data_ptr<scalar_t>(),
output.data_ptr<scalar_t>());
});
}
void PSAMaskBackwardCUDAKernelLauncher(
const int psa_type, const DArrayLite grad_output, DArrayLite grad_input,
const int psa_type, const Tensor grad_output, Tensor grad_input,
const int num_, const int h_feature, const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask, const int half_w_mask,
CudaContext& ctx) {
const int w_mask, const int half_h_mask, const int half_w_mask) {
int nthreads = num_ * h_feature * w_feature;
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
if (psa_type == 0)
PARROTS_DISPATCH_FLOATING_TYPES(grad_input.elemType().prim(), [&] {
psamask_collect_backward_cuda<scalar_t><<<nthreads, 512, 0, stream>>>(
nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
half_w_mask, grad_output.ptr<scalar_t>(), grad_input.ptr<scalar_t>());
});
AT_DISPATCH_FLOATING_TYPES(
grad_input.scalar_type(), "psamask_collect_backward_cuda", [&] {
psamask_collect_backward_cuda<scalar_t><<<nthreads, 512, 0, stream>>>(
nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
half_w_mask, grad_output.data_ptr<scalar_t>(),
grad_input.data_ptr<scalar_t>());
});
else
PARROTS_DISPATCH_FLOATING_TYPES(grad_input.elemType().prim(), [&] {
psamask_distribute_backward_cuda<scalar_t><<<nthreads, 512, 0, stream>>>(
nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
half_w_mask, grad_output.ptr<scalar_t>(), grad_input.ptr<scalar_t>());
});
AT_DISPATCH_FLOATING_TYPES(
grad_input.scalar_type(), "psamask_distribute_backward_cuda", [&] {
psamask_distribute_backward_cuda<scalar_t>
<<<nthreads, 512, 0, stream>>>(
nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
half_w_mask, grad_output.data_ptr<scalar_t>(),
grad_input.data_ptr<scalar_t>());
});
}
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>
#include "psamask_pytorch.h"
using namespace parrots;
#ifdef MMCV_WITH_CUDA
void psamask_forward_cuda_parrots(CudaContext &ctx, const SSElement &attr,
const OperatorBase::in_list_t &ins,
OperatorBase::out_list_t &outs) {
int psa_type, num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
half_w_mask;
SSAttrs(attr)
.get<int>("psa_type", psa_type)
.get<int>("num_", num_)
.get<int>("h_feature", h_feature)
.get<int>("w_feature", w_feature)
.get<int>("h_mask", h_mask)
.get<int>("w_mask", w_mask)
.get<int>("half_h_mask", half_h_mask)
.get<int>("half_w_mask", half_w_mask)
.done();
const auto &input = buildATensor(ctx, ins[0]);
auto output = buildATensor(ctx, outs[0]);
psamask_forward_cuda(psa_type, input, output, num_, h_feature, w_feature,
h_mask, w_mask, half_h_mask, half_w_mask);
}
void psamask_backward_cuda_parrots(CudaContext &ctx, const SSElement &attr,
const OperatorBase::in_list_t &ins,
OperatorBase::out_list_t &outs) {
int psa_type, num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
half_w_mask;
SSAttrs(attr)
.get<int>("psa_type", psa_type)
.get<int>("num_", num_)
.get<int>("h_feature", h_feature)
.get<int>("w_feature", w_feature)
.get<int>("h_mask", h_mask)
.get<int>("w_mask", w_mask)
.get<int>("half_h_mask", half_h_mask)
.get<int>("half_w_mask", half_w_mask)
.done();
const auto &grad_output = buildATensor(ctx, ins[0]);
auto grad_input = buildATensor(ctx, outs[0]);
psamask_backward_cuda(psa_type, grad_output, grad_input, num_, h_feature,
w_feature, h_mask, w_mask, half_h_mask, half_w_mask);
}
#endif
void psamask_forward_cpu_parrots(HostContext &ctx, const SSElement &attr,
const OperatorBase::in_list_t &ins,
OperatorBase::out_list_t &outs) {
int psa_type, num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
half_w_mask;
SSAttrs(attr)
.get<int>("psa_type", psa_type)
.get<int>("num_", num_)
.get<int>("h_feature", h_feature)
.get<int>("w_feature", w_feature)
.get<int>("h_mask", h_mask)
.get<int>("w_mask", w_mask)
.get<int>("half_h_mask", half_h_mask)
.get<int>("half_w_mask", half_w_mask)
.done();
const auto &input = buildATensor(ctx, ins[0]);
auto output = buildATensor(ctx, outs[0]);
psamask_forward_cpu(psa_type, input, output, num_, h_feature, w_feature,
h_mask, w_mask, half_h_mask, half_w_mask);
}
void psamask_backward_cpu_parrots(HostContext &ctx, const SSElement &attr,
const OperatorBase::in_list_t &ins,
OperatorBase::out_list_t &outs) {
int psa_type, num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
half_w_mask;
SSAttrs(attr)
.get<int>("psa_type", psa_type)
.get<int>("num_", num_)
.get<int>("h_feature", h_feature)
.get<int>("w_feature", w_feature)
.get<int>("h_mask", h_mask)
.get<int>("w_mask", w_mask)
.get<int>("half_h_mask", half_h_mask)
.get<int>("half_w_mask", half_w_mask)
.done();
const auto &grad_output = buildATensor(ctx, ins[0]);
auto grad_input = buildATensor(ctx, outs[0]);
psamask_backward_cpu(psa_type, grad_output, grad_input, num_, h_feature,
w_feature, h_mask, w_mask, half_h_mask, half_w_mask);
}
PARROTS_EXTENSION_REGISTER(psamask_forward)
.attr("psa_type")
.attr("num_")
.attr("h_feature")
.attr("w_feature")
.attr("h_mask")
.attr("w_mask")
.attr("half_h_mask")
.attr("half_w_mask")
.input(1)
.output(1)
.apply(psamask_forward_cpu_parrots)
#ifdef MMCV_WITH_CUDA
.apply(psamask_forward_cuda_parrots)
#endif
.done();
PARROTS_EXTENSION_REGISTER(psamask_backward)
.attr("psa_type")
.attr("num_")
.attr("h_feature")
.attr("w_feature")
.attr("h_mask")
.attr("w_mask")
.attr("half_h_mask")
.attr("half_w_mask")
.input(1)
.output(1)
.apply(psamask_backward_cpu_parrots)
#ifdef MMCV_WITH_CUDA
.apply(psamask_backward_cuda_parrots)
#endif
.done();
#ifndef PSAMASK_PYTORCH_H
#define PSAMASK_PYTORCH_H
#include <torch/extension.h>
using namespace at;
#ifdef MMCV_WITH_CUDA
void psamask_forward_cuda(const int psa_type, const Tensor input, Tensor output,
const int num_, const int h_feature,
const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask,
const int half_w_mask);
void psamask_backward_cuda(const int psa_type, const Tensor grad_output,
Tensor grad_input, const int num_,
const int h_feature, const int w_feature,
const int h_mask, const int w_mask,
const int half_h_mask, const int half_w_mask);
#endif
void psamask_forward_cpu(const int psa_type, const Tensor input, Tensor output,
const int num_, const int h_feature,
const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask,
const int half_w_mask);
void psamask_backward_cpu(const int psa_type, const Tensor grad_output,
Tensor grad_input, const int num_,
const int h_feature, const int w_feature,
const int h_mask, const int w_mask,
const int half_h_mask, const int half_w_mask);
#endif // PSAMASK_PYTORCH_H
// Copyright (c) 2018, SenseTime.
#include "parrots_cpp_helper.hpp"
#include "pytorch_cpp_helper.hpp"
#ifdef MMCV_WITH_CUDA
void ROIAlignForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
Tensor argmax_y, Tensor argmax_x,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
int pool_mode, bool aligned);
void ROIAlignBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
Tensor argmax_y, Tensor argmax_x,
Tensor grad_input, int aligned_height,
int aligned_width, float spatial_scale,
int sampling_ratio, int pool_mode,
bool aligned);
void roi_align_forward_cuda(Tensor input, Tensor rois, Tensor output,
Tensor argmax_y, Tensor argmax_x,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
int pool_mode, bool aligned) {
ROIAlignForwardCUDAKernelLauncher(
input, rois, output, argmax_y, argmax_x, aligned_height, aligned_width,
spatial_scale, sampling_ratio, pool_mode, aligned);
}
void roi_align_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax_y,
Tensor argmax_x, Tensor grad_input,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
int pool_mode, bool aligned) {
ROIAlignBackwardCUDAKernelLauncher(
grad_output, rois, argmax_y, argmax_x, grad_input, aligned_height,
aligned_width, spatial_scale, sampling_ratio, pool_mode, aligned);
}
#endif
void ROIAlignForwardCPULauncher(DArrayLite input, DArrayLite rois,
DArrayLite output, DArrayLite argmax_y,
DArrayLite argmax_x, int aligned_height,
int aligned_width, float spatial_scale,
int sampling_ratio, int pool_mode,
bool aligned);
void ROIAlignForwardCPULauncher(Tensor input, Tensor rois, Tensor output,
Tensor argmax_y, Tensor argmax_x,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
int pool_mode, bool aligned);
void ROIAlignBackwardCPULauncher(DArrayLite grad_output, DArrayLite rois,
DArrayLite argmax_y, DArrayLite argmax_x,
DArrayLite grad_input, int aligned_height,
void ROIAlignBackwardCPULauncher(Tensor grad_output, Tensor rois,
Tensor argmax_y, Tensor argmax_x,
Tensor grad_input, int aligned_height,
int aligned_width, float spatial_scale,
int sampling_ratio, int pool_mode,
bool aligned);
void ROIAlignForwardCUDAKernelLauncher(DArrayLite input, DArrayLite rois,
DArrayLite output, DArrayLite argmax_y,
DArrayLite argmax_x, int aligned_height,
int aligned_width, float spatial_scale,
int sampling_ratio, int pool_mode,
bool aligned, cudaStream_t stream);
void ROIAlignBackwardCUDAKernelLauncher(
DArrayLite grad_output, DArrayLite rois, DArrayLite argmax_y,
DArrayLite argmax_x, DArrayLite grad_input, int aligned_height,
int aligned_width, float spatial_scale, int sampling_ratio, int pool_mode,
bool aligned, cudaStream_t stream);
void roi_align_forward_cpu(HostContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int aligned_height;
int aligned_width;
float spatial_scale;
int sampling_ratio;
int pool_mode;
bool aligned;
SSAttrs(attr)
.get<int>("aligned_height", aligned_height)
.get<int>("aligned_width", aligned_width)
.get<float>("spatial_scale", spatial_scale)
.get<int>("sampling_ratio", sampling_ratio)
.get<int>("pool_mode", pool_mode)
.get<bool>("aligned", aligned)
.done();
auto& input = ins[0];
auto& rois = ins[1];
auto& output = outs[0];
auto& argmax_y = outs[1];
auto& argmax_x = outs[2];
void roi_align_forward_cpu(Tensor input, Tensor rois, Tensor output,
Tensor argmax_y, Tensor argmax_x, int aligned_height,
int aligned_width, float spatial_scale,
int sampling_ratio, int pool_mode, bool aligned) {
ROIAlignForwardCPULauncher(input, rois, output, argmax_y, argmax_x,
aligned_height, aligned_width, spatial_scale,
sampling_ratio, pool_mode, aligned);
}
void roi_align_backward_cpu(HostContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int aligned_height;
int aligned_width;
float spatial_scale;
int sampling_ratio;
int pool_mode;
bool aligned;
SSAttrs(attr)
.get<int>("aligned_height", aligned_height)
.get<int>("aligned_width", aligned_width)
.get<float>("spatial_scale", spatial_scale)
.get<int>("sampling_ratio", sampling_ratio)
.get<int>("pool_mode", pool_mode)
.get<bool>("aligned", aligned)
.done();
auto& grad_output = ins[0];
auto& rois = ins[1];
auto& argmax_y = ins[2];
auto& argmax_x = ins[3];
auto& grad_input = outs[0];
void roi_align_backward_cpu(Tensor grad_output, Tensor rois, Tensor argmax_y,
Tensor argmax_x, Tensor grad_input,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
int pool_mode, bool aligned) {
ROIAlignBackwardCPULauncher(grad_output, rois, argmax_y, argmax_x, grad_input,
aligned_height, aligned_width, spatial_scale,
sampling_ratio, pool_mode, aligned);
}
void roi_align_forward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int aligned_height;
int aligned_width;
float spatial_scale;
int sampling_ratio;
int pool_mode;
bool aligned;
SSAttrs(attr)
.get<int>("aligned_height", aligned_height)
.get<int>("aligned_width", aligned_width)
.get<float>("spatial_scale", spatial_scale)
.get<int>("sampling_ratio", sampling_ratio)
.get<int>("pool_mode", pool_mode)
.get<bool>("aligned", aligned)
.done();
auto& input = ins[0];
auto& rois = ins[1];
auto& output = outs[0];
auto& argmax_y = outs[1];
auto& argmax_x = outs[2];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
ROIAlignForwardCUDAKernelLauncher(
input, rois, output, argmax_y, argmax_x, aligned_height, aligned_width,
spatial_scale, sampling_ratio, pool_mode, aligned, stream);
}
void roi_align_backward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int aligned_height;
int aligned_width;
float spatial_scale;
int sampling_ratio;
int pool_mode;
bool aligned;
SSAttrs(attr)
.get<int>("aligned_height", aligned_height)
.get<int>("aligned_width", aligned_width)
.get<float>("spatial_scale", spatial_scale)
.get<int>("sampling_ratio", sampling_ratio)
.get<int>("pool_mode", pool_mode)
.get<bool>("aligned", aligned)
.done();
auto& grad_output = ins[0];
auto& rois = ins[1];
auto& argmax_y = ins[2];
auto& argmax_x = ins[3];
auto& grad_input = outs[0];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
ROIAlignBackwardCUDAKernelLauncher(
grad_output, rois, argmax_y, argmax_x, grad_input, aligned_height,
aligned_width, spatial_scale, sampling_ratio, pool_mode, aligned, stream);
}
PARROTS_EXTENSION_REGISTER(roi_align_forward)
.attr("aligned_height")
.attr("aligned_width")
.attr("spatial_scale")
.attr("sampling_ratio")
.attr("pool_mode")
.attr("aligned")
.input(2)
.output(3)
.apply(roi_align_forward_cpu)
#ifdef PARROTS_USE_CUDA
.apply(roi_align_forward_cuda)
void roi_align_forward(Tensor input, Tensor rois, Tensor output,
Tensor argmax_y, Tensor argmax_x, int aligned_height,
int aligned_width, float spatial_scale,
int sampling_ratio, int pool_mode, bool aligned) {
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(rois);
CHECK_CUDA_INPUT(output);
CHECK_CUDA_INPUT(argmax_y);
CHECK_CUDA_INPUT(argmax_x);
roi_align_forward_cuda(input, rois, output, argmax_y, argmax_x,
aligned_height, aligned_width, spatial_scale,
sampling_ratio, pool_mode, aligned);
#else
AT_ERROR("RoIAlign is not compiled with GPU support");
#endif
.done();
} else {
CHECK_CPU_INPUT(input);
CHECK_CPU_INPUT(rois);
CHECK_CPU_INPUT(output);
CHECK_CPU_INPUT(argmax_y);
CHECK_CPU_INPUT(argmax_x);
roi_align_forward_cpu(input, rois, output, argmax_y, argmax_x,
aligned_height, aligned_width, spatial_scale,
sampling_ratio, pool_mode, aligned);
}
}
PARROTS_EXTENSION_REGISTER(roi_align_backward)
.attr("aligned_height")
.attr("aligned_width")
.attr("spatial_scale")
.attr("sampling_ratio")
.attr("pool_mode")
.attr("aligned")
.input(4)
.output(1)
.apply(roi_align_backward_cpu)
#ifdef PARROTS_USE_CUDA
.apply(roi_align_backward_cuda)
void roi_align_backward(Tensor grad_output, Tensor rois, Tensor argmax_y,
Tensor argmax_x, Tensor grad_input, int aligned_height,
int aligned_width, float spatial_scale,
int sampling_ratio, int pool_mode, bool aligned) {
if (grad_output.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(grad_output);
CHECK_CUDA_INPUT(rois);
CHECK_CUDA_INPUT(argmax_y);
CHECK_CUDA_INPUT(argmax_x);
CHECK_CUDA_INPUT(grad_input);
roi_align_backward_cuda(grad_output, rois, argmax_y, argmax_x, grad_input,
aligned_height, aligned_width, spatial_scale,
sampling_ratio, pool_mode, aligned);
#else
AT_ERROR("RoIAlign is not compiled with GPU support");
#endif
.done();
} else {
CHECK_CPU_INPUT(grad_output);
CHECK_CPU_INPUT(rois);
CHECK_CPU_INPUT(argmax_y);
CHECK_CPU_INPUT(argmax_x);
CHECK_CPU_INPUT(grad_input);
roi_align_backward_cpu(grad_output, rois, argmax_y, argmax_x, grad_input,
aligned_height, aligned_width, spatial_scale,
sampling_ratio, pool_mode, aligned);
}
}
// Modified from
// https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/ROIAlign
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
#include <iostream>
#include <ATen/ATen.h>
#include <ATen/TensorUtils.h>
#include "parrots_cpp_helper.hpp"
#include "../pytorch_cpp_helper.hpp"
// implementation taken from Caffe2
template <typename T>
......@@ -133,8 +134,8 @@ void ROIAlignForward(const int nthreads, const T* input, const T* rois,
T roi_width = roi_end_w - roi_start_w;
T roi_height = roi_end_h - roi_start_h;
if (aligned) {
PARROTS_CHECKARGS(roi_width >= 0 && roi_height >= 0)
<< "ROIs in ROIAlign cannot have non-negative size!";
AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
"ROIs in ROIAlign cannot have non-negative size!");
} else { // for backward-compatibility only
roi_width = std::max(roi_width, (T)1.);
roi_height = std::max(roi_height, (T)1.);
......@@ -294,8 +295,8 @@ void ROIAlignBackward(const int nthreads, const T* grad_output, const T* rois,
T roi_width = roi_end_w - roi_start_w;
T roi_height = roi_end_h - roi_start_h;
if (aligned) {
PARROTS_CHECKARGS(roi_width >= 0 && roi_height >= 0)
<< "ROIs in ROIAlign do not have non-negative size!";
AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
"ROIs in ROIAlign do not have non-negative size!");
} else { // for backward-compatibility only
roi_width = std::max(roi_width, (T)1.);
roi_height = std::max(roi_height, (T)1.);
......@@ -378,38 +379,37 @@ void ROIAlignBackward(const int nthreads, const T* grad_output, const T* rois,
} // for
} // ROIAlignBackward
void ROIAlignForwardCPULauncher(DArrayLite input, DArrayLite rois,
DArrayLite output, DArrayLite argmax_y,
DArrayLite argmax_x, int aligned_height,
int aligned_width, float spatial_scale,
int sampling_ratio, int pool_mode,
bool aligned) {
int output_size = output.size();
int channels = input.dim(1);
int height = input.dim(2);
int width = input.dim(3);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
input.elemType().prim(), ([&] {
void ROIAlignForwardCPULauncher(Tensor input, Tensor rois, Tensor output,
Tensor argmax_y, Tensor argmax_x,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
int pool_mode, bool aligned) {
int output_size = output.numel();
int channels = input.size(1);
int height = input.size(2);
int width = input.size(3);
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "ROIAlign_forward", [&] {
ROIAlignForward<scalar_t>(
output_size, input.ptr<scalar_t>(), rois.ptr<scalar_t>(),
output.ptr<scalar_t>(), argmax_y.ptr<scalar_t>(),
argmax_x.ptr<scalar_t>(), aligned_height, aligned_width,
output_size, input.data_ptr<scalar_t>(), rois.data_ptr<scalar_t>(),
output.data_ptr<scalar_t>(), argmax_y.data_ptr<scalar_t>(),
argmax_x.data_ptr<scalar_t>(), aligned_height, aligned_width,
static_cast<scalar_t>(spatial_scale), sampling_ratio, pool_mode,
aligned, channels, height, width);
}));
});
}
void ROIAlignBackwardCPULauncher(DArrayLite grad_output, DArrayLite rois,
DArrayLite argmax_y, DArrayLite argmax_x,
DArrayLite grad_input, int aligned_height,
void ROIAlignBackwardCPULauncher(Tensor grad_output, Tensor rois,
Tensor argmax_y, Tensor argmax_x,
Tensor grad_input, int aligned_height,
int aligned_width, float spatial_scale,
int sampling_ratio, int pool_mode,
bool aligned) {
int output_size = grad_output.size();
int channels = grad_input.dim(1);
int height = grad_input.dim(2);
int width = grad_input.dim(3);
int output_size = grad_output.numel();
int channels = grad_input.size(1);
int height = grad_input.size(2);
int width = grad_input.size(3);
// get stride values to ensure indexing into gradients is correct.
int n_stride = grad_output.stride(0);
......@@ -417,14 +417,14 @@ void ROIAlignBackwardCPULauncher(DArrayLite grad_output, DArrayLite rois,
int h_stride = grad_output.stride(2);
int w_stride = grad_output.stride(3);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
grad_output.elemType().prim(), ([&] {
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
grad_output.scalar_type(), "ROIAlign_backward", [&] {
ROIAlignBackward<scalar_t>(
output_size, grad_output.ptr<scalar_t>(), rois.ptr<scalar_t>(),
argmax_y.ptr<scalar_t>(), argmax_x.ptr<scalar_t>(),
grad_input.ptr<scalar_t>(), aligned_height, aligned_width,
static_cast<scalar_t>(spatial_scale), sampling_ratio, pool_mode,
aligned, channels, height, width, n_stride, c_stride, h_stride,
w_stride);
}));
output_size, grad_output.data_ptr<scalar_t>(),
rois.data_ptr<scalar_t>(), argmax_y.data_ptr<scalar_t>(),
argmax_x.data_ptr<scalar_t>(), grad_input.data_ptr<scalar_t>(),
aligned_height, aligned_width, static_cast<scalar_t>(spatial_scale),
sampling_ratio, pool_mode, aligned, channels, height, width,
n_stride, c_stride, h_stride, w_stride);
});
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment