Unverified Commit 48d99025 authored by z55250825's avatar z55250825 Committed by GitHub
Browse files

Add new parrots extension implementation for all ops (#794)

* delete all parrots file
add bbox_overlaps new parrots op impl

* support first new impl parrts op (bbox_overlaps)(success test)

* add box_iou_rotated op, test succeed

* add carafe and carafe_naive op, test succeed (one parrots bug need fix)

* add cc_attention op, test success

* add corner_pool op, test success

* add parrots op deform_conv, test success

* add deform_roi_pool op, test success (but has question)

* add focal loss op, test success (gradcheck)

* add masked_conv2d op, test success

* add modulated_deform_conv op, test success

* add nms and nms_rotated op, test success

* add psamask op, test success

* add roi_align op, test_success

* add roi_pool op, test success

* add sync_bn op, test success

* add tin_shift op, test success

* fix test_deform_roi_pool, add parrots test

* skip test_onnx because parrots does not support onnx

* fix c++ lint

* fix python lint

* fix python lint
parent 72e4cc12
#ifndef MASKED_CONV2D_PYTORCH_H
#define MASKED_CONV2D_PYTORCH_H
#include <torch/extension.h>
using namespace at;
void masked_im2col_forward_cuda(const Tensor im, const Tensor mask_h_idx,
const Tensor mask_w_idx, Tensor col,
const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w);
void masked_col2im_forward_cuda(const Tensor col, const Tensor mask_h_idx,
const Tensor mask_w_idx, Tensor im, int height,
int width, int channels);
#endif // MASKED_CONV2D_PYTORCH_H
// Copyright (c) 2019, SenseTime. #include "pytorch_cpp_helper.hpp"
#include "parrots_cpp_helper.hpp"
#ifdef MMCV_WITH_CUDA
void ModulatedDeformConvForwardCUDAKernelLauncher( void ModulatedDeformConvForwardCUDAKernelLauncher(
const DArrayLite input, const DArrayLite weight, const DArrayLite bias, Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
const DArrayLite ones, const DArrayLite offset, const DArrayLite mask, Tensor mask, Tensor output, Tensor columns, int kernel_h, int kernel_w,
DArrayLite output, DArrayLite columns, int kernel_h, int kernel_w,
const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w,
const int dilation_h, const int dilation_w, const int group, const int dilation_h, const int dilation_w, const int group,
int deformable_group, const bool with_bias, CudaContext& ctx, const int deformable_group, const bool with_bias);
cudaStream_t stream);
void ModulatedDeformConvBackwardCUDAKernelLauncher( void ModulatedDeformConvBackwardCUDAKernelLauncher(
const DArrayLite input, const DArrayLite weight, const DArrayLite bias, Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
const DArrayLite ones, const DArrayLite offset, const DArrayLite mask, Tensor mask, Tensor columns, Tensor grad_input, Tensor grad_weight,
DArrayLite columns, DArrayLite grad_input, DArrayLite grad_weight, Tensor grad_bias, Tensor grad_offset, Tensor grad_mask, Tensor grad_output,
DArrayLite grad_bias, DArrayLite grad_offset, DArrayLite grad_mask, int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
DArrayLite grad_output, int kernel_h, int kernel_w, int stride_h, int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
int stride_w, int pad_h, int pad_w, int dilation_h, int dilation_w, const bool with_bias);
int group, int deformable_group, const bool with_bias, CudaContext& ctx,
cudaStream_t stream);
void modulated_deform_conv_forward_cuda(CudaContext& ctx, const SSElement& attr, void modulated_deform_conv_forward_cuda(
const OperatorBase::in_list_t& ins, Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
OperatorBase::out_list_t& outs) { Tensor mask, Tensor output, Tensor columns, int kernel_h, int kernel_w,
int kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, const int stride_h, const int stride_w, const int pad_h, const int pad_w,
dilation_w, group, deformable_group, with_bias; const int dilation_h, const int dilation_w, const int group,
SSAttrs(attr) const int deformable_group, const bool with_bias) {
.get<int>("kernel_h", kernel_h)
.get<int>("kernel_w", kernel_w)
.get<int>("stride_h", stride_h)
.get<int>("stride_w", stride_w)
.get<int>("pad_h", pad_h)
.get<int>("pad_w", pad_w)
.get<int>("dilation_h", dilation_h)
.get<int>("dilation_w", dilation_w)
.get<int>("group", group)
.get<int>("deformable_group", deformable_group)
.get<int>("with_bias", with_bias)
.done();
auto input = ins[0];
auto weight = ins[1];
auto bias = ins[2];
auto ones = ins[3];
auto offset = ins[4];
auto mask = ins[5];
auto output = outs[0];
auto columns = outs[1];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
ModulatedDeformConvForwardCUDAKernelLauncher( ModulatedDeformConvForwardCUDAKernelLauncher(
input, weight, bias, ones, offset, mask, output, columns, kernel_h, input, weight, bias, ones, offset, mask, output, columns, kernel_h,
kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
deformable_group, with_bias, ctx, stream); deformable_group, with_bias);
} }
void modulated_deform_conv_backward_cuda(CudaContext& ctx, void modulated_deform_conv_backward_cuda(
const SSElement& attr, Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
const OperatorBase::in_list_t& ins, Tensor mask, Tensor columns, Tensor grad_input, Tensor grad_weight,
OperatorBase::out_list_t& outs) { Tensor grad_bias, Tensor grad_offset, Tensor grad_mask, Tensor grad_output,
int kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
dilation_w, group, deformable_group, with_bias; int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
SSAttrs(attr) const bool with_bias) {
.get<int>("kernel_h", kernel_h)
.get<int>("kernel_w", kernel_w)
.get<int>("stride_h", stride_h)
.get<int>("stride_w", stride_w)
.get<int>("pad_h", pad_h)
.get<int>("pad_w", pad_w)
.get<int>("dilation_h", dilation_h)
.get<int>("dilation_w", dilation_w)
.get<int>("group", group)
.get<int>("deformable_group", deformable_group)
.get<int>("with_bias", with_bias)
.done();
auto input = ins[0];
auto weight = ins[1];
auto bias = ins[2];
auto ones = ins[3];
auto offset = ins[4];
auto mask = ins[5];
auto columns = outs[0];
auto grad_input = outs[1];
auto grad_weight = outs[2];
auto grad_bias = outs[3];
auto grad_offset = outs[4];
auto grad_mask = outs[5];
auto grad_output = outs[6];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
ModulatedDeformConvBackwardCUDAKernelLauncher( ModulatedDeformConvBackwardCUDAKernelLauncher(
input, weight, bias, ones, offset, mask, columns, grad_input, grad_weight, input, weight, bias, ones, offset, mask, columns, grad_input, grad_weight,
grad_bias, grad_offset, grad_mask, grad_output, kernel_h, kernel_w, grad_bias, grad_offset, grad_mask, grad_output, kernel_h, kernel_w,
stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
deformable_group, with_bias, ctx, stream); deformable_group, with_bias);
} }
#endif
PARROTS_EXTENSION_REGISTER(modulated_deform_conv_forward) void modulated_deform_conv_forward(
.attr("kernel_h") Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
.attr("kernel_w") Tensor mask, Tensor output, Tensor columns, int kernel_h, int kernel_w,
.attr("stride_h") const int stride_h, const int stride_w, const int pad_h, const int pad_w,
.attr("stride_w") const int dilation_h, const int dilation_w, const int group,
.attr("pad_h") const int deformable_group, const bool with_bias) {
.attr("pad_w") if (input.device().is_cuda()) {
.attr("dilation_h") #ifdef MMCV_WITH_CUDA
.attr("dilation_w") CHECK_CUDA_INPUT(input);
.attr("group") CHECK_CUDA_INPUT(weight);
.attr("deformable_group") CHECK_CUDA_INPUT(bias);
.attr("with_bias") CHECK_CUDA_INPUT(ones);
.input(6) CHECK_CUDA_INPUT(offset);
.output(2) CHECK_CUDA_INPUT(mask);
.apply(modulated_deform_conv_forward_cuda) CHECK_CUDA_INPUT(output);
.done(); CHECK_CUDA_INPUT(columns);
PARROTS_EXTENSION_REGISTER(modulated_deform_conv_backward) modulated_deform_conv_forward_cuda(
.attr("kernel_h") input, weight, bias, ones, offset, mask, output, columns, kernel_h,
.attr("kernel_w") kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w,
.attr("stride_h") group, deformable_group, with_bias);
.attr("stride_w") #else
.attr("pad_h") AT_ERROR("ModulatedDeformConv is not compiled with GPU support");
.attr("pad_w") #endif
.attr("dilation_h") } else {
.attr("dilation_w") AT_ERROR("ModulatedDeformConv is not implemented on CPU");
.attr("group") }
.attr("deformable_group") }
.attr("with_bias")
.input(6) void modulated_deform_conv_backward(
.output(7) Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
.apply(modulated_deform_conv_backward_cuda) Tensor mask, Tensor columns, Tensor grad_input, Tensor grad_weight,
.done(); Tensor grad_bias, Tensor grad_offset, Tensor grad_mask, Tensor grad_output,
int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
const bool with_bias) {
if (input.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(input);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(bias);
CHECK_CUDA_INPUT(ones);
CHECK_CUDA_INPUT(offset);
CHECK_CUDA_INPUT(mask);
CHECK_CUDA_INPUT(columns);
CHECK_CUDA_INPUT(grad_input);
CHECK_CUDA_INPUT(grad_weight);
CHECK_CUDA_INPUT(grad_bias);
CHECK_CUDA_INPUT(grad_offset);
CHECK_CUDA_INPUT(grad_mask);
CHECK_CUDA_INPUT(grad_output);
modulated_deform_conv_backward_cuda(
input, weight, bias, ones, offset, mask, columns, grad_input,
grad_weight, grad_bias, grad_offset, grad_mask, grad_output, kernel_h,
kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w,
group, deformable_group, with_bias);
#else
AT_ERROR("ModulatedDeformConv is not compiled with GPU support");
#endif
} else {
AT_ERROR("ModulatedDeformConv is not implemented on CPU");
}
}
#include "modulated_deform_conv_cuda_kernel.cuh" #include "modulated_deform_conv_cuda_kernel.cuh"
#include "parrots_cuda_helper.hpp" #include "pytorch_cuda_helper.hpp"
void modulated_deformable_im2col_cuda( void modulated_deformable_im2col_cuda(
const DArrayLite data_im, const DArrayLite data_offset, const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
const DArrayLite data_mask, const int batch_size, const int channels, const int batch_size, const int channels, const int height_im,
const int height_im, const int width_im, const int height_col, const int width_im, const int height_col, const int width_col,
const int width_col, const int kernel_h, const int kenerl_w, const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int stride_h, const int stride_w, const int dilation_h,
const int dilation_h, const int dilation_w, const int deformable_group, const int dilation_w, const int deformable_group, Tensor data_col) {
DArrayLite data_col, cudaStream_t stream) {
// num_axes should be smaller than block size // num_axes should be smaller than block size
const int channel_per_deformable_group = channels / deformable_group; const int channel_per_deformable_group = channels / deformable_group;
const int num_kernels = channels * batch_size * height_col * width_col; const int num_kernels = channels * batch_size * height_col * width_col;
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF( AT_DISPATCH_FLOATING_TYPES_AND_HALF(
data_im.elemType().prim(), ([&] { data_im.scalar_type(), "modulated_deformable_im2col_gpu", ([&] {
const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
modulated_deformable_im2col_gpu_kernel<<< modulated_deformable_im2col_gpu_kernel<<<
GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>( GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0,
num_kernels, data_im.ptr<scalar_t>(), data_offset.ptr<scalar_t>(), at::cuda::getCurrentCUDAStream()>>>(
data_mask.ptr<scalar_t>(), height_im, width_im, kernel_h, kenerl_w, num_kernels, data_im_, data_offset_, data_mask_, height_im,
pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, width_im, kernel_h, kenerl_w, pad_h, pad_w, stride_h, stride_w,
channel_per_deformable_group, batch_size, channels, dilation_h, dilation_w, channel_per_deformable_group, batch_size,
deformable_group, height_col, width_col, data_col.ptr<scalar_t>()); channels, deformable_group, height_col, width_col, data_col_);
})); }));
AT_CUDA_CHECK(cudaGetLastError());
PARROTS_CUDA_CHECK(cudaGetLastError());
} }
void modulated_deformable_col2im_cuda( void modulated_deformable_col2im_cuda(
const DArrayLite data_col, const DArrayLite data_offset, const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
const DArrayLite data_mask, const int batch_size, const int channels, const int batch_size, const int channels, const int height_im,
const int height_im, const int width_im, const int height_col, const int width_im, const int height_col, const int width_col,
const int width_col, const int kernel_h, const int kernel_w, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int stride_h, const int stride_w, const int dilation_h,
const int dilation_h, const int dilation_w, const int deformable_group, const int dilation_w, const int deformable_group, Tensor grad_im) {
DArrayLite grad_im, cudaStream_t stream) {
const int channel_per_deformable_group = channels / deformable_group; const int channel_per_deformable_group = channels / deformable_group;
const int num_kernels = const int num_kernels =
channels * kernel_h * kernel_w * batch_size * height_col * width_col; channels * kernel_h * kernel_w * batch_size * height_col * width_col;
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF( AT_DISPATCH_FLOATING_TYPES_AND_HALF(
data_col.elemType().prim(), ([&] { data_col.scalar_type(), "modulated_deformable_col2im_gpu", ([&] {
const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>();
modulated_deformable_col2im_gpu_kernel<<< modulated_deformable_col2im_gpu_kernel<<<
GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>( GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0,
num_kernels, data_col.ptr<scalar_t>(), data_offset.ptr<scalar_t>(), at::cuda::getCurrentCUDAStream()>>>(
data_mask.ptr<scalar_t>(), channels, height_im, width_im, kernel_h, num_kernels, data_col_, data_offset_, data_mask_, channels,
kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, height_im, width_im, kernel_h, kernel_w, pad_h, pad_w, stride_h,
channel_per_deformable_group, batch_size, deformable_group, stride_w, dilation_h, dilation_w, channel_per_deformable_group,
height_col, width_col, grad_im.ptr<scalar_t>()); batch_size, deformable_group, height_col, width_col, grad_im_);
})); }));
AT_CUDA_CHECK(cudaGetLastError());
PARROTS_CUDA_CHECK(cudaGetLastError());
} }
void modulated_deformable_col2im_coord_cuda( void modulated_deformable_col2im_coord_cuda(
const DArrayLite data_col, const DArrayLite data_im, const Tensor data_col, const Tensor data_im, const Tensor data_offset,
const DArrayLite data_offset, const DArrayLite data_mask, const Tensor data_mask, const int batch_size, const int channels,
const int batch_size, const int channels, const int height_im, const int height_im, const int width_im, const int height_col,
const int width_im, const int height_col, const int width_col, const int width_col, const int kernel_h, const int kernel_w,
const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int stride_h, const int stride_w, const int dilation_h, const int dilation_h, const int dilation_w, const int deformable_group,
const int dilation_w, const int deformable_group, DArrayLite grad_offset, Tensor grad_offset, Tensor grad_mask) {
DArrayLite grad_mask, cudaStream_t stream) {
const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h * const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h *
kernel_w * deformable_group; kernel_w * deformable_group;
const int channel_per_deformable_group = const int channel_per_deformable_group =
channels * kernel_h * kernel_w / deformable_group; channels * kernel_h * kernel_w / deformable_group;
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF( AT_DISPATCH_FLOATING_TYPES_AND_HALF(
data_col.elemType().prim(), ([&] { data_col.scalar_type(), "modulated_deformable_col2im_coord_gpu", ([&] {
const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>();
scalar_t *grad_mask_ = grad_mask.data_ptr<scalar_t>();
modulated_deformable_col2im_coord_gpu_kernel<<< modulated_deformable_col2im_coord_gpu_kernel<<<
GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>( GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0,
num_kernels, data_col.ptr<scalar_t>(), data_im.ptr<scalar_t>(), at::cuda::getCurrentCUDAStream()>>>(
data_offset.ptr<scalar_t>(), data_mask.ptr<scalar_t>(), channels, num_kernels, data_col_, data_im_, data_offset_, data_mask_,
height_im, width_im, kernel_h, kernel_w, pad_h, pad_w, stride_h, channels, height_im, width_im, kernel_h, kernel_w, pad_h, pad_w,
stride_w, dilation_h, dilation_w, channel_per_deformable_group, stride_h, stride_w, dilation_h, dilation_w,
batch_size, 2 * kernel_h * kernel_w * deformable_group, channel_per_deformable_group, batch_size,
deformable_group, height_col, width_col, 2 * kernel_h * kernel_w * deformable_group, deformable_group,
grad_offset.ptr<scalar_t>(), grad_mask.ptr<scalar_t>()); height_col, width_col, grad_offset_, grad_mask_);
})); }));
AT_CUDA_CHECK(cudaGetLastError());
PARROTS_CUDA_CHECK(cudaGetLastError());
} }
void ModulatedDeformConvForwardCUDAKernelLauncher( void ModulatedDeformConvForwardCUDAKernelLauncher(
DArrayLite input, DArrayLite weight, DArrayLite bias, DArrayLite ones, Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
DArrayLite offset, DArrayLite mask, DArrayLite output, DArrayLite columns, Tensor mask, Tensor output, Tensor columns, int kernel_h, int kernel_w,
int kernel_h, int kernel_w, const int stride_h, const int stride_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w,
const int pad_h, const int pad_w, const int dilation_h, const int dilation_h, const int dilation_w, const int group,
const int dilation_w, const int group, const int deformable_group, const int deformable_group, const bool with_bias) {
const bool with_bias, CudaContext& ctx, cudaStream_t stream) { at::DeviceGuard guard(input.device());
const int batch = input.dim(0);
const int channels = input.dim(1); const int batch = input.size(0);
const int height = input.dim(2); const int channels = input.size(1);
const int width = input.dim(3); const int height = input.size(2);
const int width = input.size(3);
const int channels_out = weight.dim(0);
const int channels_kernel = weight.dim(1); const int channels_out = weight.size(0);
const int kernel_h_ = weight.dim(2); const int channels_kernel = weight.size(1);
const int kernel_w_ = weight.dim(3); const int kernel_h_ = weight.size(2);
const int kernel_w_ = weight.size(3);
PARROTS_CHECKARGS(kernel_h_ == kernel_h && kernel_w_ == kernel_w)
<< "Input shape and kernel shape wont match: (" << kernel_h << " x " if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
<< kernel_w << " vs " << kernel_h_ << " x " << kernel_w_ << ")."; AT_ERROR("Input shape and kernel shape wont match: (%d x %d vs %d x %d).",
kernel_h_, kernel_w, kernel_h_, kernel_w_);
PARROTS_CHECKARGS(channels == channels_kernel * group) if (channels != channels_kernel * group)
<< "Input shape and kernel channels wont match: (" << channels << " vs " AT_ERROR("Input shape and kernel channels wont match: (%d vs %d).",
<< channels_kernel * group << ")."; channels, channels_kernel * group);
const int height_out = const int height_out =
(height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
const int width_out = const int width_out =
(width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
if (ones.ndims() != 2 || ones.dim(0) * ones.dim(1) < height_out * width_out) { if (ones.ndimension() != 2 ||
ones.size(0) * ones.size(1) < height_out * width_out) {
// Resize plane and fill with ones... // Resize plane and fill with ones...
ones = ctx.createDArrayLite(input.elemType(), ones = at::ones({height_out, width_out}, input.options());
DArrayShape(height_out, width_out));
fill(ctx, ones, *toScalar(1));
} }
// resize output // resize output
output = output.view({batch, channels_out, height_out, width_out}); output = output.view({batch, channels_out, height_out, width_out}).zero_();
output.setZeros(ctx.getStream());
// resize temporary columns // resize temporary columns
columns = ctx.createDArrayLite( columns =
input.elemType(), at::zeros({channels * kernel_h * kernel_w, 1 * height_out * width_out},
DArrayShape(channels * kernel_h * kernel_w, 1 * height_out * width_out)); input.options());
columns.setZeros(ctx.getStream());
output = output.view({output.dim(0), group, output.dim(1) / group, output = output.view({output.size(0), group, output.size(1) / group,
output.dim(2), output.dim(3)}); output.size(2), output.size(3)});
for (size_t b = 0; b < batch; b++) { for (int b = 0; b < batch; b++) {
modulated_deformable_im2col_cuda( modulated_deformable_im2col_cuda(
input[b], offset[b], mask[b], 1, channels, height, width, height_out, input[b], offset[b], mask[b], 1, channels, height, width, height_out,
width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, deformable_group, columns, stream); dilation_h, dilation_w, deformable_group, columns);
// divide into group // divide into group
weight = weight.view({group, weight.dim(0) / group, weight.dim(1), weight = weight.view({group, weight.size(0) / group, weight.size(1),
weight.dim(2), weight.dim(3)}); weight.size(2), weight.size(3)});
columns = columns.view({group, columns.dim(0) / group, columns.dim(1)}); columns = columns.view({group, columns.size(0) / group, columns.size(1)});
for (size_t g = 0; g < group; g++) { for (int g = 0; g < group; g++) {
auto output_g = output[b][g]; output[b][g] = output[b][g]
gemm(ctx, 1, false, .flatten(1)
weight[g].view( .addmm_(weight[g].flatten(1), columns[g])
{weight.dim(1), weight.dim(2) * weight.dim(3) * weight.dim(4)}), .view_as(output[b][g]);
false, columns[g], 1, output_g);
} }
weight = weight.view({weight.dim(0) * weight.dim(1), weight.dim(2), weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
weight.dim(3), weight.dim(4)}); weight.size(3), weight.size(4)});
columns = columns.view({columns.dim(0) * columns.dim(1), columns.dim(2)}); columns =
columns.view({columns.size(0) * columns.size(1), columns.size(2)});
} }
output = output.view({output.dim(0), output.dim(1) * output.dim(2), output = output.view({output.size(0), output.size(1) * output.size(2),
output.dim(3), output.dim(4)}); output.size(3), output.size(4)});
if (with_bias) { if (with_bias) {
bias = bias.view({1, bias.dim(0), 1, 1}); output += bias.view({1, bias.size(0), 1, 1});
add(ctx, output, bias, output);
} }
} }
void ModulatedDeformConvBackwardCUDAKernelLauncher( void ModulatedDeformConvBackwardCUDAKernelLauncher(
DArrayLite input, DArrayLite weight, DArrayLite bias, DArrayLite ones, Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
DArrayLite offset, DArrayLite mask, DArrayLite columns, Tensor mask, Tensor columns, Tensor grad_input, Tensor grad_weight,
DArrayLite grad_input, DArrayLite grad_weight, DArrayLite grad_bias, Tensor grad_bias, Tensor grad_offset, Tensor grad_mask, Tensor grad_output,
DArrayLite grad_offset, DArrayLite grad_mask, DArrayLite grad_output,
int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h, int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
int pad_w, int dilation_h, int dilation_w, int group, int deformable_group, int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
const bool with_bias, CudaContext& ctx, cudaStream_t stream) { const bool with_bias) {
const int batch = input.dim(0); at::DeviceGuard guard(input.device());
const int channels = input.dim(1);
const int height = input.dim(2); const int batch = input.size(0);
const int width = input.dim(3); const int channels = input.size(1);
const int height = input.size(2);
const int channels_kernel = weight.dim(1); const int width = input.size(3);
const int kernel_h_ = weight.dim(2);
const int kernel_w_ = weight.dim(3); const int channels_kernel = weight.size(1);
const int kernel_h_ = weight.size(2);
PARROTS_CHECKARGS(kernel_h_ == kernel_h && kernel_w_ == kernel_w) const int kernel_w_ = weight.size(3);
<< "Input shape and kernel shape wont match: (" << kernel_h << " x " if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
<< kernel_w << " vs " << kernel_h_ << " x " << kernel_w_ << ")."; AT_ERROR("Input shape and kernel shape wont match: (%d x %d vs %d x %d).",
kernel_h_, kernel_w, kernel_h_, kernel_w_);
PARROTS_CHECKARGS(channels == channels_kernel * group) if (channels != channels_kernel * group)
<< "Input shape and kernel channels wont match: (" << channels << " vs " AT_ERROR("Input shape and kernel channels wont match: (%d vs %d).",
<< channels_kernel * group << ")."; channels, channels_kernel * group);
const int height_out = const int height_out =
(height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
const int width_out = const int width_out =
(width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
if (ones.ndims() != 2 || ones.dim(0) * ones.dim(1) < height_out * width_out) { if (ones.ndimension() != 2 ||
ones.size(0) * ones.size(1) < height_out * width_out) {
// Resize plane and fill with ones... // Resize plane and fill with ones...
ones = ctx.createDArrayLite(input.elemType(), ones = at::ones({height_out, width_out}, input.options());
DArrayShape(height_out, width_out));
fill(ctx, ones, *toScalar(1));
} }
grad_input = grad_input.view({batch, channels, height, width}); grad_input = grad_input.view({batch, channels, height, width});
columns = ctx.createDArrayLite( columns = at::zeros({channels * kernel_h * kernel_w, height_out * width_out},
input.elemType(), input.options());
DArrayShape(channels * kernel_h * kernel_w, height_out * width_out));
grad_output = grad_output =
grad_output.view({grad_output.dim(0), group, grad_output.dim(1) / group, grad_output.view({grad_output.size(0), group, grad_output.size(1) / group,
grad_output.dim(2), grad_output.dim(3)}); grad_output.size(2), grad_output.size(3)});
for (size_t b = 0; b < batch; b++) { for (int b = 0; b < batch; b++) {
// divide int group // divide int group
columns = columns.view({group, columns.dim(0) / group, columns.dim(1)}); columns = columns.view({group, columns.size(0) / group, columns.size(1)});
weight = weight.view({group, weight.dim(0) / group, weight.dim(1), weight = weight.view({group, weight.size(0) / group, weight.size(1),
weight.dim(2), weight.dim(3)}); weight.size(2), weight.size(3)});
for (size_t g = 0; g < group; g++) { for (int g = 0; g < group; g++) {
auto columns_g = ctx.createDArrayLite( columns[g].addmm_(weight[g].flatten(1).transpose(0, 1),
weight.elemType(), DArrayShape(columns.dim(1), columns.dim(2))); grad_output[b][g].flatten(1), 0.0f, 1.0f);
copy(ctx, columns_g, columns[g]);
auto weight_g = weight[g].view(
{weight.dim(1), weight.dim(2) * weight.dim(3) * weight.dim(4)});
weight_g = transpose(ctx, weight_g, 0, 1);
auto grad_output_bg = ctx.createDArrayLite(
grad_output.elemType(),
DArrayShape(grad_output.dim(2), grad_output.dim(3),
grad_output.dim(4)));
copy(ctx, grad_output_bg, grad_output[b][g]);
grad_output_bg =
grad_output_bg.view({grad_output_bg.dim(0),
grad_output_bg.dim(1) * grad_output_bg.dim(2)});
columns_g =
parrots::op::addmm(ctx, columns[g], weight_g, grad_output_bg, 0, 1);
auto columns_out = columns[g];
copy(ctx, columns_out, columns_g);
} }
columns = columns.view({columns.dim(0) * columns.dim(1), columns.dim(2)}); columns =
weight = weight.view({weight.dim(0) * weight.dim(1), weight.dim(2), columns.view({columns.size(0) * columns.size(1), columns.size(2)});
weight.dim(3), weight.dim(4)}); weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
weight.size(3), weight.size(4)});
// gradient w.r.t. input coordinate data // gradient w.r.t. input coordinate data
modulated_deformable_col2im_coord_cuda( modulated_deformable_col2im_coord_cuda(
columns, input[b], offset[b], mask[b], 1, channels, height, width, columns, input[b], offset[b], mask[b], 1, channels, height, width,
height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h,
stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b], stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b],
grad_mask[b], stream); grad_mask[b]);
// gradient w.r.t. input data // gradient w.r.t. input data
modulated_deformable_col2im_cuda( modulated_deformable_col2im_cuda(
columns, offset[b], mask[b], 1, channels, height, width, height_out, columns, offset[b], mask[b], 1, channels, height, width, height_out,
width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, deformable_group, grad_input[b], stream); dilation_h, dilation_w, deformable_group, grad_input[b]);
// gradient w.r.t. weight, dWeight should accumulate across the batch and // gradient w.r.t. weight, dWeight should accumulate across the batch and
// group // group
modulated_deformable_im2col_cuda( modulated_deformable_im2col_cuda(
input[b], offset[b], mask[b], 1, channels, height, width, height_out, input[b], offset[b], mask[b], 1, channels, height, width, height_out,
width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, deformable_group, columns, stream); dilation_h, dilation_w, deformable_group, columns);
columns = columns.view({group, columns.dim(0) / group, columns.dim(1)});
grad_weight =
grad_weight.view({group, grad_weight.dim(0) / group, grad_weight.dim(1),
grad_weight.dim(2), grad_weight.dim(3)});
if (with_bias) {
grad_bias = grad_bias.view({group, grad_bias.dim(0) / group});
}
for (size_t g = 0; g < group; g++) {
auto grad_weight_g = ctx.createDArrayLite(
grad_weight.elemType(),
DArrayShape(grad_weight.dim(1), grad_weight.dim(2),
grad_weight.dim(3), grad_weight.dim(4)));
copy(ctx, grad_weight_g, grad_weight[g]);
grad_weight_g = grad_weight_g.view(
{grad_weight_g.dim(0),
grad_weight_g.dim(1) * grad_weight_g.dim(2) * grad_weight_g.dim(3)});
auto columns_g = columns[g];
columns_g = transpose(ctx, columns_g, 0, 1);
auto grad_output_bg = ctx.createDArrayLite(
grad_output.elemType(),
DArrayShape(grad_output.dim(2), grad_output.dim(3),
grad_output.dim(4)));
copy(ctx, grad_output_bg, grad_output[b][g]);
grad_output_bg =
grad_output_bg.view({grad_output_bg.dim(0),
grad_output_bg.dim(1) * grad_output_bg.dim(2)});
grad_weight_g = parrots::op::addmm(ctx, grad_weight_g, grad_output_bg,
columns_g, 1, 1);
auto grad_weight_out = grad_weight[g];
copy(ctx, grad_weight_out, grad_weight_g);
columns = columns.view({group, columns.size(0) / group, columns.size(1)});
grad_weight = grad_weight.view({group, grad_weight.size(0) / group,
grad_weight.size(1), grad_weight.size(2),
grad_weight.size(3)});
if (with_bias)
grad_bias = grad_bias.view({group, grad_bias.size(0) / group});
for (int g = 0; g < group; g++) {
grad_weight[g] =
grad_weight[g]
.flatten(1)
.addmm_(grad_output[b][g].flatten(1), columns[g].transpose(0, 1))
.view_as(grad_weight[g]);
if (with_bias) { if (with_bias) {
auto grad_bias_g = ctx.createDArrayLite(grad_bias.elemType(), grad_bias[g] =
DArrayShape(grad_bias.dim(1))); grad_bias[g]
copy(ctx, grad_bias_g, grad_bias[g]); .view({-1, 1})
grad_bias_g = grad_bias_g.view({grad_bias_g.dim(0), 1}); .addmm_(grad_output[b][g].flatten(1), ones.view({-1, 1}))
.view(-1);
auto grad_output_bg = ctx.createDArrayLite(
grad_output.elemType(),
DArrayShape(grad_output.dim(2), grad_output.dim(3),
grad_output.dim(4)));
copy(ctx, grad_output_bg, grad_output[b][g]);
grad_output_bg = grad_output_bg.view(
{grad_output_bg.dim(0),
grad_output_bg.dim(1) * grad_output_bg.dim(2)});
auto ones_g = ctx.createDArrayLite(
ones.elemType(), DArrayShape(ones.dim(0), ones.dim(1)));
copy(ctx, ones_g, ones);
ones_g = ones_g.view({ones_g.dim(0) * ones_g.dim(1), 1});
grad_bias_g =
parrots::op::addmm(ctx, grad_bias_g, grad_output_bg, ones_g, 1, 1);
auto grad_bias_out = grad_bias[g];
copy(ctx, grad_bias_out, grad_bias_g);
} }
} }
columns = columns.view({columns.dim(0) * columns.dim(1), columns.dim(2)}); columns =
grad_weight = grad_weight.view({grad_weight.dim(0) * grad_weight.dim(1), columns.view({columns.size(0) * columns.size(1), columns.size(2)});
grad_weight.dim(2), grad_weight.dim(3), grad_weight = grad_weight.view({grad_weight.size(0) * grad_weight.size(1),
grad_weight.dim(4)}); grad_weight.size(2), grad_weight.size(3),
grad_weight.size(4)});
if (with_bias) if (with_bias)
grad_bias = grad_bias = grad_bias.view({grad_bias.size(0) * grad_bias.size(1)});
grad_bias.view(DArrayShape{grad_bias.dim(0) * grad_bias.dim(1)});
} }
grad_output = grad_output.view({grad_output.dim(0) * grad_output.dim(1), grad_output = grad_output.view({grad_output.size(0) * grad_output.size(1),
grad_output.dim(2), grad_output.dim(3), grad_output.size(2), grad_output.size(3),
grad_output.dim(4)}); grad_output.size(4)});
} }
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>
#include "modulated_deform_conv_pytorch.h"
using namespace parrots;
void modulated_deform_conv_forward_cuda_parrots(
CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h,
dilation_w, group, deformable_group, with_bias;
SSAttrs(attr)
.get<int>("kernel_h", kernel_h)
.get<int>("kernel_w", kernel_w)
.get<int>("stride_h", stride_h)
.get<int>("stride_w", stride_w)
.get<int>("pad_h", pad_h)
.get<int>("pad_w", pad_w)
.get<int>("dilation_h", dilation_h)
.get<int>("dilation_w", dilation_w)
.get<int>("group", group)
.get<int>("deformable_group", deformable_group)
.get<int>("with_bias", with_bias)
.done();
const auto& input = buildATensor(ctx, ins[0]);
const auto& weight = buildATensor(ctx, ins[1]);
const auto& bias = buildATensor(ctx, ins[2]);
const auto& ones = buildATensor(ctx, ins[3]);
const auto& offset = buildATensor(ctx, ins[4]);
const auto& mask = buildATensor(ctx, ins[5]);
auto output = buildATensor(ctx, outs[0]);
auto columns = buildATensor(ctx, outs[1]);
modulated_deform_conv_forward_cuda(
input, weight, bias, ones, offset, mask, output, columns, kernel_h,
kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
deformable_group, with_bias);
}
void modulated_deform_conv_backward_cuda_parrots(
CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h,
dilation_w, group, deformable_group, with_bias;
SSAttrs(attr)
.get<int>("kernel_h", kernel_h)
.get<int>("kernel_w", kernel_w)
.get<int>("stride_h", stride_h)
.get<int>("stride_w", stride_w)
.get<int>("pad_h", pad_h)
.get<int>("pad_w", pad_w)
.get<int>("dilation_h", dilation_h)
.get<int>("dilation_w", dilation_w)
.get<int>("group", group)
.get<int>("deformable_group", deformable_group)
.get<int>("with_bias", with_bias)
.done();
const auto& input = buildATensor(ctx, ins[0]);
const auto& weight = buildATensor(ctx, ins[1]);
const auto& bias = buildATensor(ctx, ins[2]);
const auto& ones = buildATensor(ctx, ins[3]);
const auto& offset = buildATensor(ctx, ins[4]);
const auto& mask = buildATensor(ctx, ins[5]);
auto columns = buildATensor(ctx, outs[0]);
auto grad_input = buildATensor(ctx, outs[1]);
auto grad_weight = buildATensor(ctx, outs[2]);
auto grad_bias = buildATensor(ctx, outs[3]);
auto grad_offset = buildATensor(ctx, outs[4]);
auto grad_mask = buildATensor(ctx, outs[5]);
auto grad_output = buildATensor(ctx, outs[6]);
modulated_deform_conv_backward_cuda(
input, weight, bias, ones, offset, mask, columns, grad_input, grad_weight,
grad_bias, grad_offset, grad_mask, grad_output, kernel_h, kernel_w,
stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
deformable_group, with_bias);
}
PARROTS_EXTENSION_REGISTER(modulated_deform_conv_forward)
.attr("kernel_h")
.attr("kernel_w")
.attr("stride_h")
.attr("stride_w")
.attr("pad_h")
.attr("pad_w")
.attr("dilation_h")
.attr("dilation_w")
.attr("group")
.attr("deformable_group")
.attr("with_bias")
.input(6)
.output(2)
.apply(modulated_deform_conv_forward_cuda_parrots)
.done();
PARROTS_EXTENSION_REGISTER(modulated_deform_conv_backward)
.attr("kernel_h")
.attr("kernel_w")
.attr("stride_h")
.attr("stride_w")
.attr("pad_h")
.attr("pad_w")
.attr("dilation_h")
.attr("dilation_w")
.attr("group")
.attr("deformable_group")
.attr("with_bias")
.input(6)
.output(7)
.apply(modulated_deform_conv_backward_cuda_parrots)
.done();
#ifndef MODULATED_DEFORM_CONV_PYTORCH_H
#define MODULATED_DEFORM_CONV_PYTORCH_H
#include <torch/extension.h>
using namespace at;
void modulated_deform_conv_forward_cuda(
Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
Tensor mask, Tensor output, Tensor columns, int kernel_h, int kernel_w,
const int stride_h, const int stride_w, const int pad_h, const int pad_w,
const int dilation_h, const int dilation_w, const int group,
const int deformable_group, const bool with_bias);
void modulated_deform_conv_backward_cuda(
Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
Tensor mask, Tensor columns, Tensor grad_input, Tensor grad_weight,
Tensor grad_bias, Tensor grad_offset, Tensor grad_mask, Tensor grad_output,
int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
const bool with_bias);
#endif // MODULATED_DEFORM_CONV_PYTORCH_H
#include "parrots_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
#define DIVUP(x, y) (((x) + (y)-1) / (y))
int const threadsPerBlock = sizeof(unsigned long long) * 8; #ifdef MMCV_WITH_CUDA
Tensor NMSCUDAKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
DArrayLite NMSCUDAKernelLauncher(const DArrayLite boxes_sorted, int offset);
const DArrayLite order, const DArrayLite areas,
float iou_threshold, int offset, Tensor nms_cuda(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
CudaContext& ctx, cudaStream_t stream); return NMSCUDAKernelLauncher(boxes, scores, iou_threshold, offset);
void nms_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
float iou_threshold;
int offset;
SSAttrs(attr)
.get<float>("iou_threshold", iou_threshold)
.get<int>("offset", offset)
.done();
const auto& boxes_sorted = ins[0];
const auto& order = ins[1];
const auto& areas = ins[2];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
outs[0] = NMSCUDAKernelLauncher(boxes_sorted, order, areas, iou_threshold,
offset, ctx, stream);
} }
#endif
void nms_cpu(HostContext& ctx, const SSElement& attr, Tensor nms_cpu(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
const OperatorBase::in_list_t& ins, if (boxes.numel() == 0) {
OperatorBase::out_list_t& outs) { return at::empty({0}, boxes.options().dtype(at::kLong));
float iou_threshold;
int offset;
SSAttrs(attr)
.get<float>("iou_threshold", iou_threshold)
.get<int>("offset", offset)
.done();
const auto& boxes = ins[0];
const auto& order = ins[1];
const auto& areas = ins[2];
size_t nboxes = boxes.shape().dim(0);
size_t boxes_dim = boxes.shape().dim(1);
auto select = ctx.createDArrayLite(DArraySpec::array(Prim::Int64, nboxes),
getHostProxy());
select.setZeros(syncStream());
if (boxes.size() == 0) {
outs[0] = select;
return;
} }
auto x1_t = boxes.select(1, 0).contiguous();
auto y1_t = boxes.select(1, 1).contiguous();
auto x2_t = boxes.select(1, 2).contiguous();
auto y2_t = boxes.select(1, 3).contiguous();
Tensor areas_t = (x2_t - x1_t + offset) * (y2_t - y1_t + offset);
fill(ctx, select, *toScalar(1)); auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
auto select_ptr = select.ptr<int64_t>(); auto nboxes = boxes.size(0);
auto boxes_ptr = boxes.ptr<float>(); Tensor select_t = at::ones({nboxes}, boxes.options().dtype(at::kBool));
auto order_ptr = order.ptr<int64_t>();
auto areas_ptr = areas.ptr<float>(); auto select = select_t.data_ptr<bool>();
auto order = order_t.data_ptr<int64_t>();
auto x1 = x1_t.data_ptr<float>();
auto y1 = y1_t.data_ptr<float>();
auto x2 = x2_t.data_ptr<float>();
auto y2 = y2_t.data_ptr<float>();
auto areas = areas_t.data_ptr<float>();
for (int64_t _i = 0; _i < nboxes; _i++) { for (int64_t _i = 0; _i < nboxes; _i++) {
if (select_ptr[_i] == 0) continue; if (select[_i] == false) continue;
auto i = order_ptr[_i]; auto i = order[_i];
auto ix1 = boxes_ptr[i * boxes_dim]; auto ix1 = x1[i];
auto iy1 = boxes_ptr[i * boxes_dim + 1]; auto iy1 = y1[i];
auto ix2 = boxes_ptr[i * boxes_dim + 2]; auto ix2 = x2[i];
auto iy2 = boxes_ptr[i * boxes_dim + 3]; auto iy2 = y2[i];
auto iarea = areas_ptr[i]; auto iarea = areas[i];
for (int64_t _j = _i + 1; _j < nboxes; _j++) { for (int64_t _j = _i + 1; _j < nboxes; _j++) {
if (select_ptr[_j] == 0) continue; if (select[_j] == false) continue;
auto j = order_ptr[_j]; auto j = order[_j];
auto xx1 = fmaxf(ix1, boxes_ptr[j * boxes_dim]); auto xx1 = std::max(ix1, x1[j]);
auto yy1 = fmaxf(iy1, boxes_ptr[j * boxes_dim + 1]); auto yy1 = std::max(iy1, y1[j]);
auto xx2 = fminf(ix2, boxes_ptr[j * boxes_dim + 2]); auto xx2 = std::min(ix2, x2[j]);
auto yy2 = fminf(iy2, boxes_ptr[j * boxes_dim + 3]); auto yy2 = std::min(iy2, y2[j]);
auto w = fmaxf(0.0, xx2 - xx1 + offset); auto w = std::max(0.f, xx2 - xx1 + offset);
auto h = fmaxf(0.0, yy2 - yy1 + offset); auto h = std::max(0.f, yy2 - yy1 + offset);
auto inter = w * h; auto inter = w * h;
auto ovr = inter / (iarea + areas_ptr[j] - inter); auto ovr = inter / (iarea + areas[j] - inter);
if (ovr >= iou_threshold) select_ptr[_j] = 0; if (ovr >= iou_threshold) select[_j] = false;
} }
} }
outs[0] = select; return order_t.masked_select(select_t);
}
Tensor nms(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
if (boxes.device().is_cuda()) {
#ifdef MMCV_WITH_CUDA
CHECK_CUDA_INPUT(boxes);
CHECK_CUDA_INPUT(scores);
return nms_cuda(boxes, scores, iou_threshold, offset);
#else
AT_ERROR("nms is not compiled with GPU support");
#endif
} else {
CHECK_CPU_INPUT(boxes);
CHECK_CPU_INPUT(scores);
return nms_cpu(boxes, scores, iou_threshold, offset);
}
} }
void softnms_cpu(HostContext& ctx, const SSElement& attr, Tensor softnms_cpu(Tensor boxes, Tensor scores, Tensor dets,
const OperatorBase::in_list_t& ins, float iou_threshold, float sigma, float min_score,
OperatorBase::out_list_t& outs) { int method, int offset) {
float iou_threshold; if (boxes.numel() == 0) {
float sigma; return at::empty({0}, boxes.options().dtype(at::kLong));
float min_score;
int method;
int offset;
SSAttrs(attr)
.get<float>("iou_threshold", iou_threshold)
.get<float>("sigma", sigma)
.get<float>("min_score", min_score)
.get<int>("method", method)
.get<int>("offset", offset)
.done();
const auto& boxes = ins[0];
const auto& scores = ins[1];
const auto& areas = ins[2];
size_t nboxes = boxes.shape().dim(0);
size_t boxes_dim = boxes.shape().dim(1);
auto boxes_ptr = boxes.ptr<float>();
auto scores_ptr = scores.ptr<float>();
auto areas_ptr = areas.ptr<float>();
auto inputs = ctx.createDArrayLite(
DArraySpec::array(Prim::Float32, DArrayShape(nboxes, 6)));
auto inputs_ptr = inputs.ptr<float>();
auto dets = ctx.createDArrayLite(
DArraySpec::array(Prim::Float32, DArrayShape(nboxes, 5)));
auto de = dets.ptr<float>();
for (size_t i = 0; i < nboxes; i++) {
inputs_ptr[i * 6 + 0] = boxes_ptr[i * boxes_dim + 0];
inputs_ptr[i * 6 + 1] = boxes_ptr[i * boxes_dim + 1];
inputs_ptr[i * 6 + 2] = boxes_ptr[i * boxes_dim + 2];
inputs_ptr[i * 6 + 3] = boxes_ptr[i * boxes_dim + 3];
inputs_ptr[i * 6 + 4] = scores_ptr[i];
inputs_ptr[i * 6 + 5] = areas_ptr[i];
} }
size_t pos = 0; auto x1_t = boxes.select(1, 0).contiguous();
auto inds_t = ctx.createDArrayLite(DArraySpec::array(Prim::Int64, nboxes)); auto y1_t = boxes.select(1, 1).contiguous();
arange(ctx, *toScalar(0), *toScalar(nboxes), *toScalar(1), inds_t); auto x2_t = boxes.select(1, 2).contiguous();
auto inds = inds_t.ptr<int64_t>(); auto y2_t = boxes.select(1, 3).contiguous();
auto num_out = ctx.createDArrayLite(DArraySpec::scalar(Prim::Int64)); auto scores_t = scores.clone();
Tensor areas_t = (x2_t - x1_t + offset) * (y2_t - y1_t + offset);
for (size_t i = 0; i < nboxes; i++) { auto nboxes = boxes.size(0);
auto max_score = inputs_ptr[i * 6 + 4]; auto x1 = x1_t.data_ptr<float>();
auto y1 = y1_t.data_ptr<float>();
auto x2 = x2_t.data_ptr<float>();
auto y2 = y2_t.data_ptr<float>();
auto sc = scores_t.data_ptr<float>();
auto areas = areas_t.data_ptr<float>();
auto de = dets.data_ptr<float>();
int64_t pos = 0;
Tensor inds_t = at::arange(nboxes, boxes.options().dtype(at::kLong));
auto inds = inds_t.data_ptr<int64_t>();
for (int64_t i = 0; i < nboxes; i++) {
auto max_score = sc[i];
auto max_pos = i; auto max_pos = i;
pos = i + 1; pos = i + 1;
// get max box // get max box
while (pos < nboxes) { while (pos < nboxes) {
if (max_score < inputs_ptr[pos * 6 + 4]) { if (max_score < sc[pos]) {
max_score = inputs_ptr[pos * 6 + 4]; max_score = sc[pos];
max_pos = pos; max_pos = pos;
} }
pos = pos + 1; pos = pos + 1;
} }
// swap // swap
auto ix1 = de[i * 5 + 0] = inputs_ptr[max_pos * 6 + 0]; auto ix1 = de[i * 5 + 0] = x1[max_pos];
auto iy1 = de[i * 5 + 1] = inputs_ptr[max_pos * 6 + 1]; auto iy1 = de[i * 5 + 1] = y1[max_pos];
auto ix2 = de[i * 5 + 2] = inputs_ptr[max_pos * 6 + 2]; auto ix2 = de[i * 5 + 2] = x2[max_pos];
auto iy2 = de[i * 5 + 3] = inputs_ptr[max_pos * 6 + 3]; auto iy2 = de[i * 5 + 3] = y2[max_pos];
auto iscore = de[i * 5 + 4] = inputs_ptr[max_pos * 6 + 4]; auto iscore = de[i * 5 + 4] = sc[max_pos];
auto iarea = inputs_ptr[max_pos * 6 + 5]; auto iarea = areas[max_pos];
auto iind = inds[max_pos]; auto iind = inds[max_pos];
inputs_ptr[max_pos * 6 + 0] = inputs_ptr[i * 6 + 0]; x1[max_pos] = x1[i];
inputs_ptr[max_pos * 6 + 1] = inputs_ptr[i * 6 + 1]; y1[max_pos] = y1[i];
inputs_ptr[max_pos * 6 + 2] = inputs_ptr[i * 6 + 2]; x2[max_pos] = x2[i];
inputs_ptr[max_pos * 6 + 3] = inputs_ptr[i * 6 + 3]; y2[max_pos] = y2[i];
inputs_ptr[max_pos * 6 + 4] = inputs_ptr[i * 6 + 4]; sc[max_pos] = sc[i];
inputs_ptr[max_pos * 6 + 5] = inputs_ptr[i * 6 + 5]; areas[max_pos] = areas[i];
inds[max_pos] = inds[i]; inds[max_pos] = inds[i];
inputs_ptr[i * 6 + 0] = ix1; x1[i] = ix1;
inputs_ptr[i * 6 + 1] = iy1; y1[i] = iy1;
inputs_ptr[i * 6 + 2] = ix2; x2[i] = ix2;
inputs_ptr[i * 6 + 3] = iy2; y2[i] = iy2;
inputs_ptr[i * 6 + 4] = iscore; sc[i] = iscore;
inputs_ptr[i * 6 + 5] = iarea; areas[i] = iarea;
inds[i] = iind; inds[i] = iind;
pos = i + 1; pos = i + 1;
while (pos < nboxes) { while (pos < nboxes) {
auto xx1 = fmaxf(ix1, inputs_ptr[pos * 6 + 0]); auto xx1 = std::max(ix1, x1[pos]);
auto yy1 = fmaxf(iy1, inputs_ptr[pos * 6 + 1]); auto yy1 = std::max(iy1, y1[pos]);
auto xx2 = fminf(ix2, inputs_ptr[pos * 6 + 2]); auto xx2 = std::min(ix2, x2[pos]);
auto yy2 = fminf(iy2, inputs_ptr[pos * 6 + 3]); auto yy2 = std::min(iy2, y2[pos]);
auto w = fmaxf(0.0, xx2 - xx1 + offset); auto w = std::max(0.f, xx2 - xx1 + offset);
auto h = fmaxf(0.0, yy2 - yy1 + offset); auto h = std::max(0.f, yy2 - yy1 + offset);
auto inter = w * h; auto inter = w * h;
auto ovr = inter / (iarea + inputs_ptr[pos * 6 + 5] - inter); auto ovr = inter / (iarea + areas[pos] - inter);
float weight = 1.; float weight = 1.;
if (method == 0) { if (method == 0) {
...@@ -186,18 +158,18 @@ void softnms_cpu(HostContext& ctx, const SSElement& attr, ...@@ -186,18 +158,18 @@ void softnms_cpu(HostContext& ctx, const SSElement& attr,
} else if (method == 1) { } else if (method == 1) {
if (ovr >= iou_threshold) weight = 1 - ovr; if (ovr >= iou_threshold) weight = 1 - ovr;
} else if (method == 2) { } else if (method == 2) {
weight = exp(-(ovr * ovr) / sigma); weight = std::exp(-(ovr * ovr) / sigma);
} }
inputs_ptr[pos * 6 + 4] *= weight; sc[pos] *= weight;
// if box score falls below threshold, discard the box by // if box score falls below threshold, discard the box by
// swapping with last box update N // swapping with last box update N
if (inputs_ptr[pos * 6 + 4] < min_score) { if (sc[pos] < min_score) {
inputs_ptr[pos * 6 + 0] = inputs_ptr[(nboxes - 1) * 6 + 0]; x1[pos] = x1[nboxes - 1];
inputs_ptr[pos * 6 + 1] = inputs_ptr[(nboxes - 1) * 6 + 1]; y1[pos] = y1[nboxes - 1];
inputs_ptr[pos * 6 + 2] = inputs_ptr[(nboxes - 1) * 6 + 2]; x2[pos] = x2[nboxes - 1];
inputs_ptr[pos * 6 + 3] = inputs_ptr[(nboxes - 1) * 6 + 3]; y2[pos] = y2[nboxes - 1];
inputs_ptr[pos * 6 + 4] = inputs_ptr[(nboxes - 1) * 6 + 4]; sc[pos] = sc[nboxes - 1];
inputs_ptr[pos * 6 + 5] = inputs_ptr[(nboxes - 1) * 6 + 5]; areas[pos] = areas[nboxes - 1];
inds[pos] = inds[nboxes - 1]; inds[pos] = inds[nboxes - 1];
nboxes = nboxes - 1; nboxes = nboxes - 1;
pos = pos - 1; pos = pos - 1;
...@@ -205,44 +177,84 @@ void softnms_cpu(HostContext& ctx, const SSElement& attr, ...@@ -205,44 +177,84 @@ void softnms_cpu(HostContext& ctx, const SSElement& attr,
pos = pos + 1; pos = pos + 1;
} }
} }
setScalar(num_out, int64_t{nboxes}); return inds_t.slice(0, 0, nboxes);
outs[0] = dets;
outs[1] = inds_t;
outs[2] = num_out;
} }
void nms_match_cpu(HostContext& ctx, const SSElement& attr, Tensor softnms(Tensor boxes, Tensor scores, Tensor dets, float iou_threshold,
const OperatorBase::in_list_t& ins, float sigma, float min_score, int method, int offset) {
OperatorBase::out_list_t& outs) { if (boxes.device().is_cuda()) {
float iou_threshold; AT_ERROR("softnms is not implemented on GPU");
SSAttrs(attr).get<float>("iou_threshold", iou_threshold).done(); } else {
return softnms_cpu(boxes, scores, dets, iou_threshold, sigma, min_score,
method, offset);
}
} }
PARROTS_EXTENSION_REGISTER(nms) std::vector<std::vector<int> > nms_match_cpu(Tensor dets, float iou_threshold) {
.attr("iou_threshold") auto x1_t = dets.select(1, 0).contiguous();
.attr("offset") auto y1_t = dets.select(1, 1).contiguous();
.input(3) auto x2_t = dets.select(1, 2).contiguous();
.output(1) auto y2_t = dets.select(1, 3).contiguous();
.apply(nms_cpu) auto scores = dets.select(1, 4).contiguous();
#ifdef PARROTS_USE_CUDA
.apply(nms_cuda) at::Tensor areas_t = (x2_t - x1_t) * (y2_t - y1_t);
#endif
.done(); auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
PARROTS_EXTENSION_REGISTER(softnms) auto ndets = dets.size(0);
.attr("iou_threshold") at::Tensor suppressed_t =
.attr("sigma") at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU));
.attr("min_score")
.attr("method") auto suppressed = suppressed_t.data_ptr<uint8_t>();
.attr("offset") auto order = order_t.data_ptr<int64_t>();
.input(3) auto x1 = x1_t.data_ptr<float>();
.output(3) auto y1 = y1_t.data_ptr<float>();
.apply(softnms_cpu) auto x2 = x2_t.data_ptr<float>();
.done(); auto y2 = y2_t.data_ptr<float>();
auto areas = areas_t.data_ptr<float>();
PARROTS_EXTENSION_REGISTER(nms_match)
.attr("iou_threshold") std::vector<int> keep;
.input(1) std::vector<std::vector<int> > matched;
.output(1)
.apply(nms_match_cpu) for (int64_t _i = 0; _i < ndets; _i++) {
.done(); auto i = order[_i];
if (suppressed[i] == 1) continue;
keep.push_back(i);
std::vector<int> v_i;
auto ix1 = x1[i];
auto iy1 = y1[i];
auto ix2 = x2[i];
auto iy2 = y2[i];
auto iarea = areas[i];
for (int64_t _j = _i + 1; _j < ndets; _j++) {
auto j = order[_j];
if (suppressed[j] == 1) continue;
auto xx1 = std::max(ix1, x1[j]);
auto yy1 = std::max(iy1, y1[j]);
auto xx2 = std::min(ix2, x2[j]);
auto yy2 = std::min(iy2, y2[j]);
auto w = std::max(static_cast<float>(0), xx2 - xx1);
auto h = std::max(static_cast<float>(0), yy2 - yy1);
auto inter = w * h;
auto ovr = inter / (iarea + areas[j] - inter);
if (ovr >= iou_threshold) {
suppressed[j] = 1;
v_i.push_back(j);
}
}
matched.push_back(v_i);
}
for (int i = 0; i < keep.size(); i++)
matched[i].insert(matched[i].begin(), keep[i]);
return matched;
}
std::vector<std::vector<int> > nms_match(Tensor dets, float iou_threshold) {
if (dets.device().is_cuda()) {
AT_ERROR("nms_match is not implemented on GPU");
} else {
return nms_match_cpu(dets, iou_threshold);
}
}
#include "nms_cuda_kernel.cuh" #include "nms_cuda_kernel.cuh"
#include "parrots_cuda_helper.hpp" #include "pytorch_cuda_helper.hpp"
DArrayLite NMSCUDAKernelLauncher(const DArrayLite boxes_sorted, Tensor NMSCUDAKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
const DArrayLite order, const DArrayLite areas, int offset) {
float iou_threshold, int offset, at::cuda::CUDAGuard device_guard(boxes.device());
CudaContext& ctx, cudaStream_t stream) {
size_t boxes_num = boxes_sorted.dim(0);
if (boxes_sorted.size() == 0) { if (boxes.numel() == 0) {
auto select = ctx.createDArrayLite(DArraySpec::array(Prim::Int64, 0)); return at::empty({0}, boxes.options().dtype(at::kLong));
return select;
} }
auto order_t = std::get<1>(scores.sort(0, /*descending=*/true));
auto boxes_sorted = boxes.index_select(0, order_t);
const size_t col_blocks = DIVUP(boxes_num, threadsPerBlock); int boxes_num = boxes.size(0);
auto mask = ctx.createDArrayLite( const int col_blocks = DIVUP(boxes_num, threadsPerBlock);
DArraySpec::array(Prim::Int64, DArrayShape(boxes_num, col_blocks))); Tensor mask =
at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
dim3 blocks(col_blocks, col_blocks); dim3 blocks(col_blocks, col_blocks);
dim3 threads(threadsPerBlock); dim3 threads(threadsPerBlock);
PARROTS_CUDA_CHECK(cudaGetLastError()); cudaStream_t stream = at::cuda::getCurrentCUDAStream();
nms_cuda<<<blocks, threads, 0, stream>>>( nms_cuda<<<blocks, threads, 0, stream>>>(
boxes_num, iou_threshold, offset, boxes_sorted.ptr<float>(), boxes_num, iou_threshold, offset, boxes_sorted.data_ptr<float>(),
(unsigned long long*)mask.ptr<int64_t>()); (unsigned long long*)mask.data_ptr<int64_t>());
PARROTS_CUDA_CHECK(cudaGetLastError());
auto mask_cpu = ctx.createDArrayLite(mask, getHostProxy()); at::Tensor mask_cpu = mask.to(at::kCPU);
auto mask_host = mask_cpu.ptr<int64_t>(); unsigned long long* mask_host =
(unsigned long long*)mask_cpu.data_ptr<int64_t>();
auto remv = ctx.createDArrayLite(DArraySpec::array(Prim::Int64, col_blocks), std::vector<unsigned long long> remv(col_blocks);
getHostProxy()); memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
remv.setZeros(syncStream());
auto remv_ptr = remv.ptr<int64_t>();
auto keep_t = ctx.createDArrayLite(DArraySpec::array(Prim::Uint8, boxes_num), at::Tensor keep_t =
getHostProxy()); at::zeros({boxes_num}, boxes.options().dtype(at::kBool).device(at::kCPU));
keep_t.setZeros(syncStream()); bool* keep = keep_t.data_ptr<bool>();
auto keep = keep_t.ptr<uint8_t>();
for (int i = 0; i < boxes_num; i++) { for (int i = 0; i < boxes_num; i++) {
int nblock = i / threadsPerBlock; int nblock = i / threadsPerBlock;
int inblock = i % threadsPerBlock; int inblock = i % threadsPerBlock;
if (!(remv_ptr[nblock] & (1ULL << inblock))) { if (!(remv[nblock] & (1ULL << inblock))) {
keep[i] = 1; keep[i] = true;
int64_t* p = mask_host + i * col_blocks; // set every overlap box with bit 1 in remv
unsigned long long* p = mask_host + i * col_blocks;
for (int j = nblock; j < col_blocks; j++) { for (int j = nblock; j < col_blocks; j++) {
remv_ptr[j] |= p[j]; remv[j] |= p[j];
} }
} }
} }
auto keep_cuda = ctx.createDArrayLite(keep_t, ctx.getProxy()); AT_CUDA_CHECK(cudaGetLastError());
PARROTS_CUDA_CHECK(cudaGetLastError()); return order_t.masked_select(keep_t.to(at::kCUDA));
return keep_cuda;
} }
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>
#include "nms_pytorch.h"
using namespace parrots;
// Tensor nms(Tensor boxes, Tensor scores, float iou_threshold, int offset);
template <typename T>
void nms_parrots(T& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
float iou_threshold;
int offset;
SSAttrs(attr)
.get("iou_threshold", iou_threshold)
.get("offset", offset)
.done();
at::Tensor boxes, scores;
boxes = buildATensor(ctx, ins[0]);
scores = buildATensor(ctx, ins[1]);
auto out = nms(boxes, scores, iou_threshold, offset);
updateDArray(ctx, out, outs[0]);
}
/*Tensor softnms(Tensor boxes, Tensor scores, Tensor dets, float iou_threshold,
* float sigma, float min_score, int method, int offset);*/
template <typename T>
void softnms_parrots(T& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
float iou_threshold, sigma, min_score;
int method, offset;
SSAttrs(attr)
.get("iou_threshold", iou_threshold)
.get("sigma", sigma)
.get("min_score", min_score)
.get("method", method)
.get("offset", offset)
.done();
at::Tensor boxes, scores, dets;
boxes = buildATensor(ctx, ins[0]);
scores = buildATensor(ctx, ins[1]);
dets = buildATensor(ctx, ins[2]);
auto out = softnms(boxes, scores, dets, iou_threshold, sigma, min_score,
method, offset);
updateDArray(ctx, out, outs[0]);
}
// std::vector<std::vector<int> > nms_match(Tensor dets, float iou_threshold);
template <typename T>
void nms_match_parrots(T& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
float iou_threshold;
SSAttrs(attr).get("iou_threshold", iou_threshold).done();
at::Tensor dets;
dets = buildATensor(ctx, ins[0]);
auto out = nms_match(dets, iou_threshold);
int n = out.size(), m = 0;
for (int i = 0; i < n; ++i)
if (m < out[i].size()) m = out[i].size();
auto options = torch::TensorOptions().dtype(at::kInt);
auto tensor = torch::zeros({n, m}, options);
for (int i = 0; i < n; i++)
tensor.slice(0, i, i + 1) =
torch::from_blob(out[i].data(), {out[i].size()}, options);
updateDArray(ctx, tensor, outs[0]);
}
/*Tensor nms_rotated(const Tensor dets, const Tensor scores, const Tensor order,
* const Tensor dets_sorted, const float iou_threshold,
* const int multi_label);*/
template <typename T>
void nms_rotated_parrots(T& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
float iou_threshold;
int multi_label;
SSAttrs(attr)
.get("iou_threshold", iou_threshold)
.get("multi_label", multi_label)
.done();
at::Tensor dets, scores, order, dets_sorted;
dets = buildATensor(ctx, ins[0]);
scores = buildATensor(ctx, ins[1]);
order = buildATensor(ctx, ins[2]);
dets_sorted = buildATensor(ctx, ins[3]);
auto out =
nms_rotated(dets, scores, order, dets_sorted, iou_threshold, multi_label);
updateDArray(ctx, out, outs[0]);
}
PARROTS_EXTENSION_REGISTER(nms)
.attr("iou_threshold")
.attr("offset")
.input(2)
.output(1)
.apply(nms_parrots<HostContext>)
#ifdef MMCV_WITH_CUDA
.apply(nms_parrots<CudaContext>)
#endif
.done();
PARROTS_EXTENSION_REGISTER(softnms)
.attr("iou_threshold")
.attr("sigma")
.attr("min_score")
.attr("method")
.attr("offset")
.input(3)
.output(1)
.apply(softnms_parrots<HostContext>)
#ifdef MMCV_WITH_CUDA
.apply(softnms_parrots<CudaContext>)
#endif
.done();
PARROTS_EXTENSION_REGISTER(nms_match)
.attr("iou_threshold")
.input(1)
.output(1)
.apply(nms_match_parrots<HostContext>)
#ifdef MMCV_WITH_CUDA
.apply(nms_match_parrots<CudaContext>)
#endif
.done();
PARROTS_EXTENSION_REGISTER(nms_rotated)
.attr("multi_label")
.attr("iou_threshold")
.input(4)
.output(1)
.apply(nms_rotated_parrots<HostContext>)
#ifdef MMCV_WITH_CUDA
.apply(nms_rotated_parrots<CudaContext>)
#endif
.done();
#ifndef NMS_PYTORCH_H
#define NMS_PYTORCH_H
#include <torch/extension.h>
at::Tensor nms(at::Tensor boxes, at::Tensor scores, float iou_threshold,
int offset);
at::Tensor softnms(at::Tensor boxes, at::Tensor scores, at::Tensor dets,
float iou_threshold, float sigma, float min_score,
int method, int offset);
std::vector<std::vector<int> > nms_match(at::Tensor dets, float iou_threshold);
at::Tensor nms_rotated(const at::Tensor dets, const at::Tensor scores,
const at::Tensor order, const at::Tensor dets_sorted,
const float iou_threshold, const int multi_label);
#endif // NMS_PYTORCH_H
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
// modified from // modified from
// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/nms_rotated/nms_rotated.h // https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/nms_rotated/nms_rotated.h
#include "parrots_cpp_helper.hpp" #include "pytorch_cpp_helper.hpp"
DArrayLite nms_rotated_cuda(const DArrayLite dets, const DArrayLite scores, Tensor nms_rotated_cpu(const Tensor dets, const Tensor scores,
const DArrayLite dets_sorted, float iou_threshold, const float iou_threshold);
const int multi_label, cudaStream_t stream,
CudaContext& ctx); #ifdef MMCV_WITH_CUDA
Tensor nms_rotated_cuda(const Tensor dets, const Tensor scores,
const Tensor order, const Tensor dets_sorted,
const float iou_threshold, const int multi_label);
#endif
// Interface for Python // Interface for Python
// inline is needed to prevent multiple function definitions when this header is // inline is needed to prevent multiple function definitions when this header is
// included by different cpps // included by different cpps
void nms_rotated(CudaContext& ctx, const SSElement& attr, Tensor nms_rotated(const Tensor dets, const Tensor scores, const Tensor order,
const OperatorBase::in_list_t& ins, const Tensor dets_sorted, const float iou_threshold,
OperatorBase::out_list_t& outs) { const int multi_label) {
float iou_threshold; assert(dets.device().is_cuda() == scores.device().is_cuda());
int multi_label; if (dets.device().is_cuda()) {
SSAttrs(attr) #ifdef MMCV_WITH_CUDA
.get<float>("iou_threshold", iou_threshold) return nms_rotated_cuda(dets, scores, order, dets_sorted, iou_threshold,
.get<int>("multi_label", multi_label) multi_label);
.done(); #else
AT_ERROR("Not compiled with GPU support");
const auto& dets = ins[0]; #endif
const auto& scores = ins[1]; }
const auto& dets_sorted = ins[2];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream()); return nms_rotated_cpu(dets, scores, iou_threshold);
outs[0] = nms_rotated_cuda(dets, scores, dets_sorted, iou_threshold,
multi_label, stream, ctx);
} }
PARROTS_EXTENSION_REGISTER(nms_rotated)
.attr("multi_label")
.attr("iou_threshold")
.input(3)
.output(1)
.apply(nms_rotated)
.done();
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
// modified from
// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/nms_rotated/nms_rotated_cpu.cpp
#include "box_iou_rotated_utils.hpp"
#include "pytorch_cpp_helper.hpp"
template <typename scalar_t>
Tensor nms_rotated_cpu_kernel(const Tensor dets, const Tensor scores,
const float iou_threshold) {
// nms_rotated_cpu_kernel is modified from torchvision's nms_cpu_kernel,
// however, the code in this function is much shorter because
// we delegate the IoU computation for rotated boxes to
// the single_box_iou_rotated function in box_iou_rotated_utils.h
AT_ASSERTM(!dets.type().is_cuda(), "dets must be a CPU tensor");
AT_ASSERTM(!scores.type().is_cuda(), "scores must be a CPU tensor");
AT_ASSERTM(dets.type() == scores.type(),
"dets should have the same type as scores");
if (dets.numel() == 0) {
return at::empty({0}, dets.options().dtype(at::kLong));
}
auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
auto ndets = dets.size(0);
Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte));
Tensor keep_t = at::zeros({ndets}, dets.options().dtype(at::kLong));
auto suppressed = suppressed_t.data_ptr<uint8_t>();
auto keep = keep_t.data_ptr<int64_t>();
auto order = order_t.data_ptr<int64_t>();
int64_t num_to_keep = 0;
for (int64_t _i = 0; _i < ndets; _i++) {
auto i = order[_i];
if (suppressed[i] == 1) {
continue;
}
keep[num_to_keep++] = i;
for (int64_t _j = _i + 1; _j < ndets; _j++) {
auto j = order[_j];
if (suppressed[j] == 1) {
continue;
}
auto ovr = single_box_iou_rotated<scalar_t>(
dets[i].data_ptr<scalar_t>(), dets[j].data_ptr<scalar_t>(), 0);
if (ovr >= iou_threshold) {
suppressed[j] = 1;
}
}
}
return keep_t.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep);
}
Tensor nms_rotated_cpu(const Tensor dets, const Tensor scores,
const float iou_threshold) {
auto result = at::empty({0}, dets.options());
AT_DISPATCH_FLOATING_TYPES(dets.type(), "nms_rotated", [&] {
result = nms_rotated_cpu_kernel<scalar_t>(dets, scores, iou_threshold);
});
return result;
}
...@@ -2,45 +2,51 @@ ...@@ -2,45 +2,51 @@
// modified from // modified from
// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/nms_rotated/nms_rotated_cuda.cu // https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/nms_rotated/nms_rotated_cuda.cu
#include "nms_rotated_cuda.cuh" #include "nms_rotated_cuda.cuh"
#include "parrots_cuda_helper.hpp" #include "pytorch_cuda_helper.hpp"
DArrayLite nms_rotated_cuda(const DArrayLite dets, const DArrayLite scores, Tensor nms_rotated_cuda(const Tensor dets, const Tensor scores,
const DArrayLite dets_sorted, float iou_threshold, const Tensor order_t, const Tensor dets_sorted,
const int multi_label, cudaStream_t stream, float iou_threshold, const int multi_label) {
CudaContext& ctx) { // using scalar_t = float;
int dets_num = dets.dim(0); AT_ASSERTM(dets.type().is_cuda(), "dets must be a CUDA tensor");
AT_ASSERTM(scores.type().is_cuda(), "scores must be a CUDA tensor");
at::cuda::CUDAGuard device_guard(dets.device());
const int col_blocks = divideUP(dets_num, threadsPerBlock); int dets_num = dets.size(0);
auto mask = ctx.createDArrayLite( const int col_blocks = at::cuda::ATenCeilDiv(dets_num, threadsPerBlock);
DArraySpec::array(Prim::Int64, DArrayShape(dets_num * col_blocks)));
Tensor mask =
at::empty({dets_num * col_blocks}, dets.options().dtype(at::kLong));
dim3 blocks(col_blocks, col_blocks); dim3 blocks(col_blocks, col_blocks);
dim3 threads(threadsPerBlock); dim3 threads(threadsPerBlock);
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(dets_sorted.elemType().prim(), [&] { AT_DISPATCH_FLOATING_TYPES_AND_HALF(
nms_rotated_cuda_kernel<scalar_t><<<blocks, threads, 0, stream>>>( dets_sorted.type(), "nms_rotated_kernel_cuda", [&] {
dets_num, iou_threshold, dets_sorted.ptr<scalar_t>(), nms_rotated_cuda_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
(unsigned long long*)mask.ptr<int64_t>(), multi_label); dets_num, iou_threshold, dets_sorted.data<scalar_t>(),
}); (unsigned long long*)mask.data<int64_t>(), multi_label);
});
DArrayLite mask_cpu = ctx.createDArrayLite(mask, getHostProxy()); Tensor mask_cpu = mask.to(at::kCPU);
unsigned long long* mask_host = (unsigned long long*)mask_cpu.ptr<int64_t>(); unsigned long long* mask_host = (unsigned long long*)mask_cpu.data<int64_t>();
std::vector<unsigned long long> remv(col_blocks); std::vector<unsigned long long> remv(col_blocks);
memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks); memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
auto keep = ctx.createDArrayLite( Tensor keep =
DArraySpec::array(Prim::Int64, DArrayShape(dets_num)), getHostProxy()); at::empty({dets_num}, dets.options().dtype(at::kLong).device(at::kCPU));
int64_t* keep_out = keep.data<int64_t>();
int64_t* keep_out = keep.ptr<int64_t>();
int num_to_keep = 0;
for (int i = 0; i < dets_num; i++) { for (int i = 0; i < dets_num; i++) {
int nblock = i / threadsPerBlock; int nblock = i / threadsPerBlock;
int inblock = i % threadsPerBlock; int inblock = i % threadsPerBlock;
if (!(remv[nblock] & (1ULL << inblock))) { if (!(remv[nblock] & (1ULL << inblock))) {
keep_out[i] = 1; keep_out[num_to_keep++] = i;
unsigned long long* p = mask_host + i * col_blocks; unsigned long long* p = mask_host + i * col_blocks;
for (int j = nblock; j < col_blocks; j++) { for (int j = nblock; j < col_blocks; j++) {
remv[j] |= p[j]; remv[j] |= p[j];
...@@ -48,7 +54,8 @@ DArrayLite nms_rotated_cuda(const DArrayLite dets, const DArrayLite scores, ...@@ -48,7 +54,8 @@ DArrayLite nms_rotated_cuda(const DArrayLite dets, const DArrayLite scores,
} }
} }
auto keep_cuda = ctx.createDArrayLite(keep, ctx.getProxy()); AT_CUDA_CHECK(cudaGetLastError());
PARROTS_CUDA_CHECK(cudaGetLastError()); return order_t.index(
return keep_cuda; {keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep)
.to(order_t.device(), keep.scalar_type())});
} }
#include "parrots_cpp_helper.hpp"
using namespace parrots;
#include "parrots_cuda_helper.hpp"
using namespace parrots;
#include "parrots_cpp_helper.hpp" // Modified from
// https://github.com/hszhao/semseg/blob/master/lib/psa/src
#include "pytorch_cpp_helper.hpp"
#ifndef min #ifndef min
#define min(a, b) (((a) < (b)) ? (a) : (b)) #define min(a, b) (((a) < (b)) ? (a) : (b))
#endif #endif
#ifndef max #ifndef max
#define max(a, b) (((a) > (b)) ? (a) : (b)) #define max(a, b) (((a) > (b)) ? (a) : (b))
#endif #endif
...@@ -11,8 +12,8 @@ ...@@ -11,8 +12,8 @@
void psamask_collect_forward(const int num_, const int h_feature, void psamask_collect_forward(const int num_, const int h_feature,
const int w_feature, const int h_mask, const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask, const int w_mask, const int half_h_mask,
const int half_w_mask, const float *mask_data, const int half_w_mask, const Tensor mask_data,
float *buffer_data) { Tensor buffer_data) {
for (int n = 0; n < num_; n++) { for (int n = 0; n < num_; n++) {
for (int h = 0; h < h_feature; h++) { for (int h = 0; h < h_feature; h++) {
for (int w = 0; w < w_feature; w++) { for (int w = 0; w < w_feature; w++) {
...@@ -27,12 +28,13 @@ void psamask_collect_forward(const int num_, const int h_feature, ...@@ -27,12 +28,13 @@ void psamask_collect_forward(const int num_, const int h_feature,
// feature-indexed // feature-indexed
for (int hidx = hstart; hidx < hend; hidx++) { for (int hidx = hstart; hidx < hend; hidx++) {
for (int widx = wstart; widx < wend; widx++) { for (int widx = wstart; widx < wend; widx++) {
buffer_data[(n * h_feature * w_feature + buffer_data.view({-1})[(n * h_feature * w_feature +
(hidx + h - half_h_mask) * w_feature + (hidx + h - half_h_mask) * w_feature +
(widx + w - half_w_mask)) * (widx + w - half_w_mask)) *
h_feature * w_feature + h_feature * w_feature +
h * w_feature + w] = h * w_feature + w] =
mask_data[((n * h_mask * w_mask + hidx * w_mask + widx) * mask_data.view(
{-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
h_feature + h_feature +
h) * h) *
w_feature + w_feature +
...@@ -47,8 +49,8 @@ void psamask_collect_forward(const int num_, const int h_feature, ...@@ -47,8 +49,8 @@ void psamask_collect_forward(const int num_, const int h_feature,
void psamask_distribute_forward(const int num_, const int h_feature, void psamask_distribute_forward(const int num_, const int h_feature,
const int w_feature, const int h_mask, const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask, const int w_mask, const int half_h_mask,
const int half_w_mask, const float *mask_data, const int half_w_mask, const Tensor mask_data,
float *buffer_data) { Tensor buffer_data) {
for (int n = 0; n < num_; n++) { for (int n = 0; n < num_; n++) {
for (int h = 0; h < h_feature; h++) { for (int h = 0; h < h_feature; h++) {
for (int w = 0; w < w_feature; w++) { for (int w = 0; w < w_feature; w++) {
...@@ -63,11 +65,13 @@ void psamask_distribute_forward(const int num_, const int h_feature, ...@@ -63,11 +65,13 @@ void psamask_distribute_forward(const int num_, const int h_feature,
// feature-indexed // feature-indexed
for (int hidx = hstart; hidx < hend; hidx++) { for (int hidx = hstart; hidx < hend; hidx++) {
for (int widx = wstart; widx < wend; widx++) { for (int widx = wstart; widx < wend; widx++) {
buffer_data[(n * h_feature * w_feature + h * w_feature + w) * buffer_data.view(
h_feature * w_feature + {-1})[(n * h_feature * w_feature + h * w_feature + w) *
(hidx + h - half_h_mask) * w_feature + h_feature * w_feature +
(widx + w - half_w_mask)] = (hidx + h - half_h_mask) * w_feature +
mask_data[((n * h_mask * w_mask + hidx * w_mask + widx) * (widx + w - half_w_mask)] =
mask_data.view(
{-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
h_feature + h_feature +
h) * h) *
w_feature + w_feature +
...@@ -82,8 +86,8 @@ void psamask_distribute_forward(const int num_, const int h_feature, ...@@ -82,8 +86,8 @@ void psamask_distribute_forward(const int num_, const int h_feature,
void psamask_collect_backward(const int num_, const int h_feature, void psamask_collect_backward(const int num_, const int h_feature,
const int w_feature, const int h_mask, const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask, const int w_mask, const int half_h_mask,
const int half_w_mask, const float *buffer_diff, const int half_w_mask, const Tensor buffer_diff,
float *mask_diff) { Tensor mask_diff) {
for (int n = 0; n < num_; n++) { for (int n = 0; n < num_; n++) {
for (int h = 0; h < h_feature; h++) { for (int h = 0; h < h_feature; h++) {
for (int w = 0; w < w_feature; w++) { for (int w = 0; w < w_feature; w++) {
...@@ -98,11 +102,12 @@ void psamask_collect_backward(const int num_, const int h_feature, ...@@ -98,11 +102,12 @@ void psamask_collect_backward(const int num_, const int h_feature,
// feature-indexed // feature-indexed
for (int hidx = hstart; hidx < hend; hidx++) { for (int hidx = hstart; hidx < hend; hidx++) {
for (int widx = wstart; widx < wend; widx++) { for (int widx = wstart; widx < wend; widx++) {
mask_diff[((n * h_mask * w_mask + hidx * w_mask + widx) * mask_diff.view({-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
h_feature + h_feature +
h) * h) *
w_feature + w_feature +
w] = buffer_diff[(n * h_feature * w_feature + w] =
buffer_diff.view({-1})[(n * h_feature * w_feature +
(hidx + h - half_h_mask) * w_feature + (hidx + h - half_h_mask) * w_feature +
(widx + w - half_w_mask)) * (widx + w - half_w_mask)) *
h_feature * w_feature + h_feature * w_feature +
...@@ -118,7 +123,7 @@ void psamask_distribute_backward(const int num_, const int h_feature, ...@@ -118,7 +123,7 @@ void psamask_distribute_backward(const int num_, const int h_feature,
const int w_feature, const int h_mask, const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask, const int w_mask, const int half_h_mask,
const int half_w_mask, const int half_w_mask,
const float *buffer_diff, float *mask_diff) { const Tensor buffer_diff, Tensor mask_diff) {
for (int n = 0; n < num_; n++) { for (int n = 0; n < num_; n++) {
for (int h = 0; h < h_feature; h++) { for (int h = 0; h < h_feature; h++) {
for (int w = 0; w < w_feature; w++) { for (int w = 0; w < w_feature; w++) {
...@@ -133,15 +138,16 @@ void psamask_distribute_backward(const int num_, const int h_feature, ...@@ -133,15 +138,16 @@ void psamask_distribute_backward(const int num_, const int h_feature,
// feature-indexed // feature-indexed
for (int hidx = hstart; hidx < hend; hidx++) { for (int hidx = hstart; hidx < hend; hidx++) {
for (int widx = wstart; widx < wend; widx++) { for (int widx = wstart; widx < wend; widx++) {
mask_diff[((n * h_mask * w_mask + hidx * w_mask + widx) * mask_diff.view({-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
h_feature + h_feature +
h) * h) *
w_feature + w_feature +
w] = w] =
buffer_diff[(n * h_feature * w_feature + h * w_feature + w) * buffer_diff.view(
h_feature * w_feature + {-1})[(n * h_feature * w_feature + h * w_feature + w) *
(hidx + h - half_h_mask) * w_feature + h_feature * w_feature +
(widx + w - half_w_mask)]; (hidx + h - half_h_mask) * w_feature +
(widx + w - half_w_mask)];
} }
} }
} }
...@@ -149,156 +155,101 @@ void psamask_distribute_backward(const int num_, const int h_feature, ...@@ -149,156 +155,101 @@ void psamask_distribute_backward(const int num_, const int h_feature,
} }
} }
void psamask_forward_cpu(HostContext &ctx, const SSElement &attr, void psamask_forward_cpu(const int psa_type, const Tensor input, Tensor output,
const OperatorBase::in_list_t &ins, const int num_, const int h_feature,
OperatorBase::out_list_t &outs) { const int w_feature, const int h_mask,
int psa_type, num_, h_feature, w_feature, h_mask, w_mask, half_h_mask, const int w_mask, const int half_h_mask,
half_w_mask; const int half_w_mask) {
SSAttrs(attr)
.get<int>("psa_type", psa_type)
.get<int>("num_", num_)
.get<int>("h_feature", h_feature)
.get<int>("w_feature", w_feature)
.get<int>("h_mask", h_mask)
.get<int>("w_mask", w_mask)
.get<int>("half_h_mask", half_h_mask)
.get<int>("half_w_mask", half_w_mask)
.done();
const auto &input = ins[0];
auto &output = outs[0];
auto input_ptr = input.ptr<float>();
auto output_ptr = output.ptr<float>();
if (psa_type == 0) if (psa_type == 0)
psamask_collect_forward(num_, h_feature, w_feature, h_mask, w_mask, psamask_collect_forward(num_, h_feature, w_feature, h_mask, w_mask,
half_h_mask, half_w_mask, input_ptr, output_ptr); half_h_mask, half_w_mask, input, output);
else else
psamask_distribute_forward(num_, h_feature, w_feature, h_mask, w_mask, psamask_distribute_forward(num_, h_feature, w_feature, h_mask, w_mask,
half_h_mask, half_w_mask, input_ptr, output_ptr); half_h_mask, half_w_mask, input, output);
} }
void psamask_backward_cpu(HostContext &ctx, const SSElement &attr, void psamask_backward_cpu(const int psa_type, const Tensor grad_output,
const OperatorBase::in_list_t &ins, Tensor grad_input, const int num_,
OperatorBase::out_list_t &outs) { const int h_feature, const int w_feature,
int psa_type, num_, h_feature, w_feature, h_mask, w_mask, half_h_mask, const int h_mask, const int w_mask,
half_w_mask; const int half_h_mask, const int half_w_mask) {
SSAttrs(attr)
.get<int>("psa_type", psa_type)
.get<int>("num_", num_)
.get<int>("h_feature", h_feature)
.get<int>("w_feature", w_feature)
.get<int>("h_mask", h_mask)
.get<int>("w_mask", w_mask)
.get<int>("half_h_mask", half_h_mask)
.get<int>("half_w_mask", half_w_mask)
.done();
const auto &input = ins[0];
auto &output = outs[0];
auto input_ptr = input.ptr<float>();
auto output_ptr = output.ptr<float>();
if (psa_type == 0) if (psa_type == 0)
psamask_collect_backward(num_, h_feature, w_feature, h_mask, w_mask, psamask_collect_backward(num_, h_feature, w_feature, h_mask, w_mask,
half_h_mask, half_w_mask, input_ptr, output_ptr); half_h_mask, half_w_mask, grad_output, grad_input);
else else
psamask_distribute_backward(num_, h_feature, w_feature, h_mask, w_mask, psamask_distribute_backward(num_, h_feature, w_feature, h_mask, w_mask,
half_h_mask, half_w_mask, input_ptr, half_h_mask, half_w_mask, grad_output,
output_ptr); grad_input);
} }
void PSAMaskForwardCUDAKernelLauncher(const int psa_type, #ifdef MMCV_WITH_CUDA
const DArrayLite input, DArrayLite output, void PSAMaskForwardCUDAKernelLauncher(const int psa_type, const Tensor input,
const int num_, const int h_feature, Tensor output, const int num_,
const int w_feature, const int h_mask, const int h_feature, const int w_feature,
const int w_mask, const int half_h_mask, const int h_mask, const int w_mask,
const int half_w_mask, CudaContext &ctx); const int half_h_mask,
const int half_w_mask);
void PSAMaskBackwardCUDAKernelLauncher(const int psa_type,
const DArrayLite grad_output, void PSAMaskBackwardCUDAKernelLauncher(
DArrayLite grad_input, const int num_, const int psa_type, const Tensor grad_output, Tensor grad_input,
const int h_feature, const int w_feature, const int num_, const int h_feature, const int w_feature, const int h_mask,
const int h_mask, const int w_mask, const int w_mask, const int half_h_mask, const int half_w_mask);
const int half_h_mask,
const int half_w_mask, CudaContext &ctx); void psamask_forward_cuda(const int psa_type, const Tensor input, Tensor output,
const int num_, const int h_feature,
void psamask_forward_cuda(CudaContext &ctx, const SSElement &attr, const int w_feature, const int h_mask,
const OperatorBase::in_list_t &ins, const int w_mask, const int half_h_mask,
OperatorBase::out_list_t &outs) { const int half_w_mask) {
int psa_type, num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
half_w_mask;
SSAttrs(attr)
.get<int>("psa_type", psa_type)
.get<int>("num_", num_)
.get<int>("h_feature", h_feature)
.get<int>("w_feature", w_feature)
.get<int>("h_mask", h_mask)
.get<int>("w_mask", w_mask)
.get<int>("half_h_mask", half_h_mask)
.get<int>("half_w_mask", half_w_mask)
.done();
const auto &input = ins[0];
auto &output = outs[0];
PSAMaskForwardCUDAKernelLauncher(psa_type, input, output, num_, h_feature, PSAMaskForwardCUDAKernelLauncher(psa_type, input, output, num_, h_feature,
w_feature, h_mask, w_mask, half_h_mask, w_feature, h_mask, w_mask, half_h_mask,
half_w_mask, ctx); half_w_mask);
} }
void psamask_backward_cuda(CudaContext &ctx, const SSElement &attr, void psamask_backward_cuda(const int psa_type, const Tensor grad_output,
const OperatorBase::in_list_t &ins, Tensor grad_input, const int num_,
OperatorBase::out_list_t &outs) { const int h_feature, const int w_feature,
int psa_type, num_, h_feature, w_feature, h_mask, w_mask, half_h_mask, const int h_mask, const int w_mask,
half_w_mask; const int half_h_mask, const int half_w_mask) {
SSAttrs(attr) PSAMaskBackwardCUDAKernelLauncher(psa_type, grad_output, grad_input, num_,
.get<int>("psa_type", psa_type) h_feature, w_feature, h_mask, w_mask,
.get<int>("num_", num_) half_h_mask, half_w_mask);
.get<int>("h_feature", h_feature)
.get<int>("w_feature", w_feature)
.get<int>("h_mask", h_mask)
.get<int>("w_mask", w_mask)
.get<int>("half_h_mask", half_h_mask)
.get<int>("half_w_mask", half_w_mask)
.done();
const auto &input = ins[0];
auto &output = outs[0];
PSAMaskBackwardCUDAKernelLauncher(psa_type, input, output, num_, h_feature,
w_feature, h_mask, w_mask, half_h_mask,
half_w_mask, ctx);
} }
#endif
PARROTS_EXTENSION_REGISTER(psamask_forward) void psamask_forward(const Tensor input, Tensor output, const int psa_type,
.attr("psa_type") const int num_, const int h_feature, const int w_feature,
.attr("num_") const int h_mask, const int w_mask, const int half_h_mask,
.attr("h_feature") const int half_w_mask) {
.attr("w_feature") if (input.device().is_cuda()) {
.attr("h_mask") #ifdef MMCV_WITH_CUDA
.attr("w_mask") CHECK_CUDA_INPUT(input);
.attr("half_h_mask") CHECK_CUDA_INPUT(output);
.attr("half_w_mask") psamask_forward_cuda(psa_type, input, output, num_, h_feature, w_feature,
.input(1) h_mask, w_mask, half_h_mask, half_w_mask);
.output(1) #else
.apply(psamask_forward_cpu) AT_ERROR("PSAMask is not compiled with GPU support");
#ifdef PARROTS_USE_CUDA
.apply(psamask_forward_cuda)
#endif #endif
.done(); } else {
psamask_forward_cpu(psa_type, input, output, num_, h_feature, w_feature,
h_mask, w_mask, half_h_mask, half_w_mask);
}
}
PARROTS_EXTENSION_REGISTER(psamask_backward) void psamask_backward(Tensor grad_output, const Tensor grad_input,
.attr("psa_type") const int psa_type, const int num_, const int h_feature,
.attr("num_") const int w_feature, const int h_mask, const int w_mask,
.attr("h_feature") const int half_h_mask, const int half_w_mask) {
.attr("w_feature") if (grad_input.device().is_cuda()) {
.attr("h_mask") #ifdef MMCV_WITH_CUDA
.attr("w_mask") CHECK_CUDA_INPUT(grad_input);
.attr("half_h_mask") CHECK_CUDA_INPUT(grad_output);
.attr("half_w_mask") psamask_backward_cuda(psa_type, grad_output, grad_input, num_, h_feature,
.input(1) w_feature, h_mask, w_mask, half_h_mask, half_w_mask);
.output(1) #else
.apply(psamask_backward_cpu) AT_ERROR("PSAMask is not compiled with GPU support");
#ifdef PARROTS_USE_CUDA
.apply(psamask_backward_cuda)
#endif #endif
.done(); } else {
psamask_backward_cpu(psa_type, grad_output, grad_input, num_, h_feature,
w_feature, h_mask, w_mask, half_h_mask, half_w_mask);
}
}
// Modified from // Modified from
// https://github.com/hszhao/semseg/blob/master/lib/psa/src // https://github.com/hszhao/semseg/blob/master/lib/psa/src
#include "parrots_cuda_helper.hpp" #include <THC/THC.h>
#include <torch/serialize/tensor.h>
#include <THC/THCDeviceUtils.cuh>
#include "psamask_cuda_kernel.cuh" #include "psamask_cuda_kernel.cuh"
#include "pytorch_cuda_helper.hpp"
void PSAMaskForwardCUDAKernelLauncher(const int psa_type, void PSAMaskForwardCUDAKernelLauncher(const int psa_type, const Tensor input,
const DArrayLite input, DArrayLite output, Tensor output, const int num_,
const int num_, const int h_feature, const int h_feature, const int w_feature,
const int w_feature, const int h_mask, const int h_mask, const int w_mask,
const int w_mask, const int half_h_mask, const int half_h_mask,
const int half_w_mask, CudaContext& ctx) { const int half_w_mask) {
int nthreads = num_ * h_feature * w_feature; int nthreads = num_ * h_feature * w_feature;
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream()); cudaStream_t stream = at::cuda::getCurrentCUDAStream();
if (psa_type == 0) if (psa_type == 0)
PARROTS_DISPATCH_FLOATING_TYPES(input.elemType().prim(), [&] { AT_DISPATCH_FLOATING_TYPES(
psamask_collect_forward_cuda<scalar_t><<<nthreads, 512, 0, stream>>>( input.scalar_type(), "psamask_collect_forward_cuda", [&] {
nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask, psamask_collect_forward_cuda<scalar_t><<<nthreads, 512, 0, stream>>>(
half_w_mask, input.ptr<scalar_t>(), output.ptr<scalar_t>()); nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
}); half_w_mask, input.data_ptr<scalar_t>(),
output.data_ptr<scalar_t>());
});
else else
PARROTS_DISPATCH_FLOATING_TYPES(input.elemType().prim(), [&] { AT_DISPATCH_FLOATING_TYPES(
psamask_distribute_forward_cuda<scalar_t><<<nthreads, 512, 0, stream>>>( input.scalar_type(), "psamask_distribute_forward_cuda", [&] {
nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask, psamask_distribute_forward_cuda<scalar_t>
half_w_mask, input.ptr<scalar_t>(), output.ptr<scalar_t>()); <<<nthreads, 512, 0, stream>>>(
}); nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
half_w_mask, input.data_ptr<scalar_t>(),
output.data_ptr<scalar_t>());
});
} }
void PSAMaskBackwardCUDAKernelLauncher( void PSAMaskBackwardCUDAKernelLauncher(
const int psa_type, const DArrayLite grad_output, DArrayLite grad_input, const int psa_type, const Tensor grad_output, Tensor grad_input,
const int num_, const int h_feature, const int w_feature, const int h_mask, const int num_, const int h_feature, const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask, const int half_w_mask, const int w_mask, const int half_h_mask, const int half_w_mask) {
CudaContext& ctx) {
int nthreads = num_ * h_feature * w_feature; int nthreads = num_ * h_feature * w_feature;
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream()); cudaStream_t stream = at::cuda::getCurrentCUDAStream();
if (psa_type == 0) if (psa_type == 0)
PARROTS_DISPATCH_FLOATING_TYPES(grad_input.elemType().prim(), [&] { AT_DISPATCH_FLOATING_TYPES(
psamask_collect_backward_cuda<scalar_t><<<nthreads, 512, 0, stream>>>( grad_input.scalar_type(), "psamask_collect_backward_cuda", [&] {
nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask, psamask_collect_backward_cuda<scalar_t><<<nthreads, 512, 0, stream>>>(
half_w_mask, grad_output.ptr<scalar_t>(), grad_input.ptr<scalar_t>()); nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
}); half_w_mask, grad_output.data_ptr<scalar_t>(),
grad_input.data_ptr<scalar_t>());
});
else else
PARROTS_DISPATCH_FLOATING_TYPES(grad_input.elemType().prim(), [&] { AT_DISPATCH_FLOATING_TYPES(
psamask_distribute_backward_cuda<scalar_t><<<nthreads, 512, 0, stream>>>( grad_input.scalar_type(), "psamask_distribute_backward_cuda", [&] {
nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask, psamask_distribute_backward_cuda<scalar_t>
half_w_mask, grad_output.ptr<scalar_t>(), grad_input.ptr<scalar_t>()); <<<nthreads, 512, 0, stream>>>(
}); nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
half_w_mask, grad_output.data_ptr<scalar_t>(),
grad_input.data_ptr<scalar_t>());
});
} }
#include <parrots/compute/aten.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/ssattrs.hpp>
#include "psamask_pytorch.h"
using namespace parrots;
#ifdef MMCV_WITH_CUDA
void psamask_forward_cuda_parrots(CudaContext &ctx, const SSElement &attr,
const OperatorBase::in_list_t &ins,
OperatorBase::out_list_t &outs) {
int psa_type, num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
half_w_mask;
SSAttrs(attr)
.get<int>("psa_type", psa_type)
.get<int>("num_", num_)
.get<int>("h_feature", h_feature)
.get<int>("w_feature", w_feature)
.get<int>("h_mask", h_mask)
.get<int>("w_mask", w_mask)
.get<int>("half_h_mask", half_h_mask)
.get<int>("half_w_mask", half_w_mask)
.done();
const auto &input = buildATensor(ctx, ins[0]);
auto output = buildATensor(ctx, outs[0]);
psamask_forward_cuda(psa_type, input, output, num_, h_feature, w_feature,
h_mask, w_mask, half_h_mask, half_w_mask);
}
void psamask_backward_cuda_parrots(CudaContext &ctx, const SSElement &attr,
const OperatorBase::in_list_t &ins,
OperatorBase::out_list_t &outs) {
int psa_type, num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
half_w_mask;
SSAttrs(attr)
.get<int>("psa_type", psa_type)
.get<int>("num_", num_)
.get<int>("h_feature", h_feature)
.get<int>("w_feature", w_feature)
.get<int>("h_mask", h_mask)
.get<int>("w_mask", w_mask)
.get<int>("half_h_mask", half_h_mask)
.get<int>("half_w_mask", half_w_mask)
.done();
const auto &grad_output = buildATensor(ctx, ins[0]);
auto grad_input = buildATensor(ctx, outs[0]);
psamask_backward_cuda(psa_type, grad_output, grad_input, num_, h_feature,
w_feature, h_mask, w_mask, half_h_mask, half_w_mask);
}
#endif
void psamask_forward_cpu_parrots(HostContext &ctx, const SSElement &attr,
const OperatorBase::in_list_t &ins,
OperatorBase::out_list_t &outs) {
int psa_type, num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
half_w_mask;
SSAttrs(attr)
.get<int>("psa_type", psa_type)
.get<int>("num_", num_)
.get<int>("h_feature", h_feature)
.get<int>("w_feature", w_feature)
.get<int>("h_mask", h_mask)
.get<int>("w_mask", w_mask)
.get<int>("half_h_mask", half_h_mask)
.get<int>("half_w_mask", half_w_mask)
.done();
const auto &input = buildATensor(ctx, ins[0]);
auto output = buildATensor(ctx, outs[0]);
psamask_forward_cpu(psa_type, input, output, num_, h_feature, w_feature,
h_mask, w_mask, half_h_mask, half_w_mask);
}
void psamask_backward_cpu_parrots(HostContext &ctx, const SSElement &attr,
const OperatorBase::in_list_t &ins,
OperatorBase::out_list_t &outs) {
int psa_type, num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
half_w_mask;
SSAttrs(attr)
.get<int>("psa_type", psa_type)
.get<int>("num_", num_)
.get<int>("h_feature", h_feature)
.get<int>("w_feature", w_feature)
.get<int>("h_mask", h_mask)
.get<int>("w_mask", w_mask)
.get<int>("half_h_mask", half_h_mask)
.get<int>("half_w_mask", half_w_mask)
.done();
const auto &grad_output = buildATensor(ctx, ins[0]);
auto grad_input = buildATensor(ctx, outs[0]);
psamask_backward_cpu(psa_type, grad_output, grad_input, num_, h_feature,
w_feature, h_mask, w_mask, half_h_mask, half_w_mask);
}
PARROTS_EXTENSION_REGISTER(psamask_forward)
.attr("psa_type")
.attr("num_")
.attr("h_feature")
.attr("w_feature")
.attr("h_mask")
.attr("w_mask")
.attr("half_h_mask")
.attr("half_w_mask")
.input(1)
.output(1)
.apply(psamask_forward_cpu_parrots)
#ifdef MMCV_WITH_CUDA
.apply(psamask_forward_cuda_parrots)
#endif
.done();
PARROTS_EXTENSION_REGISTER(psamask_backward)
.attr("psa_type")
.attr("num_")
.attr("h_feature")
.attr("w_feature")
.attr("h_mask")
.attr("w_mask")
.attr("half_h_mask")
.attr("half_w_mask")
.input(1)
.output(1)
.apply(psamask_backward_cpu_parrots)
#ifdef MMCV_WITH_CUDA
.apply(psamask_backward_cuda_parrots)
#endif
.done();
#ifndef PSAMASK_PYTORCH_H
#define PSAMASK_PYTORCH_H
#include <torch/extension.h>
using namespace at;
#ifdef MMCV_WITH_CUDA
void psamask_forward_cuda(const int psa_type, const Tensor input, Tensor output,
const int num_, const int h_feature,
const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask,
const int half_w_mask);
void psamask_backward_cuda(const int psa_type, const Tensor grad_output,
Tensor grad_input, const int num_,
const int h_feature, const int w_feature,
const int h_mask, const int w_mask,
const int half_h_mask, const int half_w_mask);
#endif
void psamask_forward_cpu(const int psa_type, const Tensor input, Tensor output,
const int num_, const int h_feature,
const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask,
const int half_w_mask);
void psamask_backward_cpu(const int psa_type, const Tensor grad_output,
Tensor grad_input, const int num_,
const int h_feature, const int w_feature,
const int h_mask, const int w_mask,
const int half_h_mask, const int half_w_mask);
#endif // PSAMASK_PYTORCH_H
// Copyright (c) 2018, SenseTime. #include "pytorch_cpp_helper.hpp"
#include "parrots_cpp_helper.hpp"
#ifdef MMCV_WITH_CUDA
void ROIAlignForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
Tensor argmax_y, Tensor argmax_x,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
int pool_mode, bool aligned);
void ROIAlignBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
Tensor argmax_y, Tensor argmax_x,
Tensor grad_input, int aligned_height,
int aligned_width, float spatial_scale,
int sampling_ratio, int pool_mode,
bool aligned);
void roi_align_forward_cuda(Tensor input, Tensor rois, Tensor output,
Tensor argmax_y, Tensor argmax_x,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
int pool_mode, bool aligned) {
ROIAlignForwardCUDAKernelLauncher(
input, rois, output, argmax_y, argmax_x, aligned_height, aligned_width,
spatial_scale, sampling_ratio, pool_mode, aligned);
}
void roi_align_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax_y,
Tensor argmax_x, Tensor grad_input,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
int pool_mode, bool aligned) {
ROIAlignBackwardCUDAKernelLauncher(
grad_output, rois, argmax_y, argmax_x, grad_input, aligned_height,
aligned_width, spatial_scale, sampling_ratio, pool_mode, aligned);
}
#endif
void ROIAlignForwardCPULauncher(DArrayLite input, DArrayLite rois, void ROIAlignForwardCPULauncher(Tensor input, Tensor rois, Tensor output,
DArrayLite output, DArrayLite argmax_y, Tensor argmax_y, Tensor argmax_x,
DArrayLite argmax_x, int aligned_height, int aligned_height, int aligned_width,
int aligned_width, float spatial_scale, float spatial_scale, int sampling_ratio,
int sampling_ratio, int pool_mode, int pool_mode, bool aligned);
bool aligned);
void ROIAlignBackwardCPULauncher(DArrayLite grad_output, DArrayLite rois, void ROIAlignBackwardCPULauncher(Tensor grad_output, Tensor rois,
DArrayLite argmax_y, DArrayLite argmax_x, Tensor argmax_y, Tensor argmax_x,
DArrayLite grad_input, int aligned_height, Tensor grad_input, int aligned_height,
int aligned_width, float spatial_scale, int aligned_width, float spatial_scale,
int sampling_ratio, int pool_mode, int sampling_ratio, int pool_mode,
bool aligned); bool aligned);
void ROIAlignForwardCUDAKernelLauncher(DArrayLite input, DArrayLite rois, void roi_align_forward_cpu(Tensor input, Tensor rois, Tensor output,
DArrayLite output, DArrayLite argmax_y, Tensor argmax_y, Tensor argmax_x, int aligned_height,
DArrayLite argmax_x, int aligned_height, int aligned_width, float spatial_scale,
int aligned_width, float spatial_scale, int sampling_ratio, int pool_mode, bool aligned) {
int sampling_ratio, int pool_mode,
bool aligned, cudaStream_t stream);
void ROIAlignBackwardCUDAKernelLauncher(
DArrayLite grad_output, DArrayLite rois, DArrayLite argmax_y,
DArrayLite argmax_x, DArrayLite grad_input, int aligned_height,
int aligned_width, float spatial_scale, int sampling_ratio, int pool_mode,
bool aligned, cudaStream_t stream);
void roi_align_forward_cpu(HostContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int aligned_height;
int aligned_width;
float spatial_scale;
int sampling_ratio;
int pool_mode;
bool aligned;
SSAttrs(attr)
.get<int>("aligned_height", aligned_height)
.get<int>("aligned_width", aligned_width)
.get<float>("spatial_scale", spatial_scale)
.get<int>("sampling_ratio", sampling_ratio)
.get<int>("pool_mode", pool_mode)
.get<bool>("aligned", aligned)
.done();
auto& input = ins[0];
auto& rois = ins[1];
auto& output = outs[0];
auto& argmax_y = outs[1];
auto& argmax_x = outs[2];
ROIAlignForwardCPULauncher(input, rois, output, argmax_y, argmax_x, ROIAlignForwardCPULauncher(input, rois, output, argmax_y, argmax_x,
aligned_height, aligned_width, spatial_scale, aligned_height, aligned_width, spatial_scale,
sampling_ratio, pool_mode, aligned); sampling_ratio, pool_mode, aligned);
} }
void roi_align_backward_cpu(HostContext& ctx, const SSElement& attr, void roi_align_backward_cpu(Tensor grad_output, Tensor rois, Tensor argmax_y,
const OperatorBase::in_list_t& ins, Tensor argmax_x, Tensor grad_input,
OperatorBase::out_list_t& outs) { int aligned_height, int aligned_width,
int aligned_height; float spatial_scale, int sampling_ratio,
int aligned_width; int pool_mode, bool aligned) {
float spatial_scale;
int sampling_ratio;
int pool_mode;
bool aligned;
SSAttrs(attr)
.get<int>("aligned_height", aligned_height)
.get<int>("aligned_width", aligned_width)
.get<float>("spatial_scale", spatial_scale)
.get<int>("sampling_ratio", sampling_ratio)
.get<int>("pool_mode", pool_mode)
.get<bool>("aligned", aligned)
.done();
auto& grad_output = ins[0];
auto& rois = ins[1];
auto& argmax_y = ins[2];
auto& argmax_x = ins[3];
auto& grad_input = outs[0];
ROIAlignBackwardCPULauncher(grad_output, rois, argmax_y, argmax_x, grad_input, ROIAlignBackwardCPULauncher(grad_output, rois, argmax_y, argmax_x, grad_input,
aligned_height, aligned_width, spatial_scale, aligned_height, aligned_width, spatial_scale,
sampling_ratio, pool_mode, aligned); sampling_ratio, pool_mode, aligned);
} }
void roi_align_forward_cuda(CudaContext& ctx, const SSElement& attr, void roi_align_forward(Tensor input, Tensor rois, Tensor output,
const OperatorBase::in_list_t& ins, Tensor argmax_y, Tensor argmax_x, int aligned_height,
OperatorBase::out_list_t& outs) { int aligned_width, float spatial_scale,
int aligned_height; int sampling_ratio, int pool_mode, bool aligned) {
int aligned_width; if (input.device().is_cuda()) {
float spatial_scale; #ifdef MMCV_WITH_CUDA
int sampling_ratio; CHECK_CUDA_INPUT(input);
int pool_mode; CHECK_CUDA_INPUT(rois);
bool aligned; CHECK_CUDA_INPUT(output);
SSAttrs(attr) CHECK_CUDA_INPUT(argmax_y);
.get<int>("aligned_height", aligned_height) CHECK_CUDA_INPUT(argmax_x);
.get<int>("aligned_width", aligned_width)
.get<float>("spatial_scale", spatial_scale) roi_align_forward_cuda(input, rois, output, argmax_y, argmax_x,
.get<int>("sampling_ratio", sampling_ratio) aligned_height, aligned_width, spatial_scale,
.get<int>("pool_mode", pool_mode) sampling_ratio, pool_mode, aligned);
.get<bool>("aligned", aligned) #else
.done(); AT_ERROR("RoIAlign is not compiled with GPU support");
auto& input = ins[0];
auto& rois = ins[1];
auto& output = outs[0];
auto& argmax_y = outs[1];
auto& argmax_x = outs[2];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
ROIAlignForwardCUDAKernelLauncher(
input, rois, output, argmax_y, argmax_x, aligned_height, aligned_width,
spatial_scale, sampling_ratio, pool_mode, aligned, stream);
}
void roi_align_backward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int aligned_height;
int aligned_width;
float spatial_scale;
int sampling_ratio;
int pool_mode;
bool aligned;
SSAttrs(attr)
.get<int>("aligned_height", aligned_height)
.get<int>("aligned_width", aligned_width)
.get<float>("spatial_scale", spatial_scale)
.get<int>("sampling_ratio", sampling_ratio)
.get<int>("pool_mode", pool_mode)
.get<bool>("aligned", aligned)
.done();
auto& grad_output = ins[0];
auto& rois = ins[1];
auto& argmax_y = ins[2];
auto& argmax_x = ins[3];
auto& grad_input = outs[0];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
ROIAlignBackwardCUDAKernelLauncher(
grad_output, rois, argmax_y, argmax_x, grad_input, aligned_height,
aligned_width, spatial_scale, sampling_ratio, pool_mode, aligned, stream);
}
PARROTS_EXTENSION_REGISTER(roi_align_forward)
.attr("aligned_height")
.attr("aligned_width")
.attr("spatial_scale")
.attr("sampling_ratio")
.attr("pool_mode")
.attr("aligned")
.input(2)
.output(3)
.apply(roi_align_forward_cpu)
#ifdef PARROTS_USE_CUDA
.apply(roi_align_forward_cuda)
#endif #endif
.done(); } else {
CHECK_CPU_INPUT(input);
CHECK_CPU_INPUT(rois);
CHECK_CPU_INPUT(output);
CHECK_CPU_INPUT(argmax_y);
CHECK_CPU_INPUT(argmax_x);
roi_align_forward_cpu(input, rois, output, argmax_y, argmax_x,
aligned_height, aligned_width, spatial_scale,
sampling_ratio, pool_mode, aligned);
}
}
PARROTS_EXTENSION_REGISTER(roi_align_backward) void roi_align_backward(Tensor grad_output, Tensor rois, Tensor argmax_y,
.attr("aligned_height") Tensor argmax_x, Tensor grad_input, int aligned_height,
.attr("aligned_width") int aligned_width, float spatial_scale,
.attr("spatial_scale") int sampling_ratio, int pool_mode, bool aligned) {
.attr("sampling_ratio") if (grad_output.device().is_cuda()) {
.attr("pool_mode") #ifdef MMCV_WITH_CUDA
.attr("aligned") CHECK_CUDA_INPUT(grad_output);
.input(4) CHECK_CUDA_INPUT(rois);
.output(1) CHECK_CUDA_INPUT(argmax_y);
.apply(roi_align_backward_cpu) CHECK_CUDA_INPUT(argmax_x);
#ifdef PARROTS_USE_CUDA CHECK_CUDA_INPUT(grad_input);
.apply(roi_align_backward_cuda)
roi_align_backward_cuda(grad_output, rois, argmax_y, argmax_x, grad_input,
aligned_height, aligned_width, spatial_scale,
sampling_ratio, pool_mode, aligned);
#else
AT_ERROR("RoIAlign is not compiled with GPU support");
#endif #endif
.done(); } else {
CHECK_CPU_INPUT(grad_output);
CHECK_CPU_INPUT(rois);
CHECK_CPU_INPUT(argmax_y);
CHECK_CPU_INPUT(argmax_x);
CHECK_CPU_INPUT(grad_input);
roi_align_backward_cpu(grad_output, rois, argmax_y, argmax_x, grad_input,
aligned_height, aligned_width, spatial_scale,
sampling_ratio, pool_mode, aligned);
}
}
// Modified from // Modified from
// https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/ROIAlign // https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/ROIAlign
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
#include <iostream> #include <ATen/ATen.h>
#include <ATen/TensorUtils.h>
#include "parrots_cpp_helper.hpp" #include "../pytorch_cpp_helper.hpp"
// implementation taken from Caffe2 // implementation taken from Caffe2
template <typename T> template <typename T>
...@@ -133,8 +134,8 @@ void ROIAlignForward(const int nthreads, const T* input, const T* rois, ...@@ -133,8 +134,8 @@ void ROIAlignForward(const int nthreads, const T* input, const T* rois,
T roi_width = roi_end_w - roi_start_w; T roi_width = roi_end_w - roi_start_w;
T roi_height = roi_end_h - roi_start_h; T roi_height = roi_end_h - roi_start_h;
if (aligned) { if (aligned) {
PARROTS_CHECKARGS(roi_width >= 0 && roi_height >= 0) AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
<< "ROIs in ROIAlign cannot have non-negative size!"; "ROIs in ROIAlign cannot have non-negative size!");
} else { // for backward-compatibility only } else { // for backward-compatibility only
roi_width = std::max(roi_width, (T)1.); roi_width = std::max(roi_width, (T)1.);
roi_height = std::max(roi_height, (T)1.); roi_height = std::max(roi_height, (T)1.);
...@@ -294,8 +295,8 @@ void ROIAlignBackward(const int nthreads, const T* grad_output, const T* rois, ...@@ -294,8 +295,8 @@ void ROIAlignBackward(const int nthreads, const T* grad_output, const T* rois,
T roi_width = roi_end_w - roi_start_w; T roi_width = roi_end_w - roi_start_w;
T roi_height = roi_end_h - roi_start_h; T roi_height = roi_end_h - roi_start_h;
if (aligned) { if (aligned) {
PARROTS_CHECKARGS(roi_width >= 0 && roi_height >= 0) AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
<< "ROIs in ROIAlign do not have non-negative size!"; "ROIs in ROIAlign do not have non-negative size!");
} else { // for backward-compatibility only } else { // for backward-compatibility only
roi_width = std::max(roi_width, (T)1.); roi_width = std::max(roi_width, (T)1.);
roi_height = std::max(roi_height, (T)1.); roi_height = std::max(roi_height, (T)1.);
...@@ -378,38 +379,37 @@ void ROIAlignBackward(const int nthreads, const T* grad_output, const T* rois, ...@@ -378,38 +379,37 @@ void ROIAlignBackward(const int nthreads, const T* grad_output, const T* rois,
} // for } // for
} // ROIAlignBackward } // ROIAlignBackward
void ROIAlignForwardCPULauncher(DArrayLite input, DArrayLite rois, void ROIAlignForwardCPULauncher(Tensor input, Tensor rois, Tensor output,
DArrayLite output, DArrayLite argmax_y, Tensor argmax_y, Tensor argmax_x,
DArrayLite argmax_x, int aligned_height, int aligned_height, int aligned_width,
int aligned_width, float spatial_scale, float spatial_scale, int sampling_ratio,
int sampling_ratio, int pool_mode, int pool_mode, bool aligned) {
bool aligned) { int output_size = output.numel();
int output_size = output.size(); int channels = input.size(1);
int channels = input.dim(1); int height = input.size(2);
int height = input.dim(2); int width = input.size(3);
int width = input.dim(3);
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF( input.scalar_type(), "ROIAlign_forward", [&] {
input.elemType().prim(), ([&] {
ROIAlignForward<scalar_t>( ROIAlignForward<scalar_t>(
output_size, input.ptr<scalar_t>(), rois.ptr<scalar_t>(), output_size, input.data_ptr<scalar_t>(), rois.data_ptr<scalar_t>(),
output.ptr<scalar_t>(), argmax_y.ptr<scalar_t>(), output.data_ptr<scalar_t>(), argmax_y.data_ptr<scalar_t>(),
argmax_x.ptr<scalar_t>(), aligned_height, aligned_width, argmax_x.data_ptr<scalar_t>(), aligned_height, aligned_width,
static_cast<scalar_t>(spatial_scale), sampling_ratio, pool_mode, static_cast<scalar_t>(spatial_scale), sampling_ratio, pool_mode,
aligned, channels, height, width); aligned, channels, height, width);
})); });
} }
void ROIAlignBackwardCPULauncher(DArrayLite grad_output, DArrayLite rois, void ROIAlignBackwardCPULauncher(Tensor grad_output, Tensor rois,
DArrayLite argmax_y, DArrayLite argmax_x, Tensor argmax_y, Tensor argmax_x,
DArrayLite grad_input, int aligned_height, Tensor grad_input, int aligned_height,
int aligned_width, float spatial_scale, int aligned_width, float spatial_scale,
int sampling_ratio, int pool_mode, int sampling_ratio, int pool_mode,
bool aligned) { bool aligned) {
int output_size = grad_output.size(); int output_size = grad_output.numel();
int channels = grad_input.dim(1); int channels = grad_input.size(1);
int height = grad_input.dim(2); int height = grad_input.size(2);
int width = grad_input.dim(3); int width = grad_input.size(3);
// get stride values to ensure indexing into gradients is correct. // get stride values to ensure indexing into gradients is correct.
int n_stride = grad_output.stride(0); int n_stride = grad_output.stride(0);
...@@ -417,14 +417,14 @@ void ROIAlignBackwardCPULauncher(DArrayLite grad_output, DArrayLite rois, ...@@ -417,14 +417,14 @@ void ROIAlignBackwardCPULauncher(DArrayLite grad_output, DArrayLite rois,
int h_stride = grad_output.stride(2); int h_stride = grad_output.stride(2);
int w_stride = grad_output.stride(3); int w_stride = grad_output.stride(3);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF( AT_DISPATCH_FLOATING_TYPES_AND_HALF(
grad_output.elemType().prim(), ([&] { grad_output.scalar_type(), "ROIAlign_backward", [&] {
ROIAlignBackward<scalar_t>( ROIAlignBackward<scalar_t>(
output_size, grad_output.ptr<scalar_t>(), rois.ptr<scalar_t>(), output_size, grad_output.data_ptr<scalar_t>(),
argmax_y.ptr<scalar_t>(), argmax_x.ptr<scalar_t>(), rois.data_ptr<scalar_t>(), argmax_y.data_ptr<scalar_t>(),
grad_input.ptr<scalar_t>(), aligned_height, aligned_width, argmax_x.data_ptr<scalar_t>(), grad_input.data_ptr<scalar_t>(),
static_cast<scalar_t>(spatial_scale), sampling_ratio, pool_mode, aligned_height, aligned_width, static_cast<scalar_t>(spatial_scale),
aligned, channels, height, width, n_stride, c_stride, h_stride, sampling_ratio, pool_mode, aligned, channels, height, width,
w_stride); n_stride, c_stride, h_stride, w_stride);
})); });
} }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment