Unverified Commit c0f5492e authored by zhuyuanhao's avatar zhuyuanhao Committed by GitHub
Browse files

add ext ops, support parrots (#310)



* add ext ops, support parrots

* fix lint

* fix lint

* update op from mmdetection

* support non-pytorch env

* fix import bug

* test not import mmcv.op

* rename mmcv.op to mmcv.ops

* fix compile warning

* 1. fix syncbn warning in pytorch 1.5
2. support only cpu compile
3. add point_sample from mmdet

* fix text bug

* update docstrings

* fix line endings

* minor updates

* remove non_local from ops

* bug fix for nonlocal2d

* rename ops_ext to _ext and _ext to _flow_warp_ext

* update the doc

* try clang-format github action

* fix github action

* add ops to api.rst

* fix cpp format

* fix clang format issues

* remove .clang-format
Co-authored-by: default avatarKai Chen <chenkaidev@gmail.com>
parent a7bf7701
// Copyright (c) 2018, SenseTime.
#include "parrots_cpp_helper.hpp"
void SigmoidFocalLossForwardCUDAKernelLauncher(
const DArrayLite input, const DArrayLite target, const DArrayLite weight,
DArrayLite output, float gamma, float alpha, cudaStream_t stream);
void SigmoidFocalLossBackwardCUDAKernelLauncher(
const DArrayLite input, const DArrayLite target, const DArrayLite weight,
DArrayLite grad_input, float gamma, float alpha, cudaStream_t stream);
void SoftmaxFocalLossForwardCUDAKernelLauncher(
const DArrayLite input, const DArrayLite target, const DArrayLite weight,
DArrayLite output, float gamma, float alpha, cudaStream_t stream);
void SoftmaxFocalLossBackwardCUDAKernelLauncher(
const DArrayLite input, const DArrayLite target, const DArrayLite weight,
DArrayLite buff, DArrayLite grad_input, float gamma, float alpha,
cudaStream_t stream);
void sigmoid_focal_loss_forward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
float gamma;
float alpha;
SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
// get inputs and outputs
const auto& input = ins[0];
const auto& target = ins[1];
const auto& weight = ins[2];
auto& output = outs[0];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
SigmoidFocalLossForwardCUDAKernelLauncher(input, target, weight, output,
gamma, alpha, stream);
}
void sigmoid_focal_loss_backward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
float gamma;
float alpha;
SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
// get inputs and outputs
const auto& input = ins[0];
const auto& target = ins[1];
const auto& weight = ins[2];
auto& grad_input = outs[0];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
SigmoidFocalLossBackwardCUDAKernelLauncher(input, target, weight, grad_input,
gamma, alpha, stream);
}
void softmax_focal_loss_forward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
float gamma;
float alpha;
SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
// get inputs and outputs
const auto& input = ins[0];
const auto& target = ins[1];
const auto& weight = ins[2];
auto& grad_input = outs[0];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
SoftmaxFocalLossForwardCUDAKernelLauncher(input, target, weight, grad_input,
gamma, alpha, stream);
}
void softmax_focal_loss_backward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
float gamma;
float alpha;
SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
// get inputs and outputs
const auto& input = ins[0];
const auto& target = ins[1];
const auto& weight = ins[2];
auto& buff = outs[0];
auto& grad_input = outs[1];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
SoftmaxFocalLossBackwardCUDAKernelLauncher(input, target, weight, buff,
grad_input, gamma, alpha, stream);
}
PARROTS_EXTENSION_REGISTER(sigmoid_focal_loss_forward)
.attr("gamma")
.attr("alpha")
.input(3)
.output(1)
.apply(sigmoid_focal_loss_forward_cuda)
.done();
PARROTS_EXTENSION_REGISTER(sigmoid_focal_loss_backward)
.attr("gamma")
.attr("alpha")
.input(3)
.output(1)
.apply(sigmoid_focal_loss_backward_cuda)
.done();
PARROTS_EXTENSION_REGISTER(softmax_focal_loss_forward)
.attr("gamma")
.attr("alpha")
.input(3)
.output(1)
.apply(softmax_focal_loss_forward_cuda)
.done();
PARROTS_EXTENSION_REGISTER(softmax_focal_loss_backward)
.attr("gamma")
.attr("alpha")
.input(3)
.output(2)
.apply(softmax_focal_loss_backward_cuda)
.done();
#include "parrots_cuda_helper.hpp"
#include "sigmoid_focal_loss_kernel.cuh"
#include "softmax_focal_loss_kernel.cuh"
void SigmoidFocalLossForwardCUDAKernelLauncher(
const DArrayLite input, const DArrayLite target, const DArrayLite weight,
DArrayLite output, float gamma, float alpha, cudaStream_t stream) {
int output_size = output.size();
int num_classes = input.dim(1);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
input.elemType().prim(), ([&] {
sigmoid_focal_loss_forward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, input.ptr<scalar_t>(), target.ptr<int64_t>(),
weight.ptr<scalar_t>(), output.ptr<scalar_t>(), gamma, alpha,
num_classes);
}));
PARROTS_CUDA_CHECK(cudaGetLastError());
}
void SigmoidFocalLossBackwardCUDAKernelLauncher(
const DArrayLite input, const DArrayLite target, const DArrayLite weight,
DArrayLite grad_input, float gamma, float alpha, cudaStream_t stream) {
int output_size = grad_input.size();
int num_classes = input.dim(1);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
input.elemType().prim(), ([&] {
sigmoid_focal_loss_backward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, input.ptr<scalar_t>(), target.ptr<int64_t>(),
weight.ptr<scalar_t>(), grad_input.ptr<scalar_t>(), gamma,
alpha, num_classes);
}));
PARROTS_CUDA_CHECK(cudaGetLastError());
}
void SoftmaxFocalLossForwardCUDAKernelLauncher(
const DArrayLite softmax, const DArrayLite target, const DArrayLite weight,
DArrayLite output, float gamma, float alpha, cudaStream_t stream) {
int output_size = output.size();
int num_classes = softmax.dim(1);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
softmax.elemType().prim(), ([&] {
softmax_focal_loss_forward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, softmax.ptr<scalar_t>(), target.ptr<int64_t>(),
weight.ptr<scalar_t>(), output.ptr<scalar_t>(), gamma, alpha,
num_classes);
}));
PARROTS_CUDA_CHECK(cudaGetLastError());
}
void SoftmaxFocalLossBackwardCUDAKernelLauncher(
const DArrayLite softmax, const DArrayLite target, const DArrayLite weight,
DArrayLite buff, DArrayLite grad_input, float gamma, float alpha,
cudaStream_t stream) {
int output_size = buff.size();
int num_classes = softmax.dim(1);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
grad_input.elemType().prim(), ([&] {
softmax_focal_loss_backward_cuda1_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, softmax.ptr<scalar_t>(), target.ptr<int64_t>(),
weight.ptr<scalar_t>(), buff.ptr<scalar_t>(), gamma, alpha,
num_classes);
}));
PARROTS_CUDA_CHECK(cudaGetLastError());
output_size = grad_input.size();
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
grad_input.elemType().prim(), ([&] {
softmax_focal_loss_backward_cuda2_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, softmax.ptr<scalar_t>(), target.ptr<int64_t>(),
buff.ptr<scalar_t>(), grad_input.ptr<scalar_t>(), num_classes);
}));
PARROTS_CUDA_CHECK(cudaGetLastError());
}
#include "parrots_cpp_helper.hpp"
void MaskedIm2colForwardCUDAKernelLauncher(
const DArrayLite bottom_data, const DArrayLite mask_h_idx,
const DArrayLite mask_w_idx, DArrayLite top_data, const int kernel_h,
const int kernel_w, const int pad_h, const int pad_w, cudaStream_t stream);
void MaskedCol2imForwardCUDAKernelLaucher(const DArrayLite bottom_data,
const DArrayLite mask_h_idx,
const DArrayLite mask_w_idx,
DArrayLite top_data, const int height,
const int width, const int channels,
cudaStream_t stream);
void masked_im2col_forward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
// im: (n, ic, h, w), kernel size (kh, kw)
// kernel: (oc, ic * kh * kw), col: (kh * kw * ic, ow * oh)
int kernel_h, kernel_w, pad_h, pad_w;
SSAttrs(attr)
.get<int>("kernel_h", kernel_h)
.get<int>("kernel_w", kernel_w)
.get<int>("pad_h", pad_h)
.get<int>("pad_w", pad_w)
.done();
const auto& im = ins[0];
const auto& mask_h_idx = ins[1];
const auto& mask_w_idx = ins[2];
auto& col = outs[0];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
MaskedIm2colForwardCUDAKernelLauncher(im, mask_h_idx, mask_w_idx, col,
kernel_h, kernel_w, pad_h, pad_w,
stream);
}
void masked_col2im_forward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
// im: (n, ic, h, w), kernel size (kh, kw)
// kernel: (oc, ic * kh * kh), col: (kh * kw * ic, ow * oh)
int height, width, channels;
SSAttrs(attr)
.get<int>("height", height)
.get<int>("width", width)
.get<int>("channels", channels)
.done();
const auto& col = ins[0];
const auto& mask_h_idx = ins[1];
const auto& mask_w_idx = ins[2];
auto& im = outs[0];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
MaskedCol2imForwardCUDAKernelLaucher(col, mask_h_idx, mask_w_idx, im, height,
width, channels, stream);
}
PARROTS_EXTENSION_REGISTER(masked_im2col_forward)
.attr("kernel_h")
.attr("kernel_w")
.attr("pad_h")
.attr("pad_w")
.input(3)
.output(1)
.apply(masked_im2col_forward_cuda)
.done();
PARROTS_EXTENSION_REGISTER(masked_col2im_forward)
.attr("height")
.attr("width")
.attr("channels")
.input(3)
.output(1)
.apply(masked_col2im_forward_cuda)
.done();
#include "masked_conv2d_cuda_kernel.cuh"
#include "parrots_cuda_helper.hpp"
void MaskedIm2colForwardCUDAKernelLauncher(
const DArrayLite bottom_data, const DArrayLite mask_h_idx,
const DArrayLite mask_w_idx, DArrayLite top_data, const int kernel_h,
const int kernel_w, const int pad_h, const int pad_w, cudaStream_t stream) {
int channels = bottom_data.dim(1);
int height = bottom_data.dim(2);
int width = bottom_data.dim(3);
int mask_cnt = mask_h_idx.dim(0);
int output_size = mask_cnt * channels;
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
bottom_data.elemType().prim(), ([&] {
MaskedIm2colForward<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, bottom_data.ptr<scalar_t>(), height, width,
kernel_h, kernel_w, pad_h, pad_w, mask_h_idx.ptr<int64_t>(),
mask_w_idx.ptr<int64_t>(), mask_cnt, top_data.ptr<scalar_t>());
}));
PARROTS_CUDA_CHECK(cudaGetLastError());
}
void MaskedCol2imForwardCUDAKernelLaucher(const DArrayLite bottom_data,
const DArrayLite mask_h_idx,
const DArrayLite mask_w_idx,
DArrayLite top_data, const int height,
const int width, const int channels,
cudaStream_t stream) {
int mask_cnt = mask_h_idx.dim(0);
int output_size = mask_cnt * channels;
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
bottom_data.elemType().prim(), ([&] {
MaskedCol2imForward<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, bottom_data.ptr<scalar_t>(), height, width,
channels, mask_h_idx.ptr<int64_t>(), mask_w_idx.ptr<int64_t>(),
mask_cnt, top_data.ptr<scalar_t>());
}));
PARROTS_CUDA_CHECK(cudaGetLastError());
}
// Copyright (c) 2019, SenseTime.
#include "parrots_cpp_helper.hpp"
void ModulatedDeformConvForwardCUDAKernelLauncher(
const DArrayLite input, const DArrayLite weight, const DArrayLite bias,
const DArrayLite ones, const DArrayLite offset, const DArrayLite mask,
DArrayLite output, DArrayLite columns, int kernel_h, int kernel_w,
const int stride_h, const int stride_w, const int pad_h, const int pad_w,
const int dilation_h, const int dilation_w, const int group,
int deformable_group, const bool with_bias, CudaContext& ctx,
cudaStream_t stream);
void ModulatedDeformConvBackwardCUDAKernelLauncher(
const DArrayLite input, const DArrayLite weight, const DArrayLite bias,
const DArrayLite ones, const DArrayLite offset, const DArrayLite mask,
DArrayLite columns, DArrayLite grad_input, DArrayLite grad_weight,
DArrayLite grad_bias, DArrayLite grad_offset, DArrayLite grad_mask,
DArrayLite grad_output, int kernel_h, int kernel_w, int stride_h,
int stride_w, int pad_h, int pad_w, int dilation_h, int dilation_w,
int group, int deformable_group, const bool with_bias, CudaContext& ctx,
cudaStream_t stream);
void modulated_deform_conv_forward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h,
dilation_w, group, deformable_group, with_bias;
SSAttrs(attr)
.get<int>("kernel_h", kernel_h)
.get<int>("kernel_w", kernel_w)
.get<int>("stride_h", stride_h)
.get<int>("stride_w", stride_w)
.get<int>("pad_h", pad_h)
.get<int>("pad_w", pad_w)
.get<int>("dilation_h", dilation_h)
.get<int>("dilation_w", dilation_w)
.get<int>("group", group)
.get<int>("deformable_group", deformable_group)
.get<int>("with_bias", with_bias)
.done();
auto input = ins[0];
auto weight = ins[1];
auto bias = ins[2];
auto ones = ins[3];
auto offset = ins[4];
auto mask = ins[5];
auto output = outs[0];
auto columns = outs[1];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
ModulatedDeformConvForwardCUDAKernelLauncher(
input, weight, bias, ones, offset, mask, output, columns, kernel_h,
kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
deformable_group, with_bias, ctx, stream);
}
void modulated_deform_conv_backward_cuda(CudaContext& ctx,
const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h,
dilation_w, group, deformable_group, with_bias;
SSAttrs(attr)
.get<int>("kernel_h", kernel_h)
.get<int>("kernel_w", kernel_w)
.get<int>("stride_h", stride_h)
.get<int>("stride_w", stride_w)
.get<int>("pad_h", pad_h)
.get<int>("pad_w", pad_w)
.get<int>("dilation_h", dilation_h)
.get<int>("dilation_w", dilation_w)
.get<int>("group", group)
.get<int>("deformable_group", deformable_group)
.get<int>("with_bias", with_bias)
.done();
auto input = ins[0];
auto weight = ins[1];
auto bias = ins[2];
auto ones = ins[3];
auto offset = ins[4];
auto mask = ins[5];
auto columns = outs[0];
auto grad_input = outs[1];
auto grad_weight = outs[2];
auto grad_bias = outs[3];
auto grad_offset = outs[4];
auto grad_mask = outs[5];
auto grad_output = outs[6];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
ModulatedDeformConvBackwardCUDAKernelLauncher(
input, weight, bias, ones, offset, mask, columns, grad_input, grad_weight,
grad_bias, grad_offset, grad_mask, grad_output, kernel_h, kernel_w,
stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
deformable_group, with_bias, ctx, stream);
}
PARROTS_EXTENSION_REGISTER(modulated_deform_conv_forward)
.attr("kernel_h")
.attr("kernel_w")
.attr("stride_h")
.attr("stride_w")
.attr("pad_h")
.attr("pad_w")
.attr("dilation_h")
.attr("dilation_w")
.attr("group")
.attr("deformable_group")
.attr("with_bias")
.input(6)
.output(2)
.apply(modulated_deform_conv_forward_cuda)
.done();
PARROTS_EXTENSION_REGISTER(modulated_deform_conv_backward)
.attr("kernel_h")
.attr("kernel_w")
.attr("stride_h")
.attr("stride_w")
.attr("pad_h")
.attr("pad_w")
.attr("dilation_h")
.attr("dilation_w")
.attr("group")
.attr("deformable_group")
.attr("with_bias")
.input(6)
.output(7)
.apply(modulated_deform_conv_backward_cuda)
.done();
#include "modulated_deform_conv_cuda_kernel.cuh"
#include "parrots_cuda_helper.hpp"
void modulated_deformable_im2col_cuda(
const DArrayLite data_im, const DArrayLite data_offset,
const DArrayLite data_mask, const int batch_size, const int channels,
const int height_im, const int width_im, const int height_col,
const int width_col, const int kernel_h, const int kenerl_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w, const int deformable_group,
DArrayLite data_col, cudaStream_t stream) {
// num_axes should be smaller than block size
const int channel_per_deformable_group = channels / deformable_group;
const int num_kernels = channels * batch_size * height_col * width_col;
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
data_im.elemType().prim(), ([&] {
modulated_deformable_im2col_gpu_kernel<<<
GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>(
num_kernels, data_im.ptr<scalar_t>(), data_offset.ptr<scalar_t>(),
data_mask.ptr<scalar_t>(), height_im, width_im, kernel_h, kenerl_w,
pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
channel_per_deformable_group, batch_size, channels,
deformable_group, height_col, width_col, data_col.ptr<scalar_t>());
}));
PARROTS_CUDA_CHECK(cudaGetLastError());
}
void modulated_deformable_col2im_cuda(
const DArrayLite data_col, const DArrayLite data_offset,
const DArrayLite data_mask, const int batch_size, const int channels,
const int height_im, const int width_im, const int height_col,
const int width_col, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w, const int deformable_group,
DArrayLite grad_im, cudaStream_t stream) {
const int channel_per_deformable_group = channels / deformable_group;
const int num_kernels =
channels * kernel_h * kernel_w * batch_size * height_col * width_col;
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
data_col.elemType().prim(), ([&] {
modulated_deformable_col2im_gpu_kernel<<<
GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>(
num_kernels, data_col.ptr<scalar_t>(), data_offset.ptr<scalar_t>(),
data_mask.ptr<scalar_t>(), channels, height_im, width_im, kernel_h,
kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
channel_per_deformable_group, batch_size, deformable_group,
height_col, width_col, grad_im.ptr<scalar_t>());
}));
PARROTS_CUDA_CHECK(cudaGetLastError());
}
void modulated_deformable_col2im_coord_cuda(
const DArrayLite data_col, const DArrayLite data_im,
const DArrayLite data_offset, const DArrayLite data_mask,
const int batch_size, const int channels, const int height_im,
const int width_im, const int height_col, const int width_col,
const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
const int stride_h, const int stride_w, const int dilation_h,
const int dilation_w, const int deformable_group, DArrayLite grad_offset,
DArrayLite grad_mask, cudaStream_t stream) {
const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h *
kernel_w * deformable_group;
const int channel_per_deformable_group =
channels * kernel_h * kernel_w / deformable_group;
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
data_col.elemType().prim(), ([&] {
modulated_deformable_col2im_coord_gpu_kernel<<<
GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>(
num_kernels, data_col.ptr<scalar_t>(), data_im.ptr<scalar_t>(),
data_offset.ptr<scalar_t>(), data_mask.ptr<scalar_t>(), channels,
height_im, width_im, kernel_h, kernel_w, pad_h, pad_w, stride_h,
stride_w, dilation_h, dilation_w, channel_per_deformable_group,
batch_size, 2 * kernel_h * kernel_w * deformable_group,
deformable_group, height_col, width_col,
grad_offset.ptr<scalar_t>(), grad_mask.ptr<scalar_t>());
}));
PARROTS_CUDA_CHECK(cudaGetLastError());
}
void ModulatedDeformConvForwardCUDAKernelLauncher(
DArrayLite input, DArrayLite weight, DArrayLite bias, DArrayLite ones,
DArrayLite offset, DArrayLite mask, DArrayLite output, DArrayLite columns,
int kernel_h, int kernel_w, const int stride_h, const int stride_w,
const int pad_h, const int pad_w, const int dilation_h,
const int dilation_w, const int group, const int deformable_group,
const bool with_bias, CudaContext& ctx, cudaStream_t stream) {
const int batch = input.dim(0);
const int channels = input.dim(1);
const int height = input.dim(2);
const int width = input.dim(3);
const int channels_out = weight.dim(0);
const int channels_kernel = weight.dim(1);
const int kernel_h_ = weight.dim(2);
const int kernel_w_ = weight.dim(3);
PARROTS_CHECKARGS(kernel_h_ == kernel_h && kernel_w_ == kernel_w)
<< "Input shape and kernel shape wont match: (" << kernel_h << " x "
<< kernel_w << " vs " << kernel_h_ << " x " << kernel_w_ << ").";
PARROTS_CHECKARGS(channels == channels_kernel * group)
<< "Input shape and kernel channels wont match: (" << channels << " vs "
<< channels_kernel * group << ").";
const int height_out =
(height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
const int width_out =
(width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
if (ones.ndims() != 2 || ones.dim(0) * ones.dim(1) < height_out * width_out) {
// Resize plane and fill with ones...
ones = ctx.createDArrayLite(input.elemType(),
DArrayShape(height_out, width_out));
fill(ctx, ones, *toScalar(1));
}
// resize output
output = output.view({batch, channels_out, height_out, width_out});
output.setZeros(ctx.getStream());
// resize temporary columns
columns = ctx.createDArrayLite(
input.elemType(),
DArrayShape(channels * kernel_h * kernel_w, 1 * height_out * width_out));
columns.setZeros(ctx.getStream());
output = output.view({output.dim(0), group, output.dim(1) / group,
output.dim(2), output.dim(3)});
for (size_t b = 0; b < batch; b++) {
modulated_deformable_im2col_cuda(
input[b], offset[b], mask[b], 1, channels, height, width, height_out,
width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, deformable_group, columns, stream);
// divide into group
weight = weight.view({group, weight.dim(0) / group, weight.dim(1),
weight.dim(2), weight.dim(3)});
columns = columns.view({group, columns.dim(0) / group, columns.dim(1)});
for (size_t g = 0; g < group; g++) {
auto output_g = output[b][g];
gemm(ctx, 1, false,
weight[g].view(
{weight.dim(1), weight.dim(2) * weight.dim(3) * weight.dim(4)}),
false, columns[g], 1, output_g);
}
weight = weight.view({weight.dim(0) * weight.dim(1), weight.dim(2),
weight.dim(3), weight.dim(4)});
columns = columns.view({columns.dim(0) * columns.dim(1), columns.dim(2)});
}
output = output.view({output.dim(0), output.dim(1) * output.dim(2),
output.dim(3), output.dim(4)});
if (with_bias) {
bias = bias.view({1, bias.dim(0), 1, 1});
add(ctx, output, bias, output);
}
}
void ModulatedDeformConvBackwardCUDAKernelLauncher(
DArrayLite input, DArrayLite weight, DArrayLite bias, DArrayLite ones,
DArrayLite offset, DArrayLite mask, DArrayLite columns,
DArrayLite grad_input, DArrayLite grad_weight, DArrayLite grad_bias,
DArrayLite grad_offset, DArrayLite grad_mask, DArrayLite grad_output,
int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
const bool with_bias, CudaContext& ctx, cudaStream_t stream) {
const int batch = input.dim(0);
const int channels = input.dim(1);
const int height = input.dim(2);
const int width = input.dim(3);
const int channels_kernel = weight.dim(1);
const int kernel_h_ = weight.dim(2);
const int kernel_w_ = weight.dim(3);
PARROTS_CHECKARGS(kernel_h_ == kernel_h && kernel_w_ == kernel_w)
<< "Input shape and kernel shape wont match: (" << kernel_h << " x "
<< kernel_w << " vs " << kernel_h_ << " x " << kernel_w_ << ").";
PARROTS_CHECKARGS(channels == channels_kernel * group)
<< "Input shape and kernel channels wont match: (" << channels << " vs "
<< channels_kernel * group << ").";
const int height_out =
(height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
const int width_out =
(width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
if (ones.ndims() != 2 || ones.dim(0) * ones.dim(1) < height_out * width_out) {
// Resize plane and fill with ones...
ones = ctx.createDArrayLite(input.elemType(),
DArrayShape(height_out, width_out));
fill(ctx, ones, *toScalar(1));
}
grad_input = grad_input.view({batch, channels, height, width});
columns = ctx.createDArrayLite(
input.elemType(),
DArrayShape(channels * kernel_h * kernel_w, height_out * width_out));
grad_output =
grad_output.view({grad_output.dim(0), group, grad_output.dim(1) / group,
grad_output.dim(2), grad_output.dim(3)});
for (size_t b = 0; b < batch; b++) {
// divide int group
columns = columns.view({group, columns.dim(0) / group, columns.dim(1)});
weight = weight.view({group, weight.dim(0) / group, weight.dim(1),
weight.dim(2), weight.dim(3)});
for (size_t g = 0; g < group; g++) {
auto columns_g = columns[g];
gemm(ctx, 1, true,
weight[g].view(
{weight.dim(1), weight.dim(2) * weight.dim(3) * weight.dim(4)}),
false,
grad_output[b][g].view(
{grad_output.dim(2), grad_output.dim(3) * grad_output.dim(4)}),
0, columns_g);
}
columns = columns.view({columns.dim(0) * columns.dim(1), columns.dim(2)});
weight = weight.view({weight.dim(0) * weight.dim(1), weight.dim(2),
weight.dim(3), weight.dim(4)});
// gradient w.r.t. input coordinate data
modulated_deformable_col2im_coord_cuda(
columns, input[b], offset[b], mask[b], 1, channels, height, width,
height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h,
stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b],
grad_mask[b], stream);
// gradient w.r.t. input data
modulated_deformable_col2im_cuda(
columns, offset[b], mask[b], 1, channels, height, width, height_out,
width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, deformable_group, grad_input[b], stream);
// gradient w.r.t. weight, dWeight should accumulate across the batch and
// group
modulated_deformable_im2col_cuda(
input[b], offset[b], mask[b], 1, channels, height, width, height_out,
width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, deformable_group, columns, stream);
columns = columns.view({group, columns.dim(0) / group, columns.dim(1)});
grad_weight =
grad_weight.view({group, grad_weight.dim(0) / group, grad_weight.dim(1),
grad_weight.dim(2), grad_weight.dim(3)});
if (with_bias) {
grad_bias = grad_bias.view({group, grad_bias.dim(0) / group});
}
for (size_t g = 0; g < group; g++) {
auto grad_weight_g = grad_weight[g].view(
{grad_weight.dim(1),
grad_weight.dim(2) * grad_weight.dim(3) * grad_weight.dim(4)});
gemm(ctx, 1, false,
grad_output[b][g].view(
{grad_output.dim(2), grad_output.dim(3) * grad_output.dim(4)}),
true, columns[g], 1, grad_weight_g);
if (with_bias) {
auto grad_bias_g = grad_bias[g].view({grad_bias.dim(1), 1});
gemm(ctx, 1, false,
grad_output[b][g].view(
{grad_output.dim(2), grad_output.dim(3) * grad_output.dim(4)}),
false, ones.view({ones.dim(0) * ones.dim(1), 1}), 1, grad_bias_g);
}
}
columns = columns.view({columns.dim(0) * columns.dim(1), columns.dim(2)});
grad_weight = grad_weight.view({grad_weight.dim(0) * grad_weight.dim(1),
grad_weight.dim(2), grad_weight.dim(3),
grad_weight.dim(4)});
if (with_bias)
grad_bias =
grad_bias.view(DArrayShape{grad_bias.dim(0) * grad_bias.dim(1)});
}
grad_output = grad_output.view({grad_output.dim(0) * grad_output.dim(1),
grad_output.dim(2), grad_output.dim(3),
grad_output.dim(4)});
}
#include "parrots_cpp_helper.hpp"
#define DIVUP(x, y) (((x) + (y)-1) / (y))
int const threadsPerBlock = sizeof(unsigned long long) * 8;
DArrayLite NMSCUDAKernelLauncher(const DArrayLite boxes_sorted,
const DArrayLite order, const DArrayLite areas,
float iou_threshold, int offset,
CudaContext& ctx, cudaStream_t stream);
void nms_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
float iou_threshold;
int offset;
SSAttrs(attr)
.get<float>("iou_threshold", iou_threshold)
.get<int>("offset", offset)
.done();
const auto& boxes_sorted = ins[0];
const auto& order = ins[1];
const auto& areas = ins[2];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
outs[0] = NMSCUDAKernelLauncher(boxes_sorted, order, areas, iou_threshold,
offset, ctx, stream);
}
void nms_cpu(HostContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
float iou_threshold;
int offset;
SSAttrs(attr)
.get<float>("iou_threshold", iou_threshold)
.get<int>("offset", offset)
.done();
const auto& boxes = ins[0];
const auto& order = ins[1];
const auto& areas = ins[2];
size_t nboxes = boxes.shape().dim(0);
size_t boxes_dim = boxes.shape().dim(1);
auto select = ctx.createDArrayLite(DArraySpec::array(Prim::Int64, nboxes),
getHostProxy());
select.setZeros(syncStream());
if (boxes.size() == 0) {
outs[0] = select;
return;
}
fill(ctx, select, *toScalar(1));
auto select_ptr = select.ptr<int64_t>();
auto boxes_ptr = boxes.ptr<float>();
auto order_ptr = order.ptr<int64_t>();
auto areas_ptr = areas.ptr<float>();
for (int64_t _i = 0; _i < nboxes; _i++) {
if (select_ptr[_i] == 0) continue;
auto i = order_ptr[_i];
auto ix1 = boxes_ptr[i * boxes_dim];
auto iy1 = boxes_ptr[i * boxes_dim + 1];
auto ix2 = boxes_ptr[i * boxes_dim + 2];
auto iy2 = boxes_ptr[i * boxes_dim + 3];
auto iarea = areas_ptr[i];
for (int64_t _j = _i + 1; _j < nboxes; _j++) {
if (select_ptr[_j] == 0) continue;
auto j = order_ptr[_j];
auto xx1 = fmaxf(ix1, boxes_ptr[j * boxes_dim]);
auto yy1 = fmaxf(iy1, boxes_ptr[j * boxes_dim + 1]);
auto xx2 = fminf(ix2, boxes_ptr[j * boxes_dim + 2]);
auto yy2 = fminf(iy2, boxes_ptr[j * boxes_dim + 3]);
auto w = fmaxf(0.0, xx2 - xx1 + offset);
auto h = fmaxf(0.0, yy2 - yy1 + offset);
auto inter = w * h;
auto ovr = inter / (iarea + areas_ptr[j] - inter);
if (ovr >= iou_threshold) select_ptr[_j] = 0;
}
}
outs[0] = select;
}
void softnms_cpu(HostContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
float iou_threshold;
float sigma;
float min_score;
int method;
int offset;
SSAttrs(attr)
.get<float>("iou_threshold", iou_threshold)
.get<float>("sigma", sigma)
.get<float>("min_score", min_score)
.get<int>("method", method)
.get<int>("offset", offset)
.done();
const auto& boxes = ins[0];
const auto& scores = ins[1];
const auto& areas = ins[2];
size_t nboxes = boxes.shape().dim(0);
size_t boxes_dim = boxes.shape().dim(1);
auto boxes_ptr = boxes.ptr<float>();
auto scores_ptr = scores.ptr<float>();
auto areas_ptr = areas.ptr<float>();
auto inputs = ctx.createDArrayLite(
DArraySpec::array(Prim::Float32, DArrayShape(nboxes, 6)));
auto inputs_ptr = inputs.ptr<float>();
auto dets = ctx.createDArrayLite(
DArraySpec::array(Prim::Float32, DArrayShape(nboxes, 5)));
auto de = dets.ptr<float>();
for (size_t i = 0; i < nboxes; i++) {
inputs_ptr[i * 6 + 0] = boxes_ptr[i * boxes_dim + 0];
inputs_ptr[i * 6 + 1] = boxes_ptr[i * boxes_dim + 1];
inputs_ptr[i * 6 + 2] = boxes_ptr[i * boxes_dim + 2];
inputs_ptr[i * 6 + 3] = boxes_ptr[i * boxes_dim + 3];
inputs_ptr[i * 6 + 4] = scores_ptr[i];
inputs_ptr[i * 6 + 5] = areas_ptr[i];
}
size_t pos = 0;
auto inds_t = ctx.createDArrayLite(DArraySpec::array(Prim::Int64, nboxes));
arange(ctx, *toScalar(0), *toScalar(nboxes), *toScalar(1), inds_t);
auto inds = inds_t.ptr<int64_t>();
auto num_out = ctx.createDArrayLite(DArraySpec::scalar(Prim::Int64));
for (size_t i = 0; i < nboxes; i++) {
auto max_score = inputs_ptr[i * 6 + 4];
auto max_pos = i;
pos = i + 1;
// get max box
while (pos < nboxes) {
if (max_score < inputs_ptr[pos * 6 + 4]) {
max_score = inputs_ptr[pos * 6 + 4];
max_pos = pos;
}
pos = pos + 1;
}
// swap
auto ix1 = de[i * 5 + 0] = inputs_ptr[max_pos * 6 + 0];
auto iy1 = de[i * 5 + 1] = inputs_ptr[max_pos * 6 + 1];
auto ix2 = de[i * 5 + 2] = inputs_ptr[max_pos * 6 + 2];
auto iy2 = de[i * 5 + 3] = inputs_ptr[max_pos * 6 + 3];
auto iscore = de[i * 5 + 4] = inputs_ptr[max_pos * 6 + 4];
auto iarea = inputs_ptr[max_pos * 6 + 5];
auto iind = inds[max_pos];
inputs_ptr[max_pos * 6 + 0] = inputs_ptr[i * 6 + 0];
inputs_ptr[max_pos * 6 + 1] = inputs_ptr[i * 6 + 1];
inputs_ptr[max_pos * 6 + 2] = inputs_ptr[i * 6 + 2];
inputs_ptr[max_pos * 6 + 3] = inputs_ptr[i * 6 + 3];
inputs_ptr[max_pos * 6 + 4] = inputs_ptr[i * 6 + 4];
inputs_ptr[max_pos * 6 + 5] = inputs_ptr[i * 6 + 5];
inds[max_pos] = inds[i];
inputs_ptr[i * 6 + 0] = ix1;
inputs_ptr[i * 6 + 1] = iy1;
inputs_ptr[i * 6 + 2] = ix2;
inputs_ptr[i * 6 + 3] = iy2;
inputs_ptr[i * 6 + 4] = iscore;
inputs_ptr[i * 6 + 5] = iarea;
inds[i] = iind;
pos = i + 1;
while (pos < nboxes) {
auto xx1 = fmaxf(ix1, inputs_ptr[pos * 6 + 0]);
auto yy1 = fmaxf(iy1, inputs_ptr[pos * 6 + 1]);
auto xx2 = fminf(ix2, inputs_ptr[pos * 6 + 2]);
auto yy2 = fminf(iy2, inputs_ptr[pos * 6 + 3]);
auto w = fmaxf(0.0, xx2 - xx1 + offset);
auto h = fmaxf(0.0, yy2 - yy1 + offset);
auto inter = w * h;
auto ovr = inter / (iarea + inputs_ptr[pos * 6 + 5] - inter);
float weight = 1.;
if (method == 0) {
if (ovr >= iou_threshold) weight = 0;
} else if (method == 1) {
if (ovr >= iou_threshold) weight = 1 - ovr;
} else if (method == 2) {
weight = exp(-(ovr * ovr) / sigma);
}
inputs_ptr[pos * 6 + 4] *= weight;
// if box score falls below threshold, discard the box by
// swapping with last box update N
if (inputs_ptr[pos * 6 + 4] < min_score) {
inputs_ptr[pos * 6 + 0] = inputs_ptr[(nboxes - 1) * 6 + 0];
inputs_ptr[pos * 6 + 1] = inputs_ptr[(nboxes - 1) * 6 + 1];
inputs_ptr[pos * 6 + 2] = inputs_ptr[(nboxes - 1) * 6 + 2];
inputs_ptr[pos * 6 + 3] = inputs_ptr[(nboxes - 1) * 6 + 3];
inputs_ptr[pos * 6 + 4] = inputs_ptr[(nboxes - 1) * 6 + 4];
inputs_ptr[pos * 6 + 5] = inputs_ptr[(nboxes - 1) * 6 + 5];
inds[pos] = inds[nboxes - 1];
nboxes = nboxes - 1;
pos = pos - 1;
}
pos = pos + 1;
}
}
setScalar(num_out, int64_t{nboxes});
outs[0] = dets;
outs[1] = inds_t;
outs[2] = num_out;
}
void nms_match_cpu(HostContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
float iou_threshold;
SSAttrs(attr).get<float>("iou_threshold", iou_threshold).done();
}
PARROTS_EXTENSION_REGISTER(nms)
.attr("iou_threshold")
.attr("offset")
.input(3)
.output(1)
.apply(nms_cpu)
#ifdef PARROTS_USE_CUDA
.apply(nms_cuda)
#endif
.done();
PARROTS_EXTENSION_REGISTER(softnms)
.attr("iou_threshold")
.attr("sigma")
.attr("min_score")
.attr("method")
.attr("offset")
.input(3)
.output(3)
.apply(softnms_cpu)
.done();
PARROTS_EXTENSION_REGISTER(nms_match)
.attr("iou_threshold")
.input(1)
.output(1)
.apply(nms_match_cpu)
.done();
#include "nms_kernel.cuh"
#include "parrots_cuda_helper.hpp"
DArrayLite NMSCUDAKernelLauncher(const DArrayLite boxes_sorted,
const DArrayLite order, const DArrayLite areas,
float iou_threshold, int offset,
CudaContext& ctx, cudaStream_t stream) {
size_t boxes_num = boxes_sorted.dim(0);
if (boxes_sorted.size() == 0) {
auto select = ctx.createDArrayLite(DArraySpec::array(Prim::Int64, 0));
return select;
}
const size_t col_blocks = DIVUP(boxes_num, threadsPerBlock);
auto mask = ctx.createDArrayLite(
DArraySpec::array(Prim::Int64, DArrayShape(boxes_num, col_blocks)));
dim3 blocks(col_blocks, col_blocks);
dim3 threads(threadsPerBlock);
PARROTS_CUDA_CHECK(cudaGetLastError());
nms_cuda<<<blocks, threads, 0, stream>>>(
boxes_num, iou_threshold, offset, boxes_sorted.ptr<float>(),
(unsigned long long*)mask.ptr<int64_t>());
PARROTS_CUDA_CHECK(cudaGetLastError());
auto mask_cpu = ctx.createDArrayLite(mask, getHostProxy());
auto mask_host = mask_cpu.ptr<int64_t>();
auto remv = ctx.createDArrayLite(DArraySpec::array(Prim::Int64, col_blocks),
getHostProxy());
remv.setZeros(syncStream());
auto remv_ptr = remv.ptr<int64_t>();
auto keep_t = ctx.createDArrayLite(DArraySpec::array(Prim::Uint8, boxes_num),
getHostProxy());
keep_t.setZeros(syncStream());
auto keep = keep_t.ptr<uint8_t>();
for (int i = 0; i < boxes_num; i++) {
int nblock = i / threadsPerBlock;
int inblock = i % threadsPerBlock;
if (!(remv_ptr[nblock] & (1ULL << inblock))) {
keep[i] = 1;
int64_t* p = mask_host + i * col_blocks;
for (int j = nblock; j < col_blocks; j++) {
remv_ptr[j] |= p[j];
}
}
}
auto keep_cuda = ctx.createDArrayLite(keep_t, ctx.getProxy());
PARROTS_CUDA_CHECK(cudaGetLastError());
return keep_cuda;
}
#include "parrots_cpp_helper.hpp"
using namespace parrots;
#include "parrots_cuda_helper.hpp"
using namespace parrots;
#include "parrots_cpp_helper.hpp"
#ifndef min
#define min(a, b) (((a) < (b)) ? (a) : (b))
#endif
#ifndef max
#define max(a, b) (((a) > (b)) ? (a) : (b))
#endif
void psamask_collect_forward(const int num_, const int h_feature,
const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask,
const int half_w_mask, const float *mask_data,
float *buffer_data) {
for (int n = 0; n < num_; n++) {
for (int h = 0; h < h_feature; h++) {
for (int w = 0; w < w_feature; w++) {
// effective mask region : [hstart, hend) x [wstart, wend) with
// mask-indexed
const int hstart = max(0, half_h_mask - h);
const int hend = min(h_mask, h_feature + half_h_mask - h);
const int wstart = max(0, half_w_mask - w);
const int wend = min(w_mask, w_feature + half_w_mask - w);
// (hidx, widx ) with mask-indexed
// (hidx + h - half_h_mask, widx + w - half_w_mask) with
// feature-indexed
for (int hidx = hstart; hidx < hend; hidx++) {
for (int widx = wstart; widx < wend; widx++) {
buffer_data[(n * h_feature * w_feature +
(hidx + h - half_h_mask) * w_feature +
(widx + w - half_w_mask)) *
h_feature * w_feature +
h * w_feature + w] =
mask_data[((n * h_mask * w_mask + hidx * w_mask + widx) *
h_feature +
h) *
w_feature +
w];
}
}
}
}
}
}
void psamask_distribute_forward(const int num_, const int h_feature,
const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask,
const int half_w_mask, const float *mask_data,
float *buffer_data) {
for (int n = 0; n < num_; n++) {
for (int h = 0; h < h_feature; h++) {
for (int w = 0; w < w_feature; w++) {
// effective mask region : [hstart, hend) x [wstart, wend) with
// mask-indexed
const int hstart = max(0, half_h_mask - h);
const int hend = min(h_mask, h_feature + half_h_mask - h);
const int wstart = max(0, half_w_mask - w);
const int wend = min(w_mask, w_feature + half_w_mask - w);
// (hidx, widx ) with mask-indexed
// (hidx + h - half_h_mask, widx + w - half_w_mask) with
// feature-indexed
for (int hidx = hstart; hidx < hend; hidx++) {
for (int widx = wstart; widx < wend; widx++) {
buffer_data[(n * h_feature * w_feature + h * w_feature + w) *
h_feature * w_feature +
(hidx + h - half_h_mask) * w_feature +
(widx + w - half_w_mask)] =
mask_data[((n * h_mask * w_mask + hidx * w_mask + widx) *
h_feature +
h) *
w_feature +
w];
}
}
}
}
}
}
void psamask_collect_backward(const int num_, const int h_feature,
const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask,
const int half_w_mask, const float *buffer_diff,
float *mask_diff) {
for (int n = 0; n < num_; n++) {
for (int h = 0; h < h_feature; h++) {
for (int w = 0; w < w_feature; w++) {
// effective mask region : [hstart, hend) x [wstart, wend) with
// mask-indexed
const int hstart = max(0, half_h_mask - h);
const int hend = min(h_mask, h_feature + half_h_mask - h);
const int wstart = max(0, half_w_mask - w);
const int wend = min(w_mask, w_feature + half_w_mask - w);
// (hidx, widx ) with mask-indexed
// (hidx + h - half_h_mask, widx + w - half_w_mask) with
// feature-indexed
for (int hidx = hstart; hidx < hend; hidx++) {
for (int widx = wstart; widx < wend; widx++) {
mask_diff[((n * h_mask * w_mask + hidx * w_mask + widx) *
h_feature +
h) *
w_feature +
w] = buffer_diff[(n * h_feature * w_feature +
(hidx + h - half_h_mask) * w_feature +
(widx + w - half_w_mask)) *
h_feature * w_feature +
h * w_feature + w];
}
}
}
}
}
}
void psamask_distribute_backward(const int num_, const int h_feature,
const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask,
const int half_w_mask,
const float *buffer_diff, float *mask_diff) {
for (int n = 0; n < num_; n++) {
for (int h = 0; h < h_feature; h++) {
for (int w = 0; w < w_feature; w++) {
// effective mask region : [hstart, hend) x [wstart, wend) with
// mask-indexed
const int hstart = max(0, half_h_mask - h);
const int hend = min(h_mask, h_feature + half_h_mask - h);
const int wstart = max(0, half_w_mask - w);
const int wend = min(w_mask, w_feature + half_w_mask - w);
// (hidx, widx ) with mask-indexed
// (hidx + h - half_h_mask, widx + w - half_w_mask) with
// feature-indexed
for (int hidx = hstart; hidx < hend; hidx++) {
for (int widx = wstart; widx < wend; widx++) {
mask_diff[((n * h_mask * w_mask + hidx * w_mask + widx) *
h_feature +
h) *
w_feature +
w] =
buffer_diff[(n * h_feature * w_feature + h * w_feature + w) *
h_feature * w_feature +
(hidx + h - half_h_mask) * w_feature +
(widx + w - half_w_mask)];
}
}
}
}
}
}
void psamask_forward_cpu(HostContext &ctx, const SSElement &attr,
const OperatorBase::in_list_t &ins,
OperatorBase::out_list_t &outs) {
int psa_type, num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
half_w_mask;
SSAttrs(attr)
.get<int>("psa_type", psa_type)
.get<int>("num_", num_)
.get<int>("h_feature", h_feature)
.get<int>("w_feature", w_feature)
.get<int>("h_mask", h_mask)
.get<int>("w_mask", w_mask)
.get<int>("half_h_mask", half_h_mask)
.get<int>("half_w_mask", half_w_mask)
.done();
const auto &input = ins[0];
auto &output = outs[0];
auto input_ptr = input.ptr<float>();
auto output_ptr = output.ptr<float>();
if (psa_type == 0)
psamask_collect_forward(num_, h_feature, w_feature, h_mask, w_mask,
half_h_mask, half_w_mask, input_ptr, output_ptr);
else
psamask_distribute_forward(num_, h_feature, w_feature, h_mask, w_mask,
half_h_mask, half_w_mask, input_ptr, output_ptr);
}
void psamask_backward_cpu(HostContext &ctx, const SSElement &attr,
const OperatorBase::in_list_t &ins,
OperatorBase::out_list_t &outs) {
int psa_type, num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
half_w_mask;
SSAttrs(attr)
.get<int>("psa_type", psa_type)
.get<int>("num_", num_)
.get<int>("h_feature", h_feature)
.get<int>("w_feature", w_feature)
.get<int>("h_mask", h_mask)
.get<int>("w_mask", w_mask)
.get<int>("half_h_mask", half_h_mask)
.get<int>("half_w_mask", half_w_mask)
.done();
const auto &input = ins[0];
auto &output = outs[0];
auto input_ptr = input.ptr<float>();
auto output_ptr = output.ptr<float>();
if (psa_type == 0)
psamask_collect_backward(num_, h_feature, w_feature, h_mask, w_mask,
half_h_mask, half_w_mask, input_ptr, output_ptr);
else
psamask_distribute_backward(num_, h_feature, w_feature, h_mask, w_mask,
half_h_mask, half_w_mask, input_ptr,
output_ptr);
}
void PSAMaskForwardCUDAKernelLauncher(const int psa_type,
const DArrayLite input, DArrayLite output,
const int num_, const int h_feature,
const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask,
const int half_w_mask, CudaContext &ctx);
void PSAMaskBackwardCUDAKernelLauncher(const int psa_type,
const DArrayLite grad_output,
DArrayLite grad_input, const int num_,
const int h_feature, const int w_feature,
const int h_mask, const int w_mask,
const int half_h_mask,
const int half_w_mask, CudaContext &ctx);
void psamask_forward_cuda(CudaContext &ctx, const SSElement &attr,
const OperatorBase::in_list_t &ins,
OperatorBase::out_list_t &outs) {
int psa_type, num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
half_w_mask;
SSAttrs(attr)
.get<int>("psa_type", psa_type)
.get<int>("num_", num_)
.get<int>("h_feature", h_feature)
.get<int>("w_feature", w_feature)
.get<int>("h_mask", h_mask)
.get<int>("w_mask", w_mask)
.get<int>("half_h_mask", half_h_mask)
.get<int>("half_w_mask", half_w_mask)
.done();
const auto &input = ins[0];
auto &output = outs[0];
PSAMaskForwardCUDAKernelLauncher(psa_type, input, output, num_, h_feature,
w_feature, h_mask, w_mask, half_h_mask,
half_w_mask, ctx);
}
void psamask_backward_cuda(CudaContext &ctx, const SSElement &attr,
const OperatorBase::in_list_t &ins,
OperatorBase::out_list_t &outs) {
int psa_type, num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
half_w_mask;
SSAttrs(attr)
.get<int>("psa_type", psa_type)
.get<int>("num_", num_)
.get<int>("h_feature", h_feature)
.get<int>("w_feature", w_feature)
.get<int>("h_mask", h_mask)
.get<int>("w_mask", w_mask)
.get<int>("half_h_mask", half_h_mask)
.get<int>("half_w_mask", half_w_mask)
.done();
const auto &input = ins[0];
auto &output = outs[0];
PSAMaskBackwardCUDAKernelLauncher(psa_type, input, output, num_, h_feature,
w_feature, h_mask, w_mask, half_h_mask,
half_w_mask, ctx);
}
PARROTS_EXTENSION_REGISTER(psamask_forward)
.attr("psa_type")
.attr("num_")
.attr("h_feature")
.attr("w_feature")
.attr("h_mask")
.attr("w_mask")
.attr("half_h_mask")
.attr("half_w_mask")
.input(1)
.output(1)
.apply(psamask_forward_cpu)
#ifdef PARROTS_USE_CUDA
.apply(psamask_forward_cuda)
#endif
.done();
PARROTS_EXTENSION_REGISTER(psamask_backward)
.attr("psa_type")
.attr("num_")
.attr("h_feature")
.attr("w_feature")
.attr("h_mask")
.attr("w_mask")
.attr("half_h_mask")
.attr("half_w_mask")
.input(1)
.output(1)
.apply(psamask_backward_cpu)
#ifdef PARROTS_USE_CUDA
.apply(psamask_backward_cuda)
#endif
.done();
// Modified from
// https://github.com/hszhao/semseg/blob/master/lib/psa/src
#include "parrots_cuda_helper.hpp"
#include "psamask_cuda_kernel.cuh"
void PSAMaskForwardCUDAKernelLauncher(const int psa_type,
const DArrayLite input, DArrayLite output,
const int num_, const int h_feature,
const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask,
const int half_w_mask, CudaContext& ctx) {
int nthreads = num_ * h_feature * w_feature;
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
if (psa_type == 0)
PARROTS_DISPATCH_FLOATING_TYPES(input.elemType().prim(), [&] {
psamask_collect_forward_cuda<scalar_t><<<nthreads, 512, 0, stream>>>(
nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
half_w_mask, input.ptr<scalar_t>(), output.ptr<scalar_t>());
});
else
PARROTS_DISPATCH_FLOATING_TYPES(input.elemType().prim(), [&] {
psamask_distribute_forward_cuda<scalar_t><<<nthreads, 512, 0, stream>>>(
nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
half_w_mask, input.ptr<scalar_t>(), output.ptr<scalar_t>());
});
}
void PSAMaskBackwardCUDAKernelLauncher(
const int psa_type, const DArrayLite grad_output, DArrayLite grad_input,
const int num_, const int h_feature, const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask, const int half_w_mask,
CudaContext& ctx) {
int nthreads = num_ * h_feature * w_feature;
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
if (psa_type == 0)
PARROTS_DISPATCH_FLOATING_TYPES(grad_input.elemType().prim(), [&] {
psamask_collect_backward_cuda<scalar_t><<<nthreads, 512, 0, stream>>>(
nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
half_w_mask, grad_output.ptr<scalar_t>(), grad_input.ptr<scalar_t>());
});
else
PARROTS_DISPATCH_FLOATING_TYPES(grad_input.elemType().prim(), [&] {
psamask_distribute_backward_cuda<scalar_t><<<nthreads, 512, 0, stream>>>(
nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
half_w_mask, grad_output.ptr<scalar_t>(), grad_input.ptr<scalar_t>());
});
}
// Copyright (c) 2018, SenseTime.
#include "parrots_cpp_helper.hpp"
void ROIAlignForwardCUDAKernelLauncher(const DArrayLite input,
const DArrayLite rois, DArrayLite output,
DArrayLite argmax_y, DArrayLite argmax_x,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
int pool_mode, bool aligned,
cudaStream_t stream);
void ROIAlignBackwardCUDAKernelLauncher(
const DArrayLite grad_output, const DArrayLite rois,
const DArrayLite argmax_y, const DArrayLite argmax_x, DArrayLite grad_input,
int aligned_height, int aligned_width, float spatial_scale,
int sampling_ratio, int pool_mode, bool aligned, cudaStream_t stream);
void roi_align_forward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int aligned_height;
int aligned_width;
float spatial_scale;
int sampling_ratio;
int pool_mode;
bool aligned;
SSAttrs(attr)
.get<int>("aligned_height", aligned_height)
.get<int>("aligned_width", aligned_width)
.get<float>("spatial_scale", spatial_scale)
.get<int>("sampling_ratio", sampling_ratio)
.get<int>("pool_mode", pool_mode)
.get<bool>("aligned", aligned)
.done();
const auto& input = ins[0];
const auto& rois = ins[1];
auto& output = outs[0];
auto& argmax_y = outs[1];
auto& argmax_x = outs[2];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
ROIAlignForwardCUDAKernelLauncher(
input, rois, output, argmax_y, argmax_x, aligned_height, aligned_width,
spatial_scale, sampling_ratio, pool_mode, aligned, stream);
}
void roi_align_backward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int aligned_height;
int aligned_width;
float spatial_scale;
int sampling_ratio;
int pool_mode;
bool aligned;
SSAttrs(attr)
.get<int>("aligned_height", aligned_height)
.get<int>("aligned_width", aligned_width)
.get<float>("spatial_scale", spatial_scale)
.get<int>("sampling_ratio", sampling_ratio)
.get<int>("pool_mode", pool_mode)
.get<bool>("aligned", aligned)
.done();
const auto& grad_output = ins[0];
const auto& rois = ins[1];
const auto& argmax_y = ins[2];
const auto& argmax_x = ins[3];
auto& grad_input = outs[0];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
ROIAlignBackwardCUDAKernelLauncher(
grad_output, rois, argmax_y, argmax_x, grad_input, aligned_height,
aligned_width, spatial_scale, sampling_ratio, pool_mode, aligned, stream);
}
PARROTS_EXTENSION_REGISTER(roi_align_forward)
.attr("aligned_height")
.attr("aligned_width")
.attr("spatial_scale")
.attr("sampling_ratio")
.attr("pool_mode")
.attr("aligned")
.input(2)
.output(3)
.apply(roi_align_forward_cuda)
.done();
PARROTS_EXTENSION_REGISTER(roi_align_backward)
.attr("aligned_height")
.attr("aligned_width")
.attr("spatial_scale")
.attr("sampling_ratio")
.attr("pool_mode")
.attr("aligned")
.input(4)
.output(1)
.apply(roi_align_backward_cuda)
.done();
#include "parrots_cuda_helper.hpp"
#include "roi_align_kernel.cuh"
void ROIAlignForwardCUDAKernelLauncher(const DArrayLite input,
const DArrayLite rois, DArrayLite output,
DArrayLite argmax_y, DArrayLite argmax_x,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
int pool_mode, bool aligned,
cudaStream_t stream) {
int output_size = output.size();
int channels = input.dim(1);
int height = input.dim(2);
int width = input.dim(3);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
input.elemType().prim(), ([&] {
roi_align_forward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, input.ptr<scalar_t>(), rois.ptr<scalar_t>(),
output.ptr<scalar_t>(), argmax_y.ptr<scalar_t>(),
argmax_x.ptr<scalar_t>(), aligned_height, aligned_width,
spatial_scale, sampling_ratio, pool_mode, aligned, channels,
height, width);
}));
PARROTS_CUDA_CHECK(cudaGetLastError());
}
void ROIAlignBackwardCUDAKernelLauncher(
const DArrayLite grad_output, const DArrayLite rois,
const DArrayLite argmax_y, const DArrayLite argmax_x, DArrayLite grad_input,
int aligned_height, int aligned_width, float spatial_scale,
int sampling_ratio, int pool_mode, bool aligned, cudaStream_t stream) {
int output_size = grad_output.size();
int channels = grad_input.dim(1);
int height = grad_input.dim(2);
int width = grad_input.dim(3);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
grad_output.elemType().prim(), ([&] {
roi_align_backward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, grad_output.ptr<scalar_t>(), rois.ptr<scalar_t>(),
argmax_y.ptr<scalar_t>(), argmax_x.ptr<scalar_t>(),
grad_input.ptr<scalar_t>(), aligned_height, aligned_width,
spatial_scale, sampling_ratio, pool_mode, aligned, channels,
height, width);
}));
PARROTS_CUDA_CHECK(cudaGetLastError());
}
#include "parrots_cpp_helper.hpp"
void ROIPoolForwardCUDAKernelLauncher(const DArrayLite input,
const DArrayLite rois, DArrayLite output,
DArrayLite argmax, int pooled_height,
int pooled_width, float spatial_scale,
cudaStream_t stream);
void ROIPoolBackwardCUDAKernelLauncher(const DArrayLite grad_output,
const DArrayLite rois,
const DArrayLite argmax,
DArrayLite grad_input, int pooled_height,
int pooled_width, float spatial_scale,
cudaStream_t stream);
void roi_pool_forward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int pooled_height;
int pooled_width;
float spatial_scale;
SSAttrs(attr)
.get<int>("pooled_height", pooled_height)
.get<int>("pooled_width", pooled_width)
.get<float>("spatial_scale", spatial_scale)
.done();
const auto& input = ins[0];
const auto& rois = ins[1];
auto& output = outs[0];
auto& argmax = outs[1];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
ROIPoolForwardCUDAKernelLauncher(input, rois, output, argmax, pooled_height,
pooled_width, spatial_scale, stream);
}
void roi_pool_backward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int pooled_height;
int pooled_width;
float spatial_scale;
SSAttrs(attr)
.get<int>("pooled_height", pooled_height)
.get<int>("pooled_width", pooled_width)
.get<float>("spatial_scale", spatial_scale)
.done();
const auto& grad_output = ins[0];
const auto& rois = ins[1];
const auto& argmax = ins[2];
auto& grad_input = outs[0];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
ROIPoolBackwardCUDAKernelLauncher(grad_output, rois, argmax, grad_input,
pooled_height, pooled_width, spatial_scale,
stream);
}
PARROTS_EXTENSION_REGISTER(roi_pool_forward)
.attr("pooled_height")
.attr("pooled_width")
.attr("spatial_scale")
.input(2)
.output(2)
.apply(roi_pool_forward_cuda)
.done();
PARROTS_EXTENSION_REGISTER(roi_pool_backward)
.attr("pooled_height")
.attr("pooled_width")
.attr("spatial_scale")
.input(3)
.output(1)
.apply(roi_pool_backward_cuda)
.done();
#include "parrots_cuda_helper.hpp"
#include "roi_pool_kernel.cuh"
void ROIPoolForwardCUDAKernelLauncher(const DArrayLite input,
const DArrayLite rois, DArrayLite output,
DArrayLite argmax, int pooled_height,
int pooled_width, float spatial_scale,
cudaStream_t stream) {
int output_size = output.size();
int channels = input.dim(1);
int height = input.dim(2);
int width = input.dim(3);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(input.elemType().prim(), [&] {
roi_pool_forward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, input.ptr<scalar_t>(), rois.ptr<scalar_t>(),
output.ptr<scalar_t>(), argmax.ptr<int>(), pooled_height,
pooled_width, spatial_scale, channels, height, width);
});
PARROTS_CUDA_CHECK(cudaGetLastError());
}
void ROIPoolBackwardCUDAKernelLauncher(const DArrayLite grad_output,
const DArrayLite rois,
const DArrayLite argmax,
DArrayLite grad_input, int pooled_height,
int pooled_width, float spatial_scale,
cudaStream_t stream) {
int output_size = grad_output.size();
int channels = grad_output.dim(1);
int height = grad_output.dim(2);
int width = grad_output.dim(3);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(grad_output.elemType().prim(), [&] {
roi_pool_backward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, grad_output.ptr<scalar_t>(), rois.ptr<scalar_t>(),
argmax.ptr<int>(), grad_input.ptr<scalar_t>(), pooled_height,
pooled_width, channels, height, width);
});
PARROTS_CUDA_CHECK(cudaGetLastError());
}
#include "parrots_cpp_helper.hpp"
void SyncBNForwardMeanCUDAKernelLauncher(const DArrayLite input,
DArrayLite mean, cudaStream_t stream);
void SyncBNForwardVarCUDAKernelLauncher(const DArrayLite input,
const DArrayLite mean, DArrayLite var,
cudaStream_t stream);
void SyncBNForwardOutputCUDAKernelLauncher(
const DArrayLite input, const DArrayLite mean, const DArrayLite var,
DArrayLite running_mean, DArrayLite running_var, const DArrayLite weight,
const DArrayLite bias, DArrayLite norm, DArrayLite std, DArrayLite output,
const float eps, const float momentum, size_t group_size,
cudaStream_t stream);
void SyncBNBackwardParamCUDAKernelLauncher(const DArrayLite grad_output,
const DArrayLite norm,
DArrayLite weight_diff,
DArrayLite bias_diff,
cudaStream_t stream);
void SyncBNBackwardDataCUDAKernelLauncher(
const DArrayLite grad_output, const DArrayLite weight,
const DArrayLite weight_diff, const DArrayLite bias_diff,
const DArrayLite norm, const DArrayLite std, DArrayLite grad_input,
cudaStream_t stream);
void sync_bn_forward_mean_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
const auto& input = ins[0];
auto& mean = outs[0];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
SyncBNForwardMeanCUDAKernelLauncher(input, mean, stream);
}
void sync_bn_forward_var_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
const auto& input = ins[0];
const auto& mean = ins[1];
auto& var = outs[0];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
SyncBNForwardVarCUDAKernelLauncher(input, mean, var, stream);
}
void sync_bn_forward_output_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
size_t group_size;
float eps, momentum;
SSAttrs(attr)
.get<float>("eps", eps)
.get<float>("momentum", momentum)
.get<size_t>("group_size", group_size)
.done();
const auto& input = ins[0];
const auto& mean = ins[1];
const auto& var = ins[2];
const auto& weight = ins[3];
const auto& bias = ins[4];
auto& running_mean = outs[0];
auto& running_var = outs[1];
auto& norm = outs[2];
auto& std = outs[3];
auto& output = outs[4];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
SyncBNForwardOutputCUDAKernelLauncher(
input, mean, var, running_mean, running_var, weight, bias, norm, std,
output, eps, momentum, group_size, stream);
}
void sync_bn_backward_param_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
const auto& grad_output = ins[0];
const auto& norm = ins[1];
auto& grad_weight = outs[0];
auto& grad_bias = outs[1];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
SyncBNBackwardParamCUDAKernelLauncher(grad_output, norm, grad_weight,
grad_bias, stream);
}
void sync_bn_backward_data_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
const auto& grad_output = ins[0];
const auto& weight = ins[1];
const auto& grad_weight = ins[2];
const auto& grad_bias = ins[3];
const auto& norm = ins[4];
const auto& std = ins[5];
auto& grad_input = outs[0];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
SyncBNBackwardDataCUDAKernelLauncher(grad_output, weight, grad_weight,
grad_bias, norm, std, grad_input,
stream);
}
PARROTS_EXTENSION_REGISTER(sync_bn_forward_mean)
.input(1)
.output(1)
.apply(sync_bn_forward_mean_cuda)
.done();
PARROTS_EXTENSION_REGISTER(sync_bn_forward_var)
.input(2)
.output(1)
.apply(sync_bn_forward_var_cuda)
.done();
PARROTS_EXTENSION_REGISTER(sync_bn_forward_output)
.attr("eps")
.attr("momentum")
.attr("group_size")
.input(5)
.output(5)
.apply(sync_bn_forward_output_cuda)
.done();
PARROTS_EXTENSION_REGISTER(sync_bn_backward_param)
.input(2)
.output(2)
.apply(sync_bn_backward_param_cuda)
.done();
PARROTS_EXTENSION_REGISTER(sync_bn_backward_data)
.input(6)
.output(1)
.apply(sync_bn_backward_data_cuda)
.done();
#include "parrots_cuda_helper.hpp"
#include "sync_bn_cuda_kernel.cuh"
void SyncBNForwardMeanCUDAKernelLauncher(const DArrayLite input,
DArrayLite mean, cudaStream_t stream) {
int num = input.dim(0);
int channels = input.dim(1);
int spatial = input.dim(2);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
input.elemType().prim(), ([&] {
sync_bn_forward_mean_cuda_kernel<scalar_t>
<<<channels, THREADS_PER_BLOCK, 0, stream>>>(input.ptr<scalar_t>(),
mean.ptr<float>(), num,
channels, spatial);
}));
PARROTS_CUDA_CHECK(cudaGetLastError());
}
void SyncBNForwardVarCUDAKernelLauncher(const DArrayLite input,
const DArrayLite mean, DArrayLite var,
cudaStream_t stream) {
int num = input.dim(0);
int channels = input.dim(1);
int spatial = input.dim(2);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
input.elemType().prim(), ([&] {
sync_bn_forward_var_cuda_kernel<scalar_t>
<<<channels, THREADS_PER_BLOCK, 0, stream>>>(
input.ptr<scalar_t>(), mean.ptr<float>(), var.ptr<float>(), num,
channels, spatial);
}));
PARROTS_CUDA_CHECK(cudaGetLastError());
}
void SyncBNForwardOutputCUDAKernelLauncher(
const DArrayLite input, const DArrayLite mean, const DArrayLite var,
DArrayLite running_mean, DArrayLite running_var, const DArrayLite weight,
const DArrayLite bias, DArrayLite norm, DArrayLite std, DArrayLite output,
float eps, float momentum, size_t group_size, cudaStream_t stream) {
int num = input.dim(0);
int channels = input.dim(1);
int spatial = input.dim(2);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
input.elemType().prim(), ([&] {
sync_bn_forward_output_cuda_kernel<scalar_t>
<<<channels, THREADS_PER_BLOCK, 0, stream>>>(
input.ptr<scalar_t>(), mean.ptr<float>(), var.ptr<float>(),
running_mean.ptr<float>(), running_var.ptr<float>(),
weight.ptr<float>(), bias.ptr<float>(), norm.ptr<float>(),
std.ptr<float>(), output.ptr<scalar_t>(), num, channels,
spatial, eps, momentum, group_size);
}));
PARROTS_CUDA_CHECK(cudaGetLastError());
}
void SyncBNBackwardParamCUDAKernelLauncher(const DArrayLite grad_output,
const DArrayLite norm,
DArrayLite grad_weight,
DArrayLite grad_bias,
cudaStream_t stream) {
int num = grad_output.dim(0);
int channels = grad_output.dim(1);
int spatial = grad_output.dim(2);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
grad_output.elemType().prim(), ([&] {
sync_bn_backward_param_cuda_kernel<scalar_t>
<<<channels, THREADS_PER_BLOCK, 0, stream>>>(
grad_output.ptr<scalar_t>(), norm.ptr<float>(),
grad_weight.ptr<float>(), grad_bias.ptr<float>(), num, channels,
spatial);
}));
PARROTS_CUDA_CHECK(cudaGetLastError());
}
void SyncBNBackwardDataCUDAKernelLauncher(
const DArrayLite grad_output, const DArrayLite weight,
const DArrayLite grad_weight, const DArrayLite grad_bias,
const DArrayLite norm, const DArrayLite std, DArrayLite grad_input,
cudaStream_t stream) {
int output_size = grad_input.size();
int num = grad_input.dim(0);
int channels = grad_input.dim(1);
int spatial = grad_input.dim(2);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
grad_input.elemType().prim(), ([&] {
sync_bn_backward_data_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, grad_output.ptr<scalar_t>(), weight.ptr<float>(),
grad_weight.ptr<float>(), grad_bias.ptr<float>(),
norm.ptr<float>(), std.ptr<float>(), grad_input.ptr<scalar_t>(),
num, channels, spatial);
}));
PARROTS_CUDA_CHECK(cudaGetLastError());
}
#ifndef PARROTS_CPP_HELPER
#define PARROTS_CPP_HELPER
#include <parrots/darray/darraymath.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/darraylite.hpp>
#include <parrots/foundation/ssattrs.hpp>
#include <vector>
using namespace parrots;
#endif // PARROTS_CPP_HELPER
#ifndef PARROTS_CUDA_HELPER
#define PARROTS_CUDA_HELPER
#include <cuda.h>
#include <float.h>
#include <parrots/darray/darraymath.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/darrayutil.hpp>
#include <parrots/foundation/exceptions.hpp>
#include <parrots/foundation/float16.hpp>
#include <parrots/foundation/mathfunction.hpp>
#include "common_cuda_helper.hpp"
#include "parrots_cudawarpfunction.cuh"
using namespace parrots;
using phalf = float16;
#define __PHALF(x) (x.y)
#define PARROTS_CUDA_CHECK(exp) \
do { \
cudaError_t err = exp; \
if (err != cudaSuccess) { \
fprintf(stderr, "cudaCheckError() failed : %s\n", \
cudaGetErrorString(err)); \
exit(-1); \
} \
} while (0)
#define PARROTS_PRIVATE_CASE_TYPE(prim_type, type, ...) \
case prim_type: { \
using scalar_t = type; \
return __VA_ARGS__(); \
}
#define PARROTS_DISPATCH_FLOATING_TYPES(TYPE, ...) \
[&] { \
const auto& the_type = TYPE; \
switch (the_type) { \
PARROTS_PRIVATE_CASE_TYPE(Prim::Float64, double, __VA_ARGS__) \
PARROTS_PRIVATE_CASE_TYPE(Prim::Float32, float, __VA_ARGS__) \
default: \
PARROTS_NOTSUPPORTED; \
} \
}()
#define PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(TYPE, ...) \
[&] { \
const auto& the_type = TYPE; \
switch (the_type) { \
PARROTS_PRIVATE_CASE_TYPE(Prim::Float64, double, __VA_ARGS__) \
PARROTS_PRIVATE_CASE_TYPE(Prim::Float32, float, __VA_ARGS__) \
PARROTS_PRIVATE_CASE_TYPE(Prim::Float16, float16, __VA_ARGS__) \
default: \
PARROTS_NOTSUPPORTED; \
} \
}()
/** atomicAdd **/
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
static __inline__ __device__ double atomicAdd(double* address, double val) {
unsigned long long int* address_as_ull = (unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed;
if (val == 0.0) return __longlong_as_double(old);
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed,
__double_as_longlong(val + __longlong_as_double(assumed)));
} while (assumed != old);
return __longlong_as_double(old);
}
#endif
static __inline__ __device__ float16 atomicAdd(float16* address, float16 val) {
unsigned int* aligned =
(unsigned int*)((size_t)address - ((size_t)address & 2));
unsigned int old = *aligned;
unsigned int assumed;
unsigned short old_as_us;
do {
assumed = old;
old_as_us =
(unsigned short)((size_t)address & 2 ? old >> 16 : old & 0xffff);
#if __CUDACC_VER_MAJOR__ >= 9
float16 tmp;
tmp.x = old_as_us;
float16 sum = tmp + val;
unsigned short sum_as_us = sum.x;
// half sum = __float2half_rn(__half2float(__ushort_as_half(old_as_us))
// + (float)(val)); unsigned short sum_as_us = __half_as_ushort(sum);
#else
unsigned short sum_as_us =
__float2half_rn(__half2float(old_as_us) + (float)(val));
#endif
unsigned int sum_as_ui = (size_t)address & 2
? (sum_as_us << 16) | (old & 0xffff)
: (old & 0xffff0000) | sum_as_us;
old = atomicCAS(aligned, assumed, sum_as_ui);
} while (assumed != old);
//__half_raw raw = {old_as_us};
// return float16(raw);
return *reinterpret_cast<float16*>(&old_as_us);
}
#endif // PARROTS_CUDA_HELPER
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment