Unverified Commit c0f5492e authored by zhuyuanhao's avatar zhuyuanhao Committed by GitHub
Browse files

add ext ops, support parrots (#310)



* add ext ops, support parrots

* fix lint

* fix lint

* update op from mmdetection

* support non-pytorch env

* fix import bug

* test not import mmcv.op

* rename mmcv.op to mmcv.ops

* fix compile warning

* 1. fix syncbn warning in pytorch 1.5
2. support only cpu compile
3. add point_sample from mmdet

* fix text bug

* update docstrings

* fix line endings

* minor updates

* remove non_local from ops

* bug fix for nonlocal2d

* rename ops_ext to _ext and _ext to _flow_warp_ext

* update the doc

* try clang-format github action

* fix github action

* add ops to api.rst

* fix cpp format

* fix clang format issues

* remove .clang-format
Co-authored-by: default avatarKai Chen <chenkaidev@gmail.com>
parent a7bf7701
// Copyright (c) 2018, SenseTime.
#include "parrots_cpp_helper.hpp"
void SigmoidFocalLossForwardCUDAKernelLauncher(
const DArrayLite input, const DArrayLite target, const DArrayLite weight,
DArrayLite output, float gamma, float alpha, cudaStream_t stream);
void SigmoidFocalLossBackwardCUDAKernelLauncher(
const DArrayLite input, const DArrayLite target, const DArrayLite weight,
DArrayLite grad_input, float gamma, float alpha, cudaStream_t stream);
void SoftmaxFocalLossForwardCUDAKernelLauncher(
const DArrayLite input, const DArrayLite target, const DArrayLite weight,
DArrayLite output, float gamma, float alpha, cudaStream_t stream);
void SoftmaxFocalLossBackwardCUDAKernelLauncher(
const DArrayLite input, const DArrayLite target, const DArrayLite weight,
DArrayLite buff, DArrayLite grad_input, float gamma, float alpha,
cudaStream_t stream);
void sigmoid_focal_loss_forward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
float gamma;
float alpha;
SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
// get inputs and outputs
const auto& input = ins[0];
const auto& target = ins[1];
const auto& weight = ins[2];
auto& output = outs[0];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
SigmoidFocalLossForwardCUDAKernelLauncher(input, target, weight, output,
gamma, alpha, stream);
}
void sigmoid_focal_loss_backward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
float gamma;
float alpha;
SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
// get inputs and outputs
const auto& input = ins[0];
const auto& target = ins[1];
const auto& weight = ins[2];
auto& grad_input = outs[0];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
SigmoidFocalLossBackwardCUDAKernelLauncher(input, target, weight, grad_input,
gamma, alpha, stream);
}
void softmax_focal_loss_forward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
float gamma;
float alpha;
SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
// get inputs and outputs
const auto& input = ins[0];
const auto& target = ins[1];
const auto& weight = ins[2];
auto& grad_input = outs[0];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
SoftmaxFocalLossForwardCUDAKernelLauncher(input, target, weight, grad_input,
gamma, alpha, stream);
}
void softmax_focal_loss_backward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
float gamma;
float alpha;
SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
// get inputs and outputs
const auto& input = ins[0];
const auto& target = ins[1];
const auto& weight = ins[2];
auto& buff = outs[0];
auto& grad_input = outs[1];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
SoftmaxFocalLossBackwardCUDAKernelLauncher(input, target, weight, buff,
grad_input, gamma, alpha, stream);
}
PARROTS_EXTENSION_REGISTER(sigmoid_focal_loss_forward)
.attr("gamma")
.attr("alpha")
.input(3)
.output(1)
.apply(sigmoid_focal_loss_forward_cuda)
.done();
PARROTS_EXTENSION_REGISTER(sigmoid_focal_loss_backward)
.attr("gamma")
.attr("alpha")
.input(3)
.output(1)
.apply(sigmoid_focal_loss_backward_cuda)
.done();
PARROTS_EXTENSION_REGISTER(softmax_focal_loss_forward)
.attr("gamma")
.attr("alpha")
.input(3)
.output(1)
.apply(softmax_focal_loss_forward_cuda)
.done();
PARROTS_EXTENSION_REGISTER(softmax_focal_loss_backward)
.attr("gamma")
.attr("alpha")
.input(3)
.output(2)
.apply(softmax_focal_loss_backward_cuda)
.done();
#include "parrots_cuda_helper.hpp"
#include "sigmoid_focal_loss_kernel.cuh"
#include "softmax_focal_loss_kernel.cuh"
void SigmoidFocalLossForwardCUDAKernelLauncher(
const DArrayLite input, const DArrayLite target, const DArrayLite weight,
DArrayLite output, float gamma, float alpha, cudaStream_t stream) {
int output_size = output.size();
int num_classes = input.dim(1);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
input.elemType().prim(), ([&] {
sigmoid_focal_loss_forward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, input.ptr<scalar_t>(), target.ptr<int64_t>(),
weight.ptr<scalar_t>(), output.ptr<scalar_t>(), gamma, alpha,
num_classes);
}));
PARROTS_CUDA_CHECK(cudaGetLastError());
}
void SigmoidFocalLossBackwardCUDAKernelLauncher(
const DArrayLite input, const DArrayLite target, const DArrayLite weight,
DArrayLite grad_input, float gamma, float alpha, cudaStream_t stream) {
int output_size = grad_input.size();
int num_classes = input.dim(1);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
input.elemType().prim(), ([&] {
sigmoid_focal_loss_backward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, input.ptr<scalar_t>(), target.ptr<int64_t>(),
weight.ptr<scalar_t>(), grad_input.ptr<scalar_t>(), gamma,
alpha, num_classes);
}));
PARROTS_CUDA_CHECK(cudaGetLastError());
}
void SoftmaxFocalLossForwardCUDAKernelLauncher(
const DArrayLite softmax, const DArrayLite target, const DArrayLite weight,
DArrayLite output, float gamma, float alpha, cudaStream_t stream) {
int output_size = output.size();
int num_classes = softmax.dim(1);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
softmax.elemType().prim(), ([&] {
softmax_focal_loss_forward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, softmax.ptr<scalar_t>(), target.ptr<int64_t>(),
weight.ptr<scalar_t>(), output.ptr<scalar_t>(), gamma, alpha,
num_classes);
}));
PARROTS_CUDA_CHECK(cudaGetLastError());
}
void SoftmaxFocalLossBackwardCUDAKernelLauncher(
const DArrayLite softmax, const DArrayLite target, const DArrayLite weight,
DArrayLite buff, DArrayLite grad_input, float gamma, float alpha,
cudaStream_t stream) {
int output_size = buff.size();
int num_classes = softmax.dim(1);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
grad_input.elemType().prim(), ([&] {
softmax_focal_loss_backward_cuda1_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, softmax.ptr<scalar_t>(), target.ptr<int64_t>(),
weight.ptr<scalar_t>(), buff.ptr<scalar_t>(), gamma, alpha,
num_classes);
}));
PARROTS_CUDA_CHECK(cudaGetLastError());
output_size = grad_input.size();
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
grad_input.elemType().prim(), ([&] {
softmax_focal_loss_backward_cuda2_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, softmax.ptr<scalar_t>(), target.ptr<int64_t>(),
buff.ptr<scalar_t>(), grad_input.ptr<scalar_t>(), num_classes);
}));
PARROTS_CUDA_CHECK(cudaGetLastError());
}
#include "parrots_cpp_helper.hpp"
void MaskedIm2colForwardCUDAKernelLauncher(
const DArrayLite bottom_data, const DArrayLite mask_h_idx,
const DArrayLite mask_w_idx, DArrayLite top_data, const int kernel_h,
const int kernel_w, const int pad_h, const int pad_w, cudaStream_t stream);
void MaskedCol2imForwardCUDAKernelLaucher(const DArrayLite bottom_data,
const DArrayLite mask_h_idx,
const DArrayLite mask_w_idx,
DArrayLite top_data, const int height,
const int width, const int channels,
cudaStream_t stream);
void masked_im2col_forward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
// im: (n, ic, h, w), kernel size (kh, kw)
// kernel: (oc, ic * kh * kw), col: (kh * kw * ic, ow * oh)
int kernel_h, kernel_w, pad_h, pad_w;
SSAttrs(attr)
.get<int>("kernel_h", kernel_h)
.get<int>("kernel_w", kernel_w)
.get<int>("pad_h", pad_h)
.get<int>("pad_w", pad_w)
.done();
const auto& im = ins[0];
const auto& mask_h_idx = ins[1];
const auto& mask_w_idx = ins[2];
auto& col = outs[0];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
MaskedIm2colForwardCUDAKernelLauncher(im, mask_h_idx, mask_w_idx, col,
kernel_h, kernel_w, pad_h, pad_w,
stream);
}
void masked_col2im_forward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
// im: (n, ic, h, w), kernel size (kh, kw)
// kernel: (oc, ic * kh * kh), col: (kh * kw * ic, ow * oh)
int height, width, channels;
SSAttrs(attr)
.get<int>("height", height)
.get<int>("width", width)
.get<int>("channels", channels)
.done();
const auto& col = ins[0];
const auto& mask_h_idx = ins[1];
const auto& mask_w_idx = ins[2];
auto& im = outs[0];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
MaskedCol2imForwardCUDAKernelLaucher(col, mask_h_idx, mask_w_idx, im, height,
width, channels, stream);
}
PARROTS_EXTENSION_REGISTER(masked_im2col_forward)
.attr("kernel_h")
.attr("kernel_w")
.attr("pad_h")
.attr("pad_w")
.input(3)
.output(1)
.apply(masked_im2col_forward_cuda)
.done();
PARROTS_EXTENSION_REGISTER(masked_col2im_forward)
.attr("height")
.attr("width")
.attr("channels")
.input(3)
.output(1)
.apply(masked_col2im_forward_cuda)
.done();
#include "masked_conv2d_cuda_kernel.cuh"
#include "parrots_cuda_helper.hpp"
void MaskedIm2colForwardCUDAKernelLauncher(
const DArrayLite bottom_data, const DArrayLite mask_h_idx,
const DArrayLite mask_w_idx, DArrayLite top_data, const int kernel_h,
const int kernel_w, const int pad_h, const int pad_w, cudaStream_t stream) {
int channels = bottom_data.dim(1);
int height = bottom_data.dim(2);
int width = bottom_data.dim(3);
int mask_cnt = mask_h_idx.dim(0);
int output_size = mask_cnt * channels;
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
bottom_data.elemType().prim(), ([&] {
MaskedIm2colForward<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, bottom_data.ptr<scalar_t>(), height, width,
kernel_h, kernel_w, pad_h, pad_w, mask_h_idx.ptr<int64_t>(),
mask_w_idx.ptr<int64_t>(), mask_cnt, top_data.ptr<scalar_t>());
}));
PARROTS_CUDA_CHECK(cudaGetLastError());
}
void MaskedCol2imForwardCUDAKernelLaucher(const DArrayLite bottom_data,
const DArrayLite mask_h_idx,
const DArrayLite mask_w_idx,
DArrayLite top_data, const int height,
const int width, const int channels,
cudaStream_t stream) {
int mask_cnt = mask_h_idx.dim(0);
int output_size = mask_cnt * channels;
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
bottom_data.elemType().prim(), ([&] {
MaskedCol2imForward<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, bottom_data.ptr<scalar_t>(), height, width,
channels, mask_h_idx.ptr<int64_t>(), mask_w_idx.ptr<int64_t>(),
mask_cnt, top_data.ptr<scalar_t>());
}));
PARROTS_CUDA_CHECK(cudaGetLastError());
}
// Copyright (c) 2019, SenseTime.
#include "parrots_cpp_helper.hpp"
void ModulatedDeformConvForwardCUDAKernelLauncher(
const DArrayLite input, const DArrayLite weight, const DArrayLite bias,
const DArrayLite ones, const DArrayLite offset, const DArrayLite mask,
DArrayLite output, DArrayLite columns, int kernel_h, int kernel_w,
const int stride_h, const int stride_w, const int pad_h, const int pad_w,
const int dilation_h, const int dilation_w, const int group,
int deformable_group, const bool with_bias, CudaContext& ctx,
cudaStream_t stream);
void ModulatedDeformConvBackwardCUDAKernelLauncher(
const DArrayLite input, const DArrayLite weight, const DArrayLite bias,
const DArrayLite ones, const DArrayLite offset, const DArrayLite mask,
DArrayLite columns, DArrayLite grad_input, DArrayLite grad_weight,
DArrayLite grad_bias, DArrayLite grad_offset, DArrayLite grad_mask,
DArrayLite grad_output, int kernel_h, int kernel_w, int stride_h,
int stride_w, int pad_h, int pad_w, int dilation_h, int dilation_w,
int group, int deformable_group, const bool with_bias, CudaContext& ctx,
cudaStream_t stream);
void modulated_deform_conv_forward_cuda(CudaContext& ctx, const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h,
dilation_w, group, deformable_group, with_bias;
SSAttrs(attr)
.get<int>("kernel_h", kernel_h)
.get<int>("kernel_w", kernel_w)
.get<int>("stride_h", stride_h)
.get<int>("stride_w", stride_w)
.get<int>("pad_h", pad_h)
.get<int>("pad_w", pad_w)
.get<int>("dilation_h", dilation_h)
.get<int>("dilation_w", dilation_w)
.get<int>("group", group)
.get<int>("deformable_group", deformable_group)
.get<int>("with_bias", with_bias)
.done();
auto input = ins[0];
auto weight = ins[1];
auto bias = ins[2];
auto ones = ins[3];
auto offset = ins[4];
auto mask = ins[5];
auto output = outs[0];
auto columns = outs[1];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
ModulatedDeformConvForwardCUDAKernelLauncher(
input, weight, bias, ones, offset, mask, output, columns, kernel_h,
kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
deformable_group, with_bias, ctx, stream);
}
void modulated_deform_conv_backward_cuda(CudaContext& ctx,
const SSElement& attr,
const OperatorBase::in_list_t& ins,
OperatorBase::out_list_t& outs) {
int kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h,
dilation_w, group, deformable_group, with_bias;
SSAttrs(attr)
.get<int>("kernel_h", kernel_h)
.get<int>("kernel_w", kernel_w)
.get<int>("stride_h", stride_h)
.get<int>("stride_w", stride_w)
.get<int>("pad_h", pad_h)
.get<int>("pad_w", pad_w)
.get<int>("dilation_h", dilation_h)
.get<int>("dilation_w", dilation_w)
.get<int>("group", group)
.get<int>("deformable_group", deformable_group)
.get<int>("with_bias", with_bias)
.done();
auto input = ins[0];
auto weight = ins[1];
auto bias = ins[2];
auto ones = ins[3];
auto offset = ins[4];
auto mask = ins[5];
auto columns = outs[0];
auto grad_input = outs[1];
auto grad_weight = outs[2];
auto grad_bias = outs[3];
auto grad_offset = outs[4];
auto grad_mask = outs[5];
auto grad_output = outs[6];
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
ModulatedDeformConvBackwardCUDAKernelLauncher(
input, weight, bias, ones, offset, mask, columns, grad_input, grad_weight,
grad_bias, grad_offset, grad_mask, grad_output, kernel_h, kernel_w,
stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
deformable_group, with_bias, ctx, stream);
}
PARROTS_EXTENSION_REGISTER(modulated_deform_conv_forward)
.attr("kernel_h")
.attr("kernel_w")
.attr("stride_h")
.attr("stride_w")
.attr("pad_h")
.attr("pad_w")
.attr("dilation_h")
.attr("dilation_w")
.attr("group")
.attr("deformable_group")
.attr("with_bias")
.input(6)
.output(2)
.apply(modulated_deform_conv_forward_cuda)
.done();
PARROTS_EXTENSION_REGISTER(modulated_deform_conv_backward)
.attr("kernel_h")
.attr("kernel_w")
.attr("stride_h")
.attr("stride_w")
.attr("pad_h")
.attr("pad_w")
.attr("dilation_h")
.attr("dilation_w")
.attr("group")
.attr("deformable_group")
.attr("with_bias")
.input(6)
.output(7)
.apply(modulated_deform_conv_backward_cuda)
.done();
This diff is collapsed.
This diff is collapsed.
#include "nms_kernel.cuh"
#include "parrots_cuda_helper.hpp"
DArrayLite NMSCUDAKernelLauncher(const DArrayLite boxes_sorted,
const DArrayLite order, const DArrayLite areas,
float iou_threshold, int offset,
CudaContext& ctx, cudaStream_t stream) {
size_t boxes_num = boxes_sorted.dim(0);
if (boxes_sorted.size() == 0) {
auto select = ctx.createDArrayLite(DArraySpec::array(Prim::Int64, 0));
return select;
}
const size_t col_blocks = DIVUP(boxes_num, threadsPerBlock);
auto mask = ctx.createDArrayLite(
DArraySpec::array(Prim::Int64, DArrayShape(boxes_num, col_blocks)));
dim3 blocks(col_blocks, col_blocks);
dim3 threads(threadsPerBlock);
PARROTS_CUDA_CHECK(cudaGetLastError());
nms_cuda<<<blocks, threads, 0, stream>>>(
boxes_num, iou_threshold, offset, boxes_sorted.ptr<float>(),
(unsigned long long*)mask.ptr<int64_t>());
PARROTS_CUDA_CHECK(cudaGetLastError());
auto mask_cpu = ctx.createDArrayLite(mask, getHostProxy());
auto mask_host = mask_cpu.ptr<int64_t>();
auto remv = ctx.createDArrayLite(DArraySpec::array(Prim::Int64, col_blocks),
getHostProxy());
remv.setZeros(syncStream());
auto remv_ptr = remv.ptr<int64_t>();
auto keep_t = ctx.createDArrayLite(DArraySpec::array(Prim::Uint8, boxes_num),
getHostProxy());
keep_t.setZeros(syncStream());
auto keep = keep_t.ptr<uint8_t>();
for (int i = 0; i < boxes_num; i++) {
int nblock = i / threadsPerBlock;
int inblock = i % threadsPerBlock;
if (!(remv_ptr[nblock] & (1ULL << inblock))) {
keep[i] = 1;
int64_t* p = mask_host + i * col_blocks;
for (int j = nblock; j < col_blocks; j++) {
remv_ptr[j] |= p[j];
}
}
}
auto keep_cuda = ctx.createDArrayLite(keep_t, ctx.getProxy());
PARROTS_CUDA_CHECK(cudaGetLastError());
return keep_cuda;
}
#include "parrots_cpp_helper.hpp"
using namespace parrots;
#include "parrots_cuda_helper.hpp"
using namespace parrots;
This diff is collapsed.
// Modified from
// https://github.com/hszhao/semseg/blob/master/lib/psa/src
#include "parrots_cuda_helper.hpp"
#include "psamask_cuda_kernel.cuh"
void PSAMaskForwardCUDAKernelLauncher(const int psa_type,
const DArrayLite input, DArrayLite output,
const int num_, const int h_feature,
const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask,
const int half_w_mask, CudaContext& ctx) {
int nthreads = num_ * h_feature * w_feature;
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
if (psa_type == 0)
PARROTS_DISPATCH_FLOATING_TYPES(input.elemType().prim(), [&] {
psamask_collect_forward_cuda<scalar_t><<<nthreads, 512, 0, stream>>>(
nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
half_w_mask, input.ptr<scalar_t>(), output.ptr<scalar_t>());
});
else
PARROTS_DISPATCH_FLOATING_TYPES(input.elemType().prim(), [&] {
psamask_distribute_forward_cuda<scalar_t><<<nthreads, 512, 0, stream>>>(
nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
half_w_mask, input.ptr<scalar_t>(), output.ptr<scalar_t>());
});
}
void PSAMaskBackwardCUDAKernelLauncher(
const int psa_type, const DArrayLite grad_output, DArrayLite grad_input,
const int num_, const int h_feature, const int w_feature, const int h_mask,
const int w_mask, const int half_h_mask, const int half_w_mask,
CudaContext& ctx) {
int nthreads = num_ * h_feature * w_feature;
cudaStream_t stream = getStreamNative<CudaDevice>(ctx.getStream());
if (psa_type == 0)
PARROTS_DISPATCH_FLOATING_TYPES(grad_input.elemType().prim(), [&] {
psamask_collect_backward_cuda<scalar_t><<<nthreads, 512, 0, stream>>>(
nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
half_w_mask, grad_output.ptr<scalar_t>(), grad_input.ptr<scalar_t>());
});
else
PARROTS_DISPATCH_FLOATING_TYPES(grad_input.elemType().prim(), [&] {
psamask_distribute_backward_cuda<scalar_t><<<nthreads, 512, 0, stream>>>(
nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
half_w_mask, grad_output.ptr<scalar_t>(), grad_input.ptr<scalar_t>());
});
}
This diff is collapsed.
#include "parrots_cuda_helper.hpp"
#include "roi_align_kernel.cuh"
void ROIAlignForwardCUDAKernelLauncher(const DArrayLite input,
const DArrayLite rois, DArrayLite output,
DArrayLite argmax_y, DArrayLite argmax_x,
int aligned_height, int aligned_width,
float spatial_scale, int sampling_ratio,
int pool_mode, bool aligned,
cudaStream_t stream) {
int output_size = output.size();
int channels = input.dim(1);
int height = input.dim(2);
int width = input.dim(3);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
input.elemType().prim(), ([&] {
roi_align_forward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, input.ptr<scalar_t>(), rois.ptr<scalar_t>(),
output.ptr<scalar_t>(), argmax_y.ptr<scalar_t>(),
argmax_x.ptr<scalar_t>(), aligned_height, aligned_width,
spatial_scale, sampling_ratio, pool_mode, aligned, channels,
height, width);
}));
PARROTS_CUDA_CHECK(cudaGetLastError());
}
void ROIAlignBackwardCUDAKernelLauncher(
const DArrayLite grad_output, const DArrayLite rois,
const DArrayLite argmax_y, const DArrayLite argmax_x, DArrayLite grad_input,
int aligned_height, int aligned_width, float spatial_scale,
int sampling_ratio, int pool_mode, bool aligned, cudaStream_t stream) {
int output_size = grad_output.size();
int channels = grad_input.dim(1);
int height = grad_input.dim(2);
int width = grad_input.dim(3);
PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(
grad_output.elemType().prim(), ([&] {
roi_align_backward_cuda_kernel<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
output_size, grad_output.ptr<scalar_t>(), rois.ptr<scalar_t>(),
argmax_y.ptr<scalar_t>(), argmax_x.ptr<scalar_t>(),
grad_input.ptr<scalar_t>(), aligned_height, aligned_width,
spatial_scale, sampling_ratio, pool_mode, aligned, channels,
height, width);
}));
PARROTS_CUDA_CHECK(cudaGetLastError());
}
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
#ifndef PARROTS_CPP_HELPER
#define PARROTS_CPP_HELPER
#include <parrots/darray/darraymath.hpp>
#include <parrots/extension.hpp>
#include <parrots/foundation/darraylite.hpp>
#include <parrots/foundation/ssattrs.hpp>
#include <vector>
using namespace parrots;
#endif // PARROTS_CPP_HELPER
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment